123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627 |
- /*++
- Copyright (c) Alex Ionescu. All rights reserved.
- Module Name:
- lzmadec.c
- Abstract:
- This module implements the LZMA Decoding Logic responsible for decoding the
- three possible types of LZMA "packets": matches, repetitions (short \& long)
- and literals. The probability model for each type of packet is also stored
- in this file, along with the management of the previously seen packet types
- (which is tracked as the "sequence").
- Author:
- Alex Ionescu (@aionescu) 15-Apr-2020 - Initial version
- Environment:
- Windows & Linux, user mode and kernel mode.
- --*/
- #include "minlzlib.h"
- #include "lzmadec.h"
- //
- // Probability Bit Model for Lenghts in Rep and in Match sequences
- //
- typedef struct _LENGTH_DECODER_STATE
- {
- //
- // Bit Model for the choosing the type of length encoding
- //
- uint16_t Choice;
- uint16_t Choice2;
- //
- // Bit Model for each of the length encodings
- //
- uint16_t Low[LZMA_POSITION_COUNT][LZMA_MAX_LOW_LENGTH];
- uint16_t Mid[LZMA_POSITION_COUNT][LZMA_MAX_MID_LENGTH];
- uint16_t High[LZMA_MAX_HIGH_LENGTH];
- } LENGTH_DECODER_STATE, * PLENGTH_DECODER_STATE;
- //
- // State used for LZMA decoding
- //
- typedef struct _DECODER_STATE
- {
- //
- // Current type of sequence last decoded
- //
- LZMA_SEQUENCE_STATE Sequence;
- //
- // History of last 4 decoded distances
- //
- uint32_t Rep0;
- uint32_t Rep1;
- uint32_t Rep2;
- uint32_t Rep3;
- //
- // Pending length to repeat from dictionary
- //
- uint32_t Len;
- //
- // Probability Bit Models for all sequence types
- //
- union
- {
- struct
- {
- //
- // Literal model
- //
- uint16_t Literal[LZMA_LITERAL_CODERS][LZMA_LC_MODEL_SIZE];
- //
- // Last-used-distance based models
- //
- uint16_t Rep[LzmaMaxState];
- uint16_t Rep0[LzmaMaxState];
- uint16_t Rep0Long[LzmaMaxState][LZMA_POSITION_COUNT];
- uint16_t Rep1[LzmaMaxState];
- uint16_t Rep2[LzmaMaxState];
- LENGTH_DECODER_STATE RepLen;
- //
- // Explicit distance match based models
- //
- uint16_t Match[LzmaMaxState][LZMA_POSITION_COUNT];
- uint16_t DistSlot[LZMA_FIRST_CONTEXT_DISTANCE_SLOT][LZMA_DISTANCE_SLOTS];
- uint16_t Dist[(1 << 7) - LZMA_FIRST_FIXED_DISTANCE_SLOT];
- uint16_t Align[LZMA_DISTANCE_ALIGN_SLOTS];
- LENGTH_DECODER_STATE MatchLen;
- } BitModel;
- uint16_t RawProbabilities[LZMA_BIT_MODEL_SLOTS];
- } u;
- } DECODER_STATE, *PDECODER_STATE;
- DECODER_STATE Decoder;
- //
- // LZMA decoding uses 3 "properties" which determine how the probability
- // bit model will be laid out. These store the number of bits that are used
- // to pick the correct Literal Coder ("lc"), the number of Position bits to
- // select the Literal coder ("lp"), and the number of Position Bits used to
- // select various lengths ("pb"). In LZMA2, these properties are encoded in
- // a single byte with the formula: ((pb * 45) + lp * 9) + lc).
- //
- // We only support the default {lc = 3, lp = 0, pb = 2} properties, which
- // are what the main encoders out there use. This means that a total of 2
- // bits will be used for arithmetic-coded bit trees that are dependent on
- // the current position, and that a total of 3 bits will be used when we
- // pick the arithmetic-coded bit tree used for literal coding. The 0 means
- // this selection will _not_ be dependent on the position in the buffer.
- //
- const uint8_t k_LzSupportedProperties =
- (LZMA_PB * 45) + (LZMA_LP * 9) + (LZMA_LC);
- void
- LzSetLiteral (
- PLZMA_SEQUENCE_STATE State
- )
- {
- if (*State <= LzmaLitShortrepLitLitState)
- {
- //
- // States 0-3 represent packets with at least 2 back-to-back literals,
- // so another literal now takes us to state 0 (3 back-to-back literals)
- //
- *State = LzmaLitLitLitState;
- }
- else if (*State <= LzmaLitShortrepState)
- {
- //
- // States 4-6 represent packets with a literal at the end, so seeing
- // another literal now takes us to 2 back-to-back literals, which are
- // state packets 1-3.
- //
- // States 7-9 represent packets with a literal at the start, followed
- // by a match/rep/shortrep. Seeing another literal now drops this first
- // literal and takes us to having a literal at the end, which are state
- // packets 4-6 that we just described in the paragraph above.
- //
- *State = (LZMA_SEQUENCE_STATE)(*State - 3);
- }
- else
- {
- //
- // Finally, state 10 and 11 represent cases without a single literal in
- // the last 2 sequence packets, so seeing a literal now takes us to a
- // "literal at the end" state, either following a match or a rep.
- //
- *State = (LZMA_SEQUENCE_STATE)(*State - 6);
- }
- }
- bool
- LzIsLiteral (
- LZMA_SEQUENCE_STATE State
- )
- {
- //
- // States 0-6 describe literal packet sequences
- //
- return State < LzmaMaxLitState;
- }
- void
- LzSetMatch (
- PLZMA_SEQUENCE_STATE State
- )
- {
- //
- // Move to the appropriate "match" state based on current literal state
- //
- *State = LzIsLiteral(*State) ? LzmaLitMatchState : LzmaNonlitMatchState;
- }
- void
- LzSetLongRep (
- PLZMA_SEQUENCE_STATE State
- )
- {
- //
- // Move to the appropriate "long rep" state based on current literal state
- //
- *State = LzIsLiteral(*State) ? LzmaLitRepState : LzmaNonlitRepState;
- }
- void
- LzSetShortRep (
- PLZMA_SEQUENCE_STATE State
- )
- {
- //
- // Move to the appropriate "short rep" state based on current literal state
- //
- *State = LzIsLiteral(*State) ? LzmaLitShortrepState : LzmaNonlitRepState;
- }
- uint16_t*
- LzGetLiteralSlot (
- void
- )
- {
- uint8_t symbol;
- //
- // To pick the correct literal coder arithmetic-coded bit tree, LZMA uses
- // the "lc" parameter to choose the number of high bits from the previous
- // symbol (in the normal case, 3). It then combines that with the "lp"
- // parameter to choose the number of low bits from the current position in
- // the dictionary. However, since "lp" is normally 0, we can omit this.
- //
- symbol = DtGetSymbol(1);
- return Decoder.u.BitModel.Literal[symbol >> (8 - LZMA_LC)];
- }
- uint16_t*
- LzGetDistSlot (
- void
- )
- {
- uint8_t slotIndex;
- //
- // There are 4 different arithmetic-coded bit trees which are used to pick
- // the correct "distance slot" when doing match distance decoding. Each of
- // them is used based on the length of the symbol that is being repeated.
- // For lengths of 2, 3, 4 bytes, a dedicated set of distance slots is used.
- // For lengths of 5 bytes or above, a shared set of distance slots is used.
- //
- if (Decoder.Len < (LZMA_FIRST_CONTEXT_DISTANCE_SLOT + LZMA_MIN_LENGTH))
- {
- slotIndex = (uint8_t)(Decoder.Len - LZMA_MIN_LENGTH);
- }
- else
- {
- slotIndex = LZMA_FIRST_CONTEXT_DISTANCE_SLOT - 1;
- }
- return Decoder.u.BitModel.DistSlot[slotIndex];
- }
- void
- LzDecodeLiteral (
- void
- )
- {
- uint16_t* probArray;
- uint8_t symbol, matchByte;
- //
- // First, choose the correct arithmetic-coded bit tree (which is based on
- // the last symbol we just decoded), then see if we last decoded a literal.
- //
- // If so, simply get the symbol from the bit tree as normal. However, if
- // we didn't last see a literal, we need to read the "match byte" that is
- // "n" bytes away from the last decoded match. We previously stored this in
- // rep0.
- //
- // Based on this match byte, we'll then use 2 other potential bit trees,
- // see LzDecodeMatched for more information.
- //
- probArray = LzGetLiteralSlot();
- if (LzIsLiteral(Decoder.Sequence))
- {
- symbol = RcGetBitTree(probArray, (1 << 8));
- }
- else
- {
- matchByte = DtGetSymbol(Decoder.Rep0 + 1);
- symbol = RcDecodeMatchedBitTree(probArray, matchByte);
- }
- //
- // Write the symbol and indicate that the last sequence was a literal
- //
- DtPutSymbol(symbol);
- LzSetLiteral(&Decoder.Sequence);
- }
- void
- LzDecodeLen (
- PLENGTH_DECODER_STATE LenState,
- uint8_t PosBit
- )
- {
- uint16_t* probArray;
- uint16_t limit;
- //
- // Lenghts of 2 and higher are encoded in 3 possible types of arithmetic-
- // coded bit trees, depending on the size of the length.
- //
- // Lengths 2-9 are encoded in trees called "Low" using 3 bits of data.
- // Lengths 10-17 are encoded in trees called "Mid" using 3 bits of data.
- // Lengths 18-273 are encoded in a tree called "high" using 8 bits of data.
- //
- // The appropriate "Low" or "Mid" tree is selected based on the bottom 2
- // position bits (0-3) (in the LZMA standard, this is based on the "pb",
- // while the "High" tree is shared for all positions.
- //
- // Two arithmetic-coded bit trees, called "Choice" and "Choice2" tell us
- // the type of Length, so we can choose the right tree. {0, n} tells us
- // to use the Low trees, while {1, 0} tells us to use the Mid trees. Lastly
- // {1, 1} tells us to use the High tree.
- //
- Decoder.Len = LZMA_MIN_LENGTH;
- if (RcIsBitSet(&LenState->Choice))
- {
- if (RcIsBitSet(&LenState->Choice2))
- {
- probArray = LenState->High;
- limit = LZMA_MAX_HIGH_LENGTH;
- Decoder.Len += LZMA_MAX_LOW_LENGTH + LZMA_MAX_MID_LENGTH;
- }
- else
- {
- probArray = LenState->Mid[PosBit];
- limit = LZMA_MAX_MID_LENGTH;
- Decoder.Len += LZMA_MAX_LOW_LENGTH;
- }
- }
- else
- {
- probArray = LenState->Low[PosBit];
- limit = LZMA_MAX_LOW_LENGTH;
- }
- Decoder.Len += RcGetBitTree(probArray, limit);
- }
- void
- LzDecodeMatch (
- uint8_t PosBit
- )
- {
- uint16_t* probArray;
- uint8_t distSlot, distBits;
- //
- // Decode the length component of the "match" sequence. Then, since we're
- // about to decode a new distance, update our history by one level.
- //
- LzDecodeLen(&Decoder.u.BitModel.MatchLen, PosBit);
- Decoder.Rep3 = Decoder.Rep2;
- Decoder.Rep2 = Decoder.Rep1;
- Decoder.Rep1 = Decoder.Rep0;
- //
- // Read the first 6 bits, which make up the "distance slot"
- //
- probArray = LzGetDistSlot();
- distSlot = RcGetBitTree(probArray, LZMA_DISTANCE_SLOTS);
- if (distSlot < LZMA_FIRST_CONTEXT_DISTANCE_SLOT)
- {
- //
- // Slots 0-3 directly encode the distance as a literal number
- //
- Decoder.Rep0 = distSlot;
- }
- else
- {
- //
- // For slots 4-13, figure out how many "context encoded bits" are used
- // to encode this distance. The math works out such that slots 4-5 use
- // 1 bit, 6-7 use 2 bits, 8-9 use 3 bits, and so on and so forth until
- // slots 12-13 which use 5 bits.
- //
- // This gives us anywhere from 1-5 bits, plus the two upper bits which
- // can either be 0b10 or 0b11 (based on the bottom bit of the distance
- // slot). Thus, with the context encoded bits, we can represent lengths
- // anywhere from 0b10[0] to 0b11[11111] (i.e.: 4-127).
- //
- // For slots 14-63, we use "fixed 50% probability bits" which are also
- // called "direct bits". The formula below also tells us how many such
- // direct bits to use in this scenario. In other words, distBits can
- // either be the number of "context encoded bits" for slots 4-13, or it
- // can be the the number of "direct bits" for slots 14-63. This gives
- // us a range of of 2 to 26 bits, which are then used as middle bits.
- // Finally, the last 4 bits are called the "align" bits. The smallest
- // possible number we can encode is now going to be 0b10[00][0000] and
- // the highest is 0b11[1111111111111111111111111][1111], in other words
- // 128 to (2^31)-1.
- //
- distBits = (distSlot >> 1) - 1;
- Decoder.Rep0 = (0b10 | (distSlot & 1)) << distBits;
- //
- // Slots 4-13 have their own arithmetic-coded reverse bit trees. Slots
- // 14-63 encode the middle "direct bits" with fixed 50% probability and
- // the bottom 4 "align bits" with a shared arithmetic-coded reverse bit
- // tree.
- //
- if (distSlot < LZMA_FIRST_FIXED_DISTANCE_SLOT)
- {
- probArray = &Decoder.u.BitModel.Dist[Decoder.Rep0 - distSlot];
- }
- else
- {
- Decoder.Rep0 |= RcGetFixed(distBits - LZMA_DISTANCE_ALIGN_BITS) <<
- LZMA_DISTANCE_ALIGN_BITS;
- distBits = LZMA_DISTANCE_ALIGN_BITS;
- probArray = Decoder.u.BitModel.Align;
- }
- Decoder.Rep0 |= RcGetReverseBitTree(probArray, distBits);
- }
- //
- // Indicate that the last sequence was a "match"
- //
- LzSetMatch(&Decoder.Sequence);
- }
- void
- LzDecodeRepLen (
- uint8_t PosBit,
- bool IsLongRep
- )
- {
- //
- // Decode the length byte and indicate the last sequence was a "rep".
- // If this is a short rep, then the length is always hard-coded to 1.
- //
- if (IsLongRep)
- {
- LzDecodeLen(&Decoder.u.BitModel.RepLen, PosBit);
- LzSetLongRep(&Decoder.Sequence);
- }
- else
- {
- Decoder.Len = 1;
- LzSetShortRep(&Decoder.Sequence);
- }
- }
- void
- LzDecodeRep0(
- uint8_t PosBit
- )
- {
- uint8_t bit;
- //
- // This could be a "short rep" with a length of 1, or a "long rep0" with
- // a length that we have to decode. The next bit tells us this, using the
- // arithmetic-coded bit trees stored in "Rep0Long", with 1 tree for each
- // position bit (0-3).
- //
- bit = RcIsBitSet(&Decoder.u.BitModel.Rep0Long[Decoder.Sequence][PosBit]);
- LzDecodeRepLen(PosBit, bit);
- }
- void
- LzDecodeLongRep (
- uint8_t PosBit
- )
- {
- uint32_t newRep;
- //
- // Read the next 2 bits to figure out which of the recently used distances
- // we should use for this match. The following three states are possible :
- //
- // {0,n} - "Long rep1", where the length is stored in an arithmetic-coded
- // bit tree, and the distance is the 2nd most recently used distance (Rep1)
- //
- // {1,0} - "Long rep2", where the length is stored in an arithmetic-coded
- // bit tree, and the distance is the 3rd most recently used distance (Rep2)
- //
- // {1,1} - "Long rep3", where the length is stored in an arithmetic-coded
- // bit tree, and the distance is the 4th most recently used distance (Rep3)
- //
- // Once we have the right one, we must slide down each previously recently
- // used distance, so that the distance we're now using (Rep1, Rep2 or Rep3)
- // becomes "Rep0" again.
- //
- if (RcIsBitSet(&Decoder.u.BitModel.Rep1[Decoder.Sequence]))
- {
- if (RcIsBitSet(&Decoder.u.BitModel.Rep2[Decoder.Sequence]))
- {
- newRep = Decoder.Rep3;
- Decoder.Rep3 = Decoder.Rep2;
- }
- else
- {
- newRep = Decoder.Rep2;
- }
- Decoder.Rep2 = Decoder.Rep1;
- }
- else
- {
- newRep = Decoder.Rep1;
- }
- Decoder.Rep1 = Decoder.Rep0;
- Decoder.Rep0 = newRep;
- LzDecodeRepLen(PosBit, true);
- }
- void
- LzDecodeRep (
- uint8_t PosBit
- )
- {
- //
- // We know this is an LZ77 distance-length pair where the distance is based
- // on a history of up to 4 previously used distance (Rep0-3). To know which
- // distance to use, the following 5 bit positions are possible (keeping in
- // mind that we've already decoded the first 2 bits {1,1} in LzDecode which
- // got us here in the first place):
- //
- // {0,0} - "Short rep", where the length is always 1 and distance is always
- // the most recently used distance (Rep0).
- //
- // {0,1} - "Long rep0", where the length is stored in an arithmetic-coded
- // bit tree, and the distance is the most recently used distance (Rep0).
- //
- // Because both of these possibilities just use Rep0, LzDecodeRep0 handles
- // these two cases. Otherwise, we use LzDecodeLongRep to read up to two
- // additional bits to figure out which recently used distance (1, 2, or 3)
- // to use.
- //
- if (RcIsBitSet(&Decoder.u.BitModel.Rep0[Decoder.Sequence]))
- {
- LzDecodeLongRep(PosBit);
- }
- else
- {
- LzDecodeRep0(PosBit);
- }
- }
- bool
- LzDecode (
- void
- )
- {
- uint32_t position;
- uint8_t posBit;
- //
- // Get the current position in dictionary, making sure we have input bytes.
- // Once we run out of bytes, normalize the last arithmetic coded byte and
- // ensure there's no pending lengths that we haven't yet repeated.
- //
- while (DtCanWrite(&position) && RcCanRead())
- {
- //
- // An LZMA packet begins here, which can have 3 possible initial bit
- // sequences that correspond to the type of encoding that was chosen
- // to represent the next stream of symbols.
- //
- // {0, n} represents a "literal", which LzDecodeLiteral decodes.
- // Literals are a single byte encoded with arithmetic-coded bit trees
- //
- // {1, 0} represents a "match", which LzDecodeMatch decodes.
- // Matches are typical LZ77 sequences with explicit length and distance
- //
- // {1, 1} represents a "rep", which LzDecodeRep decodes.
- // Reps are LZ77 sequences where the distance is encoded as a reference
- // to a previously used distance (up to 4 -- called "Rep0-3").
- //
- // Once we've decoded either the "match" or the "rep', we now have the
- // distance in "Rep0" (the most recently used distance) and the length
- // in "Len", so we will use DtRepeatSymbol to go back in the dictionary
- // buffer "Rep0" bytes and repeat that character "Len" times.
- //
- posBit = position & (LZMA_POSITION_COUNT - 1);
- if (RcIsBitSet(&Decoder.u.BitModel.Match[Decoder.Sequence][posBit]))
- {
- if (RcIsBitSet(&Decoder.u.BitModel.Rep[Decoder.Sequence]))
- {
- LzDecodeRep(posBit);
- }
- else
- {
- LzDecodeMatch(posBit);
- }
- if (!DtRepeatSymbol(Decoder.Len, Decoder.Rep0 + 1))
- {
- return false;
- }
- Decoder.Len = 0;
- }
- else
- {
- LzDecodeLiteral();
- }
- }
- RcNormalize();
- return (Decoder.Len == 0);
- }
- void
- LzResetState (
- void
- )
- {
- //
- // Initialize decoder to default state in case we're called more than once.
- // The LZMA "Bit Model" is an adaptive arithmetic-coded probability-based
- // bit tree which encodes either a "0" or a "1".
- //
- Decoder.Sequence = LzmaLitLitLitState;
- Decoder.Rep0 = Decoder.Rep1 = Decoder.Rep2 = Decoder.Rep3 = 0;
- // static_assert((LZMA_BIT_MODEL_SLOTS * 2) == sizeof(Decoder.u.BitModel),
- // "Invalid size");
- for (int i = 0; i < LZMA_BIT_MODEL_SLOTS; i++)
- {
- RcSetDefaultProbability(&Decoder.u.RawProbabilities[i]);
- }
- }
- bool
- LzInitialize (
- uint8_t Properties
- )
- {
- if (Properties != k_LzSupportedProperties)
- {
- return false;
- }
- LzResetState();
- return true;
- }
|