lzmadec.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627
  1. /*++
  2. Copyright (c) Alex Ionescu. All rights reserved.
  3. Module Name:
  4. lzmadec.c
  5. Abstract:
  6. This module implements the LZMA Decoding Logic responsible for decoding the
  7. three possible types of LZMA "packets": matches, repetitions (short \& long)
  8. and literals. The probability model for each type of packet is also stored
  9. in this file, along with the management of the previously seen packet types
  10. (which is tracked as the "sequence").
  11. Author:
  12. Alex Ionescu (@aionescu) 15-Apr-2020 - Initial version
  13. Environment:
  14. Windows & Linux, user mode and kernel mode.
  15. --*/
  16. #include "minlzlib.h"
  17. #include "lzmadec.h"
  18. //
  19. // Probability Bit Model for Lenghts in Rep and in Match sequences
  20. //
  21. typedef struct _LENGTH_DECODER_STATE
  22. {
  23. //
  24. // Bit Model for the choosing the type of length encoding
  25. //
  26. uint16_t Choice;
  27. uint16_t Choice2;
  28. //
  29. // Bit Model for each of the length encodings
  30. //
  31. uint16_t Low[LZMA_POSITION_COUNT][LZMA_MAX_LOW_LENGTH];
  32. uint16_t Mid[LZMA_POSITION_COUNT][LZMA_MAX_MID_LENGTH];
  33. uint16_t High[LZMA_MAX_HIGH_LENGTH];
  34. } LENGTH_DECODER_STATE, * PLENGTH_DECODER_STATE;
  35. //
  36. // State used for LZMA decoding
  37. //
  38. typedef struct _DECODER_STATE
  39. {
  40. //
  41. // Current type of sequence last decoded
  42. //
  43. LZMA_SEQUENCE_STATE Sequence;
  44. //
  45. // History of last 4 decoded distances
  46. //
  47. uint32_t Rep0;
  48. uint32_t Rep1;
  49. uint32_t Rep2;
  50. uint32_t Rep3;
  51. //
  52. // Pending length to repeat from dictionary
  53. //
  54. uint32_t Len;
  55. //
  56. // Probability Bit Models for all sequence types
  57. //
  58. union
  59. {
  60. struct
  61. {
  62. //
  63. // Literal model
  64. //
  65. uint16_t Literal[LZMA_LITERAL_CODERS][LZMA_LC_MODEL_SIZE];
  66. //
  67. // Last-used-distance based models
  68. //
  69. uint16_t Rep[LzmaMaxState];
  70. uint16_t Rep0[LzmaMaxState];
  71. uint16_t Rep0Long[LzmaMaxState][LZMA_POSITION_COUNT];
  72. uint16_t Rep1[LzmaMaxState];
  73. uint16_t Rep2[LzmaMaxState];
  74. LENGTH_DECODER_STATE RepLen;
  75. //
  76. // Explicit distance match based models
  77. //
  78. uint16_t Match[LzmaMaxState][LZMA_POSITION_COUNT];
  79. uint16_t DistSlot[LZMA_FIRST_CONTEXT_DISTANCE_SLOT][LZMA_DISTANCE_SLOTS];
  80. uint16_t Dist[(1 << 7) - LZMA_FIRST_FIXED_DISTANCE_SLOT];
  81. uint16_t Align[LZMA_DISTANCE_ALIGN_SLOTS];
  82. LENGTH_DECODER_STATE MatchLen;
  83. } BitModel;
  84. uint16_t RawProbabilities[LZMA_BIT_MODEL_SLOTS];
  85. } u;
  86. } DECODER_STATE, *PDECODER_STATE;
  87. DECODER_STATE Decoder;
  88. //
  89. // LZMA decoding uses 3 "properties" which determine how the probability
  90. // bit model will be laid out. These store the number of bits that are used
  91. // to pick the correct Literal Coder ("lc"), the number of Position bits to
  92. // select the Literal coder ("lp"), and the number of Position Bits used to
  93. // select various lengths ("pb"). In LZMA2, these properties are encoded in
  94. // a single byte with the formula: ((pb * 45) + lp * 9) + lc).
  95. //
  96. // We only support the default {lc = 3, lp = 0, pb = 2} properties, which
  97. // are what the main encoders out there use. This means that a total of 2
  98. // bits will be used for arithmetic-coded bit trees that are dependent on
  99. // the current position, and that a total of 3 bits will be used when we
  100. // pick the arithmetic-coded bit tree used for literal coding. The 0 means
  101. // this selection will _not_ be dependent on the position in the buffer.
  102. //
  103. const uint8_t k_LzSupportedProperties =
  104. (LZMA_PB * 45) + (LZMA_LP * 9) + (LZMA_LC);
  105. void
  106. LzSetLiteral (
  107. PLZMA_SEQUENCE_STATE State
  108. )
  109. {
  110. if (*State <= LzmaLitShortrepLitLitState)
  111. {
  112. //
  113. // States 0-3 represent packets with at least 2 back-to-back literals,
  114. // so another literal now takes us to state 0 (3 back-to-back literals)
  115. //
  116. *State = LzmaLitLitLitState;
  117. }
  118. else if (*State <= LzmaLitShortrepState)
  119. {
  120. //
  121. // States 4-6 represent packets with a literal at the end, so seeing
  122. // another literal now takes us to 2 back-to-back literals, which are
  123. // state packets 1-3.
  124. //
  125. // States 7-9 represent packets with a literal at the start, followed
  126. // by a match/rep/shortrep. Seeing another literal now drops this first
  127. // literal and takes us to having a literal at the end, which are state
  128. // packets 4-6 that we just described in the paragraph above.
  129. //
  130. *State = (LZMA_SEQUENCE_STATE)(*State - 3);
  131. }
  132. else
  133. {
  134. //
  135. // Finally, state 10 and 11 represent cases without a single literal in
  136. // the last 2 sequence packets, so seeing a literal now takes us to a
  137. // "literal at the end" state, either following a match or a rep.
  138. //
  139. *State = (LZMA_SEQUENCE_STATE)(*State - 6);
  140. }
  141. }
  142. bool
  143. LzIsLiteral (
  144. LZMA_SEQUENCE_STATE State
  145. )
  146. {
  147. //
  148. // States 0-6 describe literal packet sequences
  149. //
  150. return State < LzmaMaxLitState;
  151. }
  152. void
  153. LzSetMatch (
  154. PLZMA_SEQUENCE_STATE State
  155. )
  156. {
  157. //
  158. // Move to the appropriate "match" state based on current literal state
  159. //
  160. *State = LzIsLiteral(*State) ? LzmaLitMatchState : LzmaNonlitMatchState;
  161. }
  162. void
  163. LzSetLongRep (
  164. PLZMA_SEQUENCE_STATE State
  165. )
  166. {
  167. //
  168. // Move to the appropriate "long rep" state based on current literal state
  169. //
  170. *State = LzIsLiteral(*State) ? LzmaLitRepState : LzmaNonlitRepState;
  171. }
  172. void
  173. LzSetShortRep (
  174. PLZMA_SEQUENCE_STATE State
  175. )
  176. {
  177. //
  178. // Move to the appropriate "short rep" state based on current literal state
  179. //
  180. *State = LzIsLiteral(*State) ? LzmaLitShortrepState : LzmaNonlitRepState;
  181. }
  182. uint16_t*
  183. LzGetLiteralSlot (
  184. void
  185. )
  186. {
  187. uint8_t symbol;
  188. //
  189. // To pick the correct literal coder arithmetic-coded bit tree, LZMA uses
  190. // the "lc" parameter to choose the number of high bits from the previous
  191. // symbol (in the normal case, 3). It then combines that with the "lp"
  192. // parameter to choose the number of low bits from the current position in
  193. // the dictionary. However, since "lp" is normally 0, we can omit this.
  194. //
  195. symbol = DtGetSymbol(1);
  196. return Decoder.u.BitModel.Literal[symbol >> (8 - LZMA_LC)];
  197. }
  198. uint16_t*
  199. LzGetDistSlot (
  200. void
  201. )
  202. {
  203. uint8_t slotIndex;
  204. //
  205. // There are 4 different arithmetic-coded bit trees which are used to pick
  206. // the correct "distance slot" when doing match distance decoding. Each of
  207. // them is used based on the length of the symbol that is being repeated.
  208. // For lengths of 2, 3, 4 bytes, a dedicated set of distance slots is used.
  209. // For lengths of 5 bytes or above, a shared set of distance slots is used.
  210. //
  211. if (Decoder.Len < (LZMA_FIRST_CONTEXT_DISTANCE_SLOT + LZMA_MIN_LENGTH))
  212. {
  213. slotIndex = (uint8_t)(Decoder.Len - LZMA_MIN_LENGTH);
  214. }
  215. else
  216. {
  217. slotIndex = LZMA_FIRST_CONTEXT_DISTANCE_SLOT - 1;
  218. }
  219. return Decoder.u.BitModel.DistSlot[slotIndex];
  220. }
  221. void
  222. LzDecodeLiteral (
  223. void
  224. )
  225. {
  226. uint16_t* probArray;
  227. uint8_t symbol, matchByte;
  228. //
  229. // First, choose the correct arithmetic-coded bit tree (which is based on
  230. // the last symbol we just decoded), then see if we last decoded a literal.
  231. //
  232. // If so, simply get the symbol from the bit tree as normal. However, if
  233. // we didn't last see a literal, we need to read the "match byte" that is
  234. // "n" bytes away from the last decoded match. We previously stored this in
  235. // rep0.
  236. //
  237. // Based on this match byte, we'll then use 2 other potential bit trees,
  238. // see LzDecodeMatched for more information.
  239. //
  240. probArray = LzGetLiteralSlot();
  241. if (LzIsLiteral(Decoder.Sequence))
  242. {
  243. symbol = RcGetBitTree(probArray, (1 << 8));
  244. }
  245. else
  246. {
  247. matchByte = DtGetSymbol(Decoder.Rep0 + 1);
  248. symbol = RcDecodeMatchedBitTree(probArray, matchByte);
  249. }
  250. //
  251. // Write the symbol and indicate that the last sequence was a literal
  252. //
  253. DtPutSymbol(symbol);
  254. LzSetLiteral(&Decoder.Sequence);
  255. }
  256. void
  257. LzDecodeLen (
  258. PLENGTH_DECODER_STATE LenState,
  259. uint8_t PosBit
  260. )
  261. {
  262. uint16_t* probArray;
  263. uint16_t limit;
  264. //
  265. // Lenghts of 2 and higher are encoded in 3 possible types of arithmetic-
  266. // coded bit trees, depending on the size of the length.
  267. //
  268. // Lengths 2-9 are encoded in trees called "Low" using 3 bits of data.
  269. // Lengths 10-17 are encoded in trees called "Mid" using 3 bits of data.
  270. // Lengths 18-273 are encoded in a tree called "high" using 8 bits of data.
  271. //
  272. // The appropriate "Low" or "Mid" tree is selected based on the bottom 2
  273. // position bits (0-3) (in the LZMA standard, this is based on the "pb",
  274. // while the "High" tree is shared for all positions.
  275. //
  276. // Two arithmetic-coded bit trees, called "Choice" and "Choice2" tell us
  277. // the type of Length, so we can choose the right tree. {0, n} tells us
  278. // to use the Low trees, while {1, 0} tells us to use the Mid trees. Lastly
  279. // {1, 1} tells us to use the High tree.
  280. //
  281. Decoder.Len = LZMA_MIN_LENGTH;
  282. if (RcIsBitSet(&LenState->Choice))
  283. {
  284. if (RcIsBitSet(&LenState->Choice2))
  285. {
  286. probArray = LenState->High;
  287. limit = LZMA_MAX_HIGH_LENGTH;
  288. Decoder.Len += LZMA_MAX_LOW_LENGTH + LZMA_MAX_MID_LENGTH;
  289. }
  290. else
  291. {
  292. probArray = LenState->Mid[PosBit];
  293. limit = LZMA_MAX_MID_LENGTH;
  294. Decoder.Len += LZMA_MAX_LOW_LENGTH;
  295. }
  296. }
  297. else
  298. {
  299. probArray = LenState->Low[PosBit];
  300. limit = LZMA_MAX_LOW_LENGTH;
  301. }
  302. Decoder.Len += RcGetBitTree(probArray, limit);
  303. }
  304. void
  305. LzDecodeMatch (
  306. uint8_t PosBit
  307. )
  308. {
  309. uint16_t* probArray;
  310. uint8_t distSlot, distBits;
  311. //
  312. // Decode the length component of the "match" sequence. Then, since we're
  313. // about to decode a new distance, update our history by one level.
  314. //
  315. LzDecodeLen(&Decoder.u.BitModel.MatchLen, PosBit);
  316. Decoder.Rep3 = Decoder.Rep2;
  317. Decoder.Rep2 = Decoder.Rep1;
  318. Decoder.Rep1 = Decoder.Rep0;
  319. //
  320. // Read the first 6 bits, which make up the "distance slot"
  321. //
  322. probArray = LzGetDistSlot();
  323. distSlot = RcGetBitTree(probArray, LZMA_DISTANCE_SLOTS);
  324. if (distSlot < LZMA_FIRST_CONTEXT_DISTANCE_SLOT)
  325. {
  326. //
  327. // Slots 0-3 directly encode the distance as a literal number
  328. //
  329. Decoder.Rep0 = distSlot;
  330. }
  331. else
  332. {
  333. //
  334. // For slots 4-13, figure out how many "context encoded bits" are used
  335. // to encode this distance. The math works out such that slots 4-5 use
  336. // 1 bit, 6-7 use 2 bits, 8-9 use 3 bits, and so on and so forth until
  337. // slots 12-13 which use 5 bits.
  338. //
  339. // This gives us anywhere from 1-5 bits, plus the two upper bits which
  340. // can either be 0b10 or 0b11 (based on the bottom bit of the distance
  341. // slot). Thus, with the context encoded bits, we can represent lengths
  342. // anywhere from 0b10[0] to 0b11[11111] (i.e.: 4-127).
  343. //
  344. // For slots 14-63, we use "fixed 50% probability bits" which are also
  345. // called "direct bits". The formula below also tells us how many such
  346. // direct bits to use in this scenario. In other words, distBits can
  347. // either be the number of "context encoded bits" for slots 4-13, or it
  348. // can be the the number of "direct bits" for slots 14-63. This gives
  349. // us a range of of 2 to 26 bits, which are then used as middle bits.
  350. // Finally, the last 4 bits are called the "align" bits. The smallest
  351. // possible number we can encode is now going to be 0b10[00][0000] and
  352. // the highest is 0b11[1111111111111111111111111][1111], in other words
  353. // 128 to (2^31)-1.
  354. //
  355. distBits = (distSlot >> 1) - 1;
  356. Decoder.Rep0 = (0b10 | (distSlot & 1)) << distBits;
  357. //
  358. // Slots 4-13 have their own arithmetic-coded reverse bit trees. Slots
  359. // 14-63 encode the middle "direct bits" with fixed 50% probability and
  360. // the bottom 4 "align bits" with a shared arithmetic-coded reverse bit
  361. // tree.
  362. //
  363. if (distSlot < LZMA_FIRST_FIXED_DISTANCE_SLOT)
  364. {
  365. probArray = &Decoder.u.BitModel.Dist[Decoder.Rep0 - distSlot];
  366. }
  367. else
  368. {
  369. Decoder.Rep0 |= RcGetFixed(distBits - LZMA_DISTANCE_ALIGN_BITS) <<
  370. LZMA_DISTANCE_ALIGN_BITS;
  371. distBits = LZMA_DISTANCE_ALIGN_BITS;
  372. probArray = Decoder.u.BitModel.Align;
  373. }
  374. Decoder.Rep0 |= RcGetReverseBitTree(probArray, distBits);
  375. }
  376. //
  377. // Indicate that the last sequence was a "match"
  378. //
  379. LzSetMatch(&Decoder.Sequence);
  380. }
  381. void
  382. LzDecodeRepLen (
  383. uint8_t PosBit,
  384. bool IsLongRep
  385. )
  386. {
  387. //
  388. // Decode the length byte and indicate the last sequence was a "rep".
  389. // If this is a short rep, then the length is always hard-coded to 1.
  390. //
  391. if (IsLongRep)
  392. {
  393. LzDecodeLen(&Decoder.u.BitModel.RepLen, PosBit);
  394. LzSetLongRep(&Decoder.Sequence);
  395. }
  396. else
  397. {
  398. Decoder.Len = 1;
  399. LzSetShortRep(&Decoder.Sequence);
  400. }
  401. }
  402. void
  403. LzDecodeRep0(
  404. uint8_t PosBit
  405. )
  406. {
  407. uint8_t bit;
  408. //
  409. // This could be a "short rep" with a length of 1, or a "long rep0" with
  410. // a length that we have to decode. The next bit tells us this, using the
  411. // arithmetic-coded bit trees stored in "Rep0Long", with 1 tree for each
  412. // position bit (0-3).
  413. //
  414. bit = RcIsBitSet(&Decoder.u.BitModel.Rep0Long[Decoder.Sequence][PosBit]);
  415. LzDecodeRepLen(PosBit, bit);
  416. }
  417. void
  418. LzDecodeLongRep (
  419. uint8_t PosBit
  420. )
  421. {
  422. uint32_t newRep;
  423. //
  424. // Read the next 2 bits to figure out which of the recently used distances
  425. // we should use for this match. The following three states are possible :
  426. //
  427. // {0,n} - "Long rep1", where the length is stored in an arithmetic-coded
  428. // bit tree, and the distance is the 2nd most recently used distance (Rep1)
  429. //
  430. // {1,0} - "Long rep2", where the length is stored in an arithmetic-coded
  431. // bit tree, and the distance is the 3rd most recently used distance (Rep2)
  432. //
  433. // {1,1} - "Long rep3", where the length is stored in an arithmetic-coded
  434. // bit tree, and the distance is the 4th most recently used distance (Rep3)
  435. //
  436. // Once we have the right one, we must slide down each previously recently
  437. // used distance, so that the distance we're now using (Rep1, Rep2 or Rep3)
  438. // becomes "Rep0" again.
  439. //
  440. if (RcIsBitSet(&Decoder.u.BitModel.Rep1[Decoder.Sequence]))
  441. {
  442. if (RcIsBitSet(&Decoder.u.BitModel.Rep2[Decoder.Sequence]))
  443. {
  444. newRep = Decoder.Rep3;
  445. Decoder.Rep3 = Decoder.Rep2;
  446. }
  447. else
  448. {
  449. newRep = Decoder.Rep2;
  450. }
  451. Decoder.Rep2 = Decoder.Rep1;
  452. }
  453. else
  454. {
  455. newRep = Decoder.Rep1;
  456. }
  457. Decoder.Rep1 = Decoder.Rep0;
  458. Decoder.Rep0 = newRep;
  459. LzDecodeRepLen(PosBit, true);
  460. }
  461. void
  462. LzDecodeRep (
  463. uint8_t PosBit
  464. )
  465. {
  466. //
  467. // We know this is an LZ77 distance-length pair where the distance is based
  468. // on a history of up to 4 previously used distance (Rep0-3). To know which
  469. // distance to use, the following 5 bit positions are possible (keeping in
  470. // mind that we've already decoded the first 2 bits {1,1} in LzDecode which
  471. // got us here in the first place):
  472. //
  473. // {0,0} - "Short rep", where the length is always 1 and distance is always
  474. // the most recently used distance (Rep0).
  475. //
  476. // {0,1} - "Long rep0", where the length is stored in an arithmetic-coded
  477. // bit tree, and the distance is the most recently used distance (Rep0).
  478. //
  479. // Because both of these possibilities just use Rep0, LzDecodeRep0 handles
  480. // these two cases. Otherwise, we use LzDecodeLongRep to read up to two
  481. // additional bits to figure out which recently used distance (1, 2, or 3)
  482. // to use.
  483. //
  484. if (RcIsBitSet(&Decoder.u.BitModel.Rep0[Decoder.Sequence]))
  485. {
  486. LzDecodeLongRep(PosBit);
  487. }
  488. else
  489. {
  490. LzDecodeRep0(PosBit);
  491. }
  492. }
  493. bool
  494. LzDecode (
  495. void
  496. )
  497. {
  498. uint32_t position;
  499. uint8_t posBit;
  500. //
  501. // Get the current position in dictionary, making sure we have input bytes.
  502. // Once we run out of bytes, normalize the last arithmetic coded byte and
  503. // ensure there's no pending lengths that we haven't yet repeated.
  504. //
  505. while (DtCanWrite(&position) && RcCanRead())
  506. {
  507. //
  508. // An LZMA packet begins here, which can have 3 possible initial bit
  509. // sequences that correspond to the type of encoding that was chosen
  510. // to represent the next stream of symbols.
  511. //
  512. // {0, n} represents a "literal", which LzDecodeLiteral decodes.
  513. // Literals are a single byte encoded with arithmetic-coded bit trees
  514. //
  515. // {1, 0} represents a "match", which LzDecodeMatch decodes.
  516. // Matches are typical LZ77 sequences with explicit length and distance
  517. //
  518. // {1, 1} represents a "rep", which LzDecodeRep decodes.
  519. // Reps are LZ77 sequences where the distance is encoded as a reference
  520. // to a previously used distance (up to 4 -- called "Rep0-3").
  521. //
  522. // Once we've decoded either the "match" or the "rep', we now have the
  523. // distance in "Rep0" (the most recently used distance) and the length
  524. // in "Len", so we will use DtRepeatSymbol to go back in the dictionary
  525. // buffer "Rep0" bytes and repeat that character "Len" times.
  526. //
  527. posBit = position & (LZMA_POSITION_COUNT - 1);
  528. if (RcIsBitSet(&Decoder.u.BitModel.Match[Decoder.Sequence][posBit]))
  529. {
  530. if (RcIsBitSet(&Decoder.u.BitModel.Rep[Decoder.Sequence]))
  531. {
  532. LzDecodeRep(posBit);
  533. }
  534. else
  535. {
  536. LzDecodeMatch(posBit);
  537. }
  538. if (!DtRepeatSymbol(Decoder.Len, Decoder.Rep0 + 1))
  539. {
  540. return false;
  541. }
  542. Decoder.Len = 0;
  543. }
  544. else
  545. {
  546. LzDecodeLiteral();
  547. }
  548. }
  549. RcNormalize();
  550. return (Decoder.Len == 0);
  551. }
  552. void
  553. LzResetState (
  554. void
  555. )
  556. {
  557. //
  558. // Initialize decoder to default state in case we're called more than once.
  559. // The LZMA "Bit Model" is an adaptive arithmetic-coded probability-based
  560. // bit tree which encodes either a "0" or a "1".
  561. //
  562. Decoder.Sequence = LzmaLitLitLitState;
  563. Decoder.Rep0 = Decoder.Rep1 = Decoder.Rep2 = Decoder.Rep3 = 0;
  564. // static_assert((LZMA_BIT_MODEL_SLOTS * 2) == sizeof(Decoder.u.BitModel),
  565. // "Invalid size");
  566. for (int i = 0; i < LZMA_BIT_MODEL_SLOTS; i++)
  567. {
  568. RcSetDefaultProbability(&Decoder.u.RawProbabilities[i]);
  569. }
  570. }
  571. bool
  572. LzInitialize (
  573. uint8_t Properties
  574. )
  575. {
  576. if (Properties != k_LzSupportedProperties)
  577. {
  578. return false;
  579. }
  580. LzResetState();
  581. return true;
  582. }