Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

235 lines
6.5 KiB

  1. /*
  2. * encdefs.h
  3. *
  4. * Encoder #define's and structure definitions.
  5. */
  6. /*
  7. * NOTES:
  8. *
  9. * To maximise compression one can set both BREAK_LENGTH
  10. * and FAST_DECISION_THRESHOLD to 250, define
  11. * INSERT_NEAR_LONG_MATCHES, and crank up EXTRA_SIZE to
  12. * a larger value (don't get too large, otherwise we
  13. * might overflow our ushort cumbits[]), but the improvement
  14. * is really marginal; e.g. 3600 bytes on winword.exe
  15. * (3.9 MB compressed). It really hurts performance too.
  16. */
  17. /*
  18. * See optenc.c
  19. *
  20. * EXTRA_SIZE is the amount of extra data we allocate in addition
  21. * to the window, and LOOK is the amount of data the optimal
  22. * parser will look ahead. LOOK is dependent on EXTRA_SIZE.
  23. *
  24. * Changing EXTRA_SIZE to 8K doesn't really do anything for
  25. * compression. 4K is a fairly optimal value.
  26. *
  27. */
  28. #define EXTRA_SIZE 16384
  29. #define LOOK (EXTRA_SIZE-MAX_MATCH-2)
  30. /*
  31. * Number of search trees used (for storing root nodes)
  32. */
  33. #define NUM_SEARCH_TREES 65536
  34. /*
  35. * Chunk size required by FCI
  36. */
  37. #define CHUNK_SIZE 32768
  38. /*
  39. * The maximum amount of data we will allow in our output buffer before
  40. * calling lzx_output_callback() to get rid of it. Since we do this
  41. * for every 32K of input data, the output buffer only has to be able
  42. * to contain 32K + some spillover, which won't be much, because we
  43. * output uncompressed blocks if we determine a block is going to be
  44. * too large.
  45. */
  46. #define OUTPUT_BUFFER_SIZE (CHUNK_SIZE+MAX_GROWTH)
  47. /*
  48. * Maximum allowable number of block splits per 32K of uncompressed
  49. * data; if increased, then MAX_GROWTH will have to be increased also.
  50. */
  51. #define MAX_BLOCK_SPLITS 4
  52. /*
  53. * Max growth is calculated as follows:
  54. *
  55. * TREE AND BLOCK INFO
  56. * ===================
  57. *
  58. * The very first time the encoder is run, it outputs a 32 bit
  59. * file translation size.
  60. *
  61. * 3 bits to output block type
  62. * 24 bits for block size in uncompressed bytes.
  63. *
  64. * Max size of a tree of n elements is 20*4 + 5*n bits
  65. *
  66. * There is a main tree of max 700 elements which is really encoded
  67. * as two separate trees of 256 and 444(max). There is also a
  68. * secondary length tree of 249 elements.
  69. *
  70. * That is 1360 bits, plus 2300 bits, plus 1325 bits.
  71. *
  72. * There may also be an aligned offset tree, which is 24 bits.
  73. *
  74. * Flushing output bit buffer; max 16 bits.
  75. *
  76. * Grand total: 5084 bits/block.
  77. *
  78. *
  79. * PARSER INFO
  80. * ===========
  81. *
  82. * Parser worst case scenario is with 2 MB buffer (50 position slots),
  83. * all matches of length 2, distributed over slots 32 and 33 (since
  84. * matches of length 2 further away than 128K are prohibited). These
  85. * slots have 15 verbatim bits. Maximum size per code is then
  86. * 2 bits to say which slot (taking into account that there will be
  87. * at least another code in the tree) plus 15 verbatim bits, for a
  88. * total of 17 bits. Max growth on 32K of input data is therefore
  89. * 1/16 * 32K, or 2K bytes.
  90. *
  91. * Alternatively, if there is only one match and everything else
  92. * is a character, then 255 characters will be length 8, and one
  93. * character and the match will be length 9. Assume the true
  94. * frequency of the demoted character is almost a 1 in 2^7
  95. * probability (it got remoted from a 2^8, but it was fairly
  96. * close to being 2^7). If there are 32768/256, or 128, occurrences
  97. * of each character, but, say, almost 256 for the demoted character,
  98. * then the demoted character will expand the data by less than
  99. * 1 bit * 256, or 256 bits. The match will take a little to
  100. * output, but max growth for "all characters" is about 256 bits.
  101. *
  102. *
  103. * END RESULT
  104. * ==========
  105. *
  106. * The maximum number of blocks which can be output is limited to
  107. * 4 per 32K of uncompressed data.
  108. *
  109. * Therefore, max growth is 4*5084 bits, plus 2K bytes, or 4590
  110. * bytes.
  111. */
  112. #define MAX_GROWTH 6144
  113. /*
  114. * Don't allow match length 2's which are further away than this
  115. * (see above)
  116. */
  117. #define MAX_LENGTH_TWO_OFFSET (128*1024)
  118. /*
  119. * When we find a match which is at least this long, prematurely
  120. * exit the binary search.
  121. *
  122. * This avoids us inserting huge match lengths of 257 zeroes, for
  123. * example. Compression will improve very *very* marginally by
  124. * increasing this figure, but it will seriously impact
  125. * performance.
  126. *
  127. * Don't make this number >= (MAX_MATCH-2); see bsearch.c.
  128. */
  129. #define BREAK_LENGTH 100
  130. /*
  131. * If this option is defined, the parser will insert all bytes of
  132. * matches with lengths >= 16 with a distance of 1; this is a bad
  133. * idea, since matches like that are generally zeroes, which we
  134. * want to avoid inserting into the search tree.
  135. */
  136. //#define INSERT_NEAR_LONG_MATCHES
  137. /*
  138. * If the optimal parser finds a match which is this long or
  139. * longer, it will take it automatically. The compression
  140. * penalty is basically zero, and it helps performance.
  141. */
  142. #define FAST_DECISION_THRESHOLD 100
  143. /*
  144. * Every TREE_CREATE_INTERVAL items, recreate the trees from
  145. * the literals we've encountered so far, to update our cost
  146. * estimations.
  147. *
  148. * 4K seems pretty optimal.
  149. */
  150. #define TREE_CREATE_INTERVAL 4096
  151. /*
  152. * When we're forced to break in our parsing (we exceed
  153. * our span), don't output a match length 2 if it is
  154. * further away than this.
  155. *
  156. * Could make this a variable rather than a constant
  157. *
  158. * On a bad binary file, two chars = 18 bits
  159. * On a good text file, two chars = 12 bits
  160. *
  161. * But match length two's are very uncommon on text files.
  162. */
  163. #define BREAK_MAX_LENGTH_TWO_OFFSET 2048
  164. /*
  165. * When MatchPos >= MPSLOT3_CUTOFF, extra_bits[MP_SLOT(MatchPos)] >= 3
  166. *
  167. * matchpos: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
  168. * extrabits: 0,0,0,0,1,1,1,1,2,2, 2, 2, 2, 2, 2, 2, 3, ...
  169. *
  170. * Used for aligned offset blocks and statistics.
  171. */
  172. #define MPSLOT3_CUTOFF 16
  173. /*
  174. * Number of elements in the main tree
  175. */
  176. #define MAIN_TREE_ELEMENTS (NUM_CHARS+(((long) context->enc_num_position_slots) << NL_SHIFT))
  177. /*
  178. * Max number of literals to hold.
  179. *
  180. * Memory required is MAX_LITERAL_ITEMS for enc_LitData[] array,
  181. * plus MAX_LITERAL_ITEMS/8 for enc_ItemType[] array.
  182. *
  183. * Must not exceed 64K, since that will cause our ushort
  184. * frequencies to overflow.
  185. */
  186. #define MAX_LITERAL_ITEMS 65536
  187. /*
  188. * Max number of distances to hold
  189. *
  190. * Memory required is MAX_DIST_ITEMS*4 for enc_DistData[] array
  191. *
  192. * MAX_DIST_ITEMS should never be greater than MAX_LITERAL_ITEMS,
  193. * since that just wastes space.
  194. *
  195. * However, it's extremely unlikely that one will get 65536 match
  196. * length 2's! In any case, the literal and distance buffers
  197. * are checked independently, and a block is output if either
  198. * overflows.
  199. *
  200. * Bitmaps are highly redundant, though; lots of matches.
  201. */
  202. #define MAX_DIST_ITEMS 32768