Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

239 lines
6.7 KiB

  1. /*
  2. * encdefs.h
  3. *
  4. * Encoder #define's and structure definitions.
  5. */
  6. /*
  7. * NOTES:
  8. *
  9. * To maximise compression one can set both BREAK_LENGTH
  10. * and FAST_DECISION_THRESHOLD to 250, define
  11. * INSERT_NEAR_LONG_MATCHES, and crank up EXTRA_SIZE to
  12. * a larger value (don't get too large, otherwise we
  13. * might overflow our ushort cumbits[]), but the improvement
  14. * is really marginal; e.g. 3600 bytes on winword.exe
  15. * (3.9 MB compressed). It really hurts performance too.
  16. */
  17. /*
  18. * See optenc.c
  19. *
  20. * EXTRA_SIZE is the amount of extra data we allocate in addition
  21. * to the window, and LOOK is the amount of data the optimal
  22. * parser will look ahead. LOOK is dependent on EXTRA_SIZE.
  23. *
  24. * Changing EXTRA_SIZE to 8K doesn't really do anything for
  25. * compression. 4K is a fairly optimal value.
  26. *
  27. * Be careful; our cumbits[] array and counters are all
  28. * ushort's in optenc.c, so make sure they don't overflow
  29. * (e.g. outputting all LOOK bytes as 9 bit uncompressed
  30. * symbols, say). If necessary, change the typedef in optenc.c
  31. * to ulong.
  32. */
  33. #define EXTRA_SIZE 4096
  34. #define LOOK (EXTRA_SIZE-MAX_MATCH-2)
  35. /*
  36. * Number of search trees used (for storing root nodes)
  37. */
  38. #define NUM_SEARCH_TREES 65536
  39. /*
  40. * Chunk size required by FCI
  41. */
  42. #define CHUNK_SIZE 32768
  43. /*
  44. * The maximum amount of data we will allow in our output buffer before
  45. * calling lzx_output_callback() to get rid of it. Since we do this
  46. * for every 32K of input data, the output buffer only has to be able
  47. * to contain 32K + some spillover, which won't be much, because we
  48. * output uncompressed blocks if we determine a block is going to be
  49. * too large.
  50. */
  51. #define OUTPUT_BUFFER_SIZE (CHUNK_SIZE+MAX_GROWTH)
  52. /*
  53. * Maximum allowable number of block splits per 32K of uncompressed
  54. * data; if increased, then MAX_GROWTH will have to be increased also.
  55. */
  56. #define MAX_BLOCK_SPLITS 4
  57. /*
  58. * Max growth is calculated as follows:
  59. *
  60. * TREE AND BLOCK INFO
  61. * ===================
  62. *
  63. * The very first time the encoder is run, it outputs a 32 bit
  64. * file translation size.
  65. *
  66. * 3 bits to output block type
  67. * 24 bits for block size in uncompressed bytes.
  68. *
  69. * Max size of a tree of n elements is 20*4 + 5*n bits
  70. *
  71. * There is a main tree of max 700 elements which is really encoded
  72. * as two separate trees of 256 and 444(max). There is also a
  73. * secondary length tree of 249 elements.
  74. *
  75. * That is 1360 bits, plus 2300 bits, plus 1325 bits.
  76. *
  77. * There may also be an aligned offset tree, which is 24 bits.
  78. *
  79. * Flushing output bit buffer; max 16 bits.
  80. *
  81. * Grand total: 5084 bits/block.
  82. *
  83. *
  84. * PARSER INFO
  85. * ===========
  86. *
  87. * Parser worst case scenario is with 2 MB buffer (50 position slots),
  88. * all matches of length 2, distributed over slots 32 and 33 (since
  89. * matches of length 2 further away than 128K are prohibited). These
  90. * slots have 15 verbatim bits. Maximum size per code is then
  91. * 2 bits to say which slot (taking into account that there will be
  92. * at least another code in the tree) plus 15 verbatim bits, for a
  93. * total of 17 bits. Max growth on 32K of input data is therefore
  94. * 1/16 * 32K, or 2K bytes.
  95. *
  96. * Alternatively, if there is only one match and everything else
  97. * is a character, then 255 characters will be length 8, and one
  98. * character and the match will be length 9. Assume the true
  99. * frequency of the demoted character is almost a 1 in 2^7
  100. * probability (it got remoted from a 2^8, but it was fairly
  101. * close to being 2^7). If there are 32768/256, or 128, occurrences
  102. * of each character, but, say, almost 256 for the demoted character,
  103. * then the demoted character will expand the data by less than
  104. * 1 bit * 256, or 256 bits. The match will take a little to
  105. * output, but max growth for "all characters" is about 256 bits.
  106. *
  107. *
  108. * END RESULT
  109. * ==========
  110. *
  111. * The maximum number of blocks which can be output is limited to
  112. * 4 per 32K of uncompressed data.
  113. *
  114. * Therefore, max growth is 4*5084 bits, plus 2K bytes, or 4590
  115. * bytes.
  116. */
  117. #define MAX_GROWTH 6144
  118. /*
  119. * Don't allow match length 2's which are further away than this
  120. * (see above)
  121. */
  122. #define MAX_LENGTH_TWO_OFFSET (128*1024)
  123. /*
  124. * When we find a match which is at least this long, prematurely
  125. * exit the binary search.
  126. *
  127. * This avoids us inserting huge match lengths of 257 zeroes, for
  128. * example. Compression will improve very *very* marginally by
  129. * increasing this figure, but it will seriously impact
  130. * performance.
  131. *
  132. * Don't make this number >= (MAX_MATCH-2); see bsearch.c.
  133. */
  134. #define BREAK_LENGTH 50
  135. /*
  136. * If this option is defined, the parser will insert all bytes of
  137. * matches with lengths >= 16 with a distance of 1; this is a bad
  138. * idea, since matches like that are generally zeroes, which we
  139. * want to avoid inserting into the search tree.
  140. */
  141. //#define INSERT_NEAR_LONG_MATCHES
  142. /*
  143. * If the optimal parser finds a match which is this long or
  144. * longer, it will take it automatically. The compression
  145. * penalty is basically zero, and it helps performance.
  146. */
  147. #define FAST_DECISION_THRESHOLD 50
  148. /*
  149. * Every TREE_CREATE_INTERVAL items, recreate the trees from
  150. * the literals we've encountered so far, to update our cost
  151. * estimations.
  152. *
  153. * 4K seems pretty optimal.
  154. */
  155. #define TREE_CREATE_INTERVAL 4096
  156. /*
  157. * When we're forced to break in our parsing (we exceed
  158. * our span), don't output a match length 2 if it is
  159. * further away than this.
  160. *
  161. * Could make this a variable rather than a constant
  162. *
  163. * On a bad binary file, two chars = 18 bits
  164. * On a good text file, two chars = 12 bits
  165. *
  166. * But match length two's are very uncommon on text files.
  167. */
  168. #define BREAK_MAX_LENGTH_TWO_OFFSET 2048
  169. /*
  170. * When MatchPos >= MPSLOT3_CUTOFF, extra_bits[MP_SLOT(MatchPos)] >= 3
  171. *
  172. * matchpos: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
  173. * extrabits: 0,0,0,0,1,1,1,1,2,2, 2, 2, 2, 2, 2, 2, 3, ...
  174. *
  175. * Used for aligned offset blocks and statistics.
  176. */
  177. #define MPSLOT3_CUTOFF 16
  178. /*
  179. * Number of elements in the main tree
  180. */
  181. #define MAIN_TREE_ELEMENTS (NUM_CHARS+(((long) context->enc_num_position_slots) << NL_SHIFT))
  182. /*
  183. * Max number of literals to hold.
  184. *
  185. * Memory required is MAX_LITERAL_ITEMS for enc_LitData[] array,
  186. * plus MAX_LITERAL_ITEMS/8 for enc_ItemType[] array.
  187. *
  188. * Must not exceed 64K, since that will cause our ushort
  189. * frequencies to overflow.
  190. */
  191. #define MAX_LITERAL_ITEMS 65536
  192. /*
  193. * Max number of distances to hold
  194. *
  195. * Memory required is MAX_DIST_ITEMS*4 for enc_DistData[] array
  196. *
  197. * MAX_DIST_ITEMS should never be greater than MAX_LITERAL_ITEMS,
  198. * since that just wastes space.
  199. *
  200. * However, it's extremely unlikely that one will get 65536 match
  201. * length 2's! In any case, the literal and distance buffers
  202. * are checked independently, and a block is output if either
  203. * overflows.
  204. *
  205. * Bitmaps are highly redundant, though; lots of matches.
  206. */
  207. #define MAX_DIST_ITEMS 32768