Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

622 lines
18 KiB

  1. /*************************************************************************
  2. * *
  3. * ENCODE.C *
  4. * *
  5. * Copyright (C) Microsoft Corporation 1990-1994 *
  6. * All Rights reserved. *
  7. * *
  8. **************************************************************************
  9. * *
  10. * Module Intent *
  11. * General encoding & decoding techniques *
  12. * *
  13. **************************************************************************
  14. * *
  15. * Current Owner: BinhN *
  16. * *
  17. **************************************************************************
  18. * *
  19. * Released by Development: (date) *
  20. * *
  21. *************************************************************************/
  22. #include <mvopsys.h>
  23. #include <mem.h>
  24. #include <mvsearch.h>
  25. #include "common.h"
  26. #include "index.h"
  27. /* Structure to access bits and bytes of a DWORD */
  28. typedef struct {
  29. unsigned short w1;
  30. unsigned short w2;
  31. } TWOWORD;
  32. typedef struct {
  33. unsigned char b1;
  34. unsigned char b2;
  35. unsigned char b3;
  36. unsigned char b4;
  37. } FOURBYTE;
  38. typedef union {
  39. unsigned long dwVal;
  40. TWOWORD dw;
  41. FOURBYTE fb;
  42. } WORDLONG;
  43. #define HI_WORD(p) (((WORDLONG FAR *)&p)->dw.w2)
  44. #define LO_WORD(p) (((WORDLONG FAR *)&p)->dw.w1)
  45. #define BYTE1(p) (((WORDLONG FAR *)&p)->fb.b4)
  46. #define BYTE2(p) (((WORDLONG FAR *)&p)->fb.b3)
  47. #define BYTE3(p) (((WORDLONG FAR *)&p)->fb.b2)
  48. #define BYTE4(p) (((WORDLONG FAR *)&p)->fb.b1)
  49. /*************************************************************************
  50. *
  51. * INTERNAL PRIVATE FUNCTIONS
  52. *
  53. * All of them should be declared near
  54. *
  55. *************************************************************************/
  56. PRIVATE LPB PASCAL NEAR LongValPack (LPB, DWORD);
  57. PRIVATE LPB PASCAL NEAR LongValUnpack (LPB, LPDW);
  58. /*************************************************************************
  59. *
  60. * INTERNAL PUBLIC FUNCTIONS
  61. *
  62. * All of them should be declared far, unless we know they belong to
  63. * the same segment. They should be included in some include files
  64. *
  65. *************************************************************************/
  66. PUBLIC CB PASCAL NEAR CbBytePack(LPB, DWORD);
  67. PUBLIC CB PASCAL NEAR OccurrencePack (LPB, LPOCC, WORD);
  68. PUBLIC CB PASCAL NEAR CbCopySortPackedOcc (LPB, LPB, WORD);
  69. PUBLIC void PASCAL NEAR OccurrenceUnpack(LPOCC, LPB, OCCF);
  70. PUBLIC CBIT PASCAL NEAR CbitBitsDw (DWORD);
  71. /*************************************************************************
  72. *
  73. * @doc INTERNAL INDEX
  74. *
  75. * @func LPB PASCAL NEAR | LongValPack |
  76. * The function packs and writes out an encoded 4-bytes value.
  77. * The encoding scheme is as followed:
  78. * - High 3 bit: used to tell how many bytes are to follow
  79. * the current byte
  80. * - The packed value
  81. * Ex:
  82. * 0x1 will be output as 0x1
  83. * 0x1F 0x1F
  84. * 0x2F 0x202F (0010 0000 0010 1111)
  85. *
  86. * @parm LPB | lpbOut |
  87. * Pointer to the output buffer
  88. *
  89. * @parm DWORD | dwVal |
  90. * 4-bytes value to be packed and emitted
  91. *
  92. * @rdesc
  93. * The buffer pointer is advanced and returned.
  94. *
  95. * @comm No validity check is done for the the output buffer
  96. *************************************************************************/
  97. PRIVATE LPB PASCAL NEAR LongValPack (LPB lpbOut, DWORD dwVal)
  98. {
  99. if (HI_WORD(dwVal) > 0x1fff) {
  100. *lpbOut++ = 4 << 5; // 4 bytes follow this byte
  101. goto Copy4Bytes;
  102. }
  103. if (HI_WORD(dwVal) > 0x001f) {
  104. BYTE1(dwVal) |= 3 << 5; /* 3 bytes follows this byte */
  105. goto Copy4Bytes;
  106. }
  107. if (HI_WORD(dwVal) > 0 || LO_WORD(dwVal) > 0x1fff) {
  108. BYTE2(dwVal) |= 2 << 5; /* 2 bytes follows this byte */
  109. goto Copy3Bytes;
  110. }
  111. if (LO_WORD(dwVal) > 0x001f) {
  112. BYTE3(dwVal) |= 1 << 5; /* 1 bytes follows this byte */
  113. goto Copy2Bytes;
  114. }
  115. else
  116. goto Copy1Bytes;
  117. Copy4Bytes:
  118. *lpbOut ++ = BYTE1(dwVal);
  119. Copy3Bytes:
  120. *lpbOut ++ = BYTE2(dwVal);
  121. Copy2Bytes:
  122. *lpbOut ++ = BYTE3(dwVal);
  123. Copy1Bytes:
  124. *lpbOut ++ = BYTE4(dwVal);
  125. return lpbOut;
  126. }
  127. /*************************************************************************
  128. *
  129. * @doc INTERNAL INDEX
  130. *
  131. * @func LPB PASCAL NEAR | LongValUnpack |
  132. * This is the reverse on LongValPack. Given a buffer containing
  133. * a packed 4-byte value, the function will unpack and return the
  134. * value. The pointer to the input buffer is updated and returned
  135. *
  136. * @parm LPB | lpbIn |
  137. * Input buffer containing the packed value
  138. *
  139. * @parm LPDW | lpdw |
  140. * Place to store the unpacked value
  141. *
  142. * @rdesc The new updated input buffer pointer
  143. *
  144. * @comm No validity check for lpbIn is done because of speed
  145. *
  146. *************************************************************************/
  147. PRIVATE LPB PASCAL NEAR LongValUnpack (LPB lpbIn, LPDW lpdw)
  148. {
  149. DWORD dwVal = 0;
  150. register int cbByteCopied;
  151. /* Get the number of bytes to be copied */
  152. cbByteCopied = *lpbIn >> 5;
  153. *lpbIn &= 0x1f;
  154. switch (cbByteCopied) {
  155. case 4:
  156. lpbIn++;
  157. case 3:
  158. BYTE1(dwVal) = *lpbIn++;
  159. case 2:
  160. BYTE2(dwVal) = *lpbIn++;
  161. case 1:
  162. BYTE3(dwVal) = *lpbIn++;
  163. case 0:
  164. BYTE4(dwVal) = *lpbIn++;
  165. }
  166. *lpdw = dwVal;
  167. return lpbIn;
  168. }
  169. /*************************************************************************
  170. *
  171. * @doc INTERNAL INDEX
  172. *
  173. * @func CB PASCAL NEAR | OccurrencePack |
  174. * Packs and emits all occurrence's fields
  175. *
  176. * @parm LPB | lpbOut |
  177. * Place to store the packed occurrence's fields
  178. *
  179. * @parm LPOCC | lpOccIn |
  180. * Pointer to occurrence structure
  181. *
  182. * @parm WORD | occf |
  183. * Occurrence flags telling which fields are present
  184. *
  185. * @rdesc The number of bytes written
  186. *
  187. *************************************************************************/
  188. PUBLIC CB PASCAL NEAR OccurrencePack (register LPB lpbOut, LPOCC lpOccIn,
  189. register WORD occf)
  190. {
  191. DWORD dwVal;
  192. LPB lpbSaved = lpbOut;
  193. while (occf) {
  194. if (occf & OCCF_FIELDID) {
  195. dwVal = lpOccIn->dwFieldId;
  196. occf &= ~OCCF_FIELDID;
  197. }
  198. else if (occf & OCCF_TOPICID) {
  199. dwVal = lpOccIn->dwTopicID;
  200. occf &= ~OCCF_TOPICID;
  201. }
  202. else if (occf & OCCF_COUNT) {
  203. dwVal = lpOccIn->dwCount;
  204. occf &= ~OCCF_COUNT;
  205. }
  206. else if (occf & OCCF_OFFSET) {
  207. dwVal = lpOccIn->dwOffset;
  208. occf &= ~OCCF_OFFSET;
  209. }
  210. else if (occf & OCCF_LENGTH) {
  211. dwVal = lpOccIn->wWordLen;
  212. occf &= ~OCCF_LENGTH;
  213. }
  214. else {
  215. break;
  216. }
  217. if (HI_WORD(dwVal) > 0x1fff) {
  218. *lpbOut++ = 4 << 5; // 4 bytes follow this byte
  219. goto Copy4Bytes;
  220. }
  221. if (HI_WORD(dwVal) > 0x001f) {
  222. BYTE1(dwVal) |= 3 << 5; /* 3 bytes follows this byte */
  223. goto Copy4Bytes;
  224. }
  225. if (HI_WORD(dwVal) > 0 || LO_WORD(dwVal) > 0x1fff) {
  226. BYTE2(dwVal) |= 2 << 5; /* 2 bytes follows this byte */
  227. goto Copy3Bytes;
  228. }
  229. if (LO_WORD(dwVal) > 0x001f) {
  230. BYTE3(dwVal) |= 1 << 5; /* 1 bytes follows this byte */
  231. goto Copy2Bytes;
  232. }
  233. else
  234. goto Copy1Bytes;
  235. #if 1
  236. Copy4Bytes:
  237. *lpbOut ++ = BYTE1(dwVal);
  238. Copy3Bytes:
  239. *lpbOut ++ = BYTE2(dwVal);
  240. Copy2Bytes:
  241. *lpbOut ++ = BYTE3(dwVal);
  242. Copy1Bytes:
  243. *lpbOut ++ = BYTE4(dwVal);
  244. }
  245. return (CB)(lpbOut - lpbSaved);
  246. #else
  247. Copy4Bytes:
  248. *(LPDW)lpbOut = dwVal;
  249. lpbOut += 4;
  250. continue;
  251. Copy3Bytes:
  252. *lpbOut ++ = BYTE2(dwVal);
  253. Copy2Bytes:
  254. *(LPW)lpbOut = LO_WORD(dwVal);
  255. lpbOut += 2;
  256. continue;
  257. Copy1Bytes:
  258. *lpbOut ++ = BYTE4(dwVal);
  259. continue;
  260. }
  261. #endif
  262. return (CB)(lpbOut - lpbSaved);
  263. }
  264. /*************************************************************************
  265. * @doc INTERNAL INDEX
  266. *
  267. * @func CB PASCAL NEAR | CbCopySortPackedOcc |
  268. * Copy the packed occurrence structure
  269. *
  270. * @parm LPB | lpbDst |
  271. * Pointer to destination buffer
  272. * @parm LPB | lpbSrc |
  273. * Pointer to source buffer
  274. * @parm WORD | uiNumOcc |
  275. * Number of occurrence fields (>= 1)
  276. * @rdesc
  277. * return the number of bytes copied
  278. *************************************************************************/
  279. PUBLIC CB PASCAL NEAR CbCopySortPackedOcc (LPB lpbDst, LPB lpbSrc,
  280. WORD uiNumOcc)
  281. {
  282. register int cbByteCopied;
  283. LPB lpbSaved = lpbDst;
  284. do {
  285. for (cbByteCopied = *lpbSrc >> 5; cbByteCopied >= 0; cbByteCopied--)
  286. *lpbDst++ = *lpbSrc++;
  287. uiNumOcc--;
  288. } while (uiNumOcc > 0);
  289. return (CB)(lpbDst - lpbSaved);
  290. }
  291. PUBLIC void PASCAL NEAR OccurrenceUnpack(LPOCC lpOccOut,
  292. register LPB lpbIn, register OCCF occf)
  293. {
  294. DWORD dwVal = 0;
  295. LPDW lpdw;
  296. register int cbByteCopied;
  297. while (occf)
  298. {
  299. DWORD dwTmp;
  300. if (occf & OCCF_FIELDID) {
  301. lpdw = &lpOccOut->dwFieldId;
  302. occf &= ~OCCF_FIELDID;
  303. }
  304. else if (occf & OCCF_TOPICID) {
  305. lpdw = &lpOccOut->dwTopicID;
  306. occf &= ~OCCF_TOPICID;
  307. }
  308. else if (occf & OCCF_COUNT) {
  309. lpdw = &lpOccOut->dwCount;
  310. occf &= ~OCCF_COUNT;
  311. }
  312. else if (occf & OCCF_OFFSET) {
  313. lpdw = &lpOccOut->dwOffset;
  314. occf &= ~OCCF_OFFSET;
  315. }
  316. else if (occf & OCCF_LENGTH) {
  317. dwTmp = lpOccOut->wWordLen;
  318. lpdw = &dwTmp;
  319. occf &= ~OCCF_LENGTH;
  320. }
  321. else {
  322. break;
  323. }
  324. dwVal = 0;
  325. /* Get the number of bytes to be copied */
  326. cbByteCopied = *lpbIn >> 5;
  327. *lpbIn &= 0x1f;
  328. #if 1
  329. switch (cbByteCopied) {
  330. case 4:
  331. lpbIn++;
  332. case 3:
  333. BYTE1(dwVal) = *lpbIn++;
  334. case 2:
  335. BYTE2(dwVal) = *lpbIn++;
  336. case 1:
  337. BYTE3(dwVal) = *lpbIn++;
  338. case 0:
  339. BYTE4(dwVal) = *lpbIn++;
  340. }
  341. #else
  342. switch (cbByteCopied) {
  343. case 4:
  344. lpbIn++;
  345. case 3:
  346. dwVal = *(LPDW)lpbIn;
  347. lpbIn += 4;
  348. break;
  349. case 2:
  350. BYTE1(dwVal) = *lpbIn++;
  351. case 1:
  352. LO_WORD(dwVal) = *(LPW)lpbIn;
  353. lpbIn += 2;
  354. break;
  355. case 0:
  356. BYTE4(dwVal) = *lpbIn++;
  357. }
  358. #endif
  359. *lpdw = dwVal;
  360. }
  361. }
  362. PUBLIC CBIT PASCAL NEAR CbitBitsDw (DWORD dwVal)
  363. {
  364. register WORD wVal; //Value to be scanned
  365. register WORD cBitCount; // Number of bit
  366. if (HI_WORD(dwVal)) {
  367. /* We will look at the hi-word only, but add 16 to the result */
  368. cBitCount = 16;
  369. wVal = HI_WORD(dwVal);
  370. }
  371. else {
  372. /* We look at the lo-word only */
  373. cBitCount = 0;
  374. wVal = LO_WORD(dwVal);
  375. }
  376. /* Now do the shift */
  377. while (wVal) {
  378. cBitCount++;
  379. wVal >>= 1;
  380. }
  381. return cBitCount;
  382. }
  383. // - - - - - - - - -
  384. // This function figures out how best to encode a set of values. It
  385. // uses an array of statistics about the data in order to make this
  386. // determination. The array conveys to the algorithm the number of
  387. // values that require a particular number of bits to represent. For
  388. // the "fixed" and "bell" schemes, this is all the information that's
  389. // needed in order to make a judgment as to which scheme is best.
  390. //
  391. // The inner workings of this are bitching hard to understand, so you
  392. // should probably read any occurence compression external documentation
  393. // you can find before you try to tackle this function.
  394. //
  395. // - - - - - - - - -
  396. //
  397. // Information about the "bitstream" scheme:
  398. //
  399. // The number of bits necessary to encode the values using the
  400. // "bitstream" scheme is spoon-fed into the algorithm via a parameter,
  401. // because it's not possible to derive this value using the statistics
  402. // array.
  403. //
  404. // - - - - - - - - -
  405. //
  406. // Information about the "bell" scheme:
  407. //
  408. // Here's a bell grid, which I hope will provide some documentation as
  409. // to the characteristics of the bell scheme. It is possible to figure
  410. // out how many bits a given sample will take to encode, given a
  411. // particular bell "center" value, but the algorithm is complicated and
  412. // non-intuitive.
  413. //
  414. // Bell Center
  415. //
  416. // 0 1 2 3 4 5 ... 31
  417. // +--------------------------------------------- ... ------
  418. // 0 | 1(c) 2 3 4 5 6 ... 32
  419. // 1 | 2(c) 2(c) 3 4 5 6 ... 32
  420. // 2 | 4 3(c) 3(c) 4 5 6 ... 32
  421. // Size in 3 | 6 5 4(c) 4(c) 5 6 ... 32
  422. // bits of 4 | 8 7 6 5(c) 5(c) 6 ... 32
  423. // value to 5 | 10 9 8 7 6(c) 6(c) .. 32
  424. // encode 6 | 12 11 10 9 8 7(c) .. 32
  425. // 7 | 14 13 12 11 10 9 ... 32
  426. // 8 | 16 15 14 13 12 11 ... 32
  427. // 9 | 18 17 16 15 14 13 ... 32
  428. // .. . .. .. .. .. .. .. ... ..
  429. // 32 | 64 63 62 61 60 59 ... 33(c)
  430. //
  431. // The numbers in this table represent the number of bits necessary to
  432. // encode a given value, using a given bell center. The "(c)" represents
  433. // the point of minimum waste. There are two of these for each "center".
  434. // The waste at (c) is guaranteed to be exactly one bit.
  435. //
  436. // It's would be possible for the bell center to be equal to 32, but this
  437. // would mess up my life since I only store center values in 5 bits, and
  438. // 32 would take 6 bits. Upon examination, though, it can be shown that
  439. // there are no cases where a ceiling value of 32 is any better than a
  440. // ceiling value of 31, so I can rule out 32.
  441. //
  442. // - - - - - - - - -
  443. //
  444. // Information about the "fixed" scheme:
  445. //
  446. // The "center" as calculated by this algorithm is the number of bits
  447. // necessary to represent the largest value in the sample.
  448. //
  449. // Since this value can be 32, but I'm only using 5 bits to store center
  450. // values, I subtract one from this value, which I will add back in
  451. // during decompression. This means that I can't store zero, size
  452. // 0 - 1 = -1, which is 31 if we've got a 5-bit quantity. So I don't
  453. // allow the fixed scheme to use zero as a center. If the best value
  454. // comes up as zero, I make it one instead.
  455. // - - - - - - - - -
  456. PUBLIC void NEAR PASCAL VGetBestScheme(
  457. LPCKEY lpckey, // Output compression key.
  458. LRGDW lrgdwStats, // Each dword (N) in this array at
  459. // a given array index (M) represents
  460. // a count of the number of values in
  461. // the sample that require M bits to
  462. // store. If (lrgdwStats[6] == 17),
  463. // there were 17 values in the sample
  464. // that required 6 bits to store.
  465. DWORD lcbitRawBitstreamBits, // This is lcbitBITSTREAM_ILLEGAL if
  466. // bitstream packing is not allowed,
  467. // else it is equal to the number of
  468. // bits necessary to encode all of
  469. // the values using bitstream
  470. // encoding.
  471. int fNoFixedScheme) // Set if we don't want fixed scheme
  472. {
  473. register short iStats; // Scratch index.
  474. DWORD argdwBellBits[ // This is used to compute bell
  475. cbitCENTER_MAX]; // values. Its sole purpose is to
  476. // save a bunch of multiplies that
  477. // I'd have to do if it didn't exist.
  478. DWORD lcbitBell; // Total number of bits used if I
  479. // adopt the bell scheme to encode
  480. // this sample.
  481. DWORD lcbitFixed; // Total number of bits used if I
  482. // adopt the fixed scheme to encode
  483. // this sample.
  484. DWORD lcbitBitstream; // Total number of bits used if I
  485. // adopt the scheme scheme to encode
  486. // this sample.
  487. DWORD lcTotalEncodedValues; // The total number of values that I
  488. // have to encode.
  489. short idwCeiling; // The size of "lrgdwStats" if you
  490. // trim off all of the high-end zero
  491. // elements.
  492. short idwBellCeiling; // This is "idwCeiling" unless the
  493. // value of "idwCeiling" is
  494. // cbitCENTER_MAX, in which case
  495. // it's "idwCeiling - 1".
  496. CBIT cbitBellCenter; // This will be the best "center"
  497. // value found for the bell scheme.
  498. CBIT cbitFixedCenter; // This will be the "center" value for
  499. // the "fixed" scheme.
  500. //
  501. // Determine the value of "idwCeiling", which is used to trim off
  502. // consecutive zero values at the top end of the statistics
  503. // array.
  504. //
  505. for (iStats = cbitCENTER_MAX - 1; iStats >= 0; iStats--)
  506. if (lrgdwStats[iStats])
  507. break;
  508. idwCeiling = iStats + 1;
  509. //
  510. // Initialize variables used in bell computation.
  511. //
  512. for (iStats = 0; iStats < idwCeiling; iStats++)
  513. argdwBellBits[iStats] = lrgdwStats[iStats] *
  514. (DWORD)(iStats * 2 + 1);
  515. lcbitBell = (DWORD)-1L;
  516. cbitBellCenter = 0;
  517. lcTotalEncodedValues = 0L;
  518. idwBellCeiling = (idwCeiling == cbitCENTER_MAX) ?
  519. cbitCENTER_MAX - 1 : idwCeiling;
  520. //
  521. // Each pass through the following loop generates a value,
  522. // "lcbitBellTotal", which is equal to the number of bits
  523. // necessary to encode all of the values, using a "center" value
  524. // equal to the loop index ("iStats"). This value is checked
  525. // against "lcbitBell", if it's less it becomes the new
  526. // "lcbitBell", and the center is stored in "cbitBellCenter".
  527. //
  528. for (iStats = 0; iStats < idwBellCeiling; iStats++) {
  529. DWORD lcbitBellTotal;
  530. register short i;
  531. lcTotalEncodedValues += lrgdwStats[iStats];
  532. lcbitBellTotal = 0L;
  533. for (i = 0; i <= iStats; i++) { // Adjust values below center.
  534. lcbitBellTotal += argdwBellBits[i];
  535. argdwBellBits[i] += lrgdwStats[i];
  536. }
  537. for (; i < idwCeiling; i++) { // Adjust values above center.
  538. argdwBellBits[i] -= lrgdwStats[i];
  539. lcbitBellTotal += argdwBellBits[i];
  540. }
  541. if (lcbitBellTotal < lcbitBell) {
  542. lcbitBell = lcbitBellTotal;
  543. cbitBellCenter = iStats;
  544. }
  545. }
  546. //
  547. // As of this point the best bell center is stored in
  548. // "cbitBellCenter", although given the obscurity of the logic in
  549. // the above loop you might have to take my word for it. The
  550. // number of bits necessary to bell encode the values using
  551. // "cbitBellCenter" as the center is in "lcbitBell".
  552. //
  553. // This next bit of code figures out which scheme to use, and
  554. // sets up the returned compression key ("lpckey") with this
  555. // result.
  556. //
  557. lcbitBell += cbitWASTED_BELL;
  558. cbitFixedCenter = (idwCeiling <= 1) ? 1 : idwCeiling - 1;
  559. lcbitFixed = (DWORD)cbitFixedCenter * // Get total "fixed" bits.
  560. lcTotalEncodedValues + cbitWASTED_FIXED;
  561. lcbitBitstream = (lcbitRawBitstreamBits ==
  562. lcbitBITSTREAM_ILLEGAL) ?
  563. (DWORD)-1L : // Get total "bitstream" bits.
  564. lcbitRawBitstreamBits + cbitWASTED_BITSTREAM;
  565. if ((lcbitFixed <= lcbitBell && fNoFixedScheme == FALSE) &&
  566. (lcbitFixed <= lcbitBitstream)) {
  567. lpckey->cschScheme = CSCH_FIXED; // Best scheme was
  568. lpckey->ucCenter = // "fixed". Note
  569. (BYTE)(cbitFixedCenter - 1); // the "- 1".
  570. } else if (lcbitBitstream <= lcbitBell)
  571. lpckey->cschScheme = CSCH_NONE; // Best scheme was
  572. // "bitstream".
  573. else {
  574. lpckey->cschScheme = CSCH_BELL; // Best scheme was
  575. lpckey->ucCenter = // "bell".
  576. (BYTE)cbitBellCenter;
  577. }
  578. }