Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1063 lines
30 KiB

4 years ago
  1. #include "stdafx.h"
  2. #pragma hdrstop
  3. #include "ctable.h"
  4. #include "FTSIFace.h"
  5. #include "Memex.h"
  6. #include "FtsLex.h"
  7. #include "Bytemaps.h"
  8. #include <stdlib.h>
  9. extern char chSpaces[];
  10. extern char chNulls [];
  11. extern char gchNull [];
  12. BYTE acOneBits[16] =
  13. {
  14. 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4
  15. };
  16. // 12345678901234567890
  17. char chSpaces[SPACE_TOKEN_LIMIT + 1] = " ";
  18. char chNulls [SPACE_TOKEN_LIMIT + 1] = " ";
  19. char gchNull [1] = {(const char) 0x00};
  20. CCompressTable::CCompressTable(UINT iCharsetDefault)
  21. {
  22. m_psht = NULL;
  23. m_pavr = NULL;
  24. m_iCharSetDefault = iCharsetDefault;
  25. m_pWeightInfo = NULL;
  26. m_pbImages = NULL;
  27. }
  28. CCompressTable::~CCompressTable()
  29. {
  30. if (m_psht) delete m_psht;
  31. if (m_pavr) delete m_pavr;
  32. if (m_vb.Base) FreeVirtualBuffer(&m_vb);
  33. if (m_pWeightInfo ) VFree(m_pWeightInfo);
  34. if (m_pbImages ) VFree(m_pbImages );
  35. }
  36. CCompressTable *CCompressTable::NewCompressTable(UINT iCharsetDefault)
  37. {
  38. CCompressTable *pct= NULL;
  39. __try
  40. {
  41. __try
  42. {
  43. pct= New CCompressTable(iCharsetDefault);
  44. CreateVirtualBuffer(&(pct->m_vb), CB_BUFFER_COMMIT, CB_BUFFER_RESERVATION);
  45. pct->m_psht= CSegHashTable::NewSegHashTable(sizeof(ULONG), sizeof(ULONG));
  46. pct->m_pavr= CAValRef::NewValRef(C_TOKEN_BLOCK);
  47. }
  48. __finally
  49. {
  50. if (_abnormal_termination() && pct)
  51. {
  52. delete pct; pct= NULL;
  53. }
  54. }
  55. }
  56. __except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH)
  57. {
  58. pct= NULL;
  59. }
  60. return pct;
  61. }
  62. INT CCompressTable::ScanString(PBYTE pbText, INT cbText, INT iCharSet)
  63. {
  64. CP cp = GetCPFromCharset(iCharSet);
  65. if (m_pWeightInfo) return ALREADY_WEIGHED;
  66. const UINT cwTokenBlock= 1024;
  67. PSTR apTokenStart[C_TOKEN_BLOCK];
  68. PSTR apTokenEnd [C_TOKEN_BLOCK];
  69. BYTE abType [C_TOKEN_BLOCK];
  70. for (m_pavr->DiscardRefs() ; cbText; m_pavr->DiscardRefs())
  71. {
  72. UINT cTokens= WordBreakA(cp, (PSTR*)&pbText, &cbText, apTokenStart, apTokenEnd, PBYTE(&abType), NULL, C_TOKEN_BLOCK, TOKENIZE_SPACES);
  73. PSTR *ppTokenStart = apTokenStart,
  74. *ppTokenEnd = apTokenEnd;
  75. for (; cTokens-- ; )
  76. {
  77. ASSERT(*ppTokenStart - *ppTokenEnd <= UINT(~USHORT(0)));
  78. m_pavr->AddValRef(*ppTokenStart, USHORT(*ppTokenEnd - *ppTokenStart));
  79. ppTokenStart++;
  80. ppTokenEnd++;
  81. }
  82. m_psht->Assimilate(m_pavr, abType, CCompressTable::IncrementCounter, CCompressTable::InitialCounter);
  83. }
  84. return 0;
  85. }
  86. void CCompressTable::IncrementCounter(UINT iValue, PVOID pvTag, PVOID pvEnvironment)
  87. {
  88. PUINT pui= PUINT(pvTag);
  89. PBYTE pb= PBYTE(pvEnvironment);
  90. ASSERT(pb[iValue]? ((INT) *pui) > 0 : ((INT) *pui) < 0);
  91. if (pb[iValue]) (*pui)++; // Positive counts mark symbols
  92. else (*pui)--; // Negative counts mark non-symbols
  93. }
  94. void CCompressTable::InitialCounter(UINT iValue, PVOID pvTag, PVOID pvEnvironment)
  95. {
  96. PUINT pui= PUINT(pvTag);
  97. PBYTE pb= PBYTE(pvEnvironment);
  98. ASSERT(!*pui);
  99. *pui= pb[iValue]? 1 : UINT(-1);
  100. }
  101. PUINT writebits(PUINT pNextCode, PINT pBitsLeft, int iBits, DWORD dwCode)
  102. {
  103. int iTmp;
  104. PUINT pNextTemp;
  105. if (iBits > *pBitsLeft)
  106. {
  107. iTmp = *pBitsLeft;
  108. pNextTemp = writebits(pNextCode, pBitsLeft, *pBitsLeft, dwCode);
  109. return(writebits(pNextTemp, pBitsLeft, iBits - iTmp, dwCode >> iTmp));
  110. }
  111. dwCode <<= 32 - iBits;
  112. *pNextCode >>= iBits;
  113. *pNextCode |= dwCode;
  114. *pBitsLeft -= iBits;
  115. if (*pBitsLeft != 0) return(pNextCode);
  116. *pBitsLeft = 32;
  117. pNextCode++;
  118. *pNextCode = 0;
  119. return(pNextCode);
  120. }
  121. ERRORCODE CCompressTable::GetPhraseTable(PUINT pcPhrases, PBYTE *ppbImage, PUINT pcbImage, PBYTE *ppbIndex, PUINT pcbIndex)
  122. {
  123. ERRORCODE ec= 0;
  124. PBYTE pbImages = NULL;
  125. PUINT pIndexCode = NULL;
  126. __try
  127. {
  128. __try
  129. {
  130. if (!m_pWeightInfo)
  131. {
  132. ec=ConstructPhraseEncoding();
  133. if (ec) __leave;
  134. ASSERT(m_cWeights);
  135. if (!m_cWeights)
  136. {
  137. ec= EMPTY_PHRASE_TABLE;
  138. __leave;
  139. }
  140. }
  141. ASSERT(m_cWeights);
  142. *pcPhrases= m_cWeights;
  143. *pcbImage = m_cbImageTotal;
  144. pbImages= PBYTE(malloc(m_cbImageTotal)); // BugBug: Change this to LocalAlloc!
  145. // Must change compiler at the same time.
  146. if (!pbImages) RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
  147. CopyMemory(pbImages, m_pbImages, m_cbImageTotal);
  148. UINT cbitsBasis = CBitsToRepresent((m_cbImageTotal - m_cWeights)/ m_cWeights);
  149. UINT basis = 1 << cbitsBasis;
  150. UINT cbitsEstimate = m_cWeights * (1 + cbitsBasis) + (m_cbImageTotal + basis - m_cWeights - 1) / basis;
  151. UINT cdwEstimate = (cbitsEstimate + 31) >> 5;
  152. UINT cdwBits = (m_cWeights + 31) >> 5;
  153. UINT cbTotal = sizeof(UINT) * (cdwBits + cdwEstimate) + sizeof(JINDEXHDR);
  154. pIndexCode= PUINT(malloc(cbTotal)); // BugBug: Change this to LocalAlloc!
  155. // Must change compiler at the same time.
  156. if (!pIndexCode) RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
  157. if (!pIndexCode) { free(pbImages); return OUT_OF_MEMORY; }
  158. ASSERT(cbitsBasis < 6);
  159. JINDEXHDR jIHdr;
  160. jIHdr.Magic = 'J';
  161. jIHdr.cCount = m_cWeights;
  162. jIHdr.cBits = cbitsBasis;
  163. UINT fBasisMask = basis - 1;
  164. CopyMemory(pIndexCode, (const void *) &jIHdr, sizeof(JINDEXHDR));
  165. ASSERT(sizeof(JINDEXHDR) == sizeof(INT));
  166. PUINT pNextCode = pIndexCode + 1;
  167. UINT cbitsLeft= 32;
  168. UINT ii;
  169. for (ii = 0; ii < m_cWeights; ii++)
  170. {
  171. ASSERT(m_pWeightInfo[ii].cbImage);
  172. UINT cb = m_pWeightInfo[ii].cbImage - 1;
  173. UINT dwRight = cb & fBasisMask;
  174. cb = (cb & ~fBasisMask) >> cbitsBasis;
  175. UINT cBits= 1 + cbitsBasis + cb;
  176. ASSERT(cBits < 33);
  177. UINT dwLeft = cb ? (((DWORD)(~0)) >> (32 - cb)) : 0;
  178. pNextCode = writebits(pNextCode, PINT(&cbitsLeft), cBits, (dwRight << (1 + cb)) | dwLeft);
  179. }
  180. if (cbitsLeft < 32)
  181. pNextCode= writebits(pNextCode, PINT(&cbitsLeft), cbitsLeft, 0);
  182. ASSERT(pNextCode - (pIndexCode+1) <= INT(cdwEstimate));
  183. PWeightInfo pwi = m_pWeightInfo;
  184. UINT dw = 0;
  185. UINT c = m_cWeights;
  186. for (cbitsLeft = 32; c--; )
  187. {
  188. UINT fSymbol= 1 & (((pwi++)->fSymbol) >> SYMBOL_SHIFT);
  189. dw= _rotr(dw | fSymbol, 1);
  190. if (!--cbitsLeft)
  191. {
  192. *pNextCode++= dw;
  193. dw= 0;
  194. cbitsLeft= 32;
  195. }
  196. }
  197. if (cbitsLeft < 32)
  198. *pNextCode++= _rotr(dw, cbitsLeft);
  199. ASSERT(cbTotal >= (pNextCode - pIndexCode) * sizeof(UINT));
  200. *pcbIndex= (pNextCode - pIndexCode) * sizeof(UINT);
  201. *ppbImage= pbImages; pbImages = NULL;
  202. *ppbIndex= PBYTE(pIndexCode); pIndexCode = NULL;
  203. ec= 0;
  204. __leave;
  205. }
  206. __finally
  207. {
  208. if (pbImages ) free(pbImages );
  209. if (pIndexCode) free(pIndexCode);
  210. }
  211. }
  212. __except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH)
  213. {
  214. ec= OUT_OF_MEMORY;
  215. }
  216. return ec;
  217. }
  218. ERRORCODE CCompressTable::SetPhraseTable(PBYTE pbImage, UINT cbImage, PBYTE pbIndex, UINT cbIndex)
  219. {
  220. ERRORCODE ec= OUT_OF_MEMORY;
  221. PBYTE pbTokenImages = NULL;
  222. PWeightInfo pwiPhrases = NULL;
  223. CAValRef *pavr = NULL;
  224. __try
  225. {
  226. __try
  227. {
  228. if (UINT(pbIndex) & (sizeof(UINT)-1)) { ec= ALIGNMENT_ERROR; __leave; }
  229. if (cbIndex & (sizeof(UINT)-1)) { ec= ALIGNMENT_ERROR; __leave; }
  230. JINDEXHDR jIHdr= *(JINDEXHDR *) pbIndex;
  231. if (jIHdr.Magic != 'J' || jIHdr.cCount == 0 || jIHdr.cCount > 128 + 16 * 1024)
  232. { ec= INVALID_PHRASE_TABLE; __leave; }
  233. pbTokenImages= PBYTE(VAlloc(FALSE, cbImage));
  234. UINT cWeights= jIHdr.cCount;
  235. pwiPhrases= PWeightInfo(VAlloc(FALSE, cWeights * sizeof(WeightInfo)));
  236. pavr= CAValRef::NewValRef(cWeights);
  237. CopyMemory(pbTokenImages, pbImage, cbImage);
  238. ASSERT(sizeof(JINDEXHDR) == sizeof(UINT));
  239. UINT cbitsBasis = jIHdr.cBits;
  240. UINT basis = 1 << cbitsBasis;
  241. UINT cbitsEstimate = cWeights * (1 + cbitsBasis) + (cbImage + basis - cWeights - 1) / basis;
  242. UINT cdwEstimate = (cbitsEstimate + 31) >> 5;
  243. CJCode JCode(jIHdr.cBits, cWeights, PVOID(pbIndex + sizeof(JINDEXHDR)));
  244. PBYTE pb = pbTokenImages;
  245. PWeightInfo pwi = pwiPhrases;
  246. UINT c = cWeights;
  247. for (; c--; ++pwi)
  248. {
  249. UINT cb= JCode.GetNextDelta();
  250. pavr->AddValRef(pb, cb);
  251. UINT iCount= cWeights - c - 1;
  252. if (iCount < 128)
  253. {
  254. pwi->enc.fClass = NDX_LOW_CLASS | pwi->fSymbol;
  255. pwi->enc.abCode[0] = 0x0FF & (iCount << 1);
  256. }
  257. else
  258. {
  259. UINT iExcess= UINT(iCount - 128);
  260. pwi->enc.fClass = NDX_MEDIUM_CLASS | pwi->fSymbol;
  261. pwi->enc.abCode[1] = iExcess & 0x0FF;
  262. pwi->enc.abCode[0] = ((iExcess >> 6) & 0x3FC) | 0x01;
  263. }
  264. pwi->cbImage= cb;
  265. pwi->pbImage= pb;
  266. pb+= cb;
  267. }
  268. PUINT pdwBits= JCode.NextDWord();
  269. for (c= cWeights, pwi= pwiPhrases; c--; pwi++)
  270. {
  271. UINT iCount= cWeights - c - 1;
  272. pwi->fSymbol |= (1 & (pdwBits[iCount >> 5] >> (iCount & 31))) << SYMBOL_SHIFT;
  273. }
  274. if (m_pWeightInfo) { VFree(m_pWeightInfo); m_pWeightInfo= NULL; }
  275. if (m_pbImages ) { VFree(m_pbImages ); m_pbImages = NULL; }
  276. if (m_psht ) { delete m_psht; m_psht = NULL; }
  277. m_pWeightInfo = pwiPhrases; pwiPhrases = NULL;
  278. m_pbImages = pbTokenImages; pbTokenImages = NULL;
  279. m_cWeights = cWeights;
  280. m_cbImageTotal = cbImage;
  281. m_psht= CSegHashTable::NewSegHashTable(sizeof(ENCODE), sizeof(ENCODE));
  282. m_psht->Assimilate(pavr, m_pWeightInfo, NULL, CCompressTable::RecordEncoding);
  283. delete pavr; pavr= NULL;
  284. ASSERT(SPACE_TOKEN_LIMIT <= C_TOKEN_BLOCK);
  285. m_pavr->DiscardRefs();
  286. INT iCount;
  287. for (iCount= 1; iCount < SPACE_TOKEN_LIMIT; ++iCount)
  288. m_pavr->AddValRef(PBYTE(chSpaces), iCount);
  289. m_psht->Assimilate(m_pavr, NULL, FnAddSpaces, FnAddSpaces);
  290. m_pavr->DiscardRefs();
  291. ZeroMemory(chNulls, SPACE_TOKEN_LIMIT);
  292. for (iCount= 1; iCount < SPACE_TOKEN_LIMIT; ++iCount)
  293. m_pavr->AddValRef(PBYTE(chNulls), iCount);
  294. m_psht->Assimilate(m_pavr, NULL, FnAddNulls, FnAddNulls);
  295. m_pavr->DiscardRefs();
  296. ec= 0;
  297. __leave;
  298. }
  299. __finally
  300. {
  301. if (pbTokenImages) VFree(pbTokenImages);
  302. if (pwiPhrases ) VFree(pwiPhrases );
  303. if (pavr ) delete pavr;
  304. }
  305. }
  306. __except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH)
  307. {
  308. ec= OUT_OF_MEMORY;
  309. }
  310. return ec;
  311. }
  312. void CCompressTable::RecordEncoding(UINT iValue, PVOID pvTag, PVOID pv)
  313. {
  314. *PENCODE(pvTag)= PWeightInfo(pv)[iValue].enc;
  315. }
  316. void CCompressTable::FnCompMergeToken(UINT iValue, PVOID pvTag, PVOID pv)
  317. {
  318. PENCODE penc= PENCODE(pvTag);
  319. PENCODE paenc= PENCODE(pv );
  320. paenc[iValue] = *penc;
  321. }
  322. void CCompressTable::FnCompAddToken(UINT iValue, PVOID pvTag, PVOID pv)
  323. {
  324. ENCODE enc;
  325. enc.fClass = LITERAL_CLASS;
  326. *PENCODE(pvTag)= enc;
  327. PENCODE paenc= PENCODE(pv);
  328. paenc[iValue]= enc;
  329. }
  330. INT CCompressTable::CompressString(PBYTE pbText, INT cbOrig, PBYTE *ppCompressed, UINT iCharset)
  331. {
  332. // This routine constructs an encoded representation of the text denoted by pbText, cbOrig, and iCharset.
  333. // The explicit result will be the length in bytes of the encoded form. If the encoded form is larger
  334. // than the original text, we malloc a suitable buffer, copy the output to that buffer, and return its
  335. // address in *ppCompressed. Otherwise we overwrite the pbText memory area with the compressed form.
  336. //
  337. // When the encoded length is > cbOrig, the calling code must free(*ppCompressed).
  338. ERRORCODE ec= 0;
  339. PBYTE pbCompressed = NULL;
  340. __try
  341. {
  342. __try
  343. {
  344. if (!m_pWeightInfo)
  345. {
  346. ec=ConstructPhraseEncoding();
  347. if (ec) __leave;
  348. }
  349. PWCHAR pwBase = PWCHAR(m_vb.Base);
  350. PCHAR pbOut = PCHAR(pwBase + cbOrig);
  351. PCHAR pbNext = pbOut;
  352. // Note: We use the m_vb area for two purposes --
  353. //
  354. // 1. As buffer for unicode characters.
  355. // 2. As a result area to store the "compressed" text.
  356. //
  357. // For the second case we assume that in all cases the "compressed"
  358. // text will never be larger than 2*cbOrig.
  359. CP cp = GetCPFromCharset(iCharset);
  360. PSTR pbScan= PSTR(pbText);
  361. INT cbText= cbOrig;
  362. const UINT cwTokenBlock= 1024;
  363. PSTR apTokenStart[C_TOKEN_BLOCK];
  364. PSTR apTokenEnd [C_TOKEN_BLOCK];
  365. ENCODE aenc [C_TOKEN_BLOCK];
  366. for (m_pavr->DiscardRefs() ; cbText; m_pavr->DiscardRefs())
  367. {
  368. UINT cTokens= WordBreakA(cp, (PSTR*)&pbScan, &cbText, apTokenStart, apTokenEnd, NULL, NULL, C_TOKEN_BLOCK, TOKENIZE_SPACES);
  369. PSTR *ppTokenStart = apTokenStart,
  370. *ppTokenEnd = apTokenEnd;
  371. UINT c= cTokens;
  372. for (; c-- ; )
  373. {
  374. PSTR pTokenStart = *ppTokenStart++;
  375. PSTR pTokenEnd = *ppTokenEnd++;
  376. ASSERT(pTokenEnd - pTokenStart <= UINT(~USHORT(0)));
  377. m_pavr->AddValRef(pTokenStart, USHORT(pTokenEnd - pTokenStart));
  378. }
  379. m_psht->Assimilate(m_pavr, aenc, CCompressTable::FnCompMergeToken, CCompressTable::FnCompAddToken);
  380. UINT i;
  381. BOOL bPrevTokenSymbol = FALSE;
  382. BOOL bNextTokenSymbol = FALSE;
  383. BOOL bCode;
  384. for (i= 0; i < cTokens; ++i)
  385. {
  386. bNextTokenSymbol= (cTokens > i+1)? aenc[i + 1].fClass & SYMBOL_TOKEN
  387. : FALSE;
  388. switch(aenc[i].fClass & CLASS_MASK)
  389. {
  390. default:
  391. ASSERT(FALSE);
  392. break;
  393. case NULL_CLASS:
  394. case NDX_LOW_CLASS:
  395. *pbNext++ = aenc[i].abCode[0];
  396. break;
  397. case SPACES_CLASS:
  398. bCode = aenc[i].abCode[0];
  399. if (!( (bCode == SINGLE_SPACE_CODE)
  400. && bPrevTokenSymbol
  401. && bNextTokenSymbol
  402. )
  403. ) *pbNext++ = bCode;
  404. else ASSERT(bCode == SINGLE_SPACE_CODE);
  405. break;
  406. case NDX_MEDIUM_CLASS:
  407. *pbNext++ = aenc[i].abCode[0];
  408. *pbNext++ = aenc[i].abCode[1];
  409. break;
  410. case LITERAL_CLASS:
  411. {
  412. const BYTE *pb;
  413. USHORT cbValue;
  414. BYTE bCode;
  415. m_pavr->GetValRef(i, &pb, &cbValue);
  416. ASSERT(cbValue);
  417. while (cbValue > 32)
  418. {
  419. *pbNext++ = BYTE(UINT(0xfb));
  420. CopyMemory(pbNext, pb, 32);
  421. pbNext += 32;
  422. pb += 32;
  423. cbValue -= 32;
  424. }
  425. bCode = (BYTE) (0x000000ff & (cbValue - 1));
  426. bCode <<= 3;
  427. bCode |= 0x03;
  428. *pbNext++ = bCode;
  429. CopyMemory(pbNext, pb, cbValue);
  430. pbNext += cbValue;
  431. }
  432. }
  433. bPrevTokenSymbol = aenc[i].fClass & SYMBOL_TOKEN;
  434. }
  435. }
  436. INT cbCompressed= pbNext - pbOut;
  437. ASSERT(cbCompressed > 0);
  438. if (cbOrig > cbCompressed)
  439. {
  440. #ifdef _DEBUG
  441. PBYTE pbDecomp= NULL;
  442. if (cbOrig <= 4096) pbDecomp= (PBYTE) _alloca(cbOrig);
  443. else pbDecomp= New BYTE[cbOrig];
  444. INT cbExp= DeCompressString(PBYTE(pbOut), pbDecomp, cbCompressed);
  445. ASSERT(cbExp == cbOrig);
  446. PBYTE pbOrig = pbText;
  447. PBYTE pbResult = pbDecomp;
  448. for (int c= cbOrig; c--; ++pbOrig, ++pbResult)
  449. ASSERT(*pbOrig == *pbResult);
  450. if (cbOrig > 4096) delete [] pbDecomp;
  451. #endif _DEBUG
  452. CopyMemory(pbText, pbOut, cbCompressed);
  453. }
  454. else
  455. if (ppCompressed)
  456. {
  457. pbCompressed= (PBYTE) malloc(cbCompressed); // BugBug: Change this to LocalAlloc!
  458. // Coordinate change w/ Compiler
  459. if (!pbCompressed) RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
  460. CopyMemory(pbCompressed, pbOut, cbCompressed);
  461. *ppCompressed= pbCompressed;
  462. pbCompressed= NULL;
  463. }
  464. ec= cbCompressed;
  465. __leave;
  466. }
  467. __finally
  468. {
  469. if (pbCompressed) free(pbCompressed);
  470. }
  471. }
  472. __except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH)
  473. {
  474. ec= OUT_OF_MEMORY;
  475. }
  476. return ec;
  477. }
  478. INT CCompressTable::DeCompressString(PBYTE pbComp, PBYTE pbDecomp, int cbComp)
  479. {
  480. ERRORCODE ec= 0;
  481. __try
  482. {
  483. __try
  484. {
  485. if (!m_pWeightInfo)
  486. {
  487. ec=ConstructPhraseEncoding();
  488. if (ec) __leave;
  489. }
  490. PBYTE pbLimit = pbComp + cbComp;
  491. PBYTE pbStartDecomp = pbDecomp;
  492. BYTE bCode;
  493. BOOL bPrevTokenSymbol = FALSE;
  494. BOOL bNextTokenSymbol = FALSE;
  495. int iIndex;
  496. int cb;
  497. while(pbComp < pbLimit)
  498. {
  499. bCode = *pbComp++;
  500. switch( acOneBits[0x0f & bCode])
  501. {
  502. case NDX_LOW_CLASS:
  503. bCode >>= 1;
  504. iIndex = (int) bCode;
  505. ASSERT(iIndex > -1);
  506. bNextTokenSymbol = m_pWeightInfo[iIndex].fSymbol;
  507. if (bNextTokenSymbol && bPrevTokenSymbol) *pbDecomp++ = ' ';
  508. CopyMemory( pbDecomp, m_pWeightInfo[iIndex].pbImage, m_pWeightInfo[iIndex].cbImage);
  509. pbDecomp += m_pWeightInfo[iIndex].cbImage;
  510. break;
  511. case NDX_MEDIUM_CLASS:
  512. bCode >>= 2;
  513. iIndex= ((((int) bCode) << 8) | *pbComp++) + 128;
  514. ASSERT(iIndex > -1);
  515. bNextTokenSymbol = m_pWeightInfo[iIndex].fSymbol;
  516. if (bNextTokenSymbol && bPrevTokenSymbol) *pbDecomp++ = ' ';
  517. CopyMemory( pbDecomp, m_pWeightInfo[iIndex].pbImage, m_pWeightInfo[iIndex].cbImage);
  518. pbDecomp += m_pWeightInfo[iIndex].cbImage;
  519. break;
  520. case LITERAL_CLASS:
  521. bNextTokenSymbol = FALSE;
  522. bCode >>= 3;
  523. cb = (int) bCode + 1;
  524. CopyMemory( pbDecomp, pbComp, cb);
  525. pbDecomp += cb;
  526. pbComp += cb;
  527. break;
  528. case SPACES_CLASS:
  529. bNextTokenSymbol = FALSE;
  530. bCode >>= 4;
  531. cb = (int) bCode + 1;
  532. ASSERT(cb > 0);
  533. while (cb--) *pbDecomp++ = ' ';
  534. break;
  535. case NULL_CLASS:
  536. bNextTokenSymbol = FALSE;
  537. bCode >>= 4;
  538. cb = (int) bCode + 1;
  539. ASSERT(cb > 0);
  540. while (cb--) *pbDecomp++ = 0x00;
  541. break;
  542. }
  543. bPrevTokenSymbol = bNextTokenSymbol;
  544. }
  545. ec= pbDecomp - pbStartDecomp;
  546. __leave;
  547. }
  548. __finally
  549. {
  550. }
  551. }
  552. __except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH)
  553. {
  554. ec= OUT_OF_MEMORY;
  555. }
  556. return ec;
  557. }
  558. typedef struct _WeightConstructionState
  559. {
  560. PWeightInfo pwi;
  561. } WeightConstructionState,
  562. *PWeightConstructionState;
  563. void CCompressTable::BuildWeightInfo(const BYTE *pbValue, UINT cbValue, void *pvTag, PVOID pvEnvironment)
  564. {
  565. #define pwcs PWeightConstructionState(pvEnvironment)
  566. ASSERT(cbValue);
  567. if (!*pbValue && cbValue < SPACE_TOKEN_LIMIT)
  568. {
  569. BOOL fAllNulls= TRUE;
  570. const BYTE *pb= pbValue;
  571. UINT cb= cbValue;
  572. for (; --cb; ) if (*++pb) fAllNulls= FALSE;
  573. if (fAllNulls) return;
  574. }
  575. if (' ' == *pbValue && cbValue < SPACE_TOKEN_LIMIT)
  576. {
  577. BOOL fAllSpaces= TRUE;
  578. const BYTE *pb= pbValue;
  579. UINT cb= cbValue;
  580. for (; --cb; ) if (' ' != *++pb) fAllSpaces= FALSE;
  581. if (fAllSpaces) return;
  582. }
  583. INT cRefs = *PINT(pvTag);
  584. BOOL fSymbol = FALSE;
  585. ASSERT(cRefs);
  586. ASSERT(sizeof(ENCODE) == sizeof(INT));
  587. PENCODE(pvTag)->fClass = LITERAL_CLASS;
  588. if (cRefs > 0) fSymbol= SYMBOL_TOKEN;
  589. else cRefs= - cRefs;
  590. if (cRefs == 1) return;
  591. PWeightInfo pwi = pwcs->pwi++;
  592. pwi->pbImage = PBYTE(pbValue);
  593. pwi->cbImage = cbValue;
  594. pwi->uiWeight = cRefs * cbValue;
  595. pwi->fSymbol = fSymbol;
  596. #undef pwcs
  597. }
  598. extern "C" int _cdecl WeightCompare(const void *pv1, const void *pv2)
  599. {
  600. PWeightInfo pw1 = *((PWeightInfo *) pv1);
  601. PWeightInfo pw2 = *((PWeightInfo *) pv2);
  602. return( pw2->uiWeight - pw1->uiWeight);
  603. }
  604. extern "C" int _cdecl WeightCompare2(const void *pv1, const void *pv2)
  605. {
  606. PWeightInfo pw1 = *((PWeightInfo *) pv1);
  607. PWeightInfo pw2 = *((PWeightInfo *) pv2);
  608. int cb = (pw1->cbImage < pw2->cbImage) ? pw1->cbImage : pw2->cbImage;
  609. int iResult= _strnicmp((const char *) pw1->pbImage, (const char *) pw2->pbImage, cb);
  610. if (iResult) return iResult;
  611. else return pw1->cbImage - pw2->cbImage;
  612. }
  613. ERRORCODE CCompressTable::ConstructPhraseEncoding()
  614. {
  615. ERRORCODE ec= 0;
  616. PWeightInfo pwiBase = NULL;
  617. PWeightInfo *papwi = NULL;
  618. PWeightInfo pWeightInfo = NULL;
  619. PBYTE pbImages = NULL;
  620. CAValRef *pavr = NULL;
  621. __try
  622. {
  623. UINT cItems= m_psht->EntryCount();
  624. if (!cItems) { ec= NO_TEXT_SCANNED; __leave; }
  625. pwiBase= PWeightInfo(VAlloc(FALSE, cItems * sizeof(WeightInfo)));
  626. // Now we'll preload the hash table with encoding for streams of
  627. // spaces and nulls.
  628. ASSERT(SPACE_TOKEN_LIMIT <= C_TOKEN_BLOCK);
  629. INT iCount;
  630. for (iCount= 1; iCount < SPACE_TOKEN_LIMIT; ++iCount)
  631. m_pavr->AddValRef(PBYTE(chSpaces), iCount);
  632. m_psht->Assimilate(m_pavr, NULL, FnAddSpaces, FnAddSpaces);
  633. m_pavr->DiscardRefs();
  634. ZeroMemory(chNulls, SPACE_TOKEN_LIMIT);
  635. for (iCount= 1; iCount < SPACE_TOKEN_LIMIT; ++iCount)
  636. m_pavr->AddValRef(PBYTE(chNulls), iCount);
  637. m_psht->Assimilate(m_pavr, NULL, FnAddNulls, FnAddNulls);
  638. m_pavr->DiscardRefs();
  639. // Note! We must never add an item to the hash table after this
  640. // point. This is because the code below stores the addresses
  641. // of the hash table value strings. If we add items, those
  642. // strings may move around.
  643. WeightConstructionState wcs;
  644. wcs.pwi= pwiBase;
  645. m_psht->DumpAll(&wcs, CCompressTable::BuildWeightInfo);
  646. UINT cWeights= wcs.pwi - pwiBase;
  647. ASSERT(cWeights);
  648. papwi= (PWeightInfo *) VAlloc(FALSE, cWeights * sizeof(PWeightInfo));
  649. UINT c = cWeights;
  650. PWeightInfo *ppwi = papwi,
  651. pwi = pwiBase;
  652. for (; c--; ) *ppwi++ = pwi++;
  653. qsort(papwi, cWeights, sizeof(PWeightInfo), WeightCompare);
  654. INT cCount;
  655. iCount= INT(cWeights);
  656. cCount= (iCount > 128)? 128 : iCount;
  657. qsort(papwi, cCount, sizeof(PWeightInfo), WeightCompare2);
  658. iCount -= 128;
  659. if (iCount > 1)
  660. {
  661. cCount= 16 * 1024;
  662. if (iCount < cCount) cCount= iCount;
  663. qsort(papwi + 128, cCount, sizeof(PWeightInfo), WeightCompare2);
  664. iCount -= cCount;
  665. if (iCount > 1) qsort(papwi + 128 + 16 * 1024, iCount, sizeof(PWeightInfo), WeightCompare2);
  666. }
  667. iCount = 128 + 16 * 1024;
  668. if (iCount < INT(cWeights)) cWeights= iCount;
  669. UINT cbImages = 0;
  670. PWeightInfo *ppwiSrc = papwi;
  671. for (c= cWeights; c--; ) cbImages += (*ppwiSrc++)->cbImage;
  672. pavr= CAValRef::NewValRef(cWeights);
  673. pWeightInfo = PWeightInfo(VAlloc(FALSE, cWeights * sizeof(WeightInfo)));
  674. pbImages = PBYTE (VAlloc(FALSE, cbImages));
  675. ASSERT(!m_pWeightInfo);
  676. ASSERT(!m_pbImages);
  677. m_pWeightInfo = pWeightInfo; pWeightInfo = NULL;
  678. m_pbImages = pbImages; pbImages = NULL;
  679. m_cWeights = cWeights;
  680. m_cbImageTotal = cbImages;
  681. PWeightInfo pwiDest = m_pWeightInfo;
  682. PBYTE pbDest = m_pbImages;
  683. for (ppwiSrc= papwi, c= cWeights; c--; )
  684. {
  685. PWeightInfo pwiSrc= *ppwiSrc++;
  686. CopyMemory(pbDest, pwiSrc->pbImage, pwiSrc->cbImage);
  687. pwiSrc->pbImage= pbDest;
  688. pbDest += pwiSrc->cbImage;
  689. *pwiDest++ = *pwiSrc;
  690. }
  691. // Now we've changed all the pbImage pointers in the weight info array
  692. // to point into the m_pbImages. We no longer need to keep the hash
  693. // table value addresses constant.
  694. for (pwi = m_pWeightInfo, iCount= 0; iCount < INT(cWeights); ++iCount, ++pwi)
  695. {
  696. pavr->AddValRef(pwi->pbImage, pwi->cbImage);
  697. if (iCount < 128)
  698. {
  699. pwi->enc.fClass = NDX_LOW_CLASS | pwi->fSymbol;
  700. pwi->enc.abCode[0] = 0x0FF & (iCount << 1);
  701. continue;
  702. }
  703. UINT iExcess= UINT(iCount - 128);
  704. pwi->enc.fClass = NDX_MEDIUM_CLASS | pwi->fSymbol;
  705. pwi->enc.abCode[1] = iExcess & 0x0FF;
  706. pwi->enc.abCode[0] = ((iExcess >> 6) & 0x3FC) | 0x01;
  707. }
  708. m_psht->Assimilate(pavr, m_pWeightInfo, NULL, FnAddTokens);
  709. ec= 0;
  710. __leave;
  711. }
  712. __finally
  713. {
  714. if (pwiBase ) VFree(pwiBase );
  715. if (papwi ) VFree(papwi );
  716. if (pWeightInfo) VFree(pWeightInfo);
  717. if (pbImages ) VFree(pbImages );
  718. if (pavr) delete pavr;
  719. }
  720. return ec;
  721. }
  722. void CCompressTable::FnAddSpaces(UINT iValue, PVOID pvTag, PVOID pv)
  723. {
  724. ENCODE enc;
  725. ASSERT(iValue < 16);
  726. enc.fClass = SPACES_CLASS;
  727. enc.abCode[0] = ((iValue & 0x0000000f) << 4) | 0x07;
  728. *PENCODE(pvTag) = enc;
  729. }
  730. void CCompressTable::FnAddNulls(UINT iValue, PVOID pvTag, PVOID pv)
  731. {
  732. ENCODE enc;
  733. ASSERT(iValue < 16);
  734. enc.fClass = NULL_CLASS;
  735. enc.abCode[0] = ((iValue & 0x0000000f) << 4) | 0x0f;
  736. *PENCODE(pvTag) = enc;
  737. }
  738. void CCompressTable::FnAddTokens(UINT iValue, PVOID pvTag, PVOID pv)
  739. {
  740. *PENCODE(pvTag)= PWeightInfo(pv)[iValue].enc;
  741. }