#include "stdafx.h" #pragma hdrstop #include "ctable.h" #include "FTSIFace.h" #include "Memex.h" #include "FtsLex.h" #include "Bytemaps.h" #include extern char chSpaces[]; extern char chNulls []; extern char gchNull []; BYTE acOneBits[16] = { 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4 }; // 12345678901234567890 char chSpaces[SPACE_TOKEN_LIMIT + 1] = " "; char chNulls [SPACE_TOKEN_LIMIT + 1] = " "; char gchNull [1] = {(const char) 0x00}; CCompressTable::CCompressTable(UINT iCharsetDefault) { m_psht = NULL; m_pavr = NULL; m_iCharSetDefault = iCharsetDefault; m_pWeightInfo = NULL; m_pbImages = NULL; } CCompressTable::~CCompressTable() { if (m_psht) delete m_psht; if (m_pavr) delete m_pavr; if (m_vb.Base) FreeVirtualBuffer(&m_vb); if (m_pWeightInfo ) VFree(m_pWeightInfo); if (m_pbImages ) VFree(m_pbImages ); } CCompressTable *CCompressTable::NewCompressTable(UINT iCharsetDefault) { CCompressTable *pct= NULL; __try { __try { pct= New CCompressTable(iCharsetDefault); CreateVirtualBuffer(&(pct->m_vb), CB_BUFFER_COMMIT, CB_BUFFER_RESERVATION); pct->m_psht= CSegHashTable::NewSegHashTable(sizeof(ULONG), sizeof(ULONG)); pct->m_pavr= CAValRef::NewValRef(C_TOKEN_BLOCK); } __finally { if (_abnormal_termination() && pct) { delete pct; pct= NULL; } } } __except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) { pct= NULL; } return pct; } INT CCompressTable::ScanString(PBYTE pbText, INT cbText, INT iCharSet) { CP cp = GetCPFromCharset(iCharSet); if (m_pWeightInfo) return ALREADY_WEIGHED; const UINT cwTokenBlock= 1024; PSTR apTokenStart[C_TOKEN_BLOCK]; PSTR apTokenEnd [C_TOKEN_BLOCK]; BYTE abType [C_TOKEN_BLOCK]; for (m_pavr->DiscardRefs() ; cbText; m_pavr->DiscardRefs()) { UINT cTokens= WordBreakA(cp, (PSTR*)&pbText, &cbText, apTokenStart, apTokenEnd, PBYTE(&abType), NULL, C_TOKEN_BLOCK, TOKENIZE_SPACES); PSTR *ppTokenStart = apTokenStart, *ppTokenEnd = apTokenEnd; for (; cTokens-- ; ) { ASSERT(*ppTokenStart - *ppTokenEnd <= UINT(~USHORT(0))); m_pavr->AddValRef(*ppTokenStart, USHORT(*ppTokenEnd - *ppTokenStart)); ppTokenStart++; ppTokenEnd++; } m_psht->Assimilate(m_pavr, abType, CCompressTable::IncrementCounter, CCompressTable::InitialCounter); } return 0; } void CCompressTable::IncrementCounter(UINT iValue, PVOID pvTag, PVOID pvEnvironment) { PUINT pui= PUINT(pvTag); PBYTE pb= PBYTE(pvEnvironment); ASSERT(pb[iValue]? ((INT) *pui) > 0 : ((INT) *pui) < 0); if (pb[iValue]) (*pui)++; // Positive counts mark symbols else (*pui)--; // Negative counts mark non-symbols } void CCompressTable::InitialCounter(UINT iValue, PVOID pvTag, PVOID pvEnvironment) { PUINT pui= PUINT(pvTag); PBYTE pb= PBYTE(pvEnvironment); ASSERT(!*pui); *pui= pb[iValue]? 1 : UINT(-1); } PUINT writebits(PUINT pNextCode, PINT pBitsLeft, int iBits, DWORD dwCode) { int iTmp; PUINT pNextTemp; if (iBits > *pBitsLeft) { iTmp = *pBitsLeft; pNextTemp = writebits(pNextCode, pBitsLeft, *pBitsLeft, dwCode); return(writebits(pNextTemp, pBitsLeft, iBits - iTmp, dwCode >> iTmp)); } dwCode <<= 32 - iBits; *pNextCode >>= iBits; *pNextCode |= dwCode; *pBitsLeft -= iBits; if (*pBitsLeft != 0) return(pNextCode); *pBitsLeft = 32; pNextCode++; *pNextCode = 0; return(pNextCode); } ERRORCODE CCompressTable::GetPhraseTable(PUINT pcPhrases, PBYTE *ppbImage, PUINT pcbImage, PBYTE *ppbIndex, PUINT pcbIndex) { ERRORCODE ec= 0; PBYTE pbImages = NULL; PUINT pIndexCode = NULL; __try { __try { if (!m_pWeightInfo) { ec=ConstructPhraseEncoding(); if (ec) __leave; ASSERT(m_cWeights); if (!m_cWeights) { ec= EMPTY_PHRASE_TABLE; __leave; } } ASSERT(m_cWeights); *pcPhrases= m_cWeights; *pcbImage = m_cbImageTotal; pbImages= PBYTE(malloc(m_cbImageTotal)); // BugBug: Change this to LocalAlloc! // Must change compiler at the same time. if (!pbImages) RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL); CopyMemory(pbImages, m_pbImages, m_cbImageTotal); UINT cbitsBasis = CBitsToRepresent((m_cbImageTotal - m_cWeights)/ m_cWeights); UINT basis = 1 << cbitsBasis; UINT cbitsEstimate = m_cWeights * (1 + cbitsBasis) + (m_cbImageTotal + basis - m_cWeights - 1) / basis; UINT cdwEstimate = (cbitsEstimate + 31) >> 5; UINT cdwBits = (m_cWeights + 31) >> 5; UINT cbTotal = sizeof(UINT) * (cdwBits + cdwEstimate) + sizeof(JINDEXHDR); pIndexCode= PUINT(malloc(cbTotal)); // BugBug: Change this to LocalAlloc! // Must change compiler at the same time. if (!pIndexCode) RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL); if (!pIndexCode) { free(pbImages); return OUT_OF_MEMORY; } ASSERT(cbitsBasis < 6); JINDEXHDR jIHdr; jIHdr.Magic = 'J'; jIHdr.cCount = m_cWeights; jIHdr.cBits = cbitsBasis; UINT fBasisMask = basis - 1; CopyMemory(pIndexCode, (const void *) &jIHdr, sizeof(JINDEXHDR)); ASSERT(sizeof(JINDEXHDR) == sizeof(INT)); PUINT pNextCode = pIndexCode + 1; UINT cbitsLeft= 32; UINT ii; for (ii = 0; ii < m_cWeights; ii++) { ASSERT(m_pWeightInfo[ii].cbImage); UINT cb = m_pWeightInfo[ii].cbImage - 1; UINT dwRight = cb & fBasisMask; cb = (cb & ~fBasisMask) >> cbitsBasis; UINT cBits= 1 + cbitsBasis + cb; ASSERT(cBits < 33); UINT dwLeft = cb ? (((DWORD)(~0)) >> (32 - cb)) : 0; pNextCode = writebits(pNextCode, PINT(&cbitsLeft), cBits, (dwRight << (1 + cb)) | dwLeft); } if (cbitsLeft < 32) pNextCode= writebits(pNextCode, PINT(&cbitsLeft), cbitsLeft, 0); ASSERT(pNextCode - (pIndexCode+1) <= INT(cdwEstimate)); PWeightInfo pwi = m_pWeightInfo; UINT dw = 0; UINT c = m_cWeights; for (cbitsLeft = 32; c--; ) { UINT fSymbol= 1 & (((pwi++)->fSymbol) >> SYMBOL_SHIFT); dw= _rotr(dw | fSymbol, 1); if (!--cbitsLeft) { *pNextCode++= dw; dw= 0; cbitsLeft= 32; } } if (cbitsLeft < 32) *pNextCode++= _rotr(dw, cbitsLeft); ASSERT(cbTotal >= (pNextCode - pIndexCode) * sizeof(UINT)); *pcbIndex= (pNextCode - pIndexCode) * sizeof(UINT); *ppbImage= pbImages; pbImages = NULL; *ppbIndex= PBYTE(pIndexCode); pIndexCode = NULL; ec= 0; __leave; } __finally { if (pbImages ) free(pbImages ); if (pIndexCode) free(pIndexCode); } } __except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) { ec= OUT_OF_MEMORY; } return ec; } ERRORCODE CCompressTable::SetPhraseTable(PBYTE pbImage, UINT cbImage, PBYTE pbIndex, UINT cbIndex) { ERRORCODE ec= OUT_OF_MEMORY; PBYTE pbTokenImages = NULL; PWeightInfo pwiPhrases = NULL; CAValRef *pavr = NULL; __try { __try { if (UINT(pbIndex) & (sizeof(UINT)-1)) { ec= ALIGNMENT_ERROR; __leave; } if (cbIndex & (sizeof(UINT)-1)) { ec= ALIGNMENT_ERROR; __leave; } JINDEXHDR jIHdr= *(JINDEXHDR *) pbIndex; if (jIHdr.Magic != 'J' || jIHdr.cCount == 0 || jIHdr.cCount > 128 + 16 * 1024) { ec= INVALID_PHRASE_TABLE; __leave; } pbTokenImages= PBYTE(VAlloc(FALSE, cbImage)); UINT cWeights= jIHdr.cCount; pwiPhrases= PWeightInfo(VAlloc(FALSE, cWeights * sizeof(WeightInfo))); pavr= CAValRef::NewValRef(cWeights); CopyMemory(pbTokenImages, pbImage, cbImage); ASSERT(sizeof(JINDEXHDR) == sizeof(UINT)); UINT cbitsBasis = jIHdr.cBits; UINT basis = 1 << cbitsBasis; UINT cbitsEstimate = cWeights * (1 + cbitsBasis) + (cbImage + basis - cWeights - 1) / basis; UINT cdwEstimate = (cbitsEstimate + 31) >> 5; CJCode JCode(jIHdr.cBits, cWeights, PVOID(pbIndex + sizeof(JINDEXHDR))); PBYTE pb = pbTokenImages; PWeightInfo pwi = pwiPhrases; UINT c = cWeights; for (; c--; ++pwi) { UINT cb= JCode.GetNextDelta(); pavr->AddValRef(pb, cb); UINT iCount= cWeights - c - 1; if (iCount < 128) { pwi->enc.fClass = NDX_LOW_CLASS | pwi->fSymbol; pwi->enc.abCode[0] = 0x0FF & (iCount << 1); } else { UINT iExcess= UINT(iCount - 128); pwi->enc.fClass = NDX_MEDIUM_CLASS | pwi->fSymbol; pwi->enc.abCode[1] = iExcess & 0x0FF; pwi->enc.abCode[0] = ((iExcess >> 6) & 0x3FC) | 0x01; } pwi->cbImage= cb; pwi->pbImage= pb; pb+= cb; } PUINT pdwBits= JCode.NextDWord(); for (c= cWeights, pwi= pwiPhrases; c--; pwi++) { UINT iCount= cWeights - c - 1; pwi->fSymbol |= (1 & (pdwBits[iCount >> 5] >> (iCount & 31))) << SYMBOL_SHIFT; } if (m_pWeightInfo) { VFree(m_pWeightInfo); m_pWeightInfo= NULL; } if (m_pbImages ) { VFree(m_pbImages ); m_pbImages = NULL; } if (m_psht ) { delete m_psht; m_psht = NULL; } m_pWeightInfo = pwiPhrases; pwiPhrases = NULL; m_pbImages = pbTokenImages; pbTokenImages = NULL; m_cWeights = cWeights; m_cbImageTotal = cbImage; m_psht= CSegHashTable::NewSegHashTable(sizeof(ENCODE), sizeof(ENCODE)); m_psht->Assimilate(pavr, m_pWeightInfo, NULL, CCompressTable::RecordEncoding); delete pavr; pavr= NULL; ASSERT(SPACE_TOKEN_LIMIT <= C_TOKEN_BLOCK); m_pavr->DiscardRefs(); INT iCount; for (iCount= 1; iCount < SPACE_TOKEN_LIMIT; ++iCount) m_pavr->AddValRef(PBYTE(chSpaces), iCount); m_psht->Assimilate(m_pavr, NULL, FnAddSpaces, FnAddSpaces); m_pavr->DiscardRefs(); ZeroMemory(chNulls, SPACE_TOKEN_LIMIT); for (iCount= 1; iCount < SPACE_TOKEN_LIMIT; ++iCount) m_pavr->AddValRef(PBYTE(chNulls), iCount); m_psht->Assimilate(m_pavr, NULL, FnAddNulls, FnAddNulls); m_pavr->DiscardRefs(); ec= 0; __leave; } __finally { if (pbTokenImages) VFree(pbTokenImages); if (pwiPhrases ) VFree(pwiPhrases ); if (pavr ) delete pavr; } } __except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) { ec= OUT_OF_MEMORY; } return ec; } void CCompressTable::RecordEncoding(UINT iValue, PVOID pvTag, PVOID pv) { *PENCODE(pvTag)= PWeightInfo(pv)[iValue].enc; } void CCompressTable::FnCompMergeToken(UINT iValue, PVOID pvTag, PVOID pv) { PENCODE penc= PENCODE(pvTag); PENCODE paenc= PENCODE(pv ); paenc[iValue] = *penc; } void CCompressTable::FnCompAddToken(UINT iValue, PVOID pvTag, PVOID pv) { ENCODE enc; enc.fClass = LITERAL_CLASS; *PENCODE(pvTag)= enc; PENCODE paenc= PENCODE(pv); paenc[iValue]= enc; } INT CCompressTable::CompressString(PBYTE pbText, INT cbOrig, PBYTE *ppCompressed, UINT iCharset) { // This routine constructs an encoded representation of the text denoted by pbText, cbOrig, and iCharset. // The explicit result will be the length in bytes of the encoded form. If the encoded form is larger // than the original text, we malloc a suitable buffer, copy the output to that buffer, and return its // address in *ppCompressed. Otherwise we overwrite the pbText memory area with the compressed form. // // When the encoded length is > cbOrig, the calling code must free(*ppCompressed). ERRORCODE ec= 0; PBYTE pbCompressed = NULL; __try { __try { if (!m_pWeightInfo) { ec=ConstructPhraseEncoding(); if (ec) __leave; } PWCHAR pwBase = PWCHAR(m_vb.Base); PCHAR pbOut = PCHAR(pwBase + cbOrig); PCHAR pbNext = pbOut; // Note: We use the m_vb area for two purposes -- // // 1. As buffer for unicode characters. // 2. As a result area to store the "compressed" text. // // For the second case we assume that in all cases the "compressed" // text will never be larger than 2*cbOrig. CP cp = GetCPFromCharset(iCharset); PSTR pbScan= PSTR(pbText); INT cbText= cbOrig; const UINT cwTokenBlock= 1024; PSTR apTokenStart[C_TOKEN_BLOCK]; PSTR apTokenEnd [C_TOKEN_BLOCK]; ENCODE aenc [C_TOKEN_BLOCK]; for (m_pavr->DiscardRefs() ; cbText; m_pavr->DiscardRefs()) { UINT cTokens= WordBreakA(cp, (PSTR*)&pbScan, &cbText, apTokenStart, apTokenEnd, NULL, NULL, C_TOKEN_BLOCK, TOKENIZE_SPACES); PSTR *ppTokenStart = apTokenStart, *ppTokenEnd = apTokenEnd; UINT c= cTokens; for (; c-- ; ) { PSTR pTokenStart = *ppTokenStart++; PSTR pTokenEnd = *ppTokenEnd++; ASSERT(pTokenEnd - pTokenStart <= UINT(~USHORT(0))); m_pavr->AddValRef(pTokenStart, USHORT(pTokenEnd - pTokenStart)); } m_psht->Assimilate(m_pavr, aenc, CCompressTable::FnCompMergeToken, CCompressTable::FnCompAddToken); UINT i; BOOL bPrevTokenSymbol = FALSE; BOOL bNextTokenSymbol = FALSE; BOOL bCode; for (i= 0; i < cTokens; ++i) { bNextTokenSymbol= (cTokens > i+1)? aenc[i + 1].fClass & SYMBOL_TOKEN : FALSE; switch(aenc[i].fClass & CLASS_MASK) { default: ASSERT(FALSE); break; case NULL_CLASS: case NDX_LOW_CLASS: *pbNext++ = aenc[i].abCode[0]; break; case SPACES_CLASS: bCode = aenc[i].abCode[0]; if (!( (bCode == SINGLE_SPACE_CODE) && bPrevTokenSymbol && bNextTokenSymbol ) ) *pbNext++ = bCode; else ASSERT(bCode == SINGLE_SPACE_CODE); break; case NDX_MEDIUM_CLASS: *pbNext++ = aenc[i].abCode[0]; *pbNext++ = aenc[i].abCode[1]; break; case LITERAL_CLASS: { const BYTE *pb; USHORT cbValue; BYTE bCode; m_pavr->GetValRef(i, &pb, &cbValue); ASSERT(cbValue); while (cbValue > 32) { *pbNext++ = BYTE(UINT(0xfb)); CopyMemory(pbNext, pb, 32); pbNext += 32; pb += 32; cbValue -= 32; } bCode = (BYTE) (0x000000ff & (cbValue - 1)); bCode <<= 3; bCode |= 0x03; *pbNext++ = bCode; CopyMemory(pbNext, pb, cbValue); pbNext += cbValue; } } bPrevTokenSymbol = aenc[i].fClass & SYMBOL_TOKEN; } } INT cbCompressed= pbNext - pbOut; ASSERT(cbCompressed > 0); if (cbOrig > cbCompressed) { #ifdef _DEBUG PBYTE pbDecomp= NULL; if (cbOrig <= 4096) pbDecomp= (PBYTE) _alloca(cbOrig); else pbDecomp= New BYTE[cbOrig]; INT cbExp= DeCompressString(PBYTE(pbOut), pbDecomp, cbCompressed); ASSERT(cbExp == cbOrig); PBYTE pbOrig = pbText; PBYTE pbResult = pbDecomp; for (int c= cbOrig; c--; ++pbOrig, ++pbResult) ASSERT(*pbOrig == *pbResult); if (cbOrig > 4096) delete [] pbDecomp; #endif _DEBUG CopyMemory(pbText, pbOut, cbCompressed); } else if (ppCompressed) { pbCompressed= (PBYTE) malloc(cbCompressed); // BugBug: Change this to LocalAlloc! // Coordinate change w/ Compiler if (!pbCompressed) RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL); CopyMemory(pbCompressed, pbOut, cbCompressed); *ppCompressed= pbCompressed; pbCompressed= NULL; } ec= cbCompressed; __leave; } __finally { if (pbCompressed) free(pbCompressed); } } __except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) { ec= OUT_OF_MEMORY; } return ec; } INT CCompressTable::DeCompressString(PBYTE pbComp, PBYTE pbDecomp, int cbComp) { ERRORCODE ec= 0; __try { __try { if (!m_pWeightInfo) { ec=ConstructPhraseEncoding(); if (ec) __leave; } PBYTE pbLimit = pbComp + cbComp; PBYTE pbStartDecomp = pbDecomp; BYTE bCode; BOOL bPrevTokenSymbol = FALSE; BOOL bNextTokenSymbol = FALSE; int iIndex; int cb; while(pbComp < pbLimit) { bCode = *pbComp++; switch( acOneBits[0x0f & bCode]) { case NDX_LOW_CLASS: bCode >>= 1; iIndex = (int) bCode; ASSERT(iIndex > -1); bNextTokenSymbol = m_pWeightInfo[iIndex].fSymbol; if (bNextTokenSymbol && bPrevTokenSymbol) *pbDecomp++ = ' '; CopyMemory( pbDecomp, m_pWeightInfo[iIndex].pbImage, m_pWeightInfo[iIndex].cbImage); pbDecomp += m_pWeightInfo[iIndex].cbImage; break; case NDX_MEDIUM_CLASS: bCode >>= 2; iIndex= ((((int) bCode) << 8) | *pbComp++) + 128; ASSERT(iIndex > -1); bNextTokenSymbol = m_pWeightInfo[iIndex].fSymbol; if (bNextTokenSymbol && bPrevTokenSymbol) *pbDecomp++ = ' '; CopyMemory( pbDecomp, m_pWeightInfo[iIndex].pbImage, m_pWeightInfo[iIndex].cbImage); pbDecomp += m_pWeightInfo[iIndex].cbImage; break; case LITERAL_CLASS: bNextTokenSymbol = FALSE; bCode >>= 3; cb = (int) bCode + 1; CopyMemory( pbDecomp, pbComp, cb); pbDecomp += cb; pbComp += cb; break; case SPACES_CLASS: bNextTokenSymbol = FALSE; bCode >>= 4; cb = (int) bCode + 1; ASSERT(cb > 0); while (cb--) *pbDecomp++ = ' '; break; case NULL_CLASS: bNextTokenSymbol = FALSE; bCode >>= 4; cb = (int) bCode + 1; ASSERT(cb > 0); while (cb--) *pbDecomp++ = 0x00; break; } bPrevTokenSymbol = bNextTokenSymbol; } ec= pbDecomp - pbStartDecomp; __leave; } __finally { } } __except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) { ec= OUT_OF_MEMORY; } return ec; } typedef struct _WeightConstructionState { PWeightInfo pwi; } WeightConstructionState, *PWeightConstructionState; void CCompressTable::BuildWeightInfo(const BYTE *pbValue, UINT cbValue, void *pvTag, PVOID pvEnvironment) { #define pwcs PWeightConstructionState(pvEnvironment) ASSERT(cbValue); if (!*pbValue && cbValue < SPACE_TOKEN_LIMIT) { BOOL fAllNulls= TRUE; const BYTE *pb= pbValue; UINT cb= cbValue; for (; --cb; ) if (*++pb) fAllNulls= FALSE; if (fAllNulls) return; } if (' ' == *pbValue && cbValue < SPACE_TOKEN_LIMIT) { BOOL fAllSpaces= TRUE; const BYTE *pb= pbValue; UINT cb= cbValue; for (; --cb; ) if (' ' != *++pb) fAllSpaces= FALSE; if (fAllSpaces) return; } INT cRefs = *PINT(pvTag); BOOL fSymbol = FALSE; ASSERT(cRefs); ASSERT(sizeof(ENCODE) == sizeof(INT)); PENCODE(pvTag)->fClass = LITERAL_CLASS; if (cRefs > 0) fSymbol= SYMBOL_TOKEN; else cRefs= - cRefs; if (cRefs == 1) return; PWeightInfo pwi = pwcs->pwi++; pwi->pbImage = PBYTE(pbValue); pwi->cbImage = cbValue; pwi->uiWeight = cRefs * cbValue; pwi->fSymbol = fSymbol; #undef pwcs } extern "C" int _cdecl WeightCompare(const void *pv1, const void *pv2) { PWeightInfo pw1 = *((PWeightInfo *) pv1); PWeightInfo pw2 = *((PWeightInfo *) pv2); return( pw2->uiWeight - pw1->uiWeight); } extern "C" int _cdecl WeightCompare2(const void *pv1, const void *pv2) { PWeightInfo pw1 = *((PWeightInfo *) pv1); PWeightInfo pw2 = *((PWeightInfo *) pv2); int cb = (pw1->cbImage < pw2->cbImage) ? pw1->cbImage : pw2->cbImage; int iResult= _strnicmp((const char *) pw1->pbImage, (const char *) pw2->pbImage, cb); if (iResult) return iResult; else return pw1->cbImage - pw2->cbImage; } ERRORCODE CCompressTable::ConstructPhraseEncoding() { ERRORCODE ec= 0; PWeightInfo pwiBase = NULL; PWeightInfo *papwi = NULL; PWeightInfo pWeightInfo = NULL; PBYTE pbImages = NULL; CAValRef *pavr = NULL; __try { UINT cItems= m_psht->EntryCount(); if (!cItems) { ec= NO_TEXT_SCANNED; __leave; } pwiBase= PWeightInfo(VAlloc(FALSE, cItems * sizeof(WeightInfo))); // Now we'll preload the hash table with encoding for streams of // spaces and nulls. ASSERT(SPACE_TOKEN_LIMIT <= C_TOKEN_BLOCK); INT iCount; for (iCount= 1; iCount < SPACE_TOKEN_LIMIT; ++iCount) m_pavr->AddValRef(PBYTE(chSpaces), iCount); m_psht->Assimilate(m_pavr, NULL, FnAddSpaces, FnAddSpaces); m_pavr->DiscardRefs(); ZeroMemory(chNulls, SPACE_TOKEN_LIMIT); for (iCount= 1; iCount < SPACE_TOKEN_LIMIT; ++iCount) m_pavr->AddValRef(PBYTE(chNulls), iCount); m_psht->Assimilate(m_pavr, NULL, FnAddNulls, FnAddNulls); m_pavr->DiscardRefs(); // Note! We must never add an item to the hash table after this // point. This is because the code below stores the addresses // of the hash table value strings. If we add items, those // strings may move around. WeightConstructionState wcs; wcs.pwi= pwiBase; m_psht->DumpAll(&wcs, CCompressTable::BuildWeightInfo); UINT cWeights= wcs.pwi - pwiBase; ASSERT(cWeights); papwi= (PWeightInfo *) VAlloc(FALSE, cWeights * sizeof(PWeightInfo)); UINT c = cWeights; PWeightInfo *ppwi = papwi, pwi = pwiBase; for (; c--; ) *ppwi++ = pwi++; qsort(papwi, cWeights, sizeof(PWeightInfo), WeightCompare); INT cCount; iCount= INT(cWeights); cCount= (iCount > 128)? 128 : iCount; qsort(papwi, cCount, sizeof(PWeightInfo), WeightCompare2); iCount -= 128; if (iCount > 1) { cCount= 16 * 1024; if (iCount < cCount) cCount= iCount; qsort(papwi + 128, cCount, sizeof(PWeightInfo), WeightCompare2); iCount -= cCount; if (iCount > 1) qsort(papwi + 128 + 16 * 1024, iCount, sizeof(PWeightInfo), WeightCompare2); } iCount = 128 + 16 * 1024; if (iCount < INT(cWeights)) cWeights= iCount; UINT cbImages = 0; PWeightInfo *ppwiSrc = papwi; for (c= cWeights; c--; ) cbImages += (*ppwiSrc++)->cbImage; pavr= CAValRef::NewValRef(cWeights); pWeightInfo = PWeightInfo(VAlloc(FALSE, cWeights * sizeof(WeightInfo))); pbImages = PBYTE (VAlloc(FALSE, cbImages)); ASSERT(!m_pWeightInfo); ASSERT(!m_pbImages); m_pWeightInfo = pWeightInfo; pWeightInfo = NULL; m_pbImages = pbImages; pbImages = NULL; m_cWeights = cWeights; m_cbImageTotal = cbImages; PWeightInfo pwiDest = m_pWeightInfo; PBYTE pbDest = m_pbImages; for (ppwiSrc= papwi, c= cWeights; c--; ) { PWeightInfo pwiSrc= *ppwiSrc++; CopyMemory(pbDest, pwiSrc->pbImage, pwiSrc->cbImage); pwiSrc->pbImage= pbDest; pbDest += pwiSrc->cbImage; *pwiDest++ = *pwiSrc; } // Now we've changed all the pbImage pointers in the weight info array // to point into the m_pbImages. We no longer need to keep the hash // table value addresses constant. for (pwi = m_pWeightInfo, iCount= 0; iCount < INT(cWeights); ++iCount, ++pwi) { pavr->AddValRef(pwi->pbImage, pwi->cbImage); if (iCount < 128) { pwi->enc.fClass = NDX_LOW_CLASS | pwi->fSymbol; pwi->enc.abCode[0] = 0x0FF & (iCount << 1); continue; } UINT iExcess= UINT(iCount - 128); pwi->enc.fClass = NDX_MEDIUM_CLASS | pwi->fSymbol; pwi->enc.abCode[1] = iExcess & 0x0FF; pwi->enc.abCode[0] = ((iExcess >> 6) & 0x3FC) | 0x01; } m_psht->Assimilate(pavr, m_pWeightInfo, NULL, FnAddTokens); ec= 0; __leave; } __finally { if (pwiBase ) VFree(pwiBase ); if (papwi ) VFree(papwi ); if (pWeightInfo) VFree(pWeightInfo); if (pbImages ) VFree(pbImages ); if (pavr) delete pavr; } return ec; } void CCompressTable::FnAddSpaces(UINT iValue, PVOID pvTag, PVOID pv) { ENCODE enc; ASSERT(iValue < 16); enc.fClass = SPACES_CLASS; enc.abCode[0] = ((iValue & 0x0000000f) << 4) | 0x07; *PENCODE(pvTag) = enc; } void CCompressTable::FnAddNulls(UINT iValue, PVOID pvTag, PVOID pv) { ENCODE enc; ASSERT(iValue < 16); enc.fClass = NULL_CLASS; enc.abCode[0] = ((iValue & 0x0000000f) << 4) | 0x0f; *PENCODE(pvTag) = enc; } void CCompressTable::FnAddTokens(UINT iValue, PVOID pvTag, PVOID pv) { *PENCODE(pvTag)= PWeightInfo(pv)[iValue].enc; }