mirror of https://github.com/lianthony/NT4.0
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1063 lines
30 KiB
1063 lines
30 KiB
#include "stdafx.h"
|
|
#pragma hdrstop
|
|
|
|
#include "ctable.h"
|
|
#include "FTSIFace.h"
|
|
#include "Memex.h"
|
|
#include "FtsLex.h"
|
|
#include "Bytemaps.h"
|
|
#include <stdlib.h>
|
|
|
|
extern char chSpaces[];
|
|
extern char chNulls [];
|
|
extern char gchNull [];
|
|
|
|
BYTE acOneBits[16] =
|
|
{
|
|
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4
|
|
};
|
|
|
|
// 12345678901234567890
|
|
char chSpaces[SPACE_TOKEN_LIMIT + 1] = " ";
|
|
char chNulls [SPACE_TOKEN_LIMIT + 1] = " ";
|
|
char gchNull [1] = {(const char) 0x00};
|
|
|
|
CCompressTable::CCompressTable(UINT iCharsetDefault)
|
|
{
|
|
m_psht = NULL;
|
|
m_pavr = NULL;
|
|
m_iCharSetDefault = iCharsetDefault;
|
|
m_pWeightInfo = NULL;
|
|
m_pbImages = NULL;
|
|
}
|
|
|
|
CCompressTable::~CCompressTable()
|
|
{
|
|
if (m_psht) delete m_psht;
|
|
if (m_pavr) delete m_pavr;
|
|
|
|
if (m_vb.Base) FreeVirtualBuffer(&m_vb);
|
|
|
|
if (m_pWeightInfo ) VFree(m_pWeightInfo);
|
|
if (m_pbImages ) VFree(m_pbImages );
|
|
}
|
|
|
|
CCompressTable *CCompressTable::NewCompressTable(UINT iCharsetDefault)
|
|
{
|
|
CCompressTable *pct= NULL;
|
|
|
|
__try
|
|
{
|
|
__try
|
|
{
|
|
pct= New CCompressTable(iCharsetDefault);
|
|
|
|
CreateVirtualBuffer(&(pct->m_vb), CB_BUFFER_COMMIT, CB_BUFFER_RESERVATION);
|
|
|
|
pct->m_psht= CSegHashTable::NewSegHashTable(sizeof(ULONG), sizeof(ULONG));
|
|
|
|
pct->m_pavr= CAValRef::NewValRef(C_TOKEN_BLOCK);
|
|
}
|
|
__finally
|
|
{
|
|
if (_abnormal_termination() && pct)
|
|
{
|
|
delete pct; pct= NULL;
|
|
}
|
|
}
|
|
}
|
|
__except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH)
|
|
{
|
|
pct= NULL;
|
|
}
|
|
|
|
return pct;
|
|
}
|
|
|
|
INT CCompressTable::ScanString(PBYTE pbText, INT cbText, INT iCharSet)
|
|
{
|
|
CP cp = GetCPFromCharset(iCharSet);
|
|
|
|
if (m_pWeightInfo) return ALREADY_WEIGHED;
|
|
|
|
const UINT cwTokenBlock= 1024;
|
|
|
|
PSTR apTokenStart[C_TOKEN_BLOCK];
|
|
PSTR apTokenEnd [C_TOKEN_BLOCK];
|
|
BYTE abType [C_TOKEN_BLOCK];
|
|
|
|
for (m_pavr->DiscardRefs() ; cbText; m_pavr->DiscardRefs())
|
|
{
|
|
UINT cTokens= WordBreakA(cp, (PSTR*)&pbText, &cbText, apTokenStart, apTokenEnd, PBYTE(&abType), NULL, C_TOKEN_BLOCK, TOKENIZE_SPACES);
|
|
|
|
PSTR *ppTokenStart = apTokenStart,
|
|
*ppTokenEnd = apTokenEnd;
|
|
|
|
for (; cTokens-- ; )
|
|
{
|
|
ASSERT(*ppTokenStart - *ppTokenEnd <= UINT(~USHORT(0)));
|
|
|
|
m_pavr->AddValRef(*ppTokenStart, USHORT(*ppTokenEnd - *ppTokenStart));
|
|
|
|
ppTokenStart++;
|
|
ppTokenEnd++;
|
|
}
|
|
|
|
m_psht->Assimilate(m_pavr, abType, CCompressTable::IncrementCounter, CCompressTable::InitialCounter);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
void CCompressTable::IncrementCounter(UINT iValue, PVOID pvTag, PVOID pvEnvironment)
|
|
{
|
|
PUINT pui= PUINT(pvTag);
|
|
PBYTE pb= PBYTE(pvEnvironment);
|
|
|
|
ASSERT(pb[iValue]? ((INT) *pui) > 0 : ((INT) *pui) < 0);
|
|
|
|
if (pb[iValue]) (*pui)++; // Positive counts mark symbols
|
|
else (*pui)--; // Negative counts mark non-symbols
|
|
}
|
|
|
|
void CCompressTable::InitialCounter(UINT iValue, PVOID pvTag, PVOID pvEnvironment)
|
|
{
|
|
PUINT pui= PUINT(pvTag);
|
|
PBYTE pb= PBYTE(pvEnvironment);
|
|
|
|
ASSERT(!*pui);
|
|
|
|
*pui= pb[iValue]? 1 : UINT(-1);
|
|
}
|
|
|
|
PUINT writebits(PUINT pNextCode, PINT pBitsLeft, int iBits, DWORD dwCode)
|
|
{
|
|
int iTmp;
|
|
PUINT pNextTemp;
|
|
|
|
if (iBits > *pBitsLeft)
|
|
{
|
|
iTmp = *pBitsLeft;
|
|
pNextTemp = writebits(pNextCode, pBitsLeft, *pBitsLeft, dwCode);
|
|
return(writebits(pNextTemp, pBitsLeft, iBits - iTmp, dwCode >> iTmp));
|
|
}
|
|
|
|
dwCode <<= 32 - iBits;
|
|
*pNextCode >>= iBits;
|
|
*pNextCode |= dwCode;
|
|
*pBitsLeft -= iBits;
|
|
|
|
if (*pBitsLeft != 0) return(pNextCode);
|
|
|
|
*pBitsLeft = 32;
|
|
pNextCode++;
|
|
*pNextCode = 0;
|
|
return(pNextCode);
|
|
}
|
|
|
|
ERRORCODE CCompressTable::GetPhraseTable(PUINT pcPhrases, PBYTE *ppbImage, PUINT pcbImage, PBYTE *ppbIndex, PUINT pcbIndex)
|
|
{
|
|
ERRORCODE ec= 0;
|
|
|
|
PBYTE pbImages = NULL;
|
|
PUINT pIndexCode = NULL;
|
|
|
|
__try
|
|
{
|
|
__try
|
|
{
|
|
if (!m_pWeightInfo)
|
|
{
|
|
ec=ConstructPhraseEncoding();
|
|
|
|
if (ec) __leave;
|
|
|
|
ASSERT(m_cWeights);
|
|
|
|
if (!m_cWeights)
|
|
{
|
|
ec= EMPTY_PHRASE_TABLE;
|
|
|
|
__leave;
|
|
}
|
|
}
|
|
|
|
ASSERT(m_cWeights);
|
|
|
|
*pcPhrases= m_cWeights;
|
|
|
|
*pcbImage = m_cbImageTotal;
|
|
|
|
pbImages= PBYTE(malloc(m_cbImageTotal)); // BugBug: Change this to LocalAlloc!
|
|
// Must change compiler at the same time.
|
|
|
|
if (!pbImages) RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
|
|
CopyMemory(pbImages, m_pbImages, m_cbImageTotal);
|
|
|
|
UINT cbitsBasis = CBitsToRepresent((m_cbImageTotal - m_cWeights)/ m_cWeights);
|
|
UINT basis = 1 << cbitsBasis;
|
|
UINT cbitsEstimate = m_cWeights * (1 + cbitsBasis) + (m_cbImageTotal + basis - m_cWeights - 1) / basis;
|
|
UINT cdwEstimate = (cbitsEstimate + 31) >> 5;
|
|
UINT cdwBits = (m_cWeights + 31) >> 5;
|
|
UINT cbTotal = sizeof(UINT) * (cdwBits + cdwEstimate) + sizeof(JINDEXHDR);
|
|
|
|
pIndexCode= PUINT(malloc(cbTotal)); // BugBug: Change this to LocalAlloc!
|
|
// Must change compiler at the same time.
|
|
|
|
if (!pIndexCode) RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
|
|
if (!pIndexCode) { free(pbImages); return OUT_OF_MEMORY; }
|
|
|
|
ASSERT(cbitsBasis < 6);
|
|
|
|
JINDEXHDR jIHdr;
|
|
|
|
jIHdr.Magic = 'J';
|
|
jIHdr.cCount = m_cWeights;
|
|
jIHdr.cBits = cbitsBasis;
|
|
|
|
UINT fBasisMask = basis - 1;
|
|
|
|
CopyMemory(pIndexCode, (const void *) &jIHdr, sizeof(JINDEXHDR));
|
|
|
|
ASSERT(sizeof(JINDEXHDR) == sizeof(INT));
|
|
|
|
PUINT pNextCode = pIndexCode + 1;
|
|
|
|
UINT cbitsLeft= 32;
|
|
|
|
UINT ii;
|
|
|
|
for (ii = 0; ii < m_cWeights; ii++)
|
|
{
|
|
ASSERT(m_pWeightInfo[ii].cbImage);
|
|
|
|
UINT cb = m_pWeightInfo[ii].cbImage - 1;
|
|
UINT dwRight = cb & fBasisMask;
|
|
cb = (cb & ~fBasisMask) >> cbitsBasis;
|
|
|
|
UINT cBits= 1 + cbitsBasis + cb;
|
|
|
|
ASSERT(cBits < 33);
|
|
|
|
UINT dwLeft = cb ? (((DWORD)(~0)) >> (32 - cb)) : 0;
|
|
|
|
pNextCode = writebits(pNextCode, PINT(&cbitsLeft), cBits, (dwRight << (1 + cb)) | dwLeft);
|
|
}
|
|
|
|
if (cbitsLeft < 32)
|
|
pNextCode= writebits(pNextCode, PINT(&cbitsLeft), cbitsLeft, 0);
|
|
|
|
ASSERT(pNextCode - (pIndexCode+1) <= INT(cdwEstimate));
|
|
|
|
PWeightInfo pwi = m_pWeightInfo;
|
|
UINT dw = 0;
|
|
UINT c = m_cWeights;
|
|
|
|
for (cbitsLeft = 32; c--; )
|
|
{
|
|
UINT fSymbol= 1 & (((pwi++)->fSymbol) >> SYMBOL_SHIFT);
|
|
|
|
dw= _rotr(dw | fSymbol, 1);
|
|
|
|
if (!--cbitsLeft)
|
|
{
|
|
*pNextCode++= dw;
|
|
dw= 0;
|
|
cbitsLeft= 32;
|
|
}
|
|
}
|
|
|
|
if (cbitsLeft < 32)
|
|
*pNextCode++= _rotr(dw, cbitsLeft);
|
|
|
|
ASSERT(cbTotal >= (pNextCode - pIndexCode) * sizeof(UINT));
|
|
|
|
*pcbIndex= (pNextCode - pIndexCode) * sizeof(UINT);
|
|
|
|
*ppbImage= pbImages; pbImages = NULL;
|
|
*ppbIndex= PBYTE(pIndexCode); pIndexCode = NULL;
|
|
|
|
ec= 0;
|
|
|
|
__leave;
|
|
}
|
|
__finally
|
|
{
|
|
if (pbImages ) free(pbImages );
|
|
if (pIndexCode) free(pIndexCode);
|
|
}
|
|
}
|
|
__except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH)
|
|
{
|
|
ec= OUT_OF_MEMORY;
|
|
}
|
|
|
|
return ec;
|
|
}
|
|
|
|
ERRORCODE CCompressTable::SetPhraseTable(PBYTE pbImage, UINT cbImage, PBYTE pbIndex, UINT cbIndex)
|
|
{
|
|
ERRORCODE ec= OUT_OF_MEMORY;
|
|
|
|
PBYTE pbTokenImages = NULL;
|
|
PWeightInfo pwiPhrases = NULL;
|
|
CAValRef *pavr = NULL;
|
|
|
|
__try
|
|
{
|
|
__try
|
|
{
|
|
if (UINT(pbIndex) & (sizeof(UINT)-1)) { ec= ALIGNMENT_ERROR; __leave; }
|
|
|
|
if (cbIndex & (sizeof(UINT)-1)) { ec= ALIGNMENT_ERROR; __leave; }
|
|
|
|
JINDEXHDR jIHdr= *(JINDEXHDR *) pbIndex;
|
|
|
|
if (jIHdr.Magic != 'J' || jIHdr.cCount == 0 || jIHdr.cCount > 128 + 16 * 1024)
|
|
{ ec= INVALID_PHRASE_TABLE; __leave; }
|
|
|
|
pbTokenImages= PBYTE(VAlloc(FALSE, cbImage));
|
|
|
|
UINT cWeights= jIHdr.cCount;
|
|
|
|
pwiPhrases= PWeightInfo(VAlloc(FALSE, cWeights * sizeof(WeightInfo)));
|
|
|
|
pavr= CAValRef::NewValRef(cWeights);
|
|
|
|
CopyMemory(pbTokenImages, pbImage, cbImage);
|
|
|
|
ASSERT(sizeof(JINDEXHDR) == sizeof(UINT));
|
|
|
|
UINT cbitsBasis = jIHdr.cBits;
|
|
UINT basis = 1 << cbitsBasis;
|
|
UINT cbitsEstimate = cWeights * (1 + cbitsBasis) + (cbImage + basis - cWeights - 1) / basis;
|
|
UINT cdwEstimate = (cbitsEstimate + 31) >> 5;
|
|
|
|
CJCode JCode(jIHdr.cBits, cWeights, PVOID(pbIndex + sizeof(JINDEXHDR)));
|
|
|
|
PBYTE pb = pbTokenImages;
|
|
PWeightInfo pwi = pwiPhrases;
|
|
UINT c = cWeights;
|
|
|
|
for (; c--; ++pwi)
|
|
{
|
|
UINT cb= JCode.GetNextDelta();
|
|
|
|
pavr->AddValRef(pb, cb);
|
|
|
|
UINT iCount= cWeights - c - 1;
|
|
|
|
if (iCount < 128)
|
|
{
|
|
pwi->enc.fClass = NDX_LOW_CLASS | pwi->fSymbol;
|
|
pwi->enc.abCode[0] = 0x0FF & (iCount << 1);
|
|
}
|
|
else
|
|
{
|
|
UINT iExcess= UINT(iCount - 128);
|
|
|
|
pwi->enc.fClass = NDX_MEDIUM_CLASS | pwi->fSymbol;
|
|
pwi->enc.abCode[1] = iExcess & 0x0FF;
|
|
pwi->enc.abCode[0] = ((iExcess >> 6) & 0x3FC) | 0x01;
|
|
}
|
|
|
|
pwi->cbImage= cb;
|
|
pwi->pbImage= pb;
|
|
pb+= cb;
|
|
}
|
|
|
|
PUINT pdwBits= JCode.NextDWord();
|
|
|
|
for (c= cWeights, pwi= pwiPhrases; c--; pwi++)
|
|
{
|
|
UINT iCount= cWeights - c - 1;
|
|
|
|
pwi->fSymbol |= (1 & (pdwBits[iCount >> 5] >> (iCount & 31))) << SYMBOL_SHIFT;
|
|
}
|
|
|
|
if (m_pWeightInfo) { VFree(m_pWeightInfo); m_pWeightInfo= NULL; }
|
|
if (m_pbImages ) { VFree(m_pbImages ); m_pbImages = NULL; }
|
|
if (m_psht ) { delete m_psht; m_psht = NULL; }
|
|
|
|
m_pWeightInfo = pwiPhrases; pwiPhrases = NULL;
|
|
m_pbImages = pbTokenImages; pbTokenImages = NULL;
|
|
m_cWeights = cWeights;
|
|
m_cbImageTotal = cbImage;
|
|
|
|
m_psht= CSegHashTable::NewSegHashTable(sizeof(ENCODE), sizeof(ENCODE));
|
|
|
|
m_psht->Assimilate(pavr, m_pWeightInfo, NULL, CCompressTable::RecordEncoding);
|
|
|
|
delete pavr; pavr= NULL;
|
|
|
|
ASSERT(SPACE_TOKEN_LIMIT <= C_TOKEN_BLOCK);
|
|
|
|
m_pavr->DiscardRefs();
|
|
|
|
INT iCount;
|
|
|
|
for (iCount= 1; iCount < SPACE_TOKEN_LIMIT; ++iCount)
|
|
m_pavr->AddValRef(PBYTE(chSpaces), iCount);
|
|
|
|
m_psht->Assimilate(m_pavr, NULL, FnAddSpaces, FnAddSpaces);
|
|
|
|
m_pavr->DiscardRefs();
|
|
|
|
ZeroMemory(chNulls, SPACE_TOKEN_LIMIT);
|
|
|
|
for (iCount= 1; iCount < SPACE_TOKEN_LIMIT; ++iCount)
|
|
m_pavr->AddValRef(PBYTE(chNulls), iCount);
|
|
|
|
m_psht->Assimilate(m_pavr, NULL, FnAddNulls, FnAddNulls);
|
|
|
|
m_pavr->DiscardRefs();
|
|
|
|
ec= 0;
|
|
|
|
__leave;
|
|
}
|
|
__finally
|
|
{
|
|
if (pbTokenImages) VFree(pbTokenImages);
|
|
if (pwiPhrases ) VFree(pwiPhrases );
|
|
if (pavr ) delete pavr;
|
|
}
|
|
}
|
|
__except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH)
|
|
{
|
|
ec= OUT_OF_MEMORY;
|
|
}
|
|
|
|
return ec;
|
|
}
|
|
|
|
void CCompressTable::RecordEncoding(UINT iValue, PVOID pvTag, PVOID pv)
|
|
{
|
|
*PENCODE(pvTag)= PWeightInfo(pv)[iValue].enc;
|
|
}
|
|
|
|
void CCompressTable::FnCompMergeToken(UINT iValue, PVOID pvTag, PVOID pv)
|
|
{
|
|
PENCODE penc= PENCODE(pvTag);
|
|
PENCODE paenc= PENCODE(pv );
|
|
|
|
paenc[iValue] = *penc;
|
|
}
|
|
|
|
void CCompressTable::FnCompAddToken(UINT iValue, PVOID pvTag, PVOID pv)
|
|
{
|
|
ENCODE enc;
|
|
|
|
enc.fClass = LITERAL_CLASS;
|
|
|
|
*PENCODE(pvTag)= enc;
|
|
|
|
PENCODE paenc= PENCODE(pv);
|
|
|
|
paenc[iValue]= enc;
|
|
}
|
|
|
|
INT CCompressTable::CompressString(PBYTE pbText, INT cbOrig, PBYTE *ppCompressed, UINT iCharset)
|
|
{
|
|
// This routine constructs an encoded representation of the text denoted by pbText, cbOrig, and iCharset.
|
|
// The explicit result will be the length in bytes of the encoded form. If the encoded form is larger
|
|
// than the original text, we malloc a suitable buffer, copy the output to that buffer, and return its
|
|
// address in *ppCompressed. Otherwise we overwrite the pbText memory area with the compressed form.
|
|
//
|
|
// When the encoded length is > cbOrig, the calling code must free(*ppCompressed).
|
|
|
|
ERRORCODE ec= 0;
|
|
|
|
PBYTE pbCompressed = NULL;
|
|
|
|
__try
|
|
{
|
|
__try
|
|
{
|
|
if (!m_pWeightInfo)
|
|
{
|
|
ec=ConstructPhraseEncoding();
|
|
|
|
if (ec) __leave;
|
|
}
|
|
|
|
PWCHAR pwBase = PWCHAR(m_vb.Base);
|
|
PCHAR pbOut = PCHAR(pwBase + cbOrig);
|
|
PCHAR pbNext = pbOut;
|
|
|
|
// Note: We use the m_vb area for two purposes --
|
|
//
|
|
// 1. As buffer for unicode characters.
|
|
// 2. As a result area to store the "compressed" text.
|
|
//
|
|
// For the second case we assume that in all cases the "compressed"
|
|
// text will never be larger than 2*cbOrig.
|
|
|
|
CP cp = GetCPFromCharset(iCharset);
|
|
|
|
PSTR pbScan= PSTR(pbText);
|
|
INT cbText= cbOrig;
|
|
|
|
const UINT cwTokenBlock= 1024;
|
|
|
|
PSTR apTokenStart[C_TOKEN_BLOCK];
|
|
PSTR apTokenEnd [C_TOKEN_BLOCK];
|
|
ENCODE aenc [C_TOKEN_BLOCK];
|
|
|
|
for (m_pavr->DiscardRefs() ; cbText; m_pavr->DiscardRefs())
|
|
{
|
|
UINT cTokens= WordBreakA(cp, (PSTR*)&pbScan, &cbText, apTokenStart, apTokenEnd, NULL, NULL, C_TOKEN_BLOCK, TOKENIZE_SPACES);
|
|
|
|
PSTR *ppTokenStart = apTokenStart,
|
|
*ppTokenEnd = apTokenEnd;
|
|
|
|
UINT c= cTokens;
|
|
|
|
for (; c-- ; )
|
|
{
|
|
PSTR pTokenStart = *ppTokenStart++;
|
|
PSTR pTokenEnd = *ppTokenEnd++;
|
|
|
|
ASSERT(pTokenEnd - pTokenStart <= UINT(~USHORT(0)));
|
|
|
|
m_pavr->AddValRef(pTokenStart, USHORT(pTokenEnd - pTokenStart));
|
|
}
|
|
|
|
m_psht->Assimilate(m_pavr, aenc, CCompressTable::FnCompMergeToken, CCompressTable::FnCompAddToken);
|
|
|
|
UINT i;
|
|
|
|
BOOL bPrevTokenSymbol = FALSE;
|
|
BOOL bNextTokenSymbol = FALSE;
|
|
BOOL bCode;
|
|
|
|
for (i= 0; i < cTokens; ++i)
|
|
{
|
|
bNextTokenSymbol= (cTokens > i+1)? aenc[i + 1].fClass & SYMBOL_TOKEN
|
|
: FALSE;
|
|
|
|
switch(aenc[i].fClass & CLASS_MASK)
|
|
{
|
|
default:
|
|
|
|
ASSERT(FALSE);
|
|
break;
|
|
|
|
case NULL_CLASS:
|
|
case NDX_LOW_CLASS:
|
|
|
|
*pbNext++ = aenc[i].abCode[0];
|
|
break;
|
|
|
|
case SPACES_CLASS:
|
|
|
|
bCode = aenc[i].abCode[0];
|
|
|
|
if (!( (bCode == SINGLE_SPACE_CODE)
|
|
&& bPrevTokenSymbol
|
|
&& bNextTokenSymbol
|
|
)
|
|
) *pbNext++ = bCode;
|
|
else ASSERT(bCode == SINGLE_SPACE_CODE);
|
|
|
|
break;
|
|
|
|
case NDX_MEDIUM_CLASS:
|
|
|
|
*pbNext++ = aenc[i].abCode[0];
|
|
*pbNext++ = aenc[i].abCode[1];
|
|
|
|
break;
|
|
|
|
case LITERAL_CLASS:
|
|
{
|
|
const BYTE *pb;
|
|
USHORT cbValue;
|
|
BYTE bCode;
|
|
|
|
m_pavr->GetValRef(i, &pb, &cbValue);
|
|
ASSERT(cbValue);
|
|
|
|
while (cbValue > 32)
|
|
{
|
|
*pbNext++ = BYTE(UINT(0xfb));
|
|
|
|
CopyMemory(pbNext, pb, 32);
|
|
pbNext += 32;
|
|
pb += 32;
|
|
cbValue -= 32;
|
|
}
|
|
|
|
bCode = (BYTE) (0x000000ff & (cbValue - 1));
|
|
bCode <<= 3;
|
|
bCode |= 0x03;
|
|
*pbNext++ = bCode;
|
|
|
|
CopyMemory(pbNext, pb, cbValue);
|
|
|
|
pbNext += cbValue;
|
|
}
|
|
|
|
}
|
|
|
|
bPrevTokenSymbol = aenc[i].fClass & SYMBOL_TOKEN;
|
|
}
|
|
}
|
|
|
|
INT cbCompressed= pbNext - pbOut;
|
|
|
|
ASSERT(cbCompressed > 0);
|
|
|
|
if (cbOrig > cbCompressed)
|
|
{
|
|
#ifdef _DEBUG
|
|
PBYTE pbDecomp= NULL;
|
|
|
|
if (cbOrig <= 4096) pbDecomp= (PBYTE) _alloca(cbOrig);
|
|
else pbDecomp= New BYTE[cbOrig];
|
|
|
|
INT cbExp= DeCompressString(PBYTE(pbOut), pbDecomp, cbCompressed);
|
|
|
|
ASSERT(cbExp == cbOrig);
|
|
|
|
PBYTE pbOrig = pbText;
|
|
PBYTE pbResult = pbDecomp;
|
|
|
|
for (int c= cbOrig; c--; ++pbOrig, ++pbResult)
|
|
ASSERT(*pbOrig == *pbResult);
|
|
|
|
if (cbOrig > 4096) delete [] pbDecomp;
|
|
#endif _DEBUG
|
|
|
|
CopyMemory(pbText, pbOut, cbCompressed);
|
|
}
|
|
else
|
|
if (ppCompressed)
|
|
{
|
|
pbCompressed= (PBYTE) malloc(cbCompressed); // BugBug: Change this to LocalAlloc!
|
|
// Coordinate change w/ Compiler
|
|
|
|
if (!pbCompressed) RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
|
|
CopyMemory(pbCompressed, pbOut, cbCompressed);
|
|
|
|
*ppCompressed= pbCompressed;
|
|
pbCompressed= NULL;
|
|
}
|
|
|
|
ec= cbCompressed;
|
|
|
|
__leave;
|
|
}
|
|
__finally
|
|
{
|
|
if (pbCompressed) free(pbCompressed);
|
|
}
|
|
}
|
|
__except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH)
|
|
{
|
|
ec= OUT_OF_MEMORY;
|
|
}
|
|
|
|
return ec;
|
|
}
|
|
|
|
INT CCompressTable::DeCompressString(PBYTE pbComp, PBYTE pbDecomp, int cbComp)
|
|
{
|
|
ERRORCODE ec= 0;
|
|
|
|
__try
|
|
{
|
|
__try
|
|
{
|
|
if (!m_pWeightInfo)
|
|
{
|
|
ec=ConstructPhraseEncoding();
|
|
|
|
if (ec) __leave;
|
|
}
|
|
|
|
PBYTE pbLimit = pbComp + cbComp;
|
|
PBYTE pbStartDecomp = pbDecomp;
|
|
BYTE bCode;
|
|
BOOL bPrevTokenSymbol = FALSE;
|
|
BOOL bNextTokenSymbol = FALSE;
|
|
|
|
int iIndex;
|
|
int cb;
|
|
|
|
while(pbComp < pbLimit)
|
|
{
|
|
bCode = *pbComp++;
|
|
switch( acOneBits[0x0f & bCode])
|
|
{
|
|
case NDX_LOW_CLASS:
|
|
bCode >>= 1;
|
|
iIndex = (int) bCode;
|
|
|
|
ASSERT(iIndex > -1);
|
|
|
|
bNextTokenSymbol = m_pWeightInfo[iIndex].fSymbol;
|
|
|
|
if (bNextTokenSymbol && bPrevTokenSymbol) *pbDecomp++ = ' ';
|
|
|
|
CopyMemory( pbDecomp, m_pWeightInfo[iIndex].pbImage, m_pWeightInfo[iIndex].cbImage);
|
|
|
|
pbDecomp += m_pWeightInfo[iIndex].cbImage;
|
|
|
|
break;
|
|
|
|
case NDX_MEDIUM_CLASS:
|
|
|
|
bCode >>= 2;
|
|
|
|
iIndex= ((((int) bCode) << 8) | *pbComp++) + 128;
|
|
|
|
ASSERT(iIndex > -1);
|
|
|
|
bNextTokenSymbol = m_pWeightInfo[iIndex].fSymbol;
|
|
|
|
if (bNextTokenSymbol && bPrevTokenSymbol) *pbDecomp++ = ' ';
|
|
|
|
CopyMemory( pbDecomp, m_pWeightInfo[iIndex].pbImage, m_pWeightInfo[iIndex].cbImage);
|
|
|
|
pbDecomp += m_pWeightInfo[iIndex].cbImage;
|
|
|
|
break;
|
|
|
|
case LITERAL_CLASS:
|
|
|
|
bNextTokenSymbol = FALSE;
|
|
bCode >>= 3;
|
|
cb = (int) bCode + 1;
|
|
CopyMemory( pbDecomp, pbComp, cb);
|
|
|
|
pbDecomp += cb;
|
|
pbComp += cb;
|
|
|
|
break;
|
|
|
|
case SPACES_CLASS:
|
|
|
|
bNextTokenSymbol = FALSE;
|
|
bCode >>= 4;
|
|
cb = (int) bCode + 1;
|
|
ASSERT(cb > 0);
|
|
|
|
while (cb--) *pbDecomp++ = ' ';
|
|
|
|
break;
|
|
|
|
case NULL_CLASS:
|
|
|
|
bNextTokenSymbol = FALSE;
|
|
bCode >>= 4;
|
|
cb = (int) bCode + 1;
|
|
|
|
ASSERT(cb > 0);
|
|
|
|
while (cb--) *pbDecomp++ = 0x00;
|
|
|
|
break;
|
|
}
|
|
bPrevTokenSymbol = bNextTokenSymbol;
|
|
}
|
|
|
|
ec= pbDecomp - pbStartDecomp;
|
|
|
|
__leave;
|
|
}
|
|
__finally
|
|
{
|
|
|
|
}
|
|
}
|
|
__except(_exception_code() == STATUS_NO_MEMORY? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH)
|
|
{
|
|
ec= OUT_OF_MEMORY;
|
|
}
|
|
|
|
return ec;
|
|
}
|
|
|
|
typedef struct _WeightConstructionState
|
|
{
|
|
PWeightInfo pwi;
|
|
|
|
} WeightConstructionState,
|
|
*PWeightConstructionState;
|
|
|
|
void CCompressTable::BuildWeightInfo(const BYTE *pbValue, UINT cbValue, void *pvTag, PVOID pvEnvironment)
|
|
{
|
|
#define pwcs PWeightConstructionState(pvEnvironment)
|
|
|
|
ASSERT(cbValue);
|
|
|
|
if (!*pbValue && cbValue < SPACE_TOKEN_LIMIT)
|
|
{
|
|
BOOL fAllNulls= TRUE;
|
|
const BYTE *pb= pbValue;
|
|
UINT cb= cbValue;
|
|
|
|
for (; --cb; ) if (*++pb) fAllNulls= FALSE;
|
|
|
|
if (fAllNulls) return;
|
|
}
|
|
|
|
if (' ' == *pbValue && cbValue < SPACE_TOKEN_LIMIT)
|
|
{
|
|
BOOL fAllSpaces= TRUE;
|
|
const BYTE *pb= pbValue;
|
|
UINT cb= cbValue;
|
|
|
|
for (; --cb; ) if (' ' != *++pb) fAllSpaces= FALSE;
|
|
|
|
if (fAllSpaces) return;
|
|
}
|
|
|
|
INT cRefs = *PINT(pvTag);
|
|
BOOL fSymbol = FALSE;
|
|
|
|
ASSERT(cRefs);
|
|
|
|
ASSERT(sizeof(ENCODE) == sizeof(INT));
|
|
|
|
PENCODE(pvTag)->fClass = LITERAL_CLASS;
|
|
|
|
if (cRefs > 0) fSymbol= SYMBOL_TOKEN;
|
|
else cRefs= - cRefs;
|
|
|
|
if (cRefs == 1) return;
|
|
|
|
PWeightInfo pwi = pwcs->pwi++;
|
|
|
|
pwi->pbImage = PBYTE(pbValue);
|
|
pwi->cbImage = cbValue;
|
|
pwi->uiWeight = cRefs * cbValue;
|
|
pwi->fSymbol = fSymbol;
|
|
|
|
#undef pwcs
|
|
}
|
|
|
|
extern "C" int _cdecl WeightCompare(const void *pv1, const void *pv2)
|
|
{
|
|
PWeightInfo pw1 = *((PWeightInfo *) pv1);
|
|
PWeightInfo pw2 = *((PWeightInfo *) pv2);
|
|
|
|
return( pw2->uiWeight - pw1->uiWeight);
|
|
}
|
|
|
|
extern "C" int _cdecl WeightCompare2(const void *pv1, const void *pv2)
|
|
{
|
|
PWeightInfo pw1 = *((PWeightInfo *) pv1);
|
|
PWeightInfo pw2 = *((PWeightInfo *) pv2);
|
|
|
|
int cb = (pw1->cbImage < pw2->cbImage) ? pw1->cbImage : pw2->cbImage;
|
|
|
|
int iResult= _strnicmp((const char *) pw1->pbImage, (const char *) pw2->pbImage, cb);
|
|
|
|
if (iResult) return iResult;
|
|
else return pw1->cbImage - pw2->cbImage;
|
|
|
|
}
|
|
|
|
ERRORCODE CCompressTable::ConstructPhraseEncoding()
|
|
{
|
|
ERRORCODE ec= 0;
|
|
|
|
PWeightInfo pwiBase = NULL;
|
|
PWeightInfo *papwi = NULL;
|
|
PWeightInfo pWeightInfo = NULL;
|
|
PBYTE pbImages = NULL;
|
|
CAValRef *pavr = NULL;
|
|
|
|
__try
|
|
{
|
|
UINT cItems= m_psht->EntryCount();
|
|
|
|
if (!cItems) { ec= NO_TEXT_SCANNED; __leave; }
|
|
|
|
pwiBase= PWeightInfo(VAlloc(FALSE, cItems * sizeof(WeightInfo)));
|
|
|
|
// Now we'll preload the hash table with encoding for streams of
|
|
// spaces and nulls.
|
|
|
|
ASSERT(SPACE_TOKEN_LIMIT <= C_TOKEN_BLOCK);
|
|
|
|
INT iCount;
|
|
|
|
for (iCount= 1; iCount < SPACE_TOKEN_LIMIT; ++iCount)
|
|
m_pavr->AddValRef(PBYTE(chSpaces), iCount);
|
|
|
|
m_psht->Assimilate(m_pavr, NULL, FnAddSpaces, FnAddSpaces);
|
|
|
|
m_pavr->DiscardRefs();
|
|
|
|
ZeroMemory(chNulls, SPACE_TOKEN_LIMIT);
|
|
|
|
for (iCount= 1; iCount < SPACE_TOKEN_LIMIT; ++iCount)
|
|
m_pavr->AddValRef(PBYTE(chNulls), iCount);
|
|
|
|
m_psht->Assimilate(m_pavr, NULL, FnAddNulls, FnAddNulls);
|
|
|
|
m_pavr->DiscardRefs();
|
|
|
|
// Note! We must never add an item to the hash table after this
|
|
// point. This is because the code below stores the addresses
|
|
// of the hash table value strings. If we add items, those
|
|
// strings may move around.
|
|
|
|
WeightConstructionState wcs;
|
|
|
|
wcs.pwi= pwiBase;
|
|
|
|
m_psht->DumpAll(&wcs, CCompressTable::BuildWeightInfo);
|
|
|
|
UINT cWeights= wcs.pwi - pwiBase;
|
|
|
|
ASSERT(cWeights);
|
|
|
|
papwi= (PWeightInfo *) VAlloc(FALSE, cWeights * sizeof(PWeightInfo));
|
|
|
|
UINT c = cWeights;
|
|
PWeightInfo *ppwi = papwi,
|
|
pwi = pwiBase;
|
|
|
|
for (; c--; ) *ppwi++ = pwi++;
|
|
|
|
qsort(papwi, cWeights, sizeof(PWeightInfo), WeightCompare);
|
|
|
|
INT cCount;
|
|
|
|
iCount= INT(cWeights);
|
|
cCount= (iCount > 128)? 128 : iCount;
|
|
|
|
qsort(papwi, cCount, sizeof(PWeightInfo), WeightCompare2);
|
|
|
|
iCount -= 128;
|
|
|
|
if (iCount > 1)
|
|
{
|
|
cCount= 16 * 1024;
|
|
|
|
if (iCount < cCount) cCount= iCount;
|
|
|
|
qsort(papwi + 128, cCount, sizeof(PWeightInfo), WeightCompare2);
|
|
|
|
iCount -= cCount;
|
|
|
|
if (iCount > 1) qsort(papwi + 128 + 16 * 1024, iCount, sizeof(PWeightInfo), WeightCompare2);
|
|
}
|
|
|
|
iCount = 128 + 16 * 1024;
|
|
|
|
if (iCount < INT(cWeights)) cWeights= iCount;
|
|
|
|
UINT cbImages = 0;
|
|
PWeightInfo *ppwiSrc = papwi;
|
|
|
|
for (c= cWeights; c--; ) cbImages += (*ppwiSrc++)->cbImage;
|
|
|
|
pavr= CAValRef::NewValRef(cWeights);
|
|
|
|
pWeightInfo = PWeightInfo(VAlloc(FALSE, cWeights * sizeof(WeightInfo)));
|
|
pbImages = PBYTE (VAlloc(FALSE, cbImages));
|
|
|
|
ASSERT(!m_pWeightInfo);
|
|
ASSERT(!m_pbImages);
|
|
|
|
m_pWeightInfo = pWeightInfo; pWeightInfo = NULL;
|
|
m_pbImages = pbImages; pbImages = NULL;
|
|
m_cWeights = cWeights;
|
|
m_cbImageTotal = cbImages;
|
|
|
|
PWeightInfo pwiDest = m_pWeightInfo;
|
|
PBYTE pbDest = m_pbImages;
|
|
|
|
for (ppwiSrc= papwi, c= cWeights; c--; )
|
|
{
|
|
PWeightInfo pwiSrc= *ppwiSrc++;
|
|
|
|
CopyMemory(pbDest, pwiSrc->pbImage, pwiSrc->cbImage);
|
|
|
|
pwiSrc->pbImage= pbDest;
|
|
|
|
pbDest += pwiSrc->cbImage;
|
|
|
|
*pwiDest++ = *pwiSrc;
|
|
}
|
|
|
|
// Now we've changed all the pbImage pointers in the weight info array
|
|
// to point into the m_pbImages. We no longer need to keep the hash
|
|
// table value addresses constant.
|
|
|
|
for (pwi = m_pWeightInfo, iCount= 0; iCount < INT(cWeights); ++iCount, ++pwi)
|
|
{
|
|
pavr->AddValRef(pwi->pbImage, pwi->cbImage);
|
|
|
|
if (iCount < 128)
|
|
{
|
|
pwi->enc.fClass = NDX_LOW_CLASS | pwi->fSymbol;
|
|
pwi->enc.abCode[0] = 0x0FF & (iCount << 1);
|
|
|
|
continue;
|
|
}
|
|
|
|
UINT iExcess= UINT(iCount - 128);
|
|
|
|
pwi->enc.fClass = NDX_MEDIUM_CLASS | pwi->fSymbol;
|
|
pwi->enc.abCode[1] = iExcess & 0x0FF;
|
|
pwi->enc.abCode[0] = ((iExcess >> 6) & 0x3FC) | 0x01;
|
|
}
|
|
|
|
m_psht->Assimilate(pavr, m_pWeightInfo, NULL, FnAddTokens);
|
|
|
|
ec= 0;
|
|
|
|
__leave;
|
|
|
|
}
|
|
__finally
|
|
{
|
|
if (pwiBase ) VFree(pwiBase );
|
|
if (papwi ) VFree(papwi );
|
|
if (pWeightInfo) VFree(pWeightInfo);
|
|
if (pbImages ) VFree(pbImages );
|
|
|
|
if (pavr) delete pavr;
|
|
}
|
|
|
|
return ec;
|
|
}
|
|
|
|
void CCompressTable::FnAddSpaces(UINT iValue, PVOID pvTag, PVOID pv)
|
|
{
|
|
ENCODE enc;
|
|
|
|
ASSERT(iValue < 16);
|
|
|
|
enc.fClass = SPACES_CLASS;
|
|
enc.abCode[0] = ((iValue & 0x0000000f) << 4) | 0x07;
|
|
|
|
*PENCODE(pvTag) = enc;
|
|
}
|
|
|
|
void CCompressTable::FnAddNulls(UINT iValue, PVOID pvTag, PVOID pv)
|
|
{
|
|
ENCODE enc;
|
|
|
|
ASSERT(iValue < 16);
|
|
|
|
enc.fClass = NULL_CLASS;
|
|
enc.abCode[0] = ((iValue & 0x0000000f) << 4) | 0x0f;
|
|
|
|
*PENCODE(pvTag) = enc;
|
|
}
|
|
|
|
void CCompressTable::FnAddTokens(UINT iValue, PVOID pvTag, PVOID pv)
|
|
{
|
|
*PENCODE(pvTag)= PWeightInfo(pv)[iValue].enc;
|
|
}
|
|
|