#include "stdafx.h" #include "resource.h" #include "vmbuffer.h" #include "saveload.h" #include "ftslex.h" #include "Memex.h" #include "dict.h" // bitmasks for bit manipulations extern DWORD bitMask32[]; extern BYTE bitMask8[]; UINT g_os_version= 0; #define OS_CHICAGO 0x03 // Constructors /************************************************************************* * FUNCTION : * * * * RETURNS : * * * * PURPOSE : * * * * PARAMETERS : * * * * SIDE EFFECTS : * * * * DESCRIPTION : * * * * HISTORY : * * * *************************************************************************/ CDictionary::CDictionary() { // The following track the state of the dictionary. m_bDictState = UNINSERTABLE; m_cWordChars = 0; m_cStemChars = 0; m_cWords = 0; m_cStopWords = 0; m_cStems = 0; m_cMaxWords = 0; m_cMaxStems = 0; m_lpfnStemmer = NULL; m_hStemmerInstance = NULL; // Initially we do not have the concept id and next word fields compressed. // Use the full DWORD to hold the values. Later, we will reduce this. m_fWordsCompressed = FALSE; m_cConceptIdBits = m_cpNextWordBits = 8*sizeof(DWORD); m_fLoadedFromDisk = FALSE; // ADDED TO TEST "WORDS OF COMMON STEM" m_ConIdInContext = m_LastOccurrenceOfConId = EOL; m_vbpImage.Base = m_vbConceptId.Base = m_vbpNextWord.Base = m_vbStems.Base = m_vbWordHashBuckets.Base = NULL; m_vbStemHashBuckets.Base = m_vbWordBuffer.Base = m_vbStemBuffer.Base = 0; if (!g_os_version) g_os_version = (GetVersion() >> 30) & 0x0003; } void CDictionary::Initial() { CreateVirtualBuffer(&m_vbpCopyOfWord, 2*256, 2*0xFFFF); CreateVirtualBuffer(&m_vbpCopyOfWord2, 2*256, 2*0xFFFF); } CDictionary *CDictionary::NewDictionary(BOOL fLoadStopWords) { CDictionary *pDict = NULL; char *pszStopWords = NULL; PWCHAR pszWStopWords = NULL; extern HINSTANCE hinstDLL; __try { pDict= New CDictionary; pDict->Initial(); // BugBug : Find a way to come up with reasonable limits on the // number of unique words, number of documents etc. Until then, use reasonably // large values. // 1st arg is estimated # of words, 2nd arg is maximum number of words // 3rd arg is estimated # of characters in unique words // 4th arg is maximum # of characters in unique words // Estimates are used to commit memory and maximums are used to reserve memory pDict->StartDictInsertions(1024, 2000000, 10000, 10000000); if (!fLoadStopWords) __leave; PWCHAR pwStopWord; #if 0 UINT uErr= 0; HRSRC hrsrc= FindResource(hinstDLL, MAKEINTRESOURCE(IDS_STOPLIST), RT_STRING); uErr= GetLastError(); UINT cbStopList= SizeofResource(hinstDLL, hrsrc); uErr= GetLastError(); ASSERT(cbStopList); #else // 0 UINT cbStopList= 8192; #endif // 0 if (cbStopList) { ++cbStopList; // To account for the trailing null. pszStopWords = (char *) VAlloc(FALSE, cbStopList ); pszWStopWords = (PWCHAR) VAlloc(FALSE, cbStopList * sizeof(WCHAR) ); int i; i = LoadString(hinstDLL, IDS_STOPLIST, pszStopWords, cbStopList); // Enter stop words only when you have them if (i && MultiByteToWideChar(GetACP(), NULL, pszStopWords, cbStopList, pszWStopWords, cbStopList)) { pwStopWord = pszWStopWords; WCHAR wSpace= (WCHAR) (BYTE) ' '; for (;;) { WCHAR wc; for (; (wc= *pwStopWord) && (wc == wSpace); ++pwStopWord); if (!wc) break; PWCHAR pwLimit= pwStopWord; for (; (wc= *pwLimit) && (wc != wSpace); ++pwLimit); pDict->EnterWord(pwStopWord, pwLimit - pwStopWord, TRUE); pwStopWord= pwLimit; } } } } __finally { if (_abnormal_termination() && pDict) { delete pDict; pDict= NULL; } if (pszStopWords) { VFree( pszStopWords); pszStopWords = NULL; } if (pszWStopWords) { VFree(pszWStopWords); pszWStopWords = NULL; } } return pDict; } // Destructor /************************************************************************* * FUNCTION : CDictionary::~CDictionary * * * * RETURNS : NOTHING. * * * * PURPOSE : Cleans up after the class. * * * * PARAMETERS : NONE. * * * * SIDE EFFECTS : All memory allocations are freed. * * * * DESCRIPTION : * * * * HISTORY : * * * * Author Date Action * * ------ ---- ------ * * * * KrishnaN 4/23/94 Creation. * * * *************************************************************************/ CDictionary::~CDictionary() { // free any allocated memory if (m_vbpCopyOfWord.Base) FreeVirtualBuffer(&m_vbpCopyOfWord); if (m_vbpCopyOfWord2.Base) FreeVirtualBuffer(&m_vbpCopyOfWord2); // If we were loaded from disk, we do not need to free the remaining objects. if (m_fLoadedFromDisk) return; if (m_vbpImage.Base) FreeVirtualBuffer(&m_vbpImage); if (m_vbConceptId.Base) FreeVirtualBuffer(&m_vbConceptId); if (m_vbpNextWord.Base) FreeVirtualBuffer(&m_vbpNextWord); if (m_vbStems.Base) FreeVirtualBuffer(&m_vbStems); if (m_vbWordHashBuckets.Base) FreeVirtualBuffer(&m_vbWordHashBuckets); if (m_vbStemHashBuckets.Base) FreeVirtualBuffer(&m_vbStemHashBuckets); if (m_vbWordBuffer.Base) FreeVirtualBuffer(&m_vbWordBuffer); if (m_vbStemBuffer.Base) FreeVirtualBuffer(&m_vbStemBuffer); } /************************************************************************* * FUNCTION : CDictionary::StartDictInsertions * * * * RETURNS : * * * * PURPOSE : Allocates memory to enable insertions into the dictionary. * * * * PARAMETERS : * * * * SIDE EFFECTS : * * * * DESCRIPTION : * * * * HISTORY : * * * * Author Date Action * * ------ ---- ------ * * * * KrishnaN 4/23/94 Creation. * * * *************************************************************************/ // Access Functions: VOID CDictionary::StartDictInsertions(DWORD cInEstWords, DWORD cInMaxWords, DWORD cInEstWordBufferSize, DWORD cInMaxWordBufferSize) { char lpStemmer[15]; ASSERT(m_fWordsCompressed == FALSE); // Construct the stemmer name from the language id from the user's default locale wsprintf(lpStemmer, "STEM%04X.DLL", LANGIDFROMLCID(GetUserDefaultLCID())); m_hStemmerInstance = LoadLibrary(lpStemmer); if (m_hStemmerInstance) m_lpfnStemmer = (FPSTEMMER)GetProcAddress(m_hStemmerInstance, "Stemmer"); // This routine is used to transition the dictionary from an DICT_UNUSABLE entity to a DICT_USABLE // entity. Any other use is not permitted. ASSERT(m_bDictState == UNINSERTABLE); // validate input ASSERT(cInMaxWords != 0 && cInMaxWordBufferSize != 0 && cInMaxWords >= cInEstWords && cInMaxWordBufferSize >= cInEstWordBufferSize); CreateVirtualBuffer(&m_vbpImage , cInEstWords * sizeof(DWORD ), cInMaxWords * sizeof(DWORD )); CreateVirtualBuffer(&m_vbConceptId , cInEstWords * sizeof(DWORD ), cInMaxWords * sizeof(DWORD )); CreateVirtualBuffer(&m_vbpNextWord , cInEstWords * sizeof(DWORD ), cInMaxWords * sizeof(DWORD )); CreateVirtualBuffer(&m_vbStems , cInEstWords * sizeof(StemStruct), cInMaxWords * sizeof(StemStruct)); CreateVirtualBuffer(&m_vbWordHashBuckets, HASHTABLE_SIZE * sizeof(DWORD ), HASHTABLE_SIZE * sizeof(DWORD )); CreateVirtualBuffer(&m_vbStemHashBuckets, HASHTABLE_SIZE * sizeof(DWORD ), HASHTABLE_SIZE * sizeof(DWORD )); // IMPORTANT : The buffer sizes for words and stems are in characters. Since we need to allocate space in number // of bytes, and since there is no way of knowing how many bytes all the strings will occupy // (DBCS enabled characters can be one or two bytes long, we will allocate 2 bytes for each character. // DOCUMENT THIS FACT. CreateVirtualBuffer(&m_vbWordBuffer, 2*cInEstWordBufferSize, 2*cInMaxWordBufferSize); CreateVirtualBuffer(&m_vbStemBuffer, 2*cInEstWordBufferSize, 2*cInMaxWordBufferSize); // Initialize the allocated memory // VritualAlloc zeroes all memory it commits, so we don't have to worry about zeroing the virtual buffers // all hash buckets initially have EOL = 0xFFFFFFFF to indicate that they have nothing in the list memset(m_vbWordHashBuckets.Base, 0xFF, HASHTABLE_SIZE * sizeof(DWORD)); memset(m_vbStemHashBuckets.Base, 0xFF, HASHTABLE_SIZE * sizeof(DWORD)); // Successful memory allocation. The dictionary is now ready for insertions. m_bDictState = INSERTABLE; // <:=) } /************************************************************************* * FUNCTION : * * * * RETURNS : * * * * PURPOSE : * * * * PARAMETERS : * * * * SIDE EFFECTS : * * * * DESCRIPTION : * * * * HISTORY : * * * * Author Date Action * * ------ ---- ------ * * * * KrishnaN 4/23/94 Creation. * * * *************************************************************************/ VOID CDictionary::EndDictInsertions() { ASSERT(m_fWordsCompressed == FALSE); ASSERT(m_bDictState == INSERTABLE); if (m_hStemmerInstance) FreeLibrary(m_hStemmerInstance); if (m_vbpCopyOfWord.Base) FreeVirtualBuffer(&m_vbpCopyOfWord); if (m_vbpCopyOfWord2.Base) FreeVirtualBuffer(&m_vbpCopyOfWord2); m_bDictState = STORABLE; // Get rid of the memory used for the stems. All dictionary look up // in the future will be based on words in the documents/query. FreeVirtualBuffer(&m_vbStems ); FreeVirtualBuffer(&m_vbStemHashBuckets); FreeVirtualBuffer(&m_vbStemBuffer ); // Now is the time to get rid of any over committed memory. // Compress the ConceptId field ASSERT(m_fWordsCompressed == FALSE); BYTE m, bitPos, i, highBitPos; DWORD dwIndex, cByte, dwValue; // First figure out the number of bits we need. Do this by finding // the logbase2 of m_cStems. // Account for the case where m_cStems is 0. That could happen! m = 0; if (m_cStems) for (; m < 32 && !(bitMask32[m] & m_cStems); m++); ASSERT(m < 32); m_cConceptIdBits = 32 - m; for (dwIndex = cByte = 0, bitPos = 0; dwIndex < m_cWords; dwIndex++) { // get the dwIndex'th Concept Id and hold on to it. Then zero out that location. dwValue = ConceptId(dwIndex); // encode a STOPWORD as m_cStems. Since all the valid concept id values are from 0 to m_cStems - 1, // using m_cStems for this abnormal value will not be a problem. if (dwValue == STOPWORD) dwValue = m_cStems; else ASSERT(dwValue < m_cStems); ConceptId(dwIndex) = 0L; // now code the dwValue in the stream. for (highBitPos = m, i = 0; i < m_cConceptIdBits; i++) { if (bitMask32[highBitPos++] & dwValue) // if true, we have a 1 bit ConceptStreamByte(cByte) |= bitMask8[bitPos]; /* WE DO NOT HAVE TO ADD A 0 BIT, BECAUSE WE ALREADY ZEROED OUT THE ENTIRE THING. else // we have a 0 bit ConceptStreamByte(cByte) &= ~bitMask8[bitPos]; */ bitPos = (bitPos + 1) % 8; if (bitPos == 0) cByte++; } } // Compress the pNextWord field // First find the number of bits needed to represent all the values. // Account for the case where m_cWords is 0. That could happen! m = 0; if (m_cWords) for (m = 0; m < 32 && !(bitMask32[m] & m_cWords); m++); ASSERT(m < 32); m_cpNextWordBits = 32 - m; for (dwIndex = cByte = 0, bitPos = 0; dwIndex < m_cWords; dwIndex++) { // get the dwIndex'th Concept Id and hold on to it. Then zero out that location. dwValue = pNextWord(dwIndex); // encode EOL as m_cWords. Since all the valid word values are from 0 to m_cWords - 1, // using m_cWords for this abnormal value will not be a problem. if (dwValue == EOL) dwValue = m_cWords; else ASSERT(dwValue < m_cWords); pNextWord(dwIndex) = 0L; // now code the dwValue in the stream. for (highBitPos = m, i = 0; i < m_cpNextWordBits; i++) { if (bitMask32[highBitPos++] & dwValue) // if true, we have a 1 bit pNextWordStreamByte(cByte) |= bitMask8[bitPos]; /* WE DO NOT HAVE TO ADD A 0 BIT, BECAUSE WE ALREADY ZEROED OUT THE ENTIRE THING. else // we have a 0 bit pNextWordStreamByte(cByte) &= ~bitMask8[bitPos]; */ bitPos = (bitPos + 1) % 8; if (bitPos == 0) cByte++; } } m_fWordsCompressed = TRUE; } /************************************************************************* * FUNCTION : * * * * RETURNS : * * * * PURPOSE : * * * * PARAMETERS : * * * * SIDE EFFECTS : * * * * DESCRIPTION : * * * * HISTORY : * * * * Author Date Action * * ------ ---- ------ * * * * KrishnaN 4/23/94 Creation. * * * *************************************************************************/ DWORD CDictionary::EnterWord(PWCHAR pWord, WORD cCharsInWord, BOOL fStopWord, BOOL fLookup) { DWORD dwHashKey; DWORD pNextWord, pCurrWord; if (cCharsInWord == 0) return STOPWORD; __try { ZeroMemory(m_vbpCopyOfWord.Base, (cCharsInWord + 1) << 1); if (g_os_version == OS_CHICAGO) { __try { // zero out the word that follows the string ZeroMemory(m_vbpCopyOfWord2.Base, (cCharsInWord + 1)<< 1); } __except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbpCopyOfWord2)) { RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL); } // The only way in chicago to convert a Unicode string to lower case is to first convert // it to multibyte, use LCMapStringA and then convert it back to Unicode using MultiByteToWideChar. WideCharToMultiByte(GetACP(), NULL, pWord, cCharsInWord, (LPSTR)m_vbpCopyOfWord2.Base, (cCharsInWord + 1) << 1, NULL, NULL); /* int i = LCMapStringA(GetUserDefaultLCID(), LCMAP_LOWERCASE, (LPSTR)m_vbpCopyOfWord2.Base, cCharsInWord, (LPSTR)m_vbpCopyOfWord2.Base, cCharsInWord); */ CharLowerBuff((LPSTR)m_vbpCopyOfWord2.Base, cCharsInWord); MultiByteToWideChar(GetACP(), NULL, (LPSTR)m_vbpCopyOfWord2.Base, cCharsInWord, (PWCHAR)m_vbpCopyOfWord.Base, cCharsInWord); } else { // zero out the word that follows the string ZeroMemory(m_vbpCopyOfWord.Base, (cCharsInWord + 1)<< 1); // copy the string CopyMemory(m_vbpCopyOfWord.Base, (LPVOID)pWord, cCharsInWord << 1); CharLowerBuffW((PWCHAR)m_vbpCopyOfWord.Base, cCharsInWord); } } __except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbpCopyOfWord)) { RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL); } // Get the pointer to the first word entry in the collision resolution chain for this bucket dwHashKey = HASHMASK & ComputeHashKey((PWCHAR)m_vbpCopyOfWord.Base, cCharsInWord); pNextWord = WordHashBucket(dwHashKey); if (pNextWord == EOL && fLookup) // Are we only looking for a word? return EOL; else if (pNextWord == EOL) // If we are not looking up, we are entering a word. { ASSERT(m_fWordsCompressed == FALSE); // words haven't yet been compressed // Make the first entry for the resolution chain for this hash bucket. WordHashBucket(dwHashKey) = pNextWord = AddWordToDict((PWCHAR)m_vbpCopyOfWord.Base, cCharsInWord); // For words that are not stop words, we need to get a concept (by stemming) id if (fStopWord) m_cStopWords++; else { // stem the word in place and assign the concept id to the word. StemWord((PWCHAR)m_vbpCopyOfWord.Base, cCharsInWord); ConceptId(pNextWord) = EnterStem((PWCHAR)m_vbpCopyOfWord.Base); } return ConceptId(pNextWord); } // Walk the collision resolution chain for this hash bucket to find the word while (pNextWord != EOL && wcscmp((PWCHAR)m_vbpCopyOfWord.Base, (PWCHAR)m_vbWordBuffer.Base + GetpImage(pNextWord))) { pCurrWord = pNextWord; pNextWord = GetpNextWord(pNextWord); } if (pNextWord == EOL && fLookup) return EOL; else if (pNextWord == EOL) { ASSERT(m_fWordsCompressed == FALSE); // The word doesn't exist in the chain // Make an entry at the tail of the resolution chain for this hash bucket. pNextWord(pCurrWord) = pNextWord = AddWordToDict((PWCHAR)m_vbpCopyOfWord.Base, cCharsInWord); // For words that are not stop words, we need to get a concept (by stemming) id if (!fStopWord) { // stem the word in place and assign the concept id to the word. StemWord((PWCHAR)m_vbpCopyOfWord.Base, cCharsInWord);; ConceptId(pNextWord) = EnterStem((PWCHAR)m_vbpCopyOfWord.Base); } return ConceptId(pNextWord); } // The word already exists. Return the concept id! return GetConceptId(pNextWord); } VOID CDictionary::StoreImage(CPersist *pDiskImage) { EndDictInsertions(); DictHdr *pdh = (DictHdr *) (pDiskImage->ReserveTableSpace(sizeof(DictHdr))); pdh->cWordChars = m_cWordChars; pdh->offWordChars = pDiskImage->NextOffset(); pDiskImage->WriteWords(PWCHAR(m_vbWordBuffer.Base), m_cWordChars); pdh->cHashBuckets = HASHTABLE_SIZE; pdh->offHashBuckets = pDiskImage->NextOffset(); pDiskImage->WriteDWords(PUINT(m_vbWordHashBuckets.Base), HASHTABLE_SIZE); pdh->cWords = m_cWords; pdh->cConceptIdBits = m_cConceptIdBits; pdh->cpNextWordBits = m_cpNextWordBits; pdh->offpImage = pDiskImage->NextOffset(); pDiskImage->WriteDWords(PUINT(m_vbpImage.Base), m_cWords); pdh->offConceptId = pDiskImage->NextOffset(); pDiskImage->WriteBytes(PBYTE(m_vbConceptId.Base), (m_cWords*m_cConceptIdBits + 7) / 8); pdh->offpNextWord = pDiskImage->NextOffset(); pDiskImage->WriteBytes(PBYTE(m_vbpNextWord.Base), (m_cWords*m_cpNextWordBits + 7) / 8); pdh->cStems = m_cStems; pdh->cStopWords = m_cStopWords; } CDictionary * CDictionary::CreateImage(CPersist *pDiskImage) { CDictionary *pdict= NULL; DictHdr *pdh = (DictHdr *) (pDiskImage->ReserveTableSpace(sizeof(DictHdr))); __try { pdict= New CDictionary; pdict->Initial(); pdict->ConnectImage(pdh, pDiskImage); } __finally { if (_abnormal_termination() && pdict) { delete pdict; pdict= NULL; } } return pdict; } VOID CDictionary::ConnectImage(DictHdr *pdh, CPersist *pDiskImage) { m_cWords = pdh->cWords; m_cWordChars = pdh->cWordChars; m_cStems = pdh->cStems; m_cStopWords = pdh->cStopWords; m_fWordsCompressed = m_fLoadedFromDisk = TRUE; m_cConceptIdBits = pdh->cConceptIdBits; m_cpNextWordBits = pdh->cpNextWordBits; m_vbWordHashBuckets.Base = LPVOID(pDiskImage->LocationOf(pdh->offHashBuckets)); m_vbpImage.Base = LPVOID(pDiskImage->LocationOf(pdh->offpImage)); m_vbConceptId.Base = LPVOID(pDiskImage->LocationOf(pdh->offConceptId)); m_vbpNextWord.Base = LPVOID(pDiskImage->LocationOf(pdh->offpNextWord)); m_vbWordBuffer.Base = LPVOID(pDiskImage->LocationOf(pdh->offWordChars)); // ready to use! m_bDictState = DICT_USABLE; } /************************************************************************* * FUNCTION : * * * * RETURNS : * * * * PURPOSE : * * * * PARAMETERS : * * * * SIDE EFFECTS : * * * * DESCRIPTION : * * * * HISTORY : * * * * Author Date Action * * ------ ---- ------ * * * * KrishnaN 4/23/94 Creation. * * * *************************************************************************/ // Internal functions. __inline void CDictionary::StemWord(PWCHAR pWord, WORD cCharsInWord) { if (m_lpfnStemmer) m_lpfnStemmer(pWord, cCharsInWord); } /************************************************************************* * FUNCTION : * * * * RETURNS : * * * * PURPOSE : * * * * PARAMETERS : * * * * SIDE EFFECTS : * * * * DESCRIPTION : * * * * HISTORY : * * * * Author Date Action * * ------ ---- ------ * * * * KrishnaN 4/23/94 Creation. * * * *************************************************************************/ _inline DWORD CDictionary::ComputeHashKey(PWCHAR Word, WORD cCharsInWord) { register WORD i; register DWORD hv; PWCHAR pString; pString = Word; hv = -(*pString); for (i = 1; i < cCharsInWord; i++) { pString++; hv = _rotl(hv, 5) - *pString; } return hv; } /************************************************************************* * FUNCTION : * * * * RETURNS : * * * * PURPOSE : * * * * PARAMETERS : * * * * SIDE EFFECTS : * * * * DESCRIPTION : * * * * HISTORY : * * * * Author Date Action * * ------ ---- ------ * * * * KrishnaN 4/23/94 Creation. * * * *************************************************************************/ DWORD CDictionary::EnterStem(PWCHAR pStem) { DWORD dwHashKey; WORD cCharsInStem; DWORD pNextStem, pCurrStem; DWORD ConceptId; ASSERT(m_fWordsCompressed == FALSE); // THE CONCEPT ID ASSOCIATED WITH A STEM IS THE INDEX OF THAT STEM IN THE STEMSTRUCT ARRAY. // SINCE pNextStem IS THE INDEX OF A STEM, RETURNING THAT IS EQUIVALENT TO RETURNING THE // CONCEPT ID. cCharsInStem = wcslen(pStem); dwHashKey = HASHMASK & ComputeHashKey(pStem, cCharsInStem); // Get the pointer to the first stem entry in the collision resolution chain for this bucket pNextStem = StemHashBucket(dwHashKey); // Add the stem if it doesn't already exist if (pNextStem == EOL) { // Make the first entry for the resolution chain for this hash bucket.. StemHashBucket(dwHashKey) = ConceptId = AddStemToDict(pStem, cCharsInStem); return ConceptId; } // Walk the collision resolution chain for this hash bucket to find the stem while (pNextStem != EOL && wcscmp(pStem, (PWCHAR)m_vbStemBuffer.Base + Stem(pNextStem)->pImage)) { pCurrStem = pNextStem; pNextStem = Stem(pNextStem)->pNextStem; } if (pNextStem == EOL) { // The stem doesn't exist in the chain. // Make an entry at the tail end of the resolution chain for this hash bucket. Stem(pCurrStem)->pNextStem = ConceptId = AddStemToDict(pStem, cCharsInStem); return ConceptId; } else // The current stem already exists in the stem dictionary. Return the concept id. return pNextStem; } /************************************************************************* * FUNCTION : * * * * RETURNS : * * * * PURPOSE : * * * * PARAMETERS : * * * * SIDE EFFECTS : * * * * DESCRIPTION : * * * * HISTORY : * * * * Author Date Action * * ------ ---- ------ * * * * KrishnaN 4/23/94 Creation. * * * *************************************************************************/ DWORD CDictionary::AddWordToDict(PWCHAR pWord, WORD cCharsInWord) { ASSERT(m_fWordsCompressed == FALSE); __try { pImage(m_cWords) = m_cWordChars; } __except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbpImage)) { RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL); } __try { // mark it as a stop word. if it is not a stopword, the code that calls this routine will over // write this field, so we won't have to worry about it. ConceptId(m_cWords) = STOPWORD; } __except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbConceptId)) { RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL); } __try { // mark it as a stop word. if it is not a stopword, the code that calls this routine will over // write this field, so we won't have to worry about it. pNextWord(m_cWords) = EOL; } __except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbpNextWord)) { RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL); } __try { wcscpy((PWCHAR)m_vbWordBuffer.Base + m_cWordChars, pWord); } __except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbWordBuffer)) { RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL); } m_cWordChars += cCharsInWord + 1; // 1 accounts for the string terminator. m_cWords++; // this return value is placed in the pNextWord pointer of the node before this node. return (m_cWords - 1); } /************************************************************************* * FUNCTION : * * * * RETURNS : * * * * PURPOSE : * * * * PARAMETERS : * * * * SIDE EFFECTS : * * * * DESCRIPTION : * * * * HISTORY : * * * * Author Date Action * * ------ ---- ------ * * * * KrishnaN 4/23/94 Creation. * * * *************************************************************************/ DWORD CDictionary::AddStemToDict(PWCHAR pStem, WORD cCharsInStem) { ASSERT(m_fWordsCompressed == FALSE); __try { Stem(m_cStems)->pImage = 2*m_cStemChars; Stem(m_cStems)->pNextStem = EOL; } __except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbStems)) { RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL); } __try { wcscpy((PWCHAR)m_vbStemBuffer.Base + 2*m_cStemChars, pStem); } __except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbStemBuffer)) { RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL); } m_cStemChars += cCharsInStem + 1; // 1 accounts for the string terminator. m_cStems++; // this return value is placed in the pNextStem pointer of the node before this node. return (m_cStems - 1); } // ADDED TO SUPPORT "WORDS OF COMMON STEM" DWORD CDictionary::GetWordCountOfConcept(DWORD dwConId) { ASSERT(m_fWordsCompressed); // When the caller passes EOL for dwConId, we return the number of words in the // dictionary. This feature has been added to optimize the search time for // words of the same stem. if (dwConId == EOL) return m_cWords; if (dwConId > m_cStems) return 0; DWORD i, j; for (i = j = 0; i < m_cWords; i++) if (GetConceptId(i) == dwConId) j++; return j; } PWCHAR CDictionary::GetFirstWordOfConcept(DWORD dwConId) { DWORD i; ASSERT(m_fWordsCompressed); if (dwConId == EOL) { m_LastOccurrenceOfConId = 0; return ((PWCHAR)m_vbWordBuffer.Base + GetpImage(m_LastOccurrenceOfConId)); } for (i = 0; i < m_cWords; i++) if (GetConceptId(i) == dwConId) { m_ConIdInContext = dwConId; m_LastOccurrenceOfConId = i; return ((PWCHAR)m_vbWordBuffer.Base + GetpImage(i)); } // could not find a word with this concept id m_ConIdInContext = m_LastOccurrenceOfConId = EOL; return NULL; } PWCHAR CDictionary::GetNextWordOfConcept(DWORD dwConId) { // When given a EOL, simply return the next word. if (dwConId == EOL) { ASSERT(m_LastOccurrenceOfConId < m_cWords); m_LastOccurrenceOfConId++; return ((PWCHAR)m_vbWordBuffer.Base + GetpImage(m_LastOccurrenceOfConId)); } // If we are asked to get the next occurrence of this conid, make sure we were tracking it if (dwConId != m_ConIdInContext) return NULL; DWORD i; for (i = m_LastOccurrenceOfConId+1; i < m_cWords; i++) if (GetConceptId(i) == dwConId) { m_LastOccurrenceOfConId = i; return ((PWCHAR)m_vbWordBuffer.Base + GetpImage(i)); } return NULL; } DWORD CDictionary::GetpImage(DWORD i) { // This is currently not compressed return *((LPDWORD)m_vbpImage.Base + i); } DWORD CDictionary::GetConceptId(DWORD i) { if (!m_fWordsCompressed) return *((LPDWORD)m_vbConceptId.Base + i); LPBYTE pb = ((LPBYTE)m_vbConceptId.Base + i*m_cConceptIdBits / 8); DWORD dwConId = 0; BYTE index; BYTE bitPos = BYTE(i*m_cConceptIdBits % 8); // If true, place a 1 bit in the lowest bit position // If false, you already have a 0 bit in the lowest bit position if (*pb & bitMask8[bitPos]) dwConId |= bitMask32[31]; for ( index = 1; index < m_cConceptIdBits; index++ ) { bitPos = (bitPos + 1) % 8; if (bitPos == 0) pb++; dwConId <<= 1; // If true, place a 1 bit in the lowest bit position // If false, you already have a 0 bit in the lowest bit position if (*pb & bitMask8[bitPos]) dwConId |= bitMask32[31]; } // If we have a stopword, return STOPWORD return ( (dwConId == m_cStems) ? STOPWORD : dwConId ); } DWORD CDictionary::GetpNextWord(DWORD i) { if (!m_fWordsCompressed) return *((LPDWORD)m_vbpNextWord.Base + i); LPBYTE pb = ((LPBYTE)m_vbpNextWord.Base + i*m_cpNextWordBits / 8); DWORD dwNextWord = 0; BYTE index; BYTE bitPos = BYTE(i*m_cpNextWordBits % 8); // If true, place a 1 bit in the lowest bit position // If false, you already have a 0 bit in the lowest bit position if (*pb & bitMask8[bitPos]) dwNextWord |= bitMask32[31]; for ( index = 1; index < m_cpNextWordBits; index++ ) { bitPos = (bitPos + 1) % 8; if (bitPos == 0) pb++; dwNextWord <<= 1; // If true, place a 1 bit in the lowest bit position // If false, you already have a 0 bit in the lowest bit position if (*pb & bitMask8[bitPos]) dwNextWord |= bitMask32[31]; } return ( (dwNextWord == m_cWords) ? EOL : dwNextWord ); }