|
|
/*************************************************************************
* * * COLLECT.C * * * * Copyright (C) Microsoft Corporation 1990-1994 * * All Rights reserved. * * * ************************************************************************** * * * Module Intent * * * * This modules is the first stage in the index building process. The * * primary functoin of stage 1 is to collect and sort all of the words * * to be indexed. Before processing can begin, the user must call * * IndexInitiate to initialize the indexing variables (IPB). Words are * * added via a call to IndexAddWord and are stored in a Balanced Tree * * until an OOM condition occurrs. The tree is dumped and reset to * * receive further words. * * * ************************************************************************** * * * Current Owner: BinhN * * * **************************************************************************/ #include <mvopsys.h>
#include <mem.h>
#include <memory.h>
#include <math.h>
#include <orkin.h>
#include <mvsearch.h>
#include "common.h"
#include "index.h"
#ifdef _DEBUG
static BYTE NEAR s_aszModule[] = __FILE__; /* Used by error return functions.*/ #endif
#define MAX_OCCDATA 5
#define ISBUFFER_SIZE 0xFFFC // Size of OUTPUT buffers for collect2.c
// The output is DWORD aligned
// And the buffer *MUST* BE a multiple of 4
// Min size: size of largest index word
#define MIN_REQUIRED_MEM 0x400000 // 4-meg minimum
/*************************************************************************
* * INTERNAL PUBLIC FUNCTIONS * * All of them should be declared far, unless we know they belong to * the same segment. They should be included in some include file * *************************************************************************/
PUBLIC VOID FAR PASCAL FreeISI (LPIPB); PUBLIC void FAR PASCAL FreeEsi (LPIPB);
/*************************************************************************
* * INTERNAL PRIVATE FUNCTIONS * *************************************************************************/
PRIVATE PBTNODE NEAR PASCAL AddNode (_LPIPB, LST, LPOCC, PHRESULT); PRIVATE HRESULT NEAR PASCAL AddTopic (_LPIPB, PSTRDATA, LPOCC); PRIVATE void NEAR PASCAL AddOccurrence (PTOPICDATA, POCCDATA, int); PRIVATE HRESULT NEAR PASCAL WriteBuffer (_LPIPB, LPB); PRIVATE HRESULT NEAR PASCAL TraverseWrite (_LPIPB, PBTNODE, int); PRIVATE void NEAR PASCAL BalanceTree (LPISI, PBTNODE); PRIVATE void NEAR PASCAL LeftRotate (LPISI, PBTNODE); PRIVATE void NEAR PASCAL RightRotate (LPISI, PBTNODE); PRIVATE HRESULT PASCAL NEAR IndexBlockAllocate (LPIPB lpipb, LONG lMemSize); PRIVATE void NEAR PASCAL VerifyTree (PBTNODE pRoot);
/*************************************************************************
* * PUBLIC API FUNCTIONS * * All of them should be declared far and included in some .DEF file * *************************************************************************/
PUBLIC LPIPB EXPORT_API FAR PASCAL MVIndexInitiate(PINDEXINFO pIndexInfo, PHRESULT phr); PUBLIC void EXPORT_API FAR PASCAL MVIndexDispose (LPIPB); PUBLIC HRESULT EXPORT_API FAR PASCAL MVIndexAddWord (LPIPB, LST, LPOCC); PUBLIC LPDWORD EXPORT_API PASCAL FAR TotalIndexedWord (LPIPB);
/*************************************************************************
* * INTERNAL PUBLIC FUNCTIONS * * All of them should be declared far and included in some .h file * *************************************************************************/
PUBLIC HRESULT FAR PASCAL SortFlushISI (_LPIPB); PUBLIC int FAR PASCAL CompareOccurrence (LPDW, LPDW, int); PUBLIC int FAR PASCAL StrCmp2BytePascal (LPB, LPB); PUBLIC HRESULT FAR PASCAL FlushTree (_LPIPB);
/*************************************************************************
* * @doc API EXTERNAL INDEXING * * @func LPIPB FAR PASCAL | MVIndexInitiate | * The function allocates a index parameter block. The block is used * in all places during indexing. This function must be called * prior to any other indexing funtion. * * @parm PINDEXINFO | pIndexInfo | * Pointer to the index information data * * @parm PHRESULT | phr | * Pointer to error buffer. * * @rdesc Pointer to the block, or NULL if error. The error buffer * contains the description of the error * *************************************************************************/ PUBLIC LPIPB EXPORT_API FAR PASCAL MVIndexInitiate(PINDEXINFO pIndexInfo, PHRESULT phr) { _LPIPB lpipb; // Pointer to index paramet block
HRESULT fRet;
// foNil should, of course, be nil
// In this case foNil is only used by incremental update
ITASSERT(0 == foNil.dwOffset && 0 == foNil.dwHigh);
if (pIndexInfo == NULL) { SetErrCode (phr, E_INVALIDARG); return(NULL); } // Allocate the block. All the fields are initialized to 0
if ((lpipb = GlobalLockedStructMemAlloc (sizeof (IPB))) == NULL) { SetErrCode (phr, E_OUTOFMEMORY); return (NULL); }
// Initialize "idxf", make sure that "occf" has "OCCF_TOPICID" set.
lpipb->idxf = (WORD)(pIndexInfo->Idxf); lpipb->occf = (WORD)(pIndexInfo->Occf | OCCF_TOPICID);
// Initialize some fields
lpipb->dwLastIndexedTopic = (DWORD)-1; // Set the number of occurrence fields in the occurrence block
if (pIndexInfo->Occf & OCCF_COUNT) lpipb->ucNumOccDataFields++; if (pIndexInfo->Occf & OCCF_OFFSET) lpipb->ucNumOccDataFields++;
// Clear sort file handle
lpipb->dwUniqueWord = 0; lpipb->esi.lpesbRoot = NULL; // Allocate all the necessary memory block
if ((lpipb->dwMemAllowed = pIndexInfo->dwMemSize) < MIN_REQUIRED_MEM) lpipb->dwMemAllowed = MIN_REQUIRED_MEM; if ((fRet = IndexBlockAllocate (lpipb, lpipb->dwMemAllowed)) != S_OK) { SetErrCode (phr, fRet); GlobalLockedStructMemFree (lpipb); return (NULL); }
if (pIndexInfo->dwBlockSize <= BTREE_NODE_SIZE) lpipb->BTreeData.Header.dwBlockSize = BTREE_NODE_SIZE; else lpipb->BTreeData.Header.dwBlockSize = pIndexInfo->dwBlockSize;
lpipb->BTreeData.Header.dwCodePageID = pIndexInfo->dwCodePageID; lpipb->BTreeData.Header.lcid = pIndexInfo->lcid; lpipb->BTreeData.Header.dwBreakerInstID = pIndexInfo->dwBreakerInstID;
// Set the callback key
lpipb->dwKey = CALLBACKKEY;
return (lpipb); }
/*************************************************************************
* * @doc API EXTERNAL INDEXING * * @func void FAR PASCAL | MVIndexDispose | * Release all memory associated with the index parameter block. * Must be called after indexing is complete. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * *************************************************************************/ PUBLIC void EXPORT_API FAR PASCAL MVIndexDispose(_LPIPB lpipb) { // Sanity check
if (lpipb == NULL) return; // Free all memory associated with internal sort
FreeISI(lpipb);
// Free all memory associated with external sort
FreeEsi(lpipb);
GlobalLockedStructMemFree (lpipb); }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func VOID PASCAL NEAR | FreeISI | * Free all blocks, and temporary file associated with the internal * sort * * @parm _LPIPB | lpipb | * Pointer to index parameter block * *************************************************************************/ PUBLIC VOID PASCAL NEAR FreeISI (_LPIPB lpipb) { // Release temporary file buffer
FreeHandle (lpipb->isi.hSortBuffer); lpipb->isi.hSortBuffer = NULL; if (lpipb->isi.hfpb) { FileClose (lpipb->isi.hfpb); lpipb->isi.hfpb = NULL; }
if (lpipb->pDataBlock) { BlockFree (lpipb->pDataBlock); lpipb->pDataBlock = NULL; } if (lpipb->BTNodeBlock.pBlockMgr) { BlockFree (lpipb->BTNodeBlock.pBlockMgr); lpipb->BTNodeBlock.pBlockMgr = NULL; lpipb->BTNodeBlock.pFreeList = NULL; // Free list of Btnode
} if (lpipb->TopicBlock.pBlockMgr) { BlockFree (lpipb->TopicBlock.pBlockMgr); lpipb->TopicBlock.pBlockMgr = NULL; lpipb->TopicBlock.pFreeList = NULL; // Free list of topic node
} if (lpipb->OccBlock.pBlockMgr) { BlockFree (lpipb->OccBlock.pBlockMgr); lpipb->OccBlock.pBlockMgr = NULL; lpipb->OccBlock.pFreeList = NULL; // Free list of occurrence nodes
} }
/*************************************************************************
* @doc API EXTERNAL INDEXING * * @func HRESULT FAR PASCAL | MVIndexAddWord | * This function will add a word into the index. * * @parm LPIPB | lpipb | * Index parameter block being operated on * * @parm LST | lstWord | * Word being indexed. (Pascal style with 2-byte header) * * @parm LPOCC | lpocc | * Occurence data associated with this word. It is assumed that the * occurrence block contains NO UNINITIALIZED DATA, ie. non-used * fields must be set to 0 * * @rdesc S_OK, if successful, else other error * * @comm * The data are copied into the buffer managed by the block manager * and arranged as a Red/Black tree to speed sorting. *************************************************************************/
static OCC NullOcc = { 0 };
PUBLIC HRESULT EXPORT_API FAR PASCAL MVIndexAddWord (_LPIPB lpipb, LST lstWord, LPOCC lpOcc) { // Local replacement variables
ERRB errb; // Pointer to error variable
LPISI pIsi; // Internal Sort Information
PBTNODE pRoot; // Root of the Balanced Tree
// Working variables
PBTNODE pNode; // Used to traverse the tree to find
// to find the insertion point
PBTNODE FAR *ppNode; // Used to add children to the tree
int result; // String compare results
int wLen; // Word length
LST lstStart; // Saved starting position
#ifdef _DEBUG
char Buffer[200]; #endif
#ifdef _DEBUGREDBLACK
int iLeft = 0; int iRight = 0; #endif
// Various flags
int fCompareField;
// Sanity check
if (lpipb == NULL) return(E_INVALIDARG); // Handle null case
if (lstWord == NULL) return(S_OK);
fCompareField = lpipb->occf & OCCF_FIELDID; pIsi = &lpipb->isi; // Internal Sort Information
pRoot = pIsi->pBalanceTree; // Root of the Balanced Tree
// Working variables
ppNode = NULL; // Used to add children to the tree
lstStart = lstWord; // Saved starting position
if (lpOcc == NULL) lpOcc = &NullOcc;
// Get statistics
lpipb->dwIndexedWord++; // Count unique TopicId's
if (lpipb->dwLastIndexedTopic != lpOcc->dwTopicID) { lpipb->lcTopics++; lpipb->dwLastIndexedTopic = lpOcc->dwTopicID; } if (lpOcc->dwTopicID > lpipb->dwMaxTopicId) { lpipb->dwMaxTopicId = lpOcc->dwTopicID; }
wLen = GETWORD((LPUW)(lstStart = lstWord)); // Save statistical information about the total length of all words
if (wLen > 2) lpipb->dwTotal3bWordLen += wLen; else lpipb->dwTotal2bWordLen += wLen; lstWord += sizeof(WORD); #ifdef _DEBUG
if (wLen >= 200) { strncpy (Buffer, lstWord, 198); Buffer[199] = 0; } else { strncpy (Buffer, lstWord, wLen); Buffer[wLen] = 0; } // if (STRICMP (Buffer, "erin") == 0)
// _asm int 3;
#endif
// Call the user callback every once in a while
if (!(lpipb->dwIndexedWord % 65536L) && (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS)) { PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo; CALLBACKINFO Info; HRESULT err;
Info.dwPhase = 1; Info.dwIndex = lpipb->dwIndexedWord; err = (*pCallbackInfo->MessageFunc) (ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info); if (S_OK != err) return (err); }
SubmitWord:
// Is this the first word in the tree?
if (pRoot == NULL) { if ((pRoot = AddNode (lpipb, lstStart, lpOcc, &errb)) == NULL) return (SetErrCode (NULL, E_OUTOFMEMORY)); // Adjust tree data
pRoot->color = BLACK; pRoot->pParent = NULL; pIsi->pBalanceTree = pRoot; // Set statistical info
lpipb->dwByteCount = GETWORD ((LPUW)pRoot->StringData.pText); lpipb->dwMaxFieldId = pRoot->StringData.dwField; return (S_OK); }
// Set traversal node to root node
pNode = pRoot;
for (; ; ) // Traverse the tree forever
{ int len; // Used for string compare block
LPB lpbWord1, lpbWord2; // Used for string compare block
PSTRDATA pString;
/**********************************************
* This section of code does a string compare **********************************************/
lpbWord1 = lstWord; pString = &pNode->StringData; lpbWord2 = pString->pText;
// Get the minimum length
if ((result = wLen - GETWORD ((LPUW)lpbWord2)) > 0) len = GETWORD ((LPUW)lpbWord2); else len = wLen;
// Skip the lengths
lpbWord2 += sizeof (WORD); // Start compare byte per byte
for (; len > 0; len--, lpbWord1++, lpbWord2++) { if (*lpbWord1 != *lpbWord2) break; } if (len != 0) result = *lpbWord1 - *lpbWord2; /**********************************
* COMPARE FIELDID AND WORD LENGTH **********************************/ if (result == 0) { // If the WordLength and FieldId are the same as the current
// nodes' then we update the current record
if (fCompareField) result = lpOcc->dwFieldId - pString->dwField; if (result == 0) result = lpOcc->wWordLen - (WORD)pString->dwWordLength; if (result == 0) { if (AddTopic (lpipb, pString, lpOcc) == S_OK) return (S_OK); // Add failed. Flush the tree to disk & resubmit word
if ((result = FlushTree(lpipb)) == S_OK) { pRoot = pIsi->pBalanceTree; goto SubmitWord; } return (SetErrCode (NULL, (HRESULT)result)); } // Fall through in case result is non-zero
} // Descend tree or add new node
if (result < 0) { if (pNode->pLeft != NULL) { pNode = pNode->pLeft; #ifdef _DEBUGREDBLACK
iLeft++; #endif
continue; } else ppNode = &pNode->pLeft; } else { if (pNode->pRight != NULL) { pNode = pNode->pRight; #ifdef _DEBUGREDBLACK
iRight++; #endif
continue; } else ppNode = &pNode->pRight; } #ifdef _DEBUGREDBLACK
_DPF3("Added node '%s' at left %d, right %d\n", Buffer, iLeft, iRight); #endif
// Add the new node to the tree
*ppNode = AddNode (lpipb, lstStart, lpOcc, &errb); // If node is NULL we will flush the tree and resubmit the word
if (*ppNode == NULL) { if ((result = FlushTree(lpipb)) != S_OK) return (result);
pRoot = pIsi->pBalanceTree; ppNode = NULL; goto SubmitWord; } (*ppNode)->pParent = pNode; // This is the only place that the nodes get balanced
BalanceTree (pIsi, *ppNode); #ifdef _DEBUGREDBLACK
VerifyTree (pIsi->pBalanceTree); #endif
return (S_OK); } }
/*************************************************************************
* @doc API EXTERNAL INDEXING * * @func LPDWORD PASCAL FAR | TotalIndexedWord | * Return the total number of words indexed (for statistical purpose * only) * * @parm LPIPB | lpipb | * Pointer to index parameter block * * @rdesc Return pointer to the total number of words indexed *************************************************************************/
PUBLIC LPDWORD PASCAL FAR TotalIndexedWord(_LPIPB lpipb) { return (&lpipb->dwUniqueWord); }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func void NEAR PASCAL | FreeEsi | * Gets rid of all external-sort blocks attached to an IPB. * These blocks are formed into a single linked list * Also closes the file associated with the external sort. * * @parm _LPIPB | lpipb | * Pointer to index parameter block where all the info is stored * *************************************************************************/
PUBLIC VOID FAR PASCAL FreeEsi(_LPIPB lpipb) { LPESB lpesb; /* Linked-list walk pointer. */ LPESB lpesbNext; /* Next ESB in chain. */ LPESI lpesi; /* Pointer to external sort info struct */ /* Get pointer to the ESI block */ lpesi = &lpipb->esi; for (lpesb = lpesi->lpesbRoot; lpesb != NULL; lpesb = lpesbNext) { /* Get pointer to the next block */ lpesbNext = lpesb->lpesbNext;
if (lpesb->hMem) { _GLOBALUNLOCK(lpesb->hMem); _GLOBALFREE(lpesb->hMem); } /* Free the block */ GlobalLockedStructMemFree (lpesb); } lpesi->lpesbRoot = NULL; /* No more chain. */ lpesi->cesb = 0; /* Everyone freed */
// Delete the internal sorting result file
if ((lpipb->idxf & KEEP_TEMP_FILE) == 0) FileUnlink (NULL, lpipb->isi.aszTempName, REGULAR_FILE); }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func PBTNODE NEAR PASCAL | AddNode | * Inserts a new node into the tree. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm LST | lpb | * Word being indexed. * * @parm LPOCC | lpOcc | * Pointer to occurrence data * * @parm PHRESULT | phr | * Pointer to error structure * * @rdesc Pointer to the newly created node * * @comm * The nodes parent pointer must be set externally. * *************************************************************************/
PBTNODE NEAR PASCAL AddNode (_LPIPB lpipb, LST lpbWord, LPOCC lpOcc, PHRESULT phr) { // Local replacement variables
LPV pDataBlock = lpipb->pDataBlock; // Pointer to Block Manager
int occf = lpipb->occf;
// Working variables
PBTNODE pNode; // This will point to the new node
PSTRDATA pString; // Pointer to string block
PTOPICDATA pTopic; // Pointer to topic block
POCCDATA pOcc; LPDW lpDw;
// Create space for new node & topic & occ & copy the string
#if 0
if ((pNode = (PBTNODE)GetBlockNode (&lpipb->BTNodeBlock)) == NULL || (pTopic = (PTOPICDATA)GetBlockNode (&lpipb->TopicBlock)) == NULL || #else
if ((pNode = (PBTNODE)BlockGetBlock(pDataBlock, sizeof(BTNODE))) == NULL || (pTopic = (PTOPICDATA)BlockGetBlock (pDataBlock, sizeof(TOPICDATA))) == NULL || #endif
(pNode->StringData.pText = (LPB)BlockCopy (lpipb->pDataBlock, lpbWord, GETWORD((LPUW)lpbWord) + sizeof (SHORT), 0)) == NULL) { return (NULL); }
pString = &pNode->StringData; /* Initialize all the fields */ // Node Information. Parent field is set outside of this function
pNode->pLeft = pNode->pRight = NULL; pNode->color = RED; /* Set the string fields */ pString->pTopic = pString->pLastTopic = pTopic; pString->dwTopicCount = 1;
// It doesn't hurt to copy the data even if we don't use it
// It also saves a compare just to set it
pString->dwField = lpOcc->dwFieldId; pString->dwWordLength = lpOcc->wWordLen; // Set the topic fields data
pTopic->pNext = NULL; pTopic->dwTopicId = lpOcc->dwTopicID; if (occf & (OCCF_COUNT | OCCF_OFFSET)) { #if 1
if ((pOcc = (POCCDATA)BlockGetBlock (pDataBlock, sizeof(OCCDATA) * lpipb->ucNumOccDataFields)) == NULL) return(NULL); #else
if ((pOcc = (POCCDATA)GetBlockNode (&lpipb->OccBlock)) == NULL ) return(NULL); #endif
// Set the occ fields
pOcc->pNext = NULL; // Generate occ data block
lpDw = pOcc->OccData; if (occf & OCCF_COUNT) *lpDw++ = lpOcc->dwCount; if (occf & OCCF_OFFSET) *lpDw = lpOcc->dwOffset; pTopic->pLastOccData = pTopic->pOccData = pOcc; pTopic->dwOccCount = 1; } else { pTopic->pLastOccData = pTopic->pOccData = NULL; pTopic->dwOccCount = 0; }
// Set Statistical information
if (lpipb->dwMaxWLen < GETWORD ((LPUW)pString->pText)) lpipb->dwMaxWLen = GETWORD ((LPUW)pString->pText); if (lpipb->dwMaxFieldId < pString->dwField) lpipb->dwMaxFieldId = pString->dwField; lpipb->dwUniqueWord++; lpipb->dwByteCount += GETWORD ((LPUW)pString->pText);
return (pNode); }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func int FAR PASCAL | CompareOccurrence | * Compares two Occurrence data pointers starting from the first * element and continuing until the elements are unequal. * * @parm LPB | lpStr1 | * Pointer to the first Occurence to compare * * @parm LPB | pOcc2 | * Pointer to the second Occurence to compare * * @parm int | max | * The number of occurrence fields to compare * * @rdesc * negative value : If pOcc1 is less than pOcc2 * 0 : if pOcc1 is equal to pOcc2 * positive value : If pOcc1 is greater than pOcc2 * * @comm * The use of switch statment is for speed since this function is * called for so many times *************************************************************************/
int FAR PASCAL CompareOccurrence (LPDW pOcc1, LPDW pOcc2, int max) { int result;
switch (max) { case 5: if (result = (int)(*pOcc1 - *pOcc2)) return (result); pOcc1++; pOcc2++; case 4: if (result = (int)(*pOcc1 - *pOcc2)) return (result); pOcc1++; pOcc2++; case 3: if (result = (int)(*pOcc1 - *pOcc2)) return (result); pOcc1++; pOcc2++; case 2: if (result = (int)(*pOcc1 - *pOcc2)) return (result); pOcc1++; pOcc2++; case 1: return ((int)(*pOcc1 - *pOcc2));
default: // This can only an error, since we knows that max
// can never be > 5
return (0); } }
/*************************************************************************
* * @doc INTERNAL INDEXING * * @func HRESULT | AddTopic | * Inserts a new topic into a nodes topic list or a new occurrence * if a topic with the same TopicId already exists. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm PSTRDATA | pString | * Pointer to node structure * * @parm LPOCC | lpOcc | * Pointer occurrence data * * @rdesc S_OK, or errors if failed * *************************************************************************/
HRESULT NEAR PASCAL AddTopic (_LPIPB lpipb, PSTRDATA pString, LPOCC lpOcc) { // Local replacement variables
LPV pDataBlock = lpipb->pDataBlock; int occf = lpipb->occf; DWORD dwNewTopicId = lpOcc->dwTopicID; POCCDATA pOcc;
// Working variables
// int topicCount; // Iterates through current topics
PTOPICDATA pTopic, pPrevTopic; LPDW lpDw; int fResult; /* Set up a new occurrence block */ if (occf & (OCCF_COUNT | OCCF_OFFSET)) { if ((pOcc = (POCCDATA)BlockGetBlock (pDataBlock, sizeof(OCCDATA) * lpipb->ucNumOccDataFields)) == NULL) return (E_OUTOFMEMORY);
lpDw = pOcc->OccData; if (occf & OCCF_COUNT) *lpDw++ = lpOcc->dwCount; if (occf & OCCF_OFFSET) *lpDw = lpOcc->dwOffset; pOcc->pNext = NULL; } else pOcc = NULL; // Check from last point of insertion
pTopic = pString->pLastTopic;
if (pTopic->dwTopicId == dwNewTopicId) { append_occ_info: // Match. We don't have to do anything. That's is the majority
// of the case. Just add the occdata to the end
if (pOcc) { pTopic->pLastOccData->pNext = pOcc; pTopic->pLastOccData = pOcc; pTopic->dwOccCount++; } goto Update; }
if (pTopic->dwTopicId < dwNewTopicId) { // kevynct: scan ahead to insertion point. Usually with sorted lists
// this won't be far at all.
pPrevTopic = pTopic; if (pTopic->pNext) { for (; (fResult = pTopic->dwTopicId - dwNewTopicId) < 0 && pTopic->pNext; pPrevTopic = pTopic, pTopic = pTopic->pNext) ; // empty loop!
if (fResult == 0) { pString->pLastTopic = pTopic; goto append_occ_info; } }
if ((pTopic = (PTOPICDATA)BlockGetBlock (pDataBlock, sizeof(TOPICDATA))) == NULL) return (E_OUTOFMEMORY);
// Set the topic fields data
if (pOcc) { pTopic->pLastOccData = pTopic->pOccData = pOcc; pTopic->dwOccCount = 1; } else { pTopic->pLastOccData = pTopic->pOccData = NULL; pTopic->dwOccCount = 0; }
pTopic->dwTopicId = dwNewTopicId;
insert_middle_or_end: // Add to middle or end of list
pTopic->pNext = pPrevTopic->pNext; pPrevTopic->pNext = pTopic; pString->dwTopicCount++;
pString->pLastTopic = pTopic; goto Update; } // It means that topics are not inserted
// in order. It can only happen if somebody is using the
// indexer for some special, non-topic related index build
// Move to the right node
pPrevTopic = NULL; for (pTopic = pString->pTopic; (fResult = pTopic->dwTopicId - dwNewTopicId) < 0 && pTopic->pNext; pPrevTopic = pTopic, pTopic = pTopic->pNext); if (fResult == 0) { // Match. Just add the occdata to the end
if (pOcc) { pTopic->pLastOccData->pNext = pOcc; pTopic->pLastOccData = pOcc; pTopic->dwOccCount++; } } else { // A new topic node is needed
if ((pTopic = (PTOPICDATA)BlockGetBlock (pDataBlock, sizeof(TOPICDATA))) == NULL) return (E_OUTOFMEMORY);
// Set the topic fields data
if (pOcc) { pTopic->pLastOccData = pTopic->pOccData = pOcc; pTopic->dwOccCount = 1; } else { pTopic->pLastOccData = pTopic->pOccData = NULL; pTopic->dwOccCount = 0; } pTopic->dwTopicId = dwNewTopicId;
// Add to the beginning if empty
if (pPrevTopic == NULL) { // Add to the beginning
pTopic->pNext = pString->pTopic; pString->pTopic = pTopic; pString->dwTopicCount++;
pString->pLastTopic = pTopic; goto Update; }
goto insert_middle_or_end; }
Update: // Update statistical information
if (lpipb->dwMaxWCount < lpOcc->dwCount) lpipb->dwMaxWCount = lpOcc->dwCount; if (lpipb->dwMaxOffset < lpOcc->dwOffset) lpipb->dwMaxOffset = lpOcc->dwOffset;
return S_OK; }
/*************************************************************************
* * @doc INTERNAL INDEXING * * @func int | StrCmp2BytePascal | * Compares two Pascal style strings against eachother. * The strings must have a 2 byte length field, *NOT* 1 byte. * * @parm LPB | lpStr1 | * Pointer to string one * * @parm LPB | lpStr2 | * Pointer to string two * * @rdesc * negative value : If pOcc1 is less than pOcc2 * 0 : if pOcc1 is equal to pOcc2 * positive value : If pOcc1 is greater than pOcc2 * *************************************************************************/
int FAR PASCAL StrCmp2BytePascal(LPB lpStr1, LPB lpStr2) { int fRet; int register len;
// Get the minimum length
if ((fRet = GETWORD ((LPUW)lpStr1) - GETWORD ((LPUW)lpStr2)) > 0) len = GETWORD ((LPUW)lpStr2); else len = GETWORD ((LPUW)lpStr1);
// Skip the lengths */
lpStr1 += sizeof (SHORT); lpStr2 += sizeof (SHORT);
// Start compare byte per byte */
for (; len > 0; len--, lpStr1++, lpStr2++) { if (*lpStr1 != *lpStr2) break; }
if (len == 0) return (fRet); return (*lpStr1 - *lpStr2); }
/*************************************************************************
* * @doc INTERNAL INDEXING * * @func HRESULT | FlushTree | * Flushes the tree to disk. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @rdesc S_OK, or errors if failed * * @comm * This function holds the output file open until the tree has been * completely written to disk. The physical offset of the written * data is stored in the ESB blocks so that the word can be merged * in the next index phase. * *************************************************************************/ PUBLIC HRESULT FAR PASCAL FlushTree(_LPIPB lpipb) { // Local replacement variables
LPISI pIsi = &lpipb->isi; LPESI pEsi = &lpipb->esi; PBTNODE pBalanceTree = pIsi->pBalanceTree; ERRB errb; PHRESULT phr = &errb;
// Local working variables
LPESB pNewEsb; HRESULT fRet;
// Make sure that the tree actually has nodes
if (pBalanceTree == NULL) return (S_OK);
// Open output file & clear working variables
if (pIsi->hfpb == NULL) { // Allocate output buffer
if ((pIsi->hSortBuffer = _GLOBALALLOC (DLLGMEM_ZEROINIT, ISBUFFER_SIZE)) == NULL) return (E_OUTOFMEMORY); pIsi->pSortBuffer = (LPB)_GLOBALLOCK (pIsi->hSortBuffer);
// Get temp filename & open file
GETTEMPFILENAME ((char)0, (LPB)"iso", (WORD)0, pIsi->aszTempName); if ((pIsi->hfpb = FileOpen (NULL, pIsi->aszTempName, REGULAR_FILE, READ_WRITE, phr)) == NULL) return (*phr); pIsi->dwRecLength = 0; pEsi->cesb = 0; }
// Allocate new ESB structure & set starting values
if ((pNewEsb = GlobalLockedStructMemAlloc (sizeof (ESB))) == NULL) return (E_OUTOFMEMORY); // Remember the starting offset
pNewEsb->lfo = pIsi->lfo;
// Reset the current insertion point
pIsi->pCurPtr = pIsi->pSortBuffer; // Actually ouput the tree data
if ((fRet = TraverseWrite(lpipb, pBalanceTree, 0)) != S_OK) return (fRet); // Flush remaining buffer to disk
if ((fRet = WriteBuffer(lpipb, pIsi->pCurPtr)) != S_OK) return(fRet);
// Set the ESB maximum record length
pNewEsb->dwEsbSize = pIsi->dwMaxEsbRecSize; pIsi->dwMaxEsbRecSize = 0; // Store end offset in list
pNewEsb->lfoMax = pIsi->lfo; // Update the fileoffset
pIsi->lfo = pNewEsb->lfoMax; if (pEsi->lpesbRoot == NULL) pNewEsb->lpesbNext = NULL; else pNewEsb->lpesbNext = pEsi->lpesbRoot; pEsi->lpesbRoot = pNewEsb; pEsi->cesb++;
// Reset tree heap & root node
BlockReset (lpipb->pDataBlock); BlockReset (lpipb->BTNodeBlock.pBlockMgr); lpipb->BTNodeBlock.pFreeList = (PLIST)BlockGetLinkedList(lpipb->BTNodeBlock.pBlockMgr); BlockReset (lpipb->TopicBlock.pBlockMgr); lpipb->TopicBlock.pFreeList = (PLIST)BlockGetLinkedList(lpipb->TopicBlock.pBlockMgr); BlockReset (lpipb->OccBlock.pBlockMgr); lpipb->OccBlock.pFreeList = (PLIST)BlockGetLinkedList(lpipb->OccBlock.pBlockMgr); pIsi->pBalanceTree = NULL;
return (S_OK); }
/*************************************************************************
* * @doc INTERNAL INDEXING * * @func HRESULT | WriteBuffer | * Physically writes buffer to disk. This will write from the beginning * of the sort buffer to pStartRec. It then copies whatever left * in the sort buffer back to the beginning of it * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm LPB | copyEnd | * Pointer to the end of the next block of data to copy * * @rdesc S_OK or errors *************************************************************************/
HRESULT NEAR PASCAL WriteBuffer (_LPIPB lpipb, LPB copyEnd) { // Local replacement variables
LPISI pIsi = &lpipb->isi; LPB pSortBuffer; ERRB errb; PHRESULT phr = &errb;
DWORD cbWritten; // Number of bytes to write to disk (bytes)
DWORD cbCopied; // Size of extra data to move to buffers front
LPB copyStart;
pSortBuffer = (LPB)pIsi->pSortBuffer;
// Find what should be left in the buffer.
// copyStart will pointer to the beginning of data to be recopied, ie.
// the beginning of a record
// - if pIsi->pStartRec == -1 : there is no beginning of record
// so we have nothing to recopy
// - if pIsi->pStartRec == pSortBuffer, again the whole thing is
// to be written out, and there is nothing to recopy
if ((copyStart = pIsi->pStartRec) == (LPB)-1 || copyStart == pSortBuffer) copyStart = copyEnd; if ((cbWritten = (DWORD)(copyStart - pSortBuffer)) == 0) return(S_OK); // Nothing to copy
cbCopied = (DWORD)(copyEnd - copyStart);
// Update backpatching data
if (pIsi->pStartRec == pSortBuffer) { pIsi->pStartRec = (LPB)-1; // The pointer is invalid
pIsi->lfoRecBackPatch = pIsi->lfo; // Remember the place for backpatch
}
// Write the buffer to disk
if (cbWritten != (DWORD) FileWrite(pIsi->hfpb, pSortBuffer, cbWritten, phr)) { return (*phr); } pIsi->lfo = FoAddDw (pIsi->lfo, cbWritten); // Only copy if extra data exists
if (cbCopied) { MEMMOVE(pSortBuffer, copyStart, cbCopied); if (pIsi->pStartRec == copyStart) pIsi->pStartRec = pSortBuffer; } // Reset pStartRec & pCurPtr
pIsi->pCurPtr = pSortBuffer + cbCopied; return S_OK; }
/*************************************************************************
* * @doc INTERNAL INDEXING * * @func HRESULT | TraverseWrite | * Copies the node data to the output buffer. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm PBTNODE | node | * Node to copy to buffer * * @parm LPB | pBuffer | * Buffer to copy node into * * @parm int | Level | * Current tree level (starting with 1) * * @rdesc S_OK, or errors if failed * * @comm * This is currently a recursive routine. It should probably be * changed to be non-recursive to save on speed at run-time. * *************************************************************************/
HRESULT NEAR PASCAL TraverseWrite (_LPIPB lpipb, PBTNODE node, int Level) { // Local replacement pointers
PSTRDATA pString = &node->StringData; LPISI pIsi = &lpipb->isi; // Internal sort information
LPB pText = pString->pText; // The word string
POCCDATA pOccData; WORD ucNumOccDataFields = lpipb->ucNumOccDataFields; PTOPICDATA pTopic = pString->pTopic; ERRB errb; PHRESULT phr = &errb;
// Working variables
DWORD topicLoop, occLoop; // Loop counters
WORD wLength; // DWORD aligned length of string
BYTE filledBuffer = 0; // Count if record fills entire buffer
LPB pBaseBuffer; // Start of entire buffer
LPB pCurPtr; LPB pMaxPtr; HRESULT fRet;
// Keep track of how deep the tree is
if (Level > pIsi->DeepLevel) pIsi->DeepLevel = (BYTE) Level;
#ifdef _DEBUG
if (Level >= 65) { // This would be a HUGE tree!
return SetErrCode (phr, E_ASSERT); } #endif
// Traverse the left sub tree
if (node->pLeft != NULL) { if ((fRet = TraverseWrite(lpipb, node->pLeft, Level + 1)) != S_OK) return(fRet); }
/* Initialize */ pBaseBuffer = (LPB)pIsi->pSortBuffer; pMaxPtr = pBaseBuffer + ISBUFFER_SIZE - sizeof(DWORD); // Leave some room
pCurPtr = pIsi->pCurPtr; // Get starting point
// Reset the record length field
pIsi->dwRecLength = 0;
// Get the Pascal string length
wLength = GETWORD ((LPUW)pText) + sizeof (SHORT); //wLength = (wLength + 3) & (~3);
// Check for minimum room
if (pMaxPtr <= pCurPtr + wLength + // String length
sizeof (DWORD) + // Record length
sizeof (DWORD) + // FieldId
sizeof (WORD) + // Word length
sizeof (DWORD) ) // TopicCount
{ if ((fRet = WriteBuffer (lpipb, pCurPtr)) != S_OK) return fRet; pCurPtr = pIsi->pCurPtr; // Reset insertion point
} // Remember record length position to be backpatched
pIsi->pStartRec = pCurPtr; pCurPtr += sizeof (DWORD);
MEMCPY(pCurPtr, pText, wLength); pCurPtr += wLength; // Add aligned offset
// Copy the Word Length only if flag is set
if (lpipb->occf & OCCF_LENGTH) pCurPtr += CbBytePack (pCurPtr, pString->dwWordLength);
// Copy FieldId only if flag is set
if (lpipb->occf & OCCF_FIELDID) pCurPtr += CbBytePack (pCurPtr, pString->dwField);
// Topic Count
if (lpipb->occf & OCCF_TOPICID) pCurPtr += CbBytePack (pCurPtr, pString->dwTopicCount); else pString->dwTopicCount = 0;
// Add in all topics
for (topicLoop = pString->dwTopicCount; topicLoop > 0; --topicLoop) { // Check buffer overflow
if (pMaxPtr <= pCurPtr + sizeof (DWORD) // TopicId
+ sizeof (DWORD)) // Occurrence count
{ pIsi->dwRecLength += (DWORD)(pCurPtr - pIsi->pCurPtr); if ((fRet = WriteBuffer (lpipb, pCurPtr)) != S_OK) return fRet; pCurPtr = pIsi->pCurPtr; // Reset insertion point
} pCurPtr += CbBytePack (pCurPtr, pTopic->dwTopicId); if (occLoop = pTopic->dwOccCount) { pCurPtr += CbBytePack (pCurPtr, pTopic->dwOccCount); pOccData = pTopic->pOccData;
// Add in all occurrence data
for (occLoop = pTopic->dwOccCount; occLoop > 0; --occLoop) { LPDW lpDw; // Check buffer overflow
if (pMaxPtr <= pCurPtr + MAX_OCCDATA * sizeof (DWORD)) { pIsi->dwRecLength += (DWORD)(pCurPtr - pIsi->pCurPtr); if ((fRet = WriteBuffer (lpipb, pCurPtr)) != S_OK) return fRet; pCurPtr = pIsi->pCurPtr; // Reset insertion point
} lpDw = (LPDW)pOccData->OccData; switch (ucNumOccDataFields) { case 5: pCurPtr += CbBytePack (pCurPtr, *lpDw++); case 4: pCurPtr += CbBytePack (pCurPtr, *lpDw++); case 3: pCurPtr += CbBytePack (pCurPtr, *lpDw++); case 2: pCurPtr += CbBytePack (pCurPtr, *lpDw++); case 1: pCurPtr += CbBytePack (pCurPtr, *lpDw++); } pOccData = pOccData->pNext; } } pTopic = pTopic->pNext; }
// Update the record length
pIsi->dwRecLength += (DWORD)(pCurPtr - pIsi->pCurPtr); // Keep track of the maximum record size for merging.
// - 1 for the current ESB. This helps speeding up the merging sequence
// since we don't have to worry about a record being split
if (pIsi->dwRecLength > pIsi->dwMaxEsbRecSize) pIsi->dwMaxEsbRecSize = pIsi->dwRecLength; // Set record length
if (pIsi->pStartRec != (LPB)-1) { // Everything is still in memory
*(LPUL)pIsi->pStartRec = pIsi->dwRecLength; } else { // We have to do backpatching
if (sizeof (DWORD) != FileSeekWrite (pIsi->hfpb, &pIsi->dwRecLength, pIsi->lfoRecBackPatch, sizeof (DWORD), phr)) return *phr; FileSeek (lpipb->isi.hfpb, pIsi->lfo, 0, phr); }
// Update the current insertion point, and prepare for the next record
pIsi->pStartRec = pIsi->pCurPtr = pCurPtr; if (node->pRight != NULL) return TraverseWrite(lpipb, node->pRight, Level + 1); return(S_OK); }
/*************************************************************************
* * @doc INTERNAL INDEXING * * @func VOID NEAR PASCAL | BalanceTree | * Balances the tree using a Red/Black algorithm. * * @parm LPISI | pIsi | * Pointer to Internal sort data * * @parm PBTNODE | node | * Pointer to the node that was just inserted * * @comm * This routine must be called after EVERY new node is inserted in * the tree to maintain proper balance. * A Red/Black tree must maintain the following conditions: * Every node is colored either red or black * Every leaf node must be black * If a node is red, then both of its children must be black * Every path from the root to a leaf must contain the same * number of black nodes * *************************************************************************/ void NEAR PASCAL BalanceTree(LPISI pIsi, PBTNODE node) { PBTNODE y; PBTNODE pParentNode; node->color = RED; while (node != pIsi->pBalanceTree && node->pParent->color == RED) { pParentNode = node->pParent; if (pParentNode != NULL && pParentNode->pParent != NULL && pParentNode == pParentNode->pParent->pLeft) { y = pParentNode->pParent->pRight; if (y != NULL && y->color == RED) { pParentNode->color = BLACK; y->color = BLACK; pParentNode->pParent->color = RED; node = pParentNode->pParent; pParentNode = node->pParent; } else { if (node == pParentNode->pRight) { node = pParentNode; // LeftRotate change the parent node
LeftRotate(pIsi, node); pParentNode = node->pParent; } pParentNode->color = BLACK; pParentNode->pParent->color = RED; // RightRotate change the parent node
RightRotate(pIsi, pParentNode); pParentNode = node->pParent; } } else { if (pParentNode != NULL && pParentNode->pParent != NULL) y = pParentNode->pParent->pLeft; else y = NULL; if (y != NULL && y->color == RED) { pParentNode->color = BLACK; y->color = BLACK; pParentNode->pParent->color = RED; node = pParentNode->pParent; pParentNode = node->pParent; } else { if (node == pParentNode->pLeft) { // RightRotate change the parent node
RightRotate(pIsi, node); node->color = BLACK; node = node->pRight; pParentNode = node->pParent; } pParentNode->color = BLACK; pParentNode->pParent->color = RED; // LeftRotste change the parent node
LeftRotate(pIsi, pParentNode->pParent); pParentNode = node->pParent; } } } pIsi->pBalanceTree->color = BLACK; }
/*************************************************************************
* * @doc INTERNAL INDEXING * * @func VOID NEAR PASCAL | LeftRotate | * Rotates two nodes in the tree. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm PBTNODE | node | * The X node to process (see notes) * * @comm * * X Y * / \ / \ * a Y ---> X c * / \ / \ * b c a b *************************************************************************/
void NEAR PASCAL LeftRotate(LPISI pIsi, PBTNODE node) { PBTNODE y = node->pRight;
node->pRight = y->pLeft; if (y->pLeft != NULL) y->pLeft->pParent = node; y->pParent = node->pParent; if (y->pParent == NULL) pIsi->pBalanceTree = y; else { if (node == node->pParent->pLeft) node->pParent->pLeft = y; else node->pParent->pRight = y; } y->pLeft = node; node->pParent = y; }
/*************************************************************************
* * @doc INTERNAL INDEXING * * @func VOID NEAR PASCAL | RightRotate | * Rotates two nodes in the tree. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm PBTNODE | node | * The X node to process (see notes) * * @comm * * Y X * / \ / \ * X c ---> a Y * / \ / \ * a b b c *************************************************************************/
void NEAR PASCAL RightRotate(LPISI pIsi, PBTNODE node) { PBTNODE y = node->pParent;
y->pLeft = node->pRight; if (y->pLeft != NULL) y->pLeft->pParent = y;
node->pParent = y->pParent; if (node->pParent == NULL) pIsi->pBalanceTree = node; else { if (y == node->pParent->pLeft) node->pParent->pLeft = node; else node->pParent->pRight = node; } node->pRight = y; y->pParent = node; }
/************************************************************************
* @doc PRIVATE * @func HRESULT PASCAL NEAR | IndexBlockAllocate | * Set the memory allocation based on the memory of the machine * @parm DWORD | dwmemSize | * Memory allocated for the indexer * @rdesc S_OK, or E_OUTOFMEMORY ************************************************************************/
PRIVATE HRESULT PASCAL NEAR IndexBlockAllocate (_LPIPB lpipb, LONG lMemSize) { if ((lpipb->pDataBlock = BlockInitiate (MAX_BLOCK_SIZE, 0, (WORD)(lMemSize/MAX_BLOCK_SIZE), USE_VIRTUAL_MEMORY)) == NULL) return(E_OUTOFMEMORY); return(S_OK); }
#ifdef _DEBUGREDBLACK
/*
* @comm * This routine must be called after EVERY new node is inserted in * the tree to maintain proper balance. * A Red/Black tree must maintain the following conditions: * Every node is colored either red or black * Every leaf node must be black * If a node is red, then both of its children must be black * Every path from the root to a leaf must contain the same * number of black nodes */ void PreOrdTrav (PBTNODE pNode, int iLevel, char cChildType) { if (pNode == NULL) { OutputDebugString ("*\n"); return; }
_DPF4 ("Chl: %c Col: %c Lev: %d\n", cChildType, pNode->color == RED ? 'R' : 'B', iLevel);
iLevel++; PreOrdTrav (pNode->pLeft, iLevel, 'L'); PreOrdTrav (pNode->pRight, iLevel, 'R'); }
void NEAR PASCAL VerifyTree (PBTNODE pRoot) {
PreOrdTrav (pRoot, 0, 'R'); OutputDebugString ("End Tree\n");
} #endif /* _DEBUG */
|