|
|
/*************************************************************************
* * * PERMIND2.C * * * * Copyright (C) Microsoft Corporation 1990-1994 * * All Rights reserved. * * * ************************************************************************** * * * Module Intent * * This is the final stage of the index building process. This module * * converts the input data into a permanent B-Tree file. * * * * Stem node structure: * * CbLeft |* Word | PointerToNode *| Slack * * * * Leaf node structure: * * NxtBlkPtr|CbLeft|*Word|FieldId|TopicCnt|PointerToNode|DataSize*|Slack * * * * Data node structure: * * |* Topic | OccBlkCnt |* OccBlk *| *| Slack * * * * Fields between |* *| repeat based on count values * * * ************************************************************************** * * * Current Owner: BinhN * * * **************************************************************************/
#include <mvopsys.h>
#include <mem.h>
#include <memory.h>
#include <math.h>
#include <orkin.h>
#include <mvsearch.h>
#include "common.h"
#include "index.h"
#ifdef _DEBUG
static BYTE NEAR s_aszModule[] = __FILE__; /* Used by error return functions.*/ #endif
/*************************************************************************
* * PRIVATE PUBLIC FUNCTIONS * * All of them should be declared far, unless we know they belong to * the same segment. They should be included in some include file * *************************************************************************/
PUBLIC HRESULT FAR PASCAL BuildBTree (HFPB, _LPIPB, LPB, HFPB, LPSTR); PUBLIC PNODEINFO FAR PASCAL AllocBTreeNode (_LPIPB); PUBLIC VOID PASCAL FAR FreeBTreeNode (PNODEINFO pNode); PUBLIC int FAR PASCAL PrefixCompressWord (LPB, LPB, LPB, int); PUBLIC HRESULT FAR PASCAL FWriteBits(PFILEDATA, DWORD, BYTE); PUBLIC DWORD FAR PASCAL WriteDataNode (_LPIPB, DWORD, PHRESULT);
/*************************************************************************
* * PRIVATE PRIVATE FUNCTIONS * *************************************************************************/
PRIVATE HRESULT NEAR PASCAL AddRecordToLeaf (_LPIPB); PRIVATE HRESULT NEAR PASCAL AddRecordToStem (_LPIPB, LPB); PRIVATE int NEAR PASCAL CompressDword (PFILEDATA, DWORD); PRIVATE HRESULT NEAR PASCAL WriteStemNode (_LPIPB, PNODEINFO); PRIVATE HRESULT NEAR PASCAL WriteLeafNode (_LPIPB); PRIVATE HRESULT NEAR PASCAL FlushAllNodes (_LPIPB); // Compression functions
// PRIVATE HRESULT NEAR PASCAL FAddDword (PFILEDATA, DWORD, CKEY);
PRIVATE HRESULT NEAR PASCAL FWriteBool(PFILEDATA, BOOL);
// This table is used to avoid the calculation "(1L << v) - 1". Instead
// you say "argdwBits[v]", which should be faster. The table is useful
// other places, too.
DWORD argdwBits[] = { 0x00000000, 0x00000001, 0x00000003, 0x00000007, 0x0000000F, 0x0000001F, 0x0000003F, 0x0000007F, 0x000000FF, 0x000001FF, 0x000003FF, 0x000007FF, 0x00000FFF, 0x00001FFF, 0x00003FFF, 0x00007FFF, 0x0000FFFF, 0x0001FFFF, 0x0003FFFF, 0x0007FFFF, 0x000FFFFF, 0x001FFFFF, 0x003FFFFF, 0x007FFFFF, 0x00FFFFFF, 0x01FFFFFF, 0x03FFFFFF, 0x07FFFFFF, 0x0FFFFFFF, 0x1FFFFFFF, 0x3FFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, };
PRIVATE HRESULT PASCAL NEAR WriteBitStreamDWord (PFILEDATA, DWORD, int); PRIVATE HRESULT PASCAL NEAR WriteFixedDWord (PFILEDATA, DWORD, int); PRIVATE HRESULT PASCAL NEAR WriteBellDWord (PFILEDATA, DWORD, int);
FENCODE EncodeTable[] = { WriteBitStreamDWord, WriteFixedDWord, WriteBellDWord, NULL, };
#define FAddDword(p,dw,key) (EncodeTable[(key).cschScheme]((p), (dw), (key).ucCenter))
#define SAFE_SLACK 256
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func HRESULT | BuildBTree | * Allocates required memory and opens input files to create a B-Tree. * Parses incoming words and calls AddRecordToLeaf to process them. * * @parm HFPB | hfpbSysFile | * If not NULL, handle to an already opened sysfile * * @parm _LPIPB | lpipb | * Pointer to the index parameter block * * @parm LPB | lpszTemp | * Filename of the temporary input file * * @parm LPB | lpszPerm | * Filename of the permanent B-Tree file * * @rdesc Returns S_OK on success or errors if failed * *************************************************************************/
HRESULT FAR PASCAL BuildBTree (HFPB hfpbFileSys, _LPIPB lpipb, LPB lpszTemp, HFPB hfpbPerm, LPSTR lszFilename/*IStream *pistmPerm*/) { PFILEDATA pOutFile; // Pointer to output data
PFILEDATA pInFile; // Pointer to input data
DWORD dwBytesRead = 0; // Checks for EOF
DWORD dwLeftover; // Used to adjust input buffer
PBTREEDATA pTreeData = &lpipb->BTreeData; // Structure defining BTree
PIH20 pHeader = &pTreeData->Header; // Replacement variable
HRESULT fRet; // Return value
PNODEINFO pNode; // Pointer to current input node
ERRB errb= S_OK; PHRESULT phr = &errb; int iIndex; // Index into the compressed key
DWORD dwUniqueTerm = 0; // Callback variable
BOOL fOpenedFile; // TRUE if we have to close the file
// Open input file
pInFile = &lpipb->InFile; if ((pInFile->fFile = FileOpen (NULL, lpszTemp, REGULAR_FILE, READ, phr)) == NULL) return *phr;
// Allocate input buffer
pInFile->dwMax = FILE_BUFFER; if ((pInFile->hMem = _GLOBALALLOC (DLLGMEM_ZEROINIT, pInFile->dwMax + SAFE_SLACK)) == NULL) { fRet = E_OUTOFMEMORY; exit0: FileClose (pInFile->fFile); if ((lpipb->idxf & KEEP_TEMP_FILE) == 0) FileUnlink (NULL, lpszTemp, REGULAR_FILE); return fRet; } pInFile->pMem = _GLOBALLOCK (pInFile->hMem); pInFile->pCurrent = pInFile->pMem;
pOutFile = &lpipb->OutFile;
/* Open subfile if necessary, (and system file if necessary) */ pOutFile->fFile = hfpbPerm; if ((fOpenedFile = FsTypeFromHfpb(hfpbPerm) != FS_SUBFILE) && (pOutFile->fFile = (HANDLE)FileOpen (hfpbPerm, lszFilename, hfpbPerm ? FS_SUBFILE : REGULAR_FILE, READ, phr)) == 0) { SetErrCode (&fRet, E_FILENOTFOUND); exit1: FreeHandle (pInFile->hMem); goto exit0; }
// Allocate output buffer, at least enough for one block
pOutFile->dwMax = FILE_BUFFER; if (pOutFile->dwMax < (LONG)lpipb->BTreeData.Header.dwBlockSize) pOutFile->dwMax = lpipb->BTreeData.Header.dwBlockSize; if ((pOutFile->hMem = _GLOBALALLOC (DLLGMEM_ZEROINIT, pOutFile->dwMax + SAFE_SLACK)) == NULL) { fRet = E_OUTOFMEMORY; exit2: if (fOpenedFile) FileClose (hfpbPerm); goto exit1; } pOutFile->pMem = _GLOBALLOCK (pOutFile->hMem); // Skip 1K to hold header infomation
pOutFile->pCurrent = pOutFile->pMem + FILE_HEADER; pOutFile->cbLeft = pOutFile->dwMax - FILE_HEADER; pOutFile->foPhysicalOffset.dwOffset = FILE_HEADER; pOutFile->ibit = cbitBYTE - 1;
// Allocate first leaf node
if ((pTreeData->rgpNodeInfo[0] = AllocBTreeNode (lpipb)) == NULL) { fRet = E_OUTOFMEMORY; exit3: FreeHandle (pOutFile->hMem); goto exit2; } pHeader->nidLast = 1; pHeader->cIdxLevels = 1;
// pNode points to the leaf node structure
pNode = pTreeData->rgpNodeInfo[0]; pNode->Slack = LEAF_SLACK; // Set the bytes left in node block
pNode->cbLeft = lpipb->BTreeData.Header.dwBlockSize - FOFFSET_SIZE - sizeof(WORD);
// Set the word length flag
if (lpipb->occf & OCCF_LENGTH) pTreeData->fOccfLength = 1; #if 0
// Save some math time if we're doing term-weighting
if (lpipb->idxf & IDXF_NORMALIZE) { MEMSET (pTreeData->argbLog, (BYTE)0, cLOG_MAX * sizeof (BYTE)); if ((hLog = _GLOBALALLOC (GMEM_MOVEABLE, (CB)(cLOG_MAX * sizeof (FLOAT)))) == NULL) { fRet = E_OUTOFMEMORY; goto exit3; } pTreeData->lrgrLog = (float FAR *)_GLOBALLOCK (hLog); } else hLog = NULL; #endif
// Load the input buffer & repeat until all records are processed
pInFile->dwMax = pInFile->cbLeft = FileRead (pInFile->fFile, pInFile->pMem, pInFile->dwMax, phr); do { // Call the user callback every once in a while
if (!(++dwUniqueTerm % 8192L) && (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS)) { PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo; CALLBACKINFO Info;
Info.dwPhase = 3; Info.dwIndex = (DWORD)((float)dwUniqueTerm / lpipb->dwUniqueWord * 100); fRet = (*pCallbackInfo->MessageFunc) (ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info); if (S_OK != fRet) goto exit4; }
if ((fRet = AddRecordToLeaf (lpipb)) != S_OK) goto exit4;
// pInFile->pCurrent points to the record size
// 256 is just an arbitrary number of slack to minimize out of data
// kevynct: pCurrent points to a record length which does not include
// the DWORD record len size, so we add this when checking. Actually, we
// add twice that to be safe.
if (pInFile->cbLeft <= SAFE_SLACK || (LONG)(GETLONG ((LPUL)(pInFile->pCurrent)) + 2 * sizeof(DWORD)) >= pInFile->cbLeft) { MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft); if ((pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem + pInFile->cbLeft, pInFile->dwMax - pInFile->cbLeft, phr)) < 0) { fRet = *phr; exit4: // Free log block used for term-weighting
#if 0
FreeHandle (hLog); #endif
// Free all node blocks
dwLeftover = 0; while (pTreeData->rgpNodeInfo[dwLeftover] != NULL) { FreeBTreeNode(pTreeData->rgpNodeInfo[dwLeftover++]); } goto exit3; }
pInFile->dwMax = pInFile->cbLeft; pInFile->pCurrent = pInFile->pMem; } } while (fRet == S_OK && pInFile->cbLeft);
// Flush anything left in the output buffer
if ((fRet = FlushAllNodes (lpipb)) != S_OK) goto exit4;
// Write out the sigma table
if (lpipb->idxf & IDXF_NORMALIZE) { pHeader->WeightTabOffset = pOutFile->foPhysicalOffset; pHeader->WeightTabSize = (LCB)((lpipb->dwMaxTopicId + 1) * sizeof (SIGMA)); if (FileWrite (pOutFile->fFile, lpipb->wi.hrgsigma, pHeader->WeightTabSize, phr) != (LONG)pHeader->WeightTabSize) { fRet = *phr; goto exit4; } pOutFile->foStartOffset = FoAddDw(pOutFile->foStartOffset, pHeader->WeightTabSize); }
// Copy info to header
pHeader->FileStamp = INDEX_STAMP; pHeader->version = VERCURRENT; pHeader->occf = lpipb->occf; pHeader->idxf = lpipb->idxf; pHeader->lcTopics = lpipb->lcTopics; pHeader->dwMaxTopicId = lpipb->dwMaxTopicId; pHeader->dwMaxFieldId = lpipb->dwMaxFieldId; pHeader->dwMaxWCount = lpipb->dwMaxWCount; pHeader->dwMaxOffset = lpipb->dwMaxOffset; pHeader->dwMaxWLen = lpipb->dwMaxWLen; pHeader->dwTotalWords = lpipb->dwIndexedWord; // Total indexed words
pHeader->dwUniqueWords = lpipb->dwUniqueWord; // Total unique words
pHeader->dwTotal2bWordLen = lpipb->dwTotal2bWordLen; pHeader->dwTotal3bWordLen = lpipb->dwTotal3bWordLen; pHeader->dwUniqueWordLen = lpipb->dwTotalUniqueWordLen; pHeader->ckeyTopicId = lpipb->cKey[CKEY_TOPIC_ID]; pHeader->ckeyOccCount = lpipb->cKey[CKEY_OCC_COUNT]; iIndex = CKEY_OCC_BASE; if (pHeader->occf & OCCF_COUNT) pHeader->ckeyWordCount = lpipb->cKey[iIndex++]; if (pHeader->occf & OCCF_OFFSET) pHeader->ckeyOffset = lpipb->cKey[iIndex];
if (FileSeekWrite (pOutFile->fFile, (LPB)pHeader, MakeFo (0, 0), sizeof (IH20), phr) != sizeof (IH20)) { fRet = *phr; goto exit4; } // Call the user callback every once in a while
if (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS) { PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo; CALLBACKINFO Info;
Info.dwPhase = 3; Info.dwIndex = 100; fRet = (*pCallbackInfo->MessageFunc) (ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info); if (S_OK != fRet) goto exit4; } fRet = S_OK; goto exit4; } /* BuildBTree */
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func HRESULT | AddRecordToLeaf | * Add the record pointed to by pDtreeData->OutFile->pCurrent to the B-Tree * contained in the structure. * * @parm _LPIPB | lpipb | * Pointer to the index parameter block * * @rdesc Returns S_OK on success or errors if failed * *************************************************************************/ #ifdef _DEBUG
static BYTE LastWord[4000] = {0}; static BYTE CurWord[4000] = {0}; #endif
HRESULT PASCAL AddRecordToLeaf (_LPIPB lpipb) { // Local Replacement Variables
PBTREEDATA pTreeData = &lpipb->BTreeData; PFILEDATA pOutFile = &lpipb->OutFile; // Output data
PFILEDATA pInFile = &lpipb->InFile; // Input data
HFPB fOutput = pOutFile->fFile; // Output file
HFPB fInput = lpipb->InFile.fFile; // Input file
LPB pInCurPtr = lpipb->InFile.pCurrent; // Input buffer
PNODEINFO pNode; LPB lpbWord; // Pointer to the word string
OCCF occf = lpipb->occf; // Working Variables
DWORD dwTopicCount; // Number of topic in record
DWORD dwFieldId; DWORD dwBlockSize; // Size of the entire occ block
LPB pDest; WORD uStringSize; ERRB errb;
// We always start from the leaf node
pNode = pTreeData->rgpNodeInfo[0];
// Set pointer to working buffer
pDest = pNode->pTmpResult; // Advance input buffer to the word string
pInCurPtr += sizeof (DWORD); lpbWord = pInCurPtr;
// Insert the word into the buffer
pDest += PrefixCompressWord (pDest, pInCurPtr, pNode->pLastWord, pTreeData->fOccfLength); // Get the word length
uStringSize = GETWORD((LPUW)pInCurPtr); lpipb->dwTotalUniqueWordLen += uStringSize; // Adjust for the word length storage
uStringSize += sizeof(SHORT); // Skip the word
pInCurPtr += uStringSize; #ifdef _DEBUG
STRCPY (LastWord, CurWord); MEMCPY (CurWord, lpbWord + 2, GETWORD((LPUW)lpbWord)); CurWord[GETWORD((LPUW)lpbWord)] = 0; if (STRCMP (LastWord, CurWord) > 0) SetErrCode (NULL, E_ASSERT); // if (STRCMP (CurWord, "forbidden") == 0)
// _asm int 3;
#endif
// If OccfLength is set skip it now
// (It has already been appended to the compressed word)
if (pTreeData->fOccfLength) pInCurPtr += CbByteUnpack(&dwBlockSize, pInCurPtr);
// Copy the FieldID
if (occf & OCCF_FIELDID) { CbByteUnpack (&dwFieldId, pInCurPtr); do { *pDest++ = *pInCurPtr; } while (*pInCurPtr++ & 0x80); }
// Get Topic Count
#if 0
CbByteUnpack (&dwTopicCount, pInCurPtr); do { *pDest++ = *pInCurPtr; } while (*pInCurPtr++ & 0x80); #else
dwTopicCount = GETLONG((LPUL)pInCurPtr); pInCurPtr += sizeof(DWORD); pDest += CbBytePack(pDest, dwTopicCount); #endif
// Check to see if this entry will fit in the leaf node
// We can't write the data block until we know where the entry
// will be stored. We must add in FOFFSET_SIZE to our current location
// to determine size. We ignore the block size field, so we might encroach
// on the slack by a few bytes.
if (pNode->cbLeft - pNode->Slack < (SHORT)(pDest -pNode->pTmpResult +FOFFSET_SIZE)) { HRESULT fRet;
if ((fRet = AddRecordToStem (lpipb, lpbWord)) != S_OK) return(fRet); // If the prefix count is zero, no problem
// Else we have to re-copy the word, since we are in a new leaf node
if (0 != pNode->pTmpResult[1]) { dwBlockSize = PrefixCompressWord (pNode->pTmpResult, lpbWord, pNode->pLastWord, pTreeData->fOccfLength); pDest = pNode->pTmpResult + dwBlockSize; if (occf & OCCF_FIELDID) pDest += CbBytePack (pDest, dwFieldId); pDest += CbBytePack (pDest, dwTopicCount); } } // Save new word as last word
MEMCPY (pNode->pLastWord, lpbWord, uStringSize + 2);
// Set pointer to beginning of data block
pDest += CopyFileOffset (pDest, pOutFile->foPhysicalOffset); // Update the bytes left
pInFile->cbLeft -= (LONG) (pInCurPtr - pInFile->pCurrent); #ifdef _DEBUG
if (pInFile->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
// Compress data block to output buffer and store it's compressed size
pInFile->pCurrent = pInCurPtr; if ((dwBlockSize = WriteDataNode (lpipb, dwTopicCount, &errb)) == 0) return errb; pDest += CbBytePack (pDest, dwBlockSize);
// Copy the temp buffer to the real node
dwBlockSize = (DWORD)(pDest - pNode->pTmpResult); MEMCPY (pNode->pCurPtr, pNode->pTmpResult, dwBlockSize); pNode->pCurPtr += dwBlockSize; pNode->cbLeft -= (WORD)dwBlockSize;
return S_OK; }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func DWORD | AddRecordToStem | * Add a key to a stem node, creating/flushing nodes as necessary. * * @parm LPB | lpbWord | * The word to add the the stem node (last word in the full leaf node) * * @rdesc S_OK if successful, or errors if failed * *************************************************************************/
HRESULT PASCAL AddRecordToStem (_LPIPB lpipb, LPB lpbWord) { SHORT CurLevel = 0; PNODEINFO pStemNode; PNODEINFO pLastNode;
PBTREEDATA pTreeData = &lpipb->BTreeData; PNODEINFO pLeafNode = pTreeData->rgpNodeInfo[0]; LPB pLastWord; int cbTemp; ERRB errb = S_OK; HRESULT fRet;
// Move up through stem nodes until space can be found/made
pStemNode = pLeafNode; do { pLastWord = pStemNode->pLastWord; pStemNode = pTreeData->rgpNodeInfo[++CurLevel]; if (pStemNode == NULL) { // Create a new stem node
if ((pStemNode = pTreeData->rgpNodeInfo[CurLevel] = AllocBTreeNode (lpipb)) == NULL) return SetErrCode (NULL, E_OUTOFMEMORY); pStemNode->Slack = STEM_SLACK; pStemNode->cbLeft = lpipb->BTreeData.Header.dwBlockSize - sizeof(WORD); if (++pTreeData->Header.cIdxLevels > MAX_TREE_HEIGHT) return E_TREETOOBIG; } pTreeData->Header.nidLast++; } while (pStemNode->cbLeft - pStemNode->Slack < (SHORT)(GETWORD ((LPUW)pLastWord) + sizeof (SHORT) + FOFFSET_SIZE)); // Work back down through the nodes clearing them to disk
while (CurLevel > 1) { pLastNode = pTreeData->rgpNodeInfo[--CurLevel]; pLastWord = pLastNode->pLastWord; // Copy word to stem node
if ((cbTemp = PrefixCompressWord (pStemNode->pCurPtr, pLastWord, pStemNode->pLastWord, pTreeData->fOccfLength)) == 0) { return errb; } pStemNode->pCurPtr += cbTemp; // Update the last word in the stem node
MEMCPY (pStemNode->pLastWord, pLastWord, GETWORD((LPUW)pLastWord)+ 2*sizeof(WORD)); // Set pointer in stem node
CopyFileOffset (pStemNode->pCurPtr, lpipb->OutFile.foPhysicalOffset); pStemNode->pCurPtr += FOFFSET_SIZE; pStemNode->cbLeft -= FOFFSET_SIZE + cbTemp; #ifdef _DEBUG
if (pStemNode->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
pStemNode = pTreeData->rgpNodeInfo[CurLevel]; if ((fRet = WriteStemNode (lpipb, pStemNode)) != S_OK) return(fRet); } // Clear the leaf node into the first stem node & reset it
// Copy last word to stem node
if ((cbTemp = PrefixCompressWord (pStemNode->pCurPtr, pLeafNode->pLastWord, pStemNode->pLastWord, pTreeData->fOccfLength)) == 0) { return errb; } pStemNode->pCurPtr += cbTemp; pStemNode->cbLeft -= cbTemp; #ifdef _DEBUG
if (pStemNode->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
// Update the last word in the stem node
MEMCPY (pStemNode->pLastWord, pLeafNode->pLastWord, GETWORD((LPUW)(pLeafNode->pLastWord))+2*sizeof(WORD)); // Set pointer to the leaf node
CopyFileOffset (pStemNode->pCurPtr, lpipb->OutFile.foPhysicalOffset); pStemNode->pCurPtr += FOFFSET_SIZE; pStemNode->cbLeft -= FOFFSET_SIZE; #ifdef _DEBUG
if (pStemNode->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
// Flush leaf node to output buffer and reset it
return WriteLeafNode (lpipb); }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func int | CompressDword | * Compresses the input stream into the output buffer using a high * bit encoding method. If the buffer is full it will be flushed to * a file. * * @parm PFILEDATA | pOutput | * Pointer to output buffer info * * @parm LPDWORD | pSrc | * Pointer to the uncompressed input stream * * @rdesc Returns the number of compressed bytes buffered * *************************************************************************/
int PASCAL CompressDword (PFILEDATA pOutput, DWORD dwValue) { LPB pDest = pOutput->pCurrent; int cBytes = 0; // Count of compressed bytes
ERRB errb;
// Any room left in output buffer?
if (sizeof(DWORD) * 2 >= pOutput->cbLeft) { DWORD dwSize; FileWrite (pOutput->fFile, pOutput->pMem, (dwSize = (DWORD)(pDest - pOutput->pMem)), &errb); pDest = pOutput->pMem; pOutput->cbLeft = pOutput->dwMax; pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize); }
while (dwValue) { *pDest = (BYTE)(dwValue & 0x7F); cBytes++; dwValue >>= 7; if (dwValue != 0) *pDest |= 0x80; pDest++; } pOutput->pCurrent = pDest; pOutput->foPhysicalOffset = FoAddDw (pOutput->foPhysicalOffset, (DWORD)cBytes); pOutput->cbLeft -= cBytes; #ifdef _DEBUG
if (pOutput->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
return cBytes; }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func DWORD | WriteDataNode | * Compresses the input stream into the output buffer. If the buffer * is full it will be flushed to a file. * * @parm _LPIPB | lpipb | * Pointer to global buffer * * @parm DWORD | dwTopicCount | * The number of topics in the input stream * * @parm PHRESULT | phr | * Error buffer * * @rdesc Returns the number of compressed bytes written * *************************************************************************/
PUBLIC DWORD PASCAL FAR WriteDataNode (_LPIPB lpipb, DWORD dwTopicCount, PHRESULT phr) { // Local replacement Variables
PBTREEDATA pTreeData = &lpipb->BTreeData; PFILEDATA pOutput = &lpipb->OutFile; // Output data structure
PFILEDATA pInFile = &lpipb->InFile; // Input data structre
HFPB fFile = pOutput->fFile; // Output file handle
// Working Variables
DWORD dwBlockSize; // Size of block to compress
DWORD dwEncodedSize = 0; // Size of encoded block
DWORD dwTopicIdDelta; // Really only used for weight values
DWORD TopicLoop; DWORD dwSlackSize; DWORD loop; DWORD dwTemp; FILEOFFSET foStart; // Physical beginning of bit compression block
FLOAT rTerm; // Only used when IDXF_NORMALIZE is set
FLOAT rWeight; // Only used when IDXF_NORMALIZE is set
WORD wWeight; // Only used when IDXF_NORMALIZE is set
DWORD dwTopicId = 0; // Only used when IDXF_NORMALIZE is set
int cbTemp; // # of compressed bytes that uncompressed
OCCF occf = lpipb->occf; HRESULT fRet;
foStart = pOutput->foPhysicalOffset; wWeight = 0; // UNDONE: Don't need it
for (TopicLoop = dwTopicCount; TopicLoop > 0; --TopicLoop) { // Move to the byte boundary
if (pOutput->ibit != cbitBYTE - 1) { pOutput->ibit = cbitBYTE - 1; if (--pOutput->cbLeft) { pOutput->pCurrent++; pOutput->foPhysicalOffset = FoAddDw (pOutput->foPhysicalOffset, 1); } else { if (FileWrite (pOutput->fFile, pOutput->pMem, dwTemp = (DWORD)(pOutput->pCurrent - pOutput->pMem), phr) != (LONG)dwTemp) return(0); pOutput->pCurrent = pOutput->pMem; pOutput->cbLeft = pOutput->dwMax; pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwTemp); #ifdef _DEBUG
MEMSET (pOutput->pMem, 0, pOutput->dwMax); #endif
} } // Store TopicId as necessary
if (pInFile->cbLeft < 2 * sizeof (DWORD)) { MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft); pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem + pInFile->cbLeft, pInFile->dwMax - pInFile->cbLeft, phr); pInFile->dwMax = pInFile->cbLeft; pInFile->pCurrent = pInFile->pMem; } cbTemp = CbByteUnpack (&dwTopicIdDelta, pInFile->pCurrent); dwTopicId += dwTopicIdDelta; // Get the real TopicID
if ((fRet = FAddDword (pOutput, dwTopicIdDelta, lpipb->cKey[CKEY_TOPIC_ID])) != S_OK) { SetErrCode(phr, fRet); return(0); } pInFile->pCurrent += cbTemp; pInFile->cbLeft -= cbTemp; if (occf & OCCF_HAVE_OCCURRENCE) { // Get number of occ data records for this topic
if (pInFile->cbLeft < 2 * sizeof (DWORD)) { MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft); pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem + pInFile->cbLeft, pInFile->dwMax - pInFile->cbLeft, phr); pInFile->dwMax = pInFile->cbLeft; pInFile->pCurrent = pInFile->pMem; } cbTemp = CbByteUnpack (&dwBlockSize, pInFile->pCurrent); pInFile->pCurrent += cbTemp; pInFile->cbLeft -= cbTemp; }
// If we are term weighing we have to calculate the weight
if (lpipb->idxf & IDXF_NORMALIZE) { #ifndef ISBU_IR_CHANGE
// log10(x/y) == log10 (x) - log10 (y). Since x in our case is a known constant,
// 100,000,000, I'm replacing that with its equivalent log10 value of 8.0 and subtracting
// the log10(y) from it
rTerm = (float) (8.0 - log10((double) dwTopicCount)); // In extreme cases, rTerm could be 0 or even -ve (when dwTopicCount approaches or
// exceeds 100,000,000)
if (rTerm <= (float) 0.0) rTerm = cVerySmallWt; // very small value. == log(100 mil/ 95 mil)
// NOTE : rWeight for the doc term would be as follows:
// rWeight = float(min(4096, dwBlockSize)) * rTerm / lpipb->wi.hrgsigma[dwTopicId]
//
// Since rTerm needs to be recomputed again for the query term weight computation,
// and since rTerm will be the same value for the current term ('cos N and n of log(N/n)
// are the same (N = 100 million and n is whatever the doc term freq is for the term),
// we will factor in the second rTerm at index time. This way, we don't have to deal
// with rTerm at search time (reduces computation and query time shortens)
//
// MV 2.0 initially did the same thing. However, BinhN removed the second rTerm
// because he decided to remove the rTerm altogether from the query term weight. He
// did that to keep the scores reasonably high.
rWeight = ((float) min(cTFThreshold, dwBlockSize)) * rTerm * rTerm / lpipb->wi.hrgsigma[dwTopicId]; // without the additional rTerm, we would probably be between 0.0 and 1.0
if (rWeight > rTerm) wWeight = 0xFFFF; else wWeight = (WORD) ((float)0xFFFF * rWeight / rTerm); #else
rTerm = (float) (65535.0 * 8) / (float)dwTopicCount; rWeight = (float)dwBlockSize * rTerm / lpipb->wi.hrgsigma[dwTopicId]; if (rWeight >= 65535.0) wWeight = 65335; else wWeight = (WORD)rWeight; #endif // ISBU_IR_CHANGE
// Write the weight to the output buffer
if ((fRet = FWriteBits (&lpipb->OutFile, (DWORD)wWeight, (BYTE)(sizeof (WORD) * cbitBYTE))) != S_OK) { SetErrCode (phr, fRet); return(0); } }
// Don't do anything else if there is nothing else to do!!!
if ((occf & OCCF_HAVE_OCCURRENCE) == 0) continue; // Write the OccCount
if ((fRet = FAddDword (pOutput, dwBlockSize, lpipb->cKey[CKEY_OCC_COUNT])) != S_OK) { SetErrCode (phr, fRet); return(0); }
// Encode the occ block
for (loop = dwBlockSize; loop > 0; loop--) { int iIndex; iIndex = CKEY_OCC_BASE; // Make sure input buffer holds enough data
if (pInFile->cbLeft < 5 * sizeof (DWORD)) { MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft); pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem + pInFile->cbLeft, pInFile->dwMax - pInFile->cbLeft, phr); pInFile->dwMax = pInFile->cbLeft; pInFile->pCurrent = pInFile->pMem; }
if (occf & OCCF_COUNT) { cbTemp = CbByteUnpack (&dwTemp, pInFile->pCurrent); pInFile->pCurrent += cbTemp; pInFile->cbLeft -= cbTemp; if ((fRet = FAddDword (pOutput, dwTemp, lpipb->cKey[iIndex])) != S_OK) { SetErrCode (phr, fRet); return(0); } iIndex++; } if (occf & OCCF_OFFSET) { cbTemp = CbByteUnpack (&dwTemp, pInFile->pCurrent); pInFile->pCurrent += cbTemp; pInFile->cbLeft -= cbTemp; if ((fRet = FAddDword (pOutput, dwTemp, lpipb->cKey[iIndex])) != S_OK) { SetErrCode (phr, fRet); return(0); } } } } // Advance to next byte (we are partially through a byte now)
pOutput->ibit = cbitBYTE - 1; pOutput->pCurrent++; pOutput->foPhysicalOffset = FoAddDw (pOutput->foPhysicalOffset, 1); pOutput->cbLeft--; #ifdef _DEBUG
if (pOutput->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
dwEncodedSize += DwSubFo (pOutput->foPhysicalOffset, foStart); // Leave slack space, but not for uncommon words
if (dwTopicCount <= 2) dwSlackSize = 0; else dwSlackSize = dwEncodedSize / 10;
dwEncodedSize += dwSlackSize; // Keep a running total of all allocated slack space
pTreeData->Header.dwSlackCount += dwSlackSize;
while (dwSlackSize) { if (pOutput->cbLeft < (LONG)dwSlackSize) { // The slack block doesn't fit in the output buffer
// Write as much as we can then flush the buffer and write the rest
// MEMSET (pOutput->pCurrent, 0, pOutput->cbLeft);
DWORD dwSize; dwSlackSize -= pOutput->cbLeft; if (0 == FileWrite (fFile, pOutput->pMem, dwSize = pOutput->dwMax, phr)) { return 0; } pOutput->pCurrent = pOutput->pMem; pOutput->foPhysicalOffset = FoAddDw (pOutput->foPhysicalOffset, pOutput->cbLeft); pOutput->cbLeft = pOutput->dwMax; pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize); } else { // The slack fits, no problems
MEMSET (pOutput->pCurrent, 0, dwSlackSize); pOutput->pCurrent += dwSlackSize; pOutput->foPhysicalOffset = FoAddDw (pOutput->foPhysicalOffset, dwSlackSize); pOutput->cbLeft -= dwSlackSize; #ifdef _DEBUG
if (pOutput->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
dwSlackSize = 0; } } return dwEncodedSize; }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func void | WriteStemNode | * Flushes a stem node in the BTree to the output buffer. Once flushed, * the node is reset to the beginning and filled with zeros. * * @parm _LPIPB | lpipb | * Pointer the IPB structure * * @parm PNODEINFO | pNode | * Pointer to the node to flush * *************************************************************************/
PRIVATE HRESULT PASCAL WriteStemNode (_LPIPB lpipb, PNODEINFO pNode) { // Local Replacement Variable
PBTREEDATA pTreeData = &lpipb->BTreeData; PFILEDATA pOutput = &lpipb->OutFile; // Output structure
LPB pDest; // Output buffer
LPB pStart = pNode->pBuffer; // Start of node buffer
// Local Working Variables
DWORD dwBytesLeft; // Bytes left to write
ERRB errb;
#if 0 // Use 2-bytes for cbLeft to simplify the work of update
// Compress CbLeft to output buffer
dwBytesLeft = lpipb->BTreeData.Header.dwBlockSize - FOFFSET_SIZE - CompressDword (pOutput, (DWORD)pNode->cbLeft); #else
*(LPUW)(pOutput->pCurrent) = (WORD)pNode->cbLeft; pOutput->pCurrent += sizeof(WORD); pOutput->cbLeft -= sizeof(WORD); pOutput->foPhysicalOffset = FoAddDw (pOutput->foPhysicalOffset, (DWORD)sizeof(WORD)); dwBytesLeft = lpipb->BTreeData.Header.dwBlockSize - sizeof(WORD); #endif
pDest = pOutput->pCurrent;
// Keep a running total of all allocated slack space
pTreeData->Header.dwSlackCount += pNode->cbLeft;
// This is why the buffer must be >= BTREE_NODE_SIZE
// This could be put in a loop to avoid that restriction, but it
// is probably not worth it. (See also WriteLeafNode)
if (pOutput->cbLeft < (LONG)dwBytesLeft) { LONG dwSize; if (FileWrite (pOutput->fFile, pOutput->pMem, dwSize = (DWORD)(pDest - pOutput->pMem), &errb) != dwSize) return(errb); pDest = pOutput->pMem; pOutput->cbLeft = pOutput->dwMax; pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize); } MEMCPY (pDest, pStart, dwBytesLeft); pOutput->foPhysicalOffset = FoAddDw (pOutput->foPhysicalOffset, dwBytesLeft); pOutput->cbLeft -= dwBytesLeft; #ifdef _DEBUG
if (pOutput->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
// Set the external variable
pOutput->pCurrent = pDest + dwBytesLeft;
// Set to all zeros so we know when we have reached the end of data later
MEMSET (pNode->pBuffer, 0, lpipb->BTreeData.Header.dwBlockSize); pNode->cbLeft = lpipb->BTreeData.Header.dwBlockSize - sizeof(WORD); pNode->pCurPtr = pNode->pBuffer; *(PUSHORT)pNode->pLastWord = 0; return(S_OK); }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func void | WriteLeafNode | * Flushes a leaf node in the BTree to the output buffer. Once flushed, * the node is reset to the beginning and filled with zeros. * * @parm _LPIPB | lpipb | * Pointer to index block * * @rdesc S_OK or other errors *************************************************************************/
PRIVATE HRESULT PASCAL NEAR WriteLeafNode (_LPIPB lpipb) { // Local Replacement Variables
PBTREEDATA pTreeData = &lpipb->BTreeData; PFILEDATA pOutput = &lpipb->OutFile; // Output data structure
LPB pDest = pOutput->pCurrent; // Output buffer
FILEOFFSET OffsetPointer = pTreeData->OffsetPointer; FILEOFFSET foPhysicalOffset = pOutput->foPhysicalOffset; PNODEINFO pNode = pTreeData->rgpNodeInfo[0]; // Leaf node
LPB pStart = pNode->pBuffer; // Beginning of the node buffer
// Working Variables
DWORD dwLeft; FILEOFFSET StartOffset; // Physical offset of the begining
// of the output buffer
ERRB errb;
// Backpatch the current offset to the last nodes pointer
if (!FoIsNil (OffsetPointer)) { // Is the backpatch location in the output buffer?
if (FoCompare (OffsetPointer, (StartOffset = FoSubFo (foPhysicalOffset, MakeFo ((DWORD)(pDest - pOutput->pMem), 0)))) >= 0) { CopyFileOffset (pOutput->pMem + DwSubFo (OffsetPointer, StartOffset), foPhysicalOffset); } else { if (FileSeekWrite (pOutput->fFile, &foPhysicalOffset, OffsetPointer, sizeof (DWORD), &errb) != sizeof (DWORD)) return(errb); FileSeek (pOutput->fFile, StartOffset, 0, NULL); } } // Set the backpatch location for next time
pTreeData->OffsetPointer = foPhysicalOffset;
// Skip the record pointer for this record (will be backpatched next time)
if (pOutput->cbLeft <= 0 ) { LONG dwSize; if (FileWrite (pOutput->fFile, pOutput->pMem, dwSize = (DWORD)(pDest - pOutput->pMem), &errb) != dwSize) return(errb); pDest = pOutput->pMem; pOutput->cbLeft = pOutput->dwMax; pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize); } MEMSET (pDest, 0, FOFFSET_SIZE); pOutput->cbLeft -= FOFFSET_SIZE; #ifdef _DEBUG
if (pOutput->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
pOutput->pCurrent = pDest + FOFFSET_SIZE; pOutput->foPhysicalOffset = FoAddDw (foPhysicalOffset, FOFFSET_SIZE);
#if 0 // Use 2-bytes for cbLeft to simplify the work of update
// Compress CbLeft to output buffer
dwLeft = lpipb->BTreeData.Header.dwBlockSize - FOFFSET_SIZE - CompressDword (pOutput, (DWORD)pNode->cbLeft); #else
*(LPUW)(pOutput->pCurrent) = (WORD)pNode->cbLeft; pOutput->foPhysicalOffset = FoAddDw (pOutput->foPhysicalOffset, (DWORD)sizeof(WORD)); pOutput->cbLeft -= sizeof(WORD); dwLeft = lpipb->BTreeData.Header.dwBlockSize - FOFFSET_SIZE - sizeof(WORD); pOutput->pCurrent += sizeof(WORD); #endif
pDest = pOutput->pCurrent;
// Keep a running total of all allocated slack space
pTreeData->Header.dwSlackCount += pNode->cbLeft; // This is why the buffer must be >= BTREE_NODE_SIZE
// This could be put in a loop to avoid that restriction, but it
// is probably not worth it. (See also WriteStemNode)
if (pOutput->cbLeft < (LONG)dwLeft) { LONG dwSize; if (FileWrite (pOutput->fFile, pOutput->pMem, dwSize = (DWORD)(pDest - pOutput->pMem), &errb) != dwSize) return(errb); pDest = pOutput->pMem; pOutput->cbLeft = pOutput->dwMax; pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize); } MEMCPY (pDest, pStart, dwLeft); pOutput->foPhysicalOffset = FoAddDw (pOutput->foPhysicalOffset, dwLeft); pOutput->cbLeft -= dwLeft; #ifdef _DEBUG
if (pOutput->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
pOutput->pCurrent = pDest + dwLeft;
// Reset buffer back to beginning
MEMSET (pNode->pBuffer, 0, lpipb->BTreeData.Header.dwBlockSize); pNode->pCurPtr = pNode->pBuffer; // Set the bytes left in node block
pNode->cbLeft = lpipb->BTreeData.Header.dwBlockSize - FOFFSET_SIZE - sizeof(WORD); *(PUSHORT)pNode->pLastWord = 0; return(S_OK); }
/*************************************************************************
* @doc PRIVATE INDEXING * * @func PNODEINFO | AllocBTreeNode | * Allocates memory for the node structure as well as the data buffer * contained in the structure. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @rdesc Returns a pointer to the newly allocated node *************************************************************************/
PUBLIC PNODEINFO PASCAL FAR AllocBTreeNode (_LPIPB lpipb) { PNODEINFO pNode;
// Allocate node structure
if ((pNode = GlobalLockedStructMemAlloc (sizeof (NODEINFO))) == NULL) { exit0: SetErrCode (NULL, E_OUTOFMEMORY); return NULL; }
// Allocate data buffer
if ((pNode->hMem = _GLOBALALLOC (DLLGMEM_ZEROINIT, pNode->dwBlockSize = lpipb->BTreeData.Header.dwBlockSize)) == NULL) { exit1: GlobalLockedStructMemFree(pNode); goto exit0; } pNode->pCurPtr = pNode->pBuffer = (LPB)_GLOBALLOCK (pNode->hMem);
// Allocate a buffer with the maximum word length, which is the block
// size
if ((pNode->hLastWord = _GLOBALALLOC (DLLGMEM_ZEROINIT, pNode->dwBlockSize)) == NULL) { exit2: FreeHandle (pNode->hMem); goto exit1; } pNode->pLastWord = (LPB)_GLOBALLOCK (pNode->hLastWord);
// Alllocate temporary result buffer.
if ((pNode->hTmp = _GLOBALALLOC (DLLGMEM_ZEROINIT, pNode->dwBlockSize)) == NULL) { FreeHandle (pNode->hLastWord); goto exit2; } pNode->pTmpResult = (LPB)_GLOBALLOCK (pNode->hTmp); return pNode; }
/*************************************************************************
* @doc PRIVATE INDEXING * * @func VOID | FreeBTreeNode | * Free all memory allocated for the node * * @parm PNODEINFO | pNode | * BTree node to be freed *************************************************************************/
PUBLIC VOID PASCAL FAR FreeBTreeNode (PNODEINFO pNode) { if (pNode == NULL) return; FreeHandle (pNode->hTmp); FreeHandle (pNode->hMem); FreeHandle (pNode->hLastWord); GlobalLockedStructMemFree(pNode); }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func HRESULT | PrefixCompressWord | * Adds a word to a record based on the last word in the node. * * @parm LPB | pDest | * Pointer to the destination buffer * * @parm LPB | lpbWord | * Pointer to the word string to add to node. The format is: * - 2-byte: string length * - n-byte: the string itself * - cbBytePack: real word length * * @parm LPB | pLastWord | * Pointer to the last word entered in the destination buffer * * @parm int | fOccfLengthSet | * Set to 1 if OCCF_LENGTH field is set, else 0 * * @parm PHRESULT | pErrb | * Pointer to error structure * * @rdesc returns number of bytes written to the destination buffer * @rcomm * Strings are compressed based on how many beginning bytes * (prefix) it has in common woth the previous word. The format is * - String's length : 2-byte CbPacked * - Prefix length : 1-byte (0 - 127). If the high bit is set * another word length is to follow the word * - Word : n-byte without the prefix * - Word's real length - 2-byte CbPacked: only exist if the * prefix length high bit is set *************************************************************************/
PUBLIC int PASCAL FAR PrefixCompressWord (LPB pDest, LPB lpbWord, LPB pLastWord, int fOccfLengthSet) { // Working Variables
int bPrefix; // The number of prefix bytes that match
unsigned int wPostfix; // Bytes left over that don't match
USHORT cbMinWordLen; // Smallest word size between the two words
LPB pStart = pDest; // Starting position
DWORD dwRealLength; // The real length of the word
// Get the minimum word length
wPostfix = GETWORD ((LPUW)lpbWord); if ((cbMinWordLen = GETWORD ((LPUW)pLastWord)) > wPostfix) cbMinWordLen = (USHORT) wPostfix; // Add one to adjust for two byte word headers (saves an add in the loop)
cbMinWordLen++;
for (bPrefix = 2; bPrefix <= cbMinWordLen; bPrefix++) { if (lpbWord[bPrefix] != pLastWord[bPrefix]) break; } // Adjust back to the real value
bPrefix -= 2;
// Prefix must be <= 127 (high bit is used to indicate fOccfLength field)
if (bPrefix > 127) bPrefix = 127;
cbMinWordLen = (USHORT) wPostfix; // Save the word length
wPostfix -= bPrefix; // Add wLen to wPostfix to get total byte count then write it.
// The extra byte is for the prefix byte
pDest += (USHORT)CbBytePack (pDest, (DWORD)(wPostfix + 1));
// If WordLen == string length then don't write WordLen
if (fOccfLengthSet) { CbByteUnpack (&dwRealLength, lpbWord + sizeof(WORD) + cbMinWordLen ); if (dwRealLength == cbMinWordLen) fOccfLengthSet = FALSE; }
// Write prefix size
// If fOccfLengthSet is set, set high bit of bPrefix
if (fOccfLengthSet) *pDest = bPrefix | 0x80; else *pDest = (BYTE) bPrefix; pDest++;
// Copy the postfix string over
MEMCPY (pDest, lpbWord + (bPrefix + sizeof (SHORT)), wPostfix); pDest += wPostfix;
// if fOccfLengthSet is set append WordLen to end of word
// (WordLen field follows word in input stream)
if (fOccfLengthSet) pDest += CbBytePack (pDest, dwRealLength);
return (int)(pDest - pStart); }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func void | FlushAllNodes | * Flushes the remaining nodes to disk when the tree is completely built. * * @parm _LPIPB | lpipb | * Pointer to index block * * @rdesc S_OK on success or errors if failed * *************************************************************************/
HRESULT PASCAL FlushAllNodes (_LPIPB lpipb) { PBTREEDATA pTreeData = &lpipb->BTreeData; PFILEDATA pOutput = &lpipb->OutFile;
PNODEINFO pLeafNode; PNODEINFO pStemNode; int WordSize; BYTE curLevel = 0; ERRB errb = S_OK; HRESULT fRet;
pStemNode = pTreeData->rgpNodeInfo[0];
while (pTreeData->rgpNodeInfo[++curLevel] != NULL) { pLeafNode = pStemNode; pStemNode = pTreeData->rgpNodeInfo[curLevel];
if ((WordSize = PrefixCompressWord (pStemNode->pCurPtr, pLeafNode->pLastWord, pStemNode->pLastWord, pTreeData->fOccfLength)) == 0) { return errb; } // Save new word as last word
MEMCPY (pStemNode->pLastWord, pLeafNode->pLastWord, GETWORD ((LPUW)pLeafNode->pLastWord) + 2);
pStemNode->pCurPtr += WordSize; pStemNode->cbLeft -= WordSize; #ifdef _DEBUG
if (pOutput->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
CopyFileOffset (pStemNode->pCurPtr, lpipb->OutFile.foPhysicalOffset); pStemNode->pCurPtr += FOFFSET_SIZE; pStemNode->cbLeft -= FOFFSET_SIZE; #ifdef _DEBUG
if (pOutput->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
if (curLevel == 1) { if ((fRet = WriteLeafNode (lpipb)) != S_OK) return(fRet); } else { if ((fRet = WriteStemNode (lpipb, pLeafNode)) != S_OK) return(fRet); } } // Set the pointer to the top stem node
pTreeData->Header.foIdxRoot = pOutput->foPhysicalOffset; pTreeData->Header.nidIdxRoot = pOutput->foPhysicalOffset.dwOffset;
if (curLevel == 1) { if ((fRet = WriteLeafNode (lpipb)) != S_OK) return(fRet); } else { if ((fRet = WriteStemNode (lpipb, pStemNode)) != S_OK) return(fRet); } { LONG dwSize; // Flush the output buffer
if (FileWrite (pOutput->fFile, pOutput->pMem, dwSize = (DWORD)(pOutput->pCurrent - pOutput->pMem), &errb) != dwSize) return(errb); pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize); } return S_OK; }
PRIVATE HRESULT PASCAL NEAR WriteBitStreamDWord (PFILEDATA pOutput, DWORD dw, int ckeyCenter) { BYTE ucBits; HRESULT fRet;
// Bitstream scheme.
//
// This writes "dw" one-bits followed by a zero-bit.
//
for (; dw;) { if (dw < cbitBYTE * sizeof(DWORD)) { ucBits = (BYTE)dw; dw = 0; } else { ucBits = cbitBYTE * sizeof(DWORD); dw -= cbitBYTE * sizeof(DWORD); } if ((fRet = FWriteBits(pOutput, argdwBits[ucBits], (BYTE)ucBits)) != S_OK) return fRet; } return FWriteBool(pOutput, 0); } PRIVATE HRESULT PASCAL NEAR WriteFixedDWord (PFILEDATA pOutput, DWORD dw, int ckeyCenter) { // This just writes "ckey.ucCenter" bits of data.
return (FWriteBits (pOutput, dw, (BYTE)(ckeyCenter + 1))); }
PRIVATE HRESULT PASCAL NEAR WriteBellDWord (PFILEDATA pOutput, DWORD dw, int ckeyCenter) { BYTE ucBits; HRESULT fRet; // The "BELL" scheme is more complicated.
ucBits = (BYTE)CbitBitsDw(dw); if (ucBits <= ckeyCenter) { //
// Encoding a small value. Write a zero, then write
// "ckey.ucCenter" bits of the value, which
// is guaranteed to be enough.
//
if ((fRet = FWriteBool(pOutput, 0)) != S_OK) return fRet; return FWriteBits(pOutput, dw, (BYTE)(ckeyCenter)); } //
// Encoding a value that won't fit in "ckey.ucCenter" bits.
// "ucBits" is how many bits it will really take.
//
// First, write out "ucBits - ckey.ucCenter" one-bits.
//
if ((fRet = FWriteBits(pOutput, argdwBits[ucBits - ckeyCenter], (BYTE)(ucBits - ckeyCenter))) != S_OK) return fRet; //
// Now, write out the value in "ucBits" bits,
// but zero the high-bit first.
//
return FWriteBits(pOutput, dw & argdwBits[ucBits - 1], ucBits); }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func HRESULT | FWriteBits | * Writes a bunch of bits into the output buffer. * * @parm PFILEDATA | pOutput | * Pointer to the output data structure * * @parm DWORD | dwVal | * DWORD value to write * * @parm BYTE | cbits | * Number of bits to write from dwVal * * @rdesc Returns S_OK on success or errors if failed * *************************************************************************/
PUBLIC HRESULT FAR PASCAL FWriteBits (PFILEDATA pOutput, DWORD dwVal, BYTE cBits) { BYTE cbitThisPassBits; BYTE bThis; ERRB errb; static DWORD Count = 0;
// Loop until no bits left
for (; cBits;) {
if (pOutput->ibit < 0) { pOutput->pCurrent++; pOutput->foPhysicalOffset = FoAddDw (pOutput->foPhysicalOffset, 1); pOutput->cbLeft--; #ifdef _DEBUG
if (pOutput->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
// Room left in output buffer?
if (pOutput->cbLeft <= 256) { LONG dwSize; if (FileWrite (pOutput->fFile, pOutput->pMem, dwSize = (DWORD)(pOutput->pCurrent - pOutput->pMem), &errb) != dwSize) return(errb); pOutput->cbLeft = pOutput->dwMax; pOutput->pCurrent = pOutput->pMem; pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize); #ifdef _DEBUG
// MEMSET (pOutput->pMem, 0, pOutput->dwMax);
// Count++;
// if (!FoEquals(pOutput->foStartOffset, pOutput->foPhysicalOffset))
// _asm int 3;
#endif
} pOutput->ibit = cbitBYTE - 1; } else { // Write some bits.
cbitThisPassBits = (pOutput->ibit + 1 < cBits) ? pOutput->ibit + 1 : cBits; bThis = (pOutput->ibit == cbitBYTE - 1) ? 0 : *pOutput->pCurrent; bThis |= ((dwVal >> (cBits - cbitThisPassBits)) << (pOutput->ibit - cbitThisPassBits + 1)); *pOutput->pCurrent = (BYTE)bThis; pOutput->ibit -= cbitThisPassBits; cBits -= (BYTE)cbitThisPassBits; } } return S_OK; }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func HRESULT | FWriteBool | * Writes a single bit into the output buffer. * * @parm PFILEDATA | pOutput | * Pointer to the output data structure * * @parm BOOL | dwVal | * BOOL value to write * * @rdesc Returns S_OK on success or errors if failed * *************************************************************************/
PRIVATE HRESULT NEAR PASCAL FWriteBool (PFILEDATA pOutput, BOOL fVal) { HRESULT fRet = E_FAIL; ERRB errb;
if (pOutput->ibit < 0) { // This byte is full, point to a new byte
pOutput->pCurrent++; pOutput->foPhysicalOffset = FoAddDw (pOutput->foPhysicalOffset, 1); pOutput->cbLeft--; #ifdef _DEBUG
if (pOutput->cbLeft <= 0) SetErrCode (NULL, E_ASSERT); #endif
// Room left in output buffer?
if (pOutput->cbLeft <= sizeof(DWORD)) { LONG dwSize; if (FileWrite (pOutput->fFile, pOutput->pMem, dwSize = (DWORD)(pOutput->pCurrent - pOutput->pMem), &errb) != dwSize) return(errb); pOutput->pCurrent = pOutput->pMem; pOutput->cbLeft = pOutput->dwMax; pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize); #ifdef _DEBUG
MEMSET (pOutput->pMem, 0, pOutput->dwMax); #endif
} pOutput->ibit = cbitBYTE - 1; } if (pOutput->ibit == cbitBYTE - 1) // Zero out a brand-new byte.
*pOutput->pCurrent = (BYTE)0; if (fVal) // Write my boolean.
*pOutput->pCurrent |= 1 << pOutput->ibit; pOutput->ibit--; return S_OK; // Fine.
}
HRESULT PASCAL FAR BuildBtreeFromEso (HFPB hfpb, LPSTR pstrFilename, LPB lpbEsiFile, LPB lpbEsoFile, PINDEXINFO pIndexInfo) { _LPIPB lpipb; HRESULT fRet; ERRB errb; BYTE bKeyIndex = 0; IPB ipb; HFILE hFile;
if ((lpipb = MVIndexInitiate(pIndexInfo, NULL)) == NULL) return E_OUTOFMEMORY;
/* Read in the external sort buffer info */
if ((hFile = _lopen (lpbEsiFile, READ)) == HFILE_ERROR) return E_NOTEXIST;
/* Read old IPB info */ _lread (hFile, &ipb, sizeof(IPB));
/* Transfer meaningful data */
lpipb->dwIndexedWord = ipb.dwIndexedWord; lpipb->dwUniqueWord = ipb.dwUniqueWord; lpipb->dwByteCount = ipb.dwByteCount; lpipb->dwOccOffbits = ipb.dwOccOffbits; lpipb->dwOccExtbits = ipb.dwOccExtbits; lpipb->dwMaxFieldId = ipb.dwMaxFieldId; lpipb->dwMaxWCount = ipb.dwMaxWCount; lpipb->dwMaxOffset = ipb.dwMaxOffset; lpipb->dwTotal3bWordLen = ipb.dwTotal3bWordLen; lpipb->dwTotal2bWordLen = ipb.dwTotal2bWordLen; lpipb->dwTotalUniqueWordLen = ipb.dwTotalUniqueWordLen; lpipb->lcTopics = ipb.lcTopics; lpipb->dwMaxTopicId = ipb.dwMaxTopicId; // lpipb->dwMemAllowed = ipb.dwMemAllowed;
lpipb->dwMaxRecordSize = ipb.dwMaxRecordSize; lpipb->dwMaxEsbRecSize = ipb.dwMaxEsbRecSize; lpipb->dwMaxWLen = ipb.dwMaxWLen; lpipb->idxf = ipb.idxf; if (lpipb->idxf & IDXF_NORMALIZE) { if ((lpipb->wi.hSigma = _GLOBALALLOC (DLLGMEM_ZEROINIT, (LCB)((lpipb->dwMaxTopicId + 1) * sizeof (SIGMA)))) == NULL) return SetErrCode (&errb, E_OUTOFMEMORY); lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK (lpipb->wi.hSigma);
if ((lpipb->wi.hLog = _GLOBALALLOC (DLLGMEM_ZEROINIT, (CB)(cLOG_MAX * sizeof (FLOAT)))) == NULL) { SetErrCode (&errb, (HRESULT)(fRet = E_OUTOFMEMORY)); exit1: FreeHandle (lpipb->wi.hSigma); MVIndexDispose (lpipb); return fRet; } #if 0
lpipb->wi.lrgrLog = (FLOAT FAR *)_GLOBALLOCK (lpipb->wi.hLog); // Initialize the array
for (loop = cLOG_MAX - 1; loop > 0; --loop) { rLog = (FLOAT)1.0 / (float)loop; lpipb->wi.lrgrLog[loop] = rLog * rLog; } #endif
}
// Build the permanent index
fRet = BuildBTree(NULL, lpipb, lpbEsoFile, hfpb, pstrFilename); if (lpipb->idxf & IDXF_NORMALIZE) { FreeHandle (lpipb->wi.hLog); goto exit1; } fRet = S_OK; goto exit1; }
|