|
|
/*************************************************************************
* * * UPDATE.C * * * * Copyright (C) Microsoft Corporation 1990-1994 * * All Rights reserved. * * * ************************************************************************** * * * Module Intent * * * ************************************************************************** * * * Current Owner: BinhN * * * **************************************************************************/
#include <mvopsys.h>
#include <math.h>
#include <mem.h>
#include <orkin.h>
#include <mvsearch.h>
#include "common.h"
#include "index.h"
#ifdef _DEBUG
static BYTE NEAR s_aszModule[] = __FILE__; /* Used by error return functions.*/ #endif
#define SAFE_SLACK 48 // Extra safety bytes
#define ESOUTPUT_BUFFER 0xFFFC // Size of output file buffer
// This must be at the size of the largest word + 12
// or word + 14 if OCCF_LENGTH is set
#define ESINPUT_BUFFER 0x7FFC // Size of input file buffers.
// Each ESB block get its own input buffer
// Min Size: Size of index word + ~8 bytes
#define NEW_NODE_ON_LEFT 0
#define NEW_NODE_ON_RIGHT 1
extern FENCODE EncodeTable[]; extern FDECODE DecodeTable[];
#define FAddDword(p,dw,key) EncodeTable[(key).cschScheme]((p), (dw), (key).ucCenter)
#define FGetDword(a,b,c) (*DecodeTable[b.cschScheme])(a, b, c)
typedef struct WORDINFO { DWORD dwWordLen; DWORD dwFieldId; DWORD dwNewTopicCount; DWORD dwIndexTopicCount; DWORD dwMergeTopicCount; DWORD dwOldTopicId; DWORD dwNewTopicId; DWORD dwIndexTopicId; DWORD dwDataSize; FILEOFFSET dataLocation; WORD fFlag; WORD pad; } WORDINFO, FAR *PWORDINFO;
typedef struct FREEBLOCK { DWORD dwBlockSize; FILEOFFSET foBlockOffset; }FREEBLOCK, FAR *PFREEBLOCK;
BYTE EmptyWord[4] = { 0 };
#ifdef _DEBUG
DWORD dwOldDataLoss = 0; DWORD dwNewDataSize = 0; DWORD dwOldDataNeed = 0; DWORD dwNewNodeSize = 0; #endif
// Flag to denote that the current entry is to be replaced by the new entry
// This happens when:
// - A repeated entry in the leaf node
// - The last entry in the stem node that has to be changed to the last
// word of the leaf node
#define REPLACE_WORD_01 0x0001
// Flag to denote that the last word buffer actually contains the word
// before last. This is needed when we have to replace the last word
// with the new word. In this case we need the word before last to do
// compression
#define ONE_WORD_BEHIND_02 0x0002
// Flag to denote updating the offset field with the temp node offset
#define USE_TEMP_NODE_04 0x0004
// Flag to denote that only the node offset address is to be updated. Since
// this is a fixed record size, this will speed up the update.
#define UPDATE_NODE_ADDRESS_08 0x0008
// rgpTmpNodeInfo is the new right node if set, else it is the left node
#define USE_TEMP_FOR_RIGHT_NODE_10 0x0010
// Flag to denote that we have to skip the next word before inserting a new
// word. This happen when adding a new word to the end of the block, where
// pCurPtr is pointing to the beginning of the last word
#define SKIP_NEXT_WORD_20 0x0020
// Both nodes, rgpNodeInfo and rgpTmpNodeInfo are used as left and right
// children. This happens when a new top node is created
#define USE_BOTH_NODE_40 0x0040
/*************************************************************************
* * INTERNAL PRIVATE FUNCTIONS * * All of them should be declared near * *************************************************************************/ PRIVATE HRESULT NEAR PASCAL ESFlushBuffer (LPESI); PRIVATE HRESULT NEAR PASCAL ESFillBuffer (_LPIPB, LPESB); PRIVATE void NEAR PASCAL ESMemory2Disk (_LPIPB, PMERGEHEADER); PRIVATE HRESULT NEAR PASCAL ProcessFiles (_LPIPB lpipb, LPMERGEPARAMS); PRIVATE int NEAR PASCAL CompareRecordBuffers (_LPIPB, LPB, LPB); PRIVATE VOID NEAR PASCAL PQueueUp (_LPIPB, LPESB FAR *, LONG); PRIVATE VOID NEAR PASCAL PQueueDown (_LPIPB); PRIVATE PTOPICDATA PASCAL NEAR MergeTopicNode (PMERGEHEADER, PTOPICDATA, int); PRIVATE VOID NEAR MergeOccurrence (PTOPICDATA, PTOPICDATA, int); PRIVATE HRESULT NEAR PASCAL UpdateIndexBTree (_LPIPB, HFPB, LPB, LPB); VOID SetQueue (LPESI pEsi); PRIVATE HRESULT NEAR PASCAL AddWordToBTree (_LPIPB, LPB, PWORDINFO); PRIVATE HRESULT PASCAL NEAR NewDataInsert(LPIPB lpipb, PFILEDATA pInfile, PNODEINFO FAR *rgpNodeInfo, LPB pWord, PWORDINFO pWordInfo); PRIVATE HRESULT PASCAL NEAR CreateNewNode(_LPIPB lpipb, int cLevel, int fIsStemNode, int fAfter); PRIVATE PASCAL NEAR AddRecordToBTree (_LPIPB lpipb, LPB pWord, PWORDINFO pWordInfo, int cLevel, int fReplaceWord); PRIVATE HRESULT PASCAL NEAR WriteNewDataRecord (_LPIPB, PWORDINFO); PRIVATE HRESULT GetFreeBlock (_LPIPB, PFREEBLOCK, DWORD); PRIVATE HRESULT PASCAL NEAR CopyBlockFile (PFILEDATA, HFPB, FILEOFFSET, DWORD); PRIVATE HRESULT PASCAL FAR EmitOldData (_LPIPB, PNODEINFO, PWORDINFO); PRIVATE HRESULT PASCAL FAR EmitNewData (_LPIPB, PWORDINFO, BOOL); PRIVATE HRESULT PASCAL NEAR UpdateDataNode (_LPIPB lpipb, PWORDINFO pWordInfo); PRIVATE int PASCAL NEAR SplitNodeAndAddData (_LPIPB lpipb, LPB pWord, PWORDINFO pWordInfo, int cLevel, int fFlag, int fIsStemNode); PRIVATE int PASCAL NEAR CopyNewDataToStemNode (_LPIPB lpipb, PNODEINFO pTmpNode, LPB pWord, LPB pLastWord, int cLevel, int fFlag); PRIVATE int PASCAL NEAR CopyNewDataToLeafNode (_LPIPB lpipb, PNODEINFO pTmpNode, PWORDINFO pWordInfo, LPB pWord, LPB pLastWord); VOID GetLastWordInNode (_LPIPB lpipb, PNODEINFO pNodeinfo, BOOL flag); PRIVATE HRESULT PASCAL FAR SkipNewData (_LPIPB lpipb, PWORDINFO pWordInfo); HRESULT CheckLeafNode (PNODEINFO pNodeInfo, int occf); HRESULT CheckStemNode (PNODEINFO pNodeInfo);
/*************************************************************************
* * INTERNAL PUBLIC FUNCTIONS * * All of them should be declared far, unless we know they belong to * the same segment. They should be included in some include file * *************************************************************************/ HRESULT FAR PASCAL FlushTree(_LPIPB lpipb); PUBLIC HRESULT FAR PASCAL MergeSortTreeFile (_LPIPB, LPMERGEPARAMS); PUBLIC HRESULT FAR PASCAL FillInputBuffer (LPESB, HFPB); PUBLIC VOID PASCAL FAR FreeBTreeNode (PNODEINFO pNode); PUBLIC PNODEINFO PASCAL FAR AllocBTreeNode (_LPIPB lpipb); PUBLIC PASCAL FAR PrefixCompressWord (LPB, LPB, LPB, int); PUBLIC DWORD PASCAL FAR WriteDataNode (_LPIPB, DWORD, PHRESULT); PUBLIC HRESULT PASCAL FAR IndexOpenRW (LPIPB, HFPB, LSZ); PUBLIC HRESULT PASCAL FAR SkipOldData (_LPIPB, PNODEINFO); PUBLIC LONG PASCAL FAR CompareDWord (DWORD, DWORD, LPV lpParm);
#ifdef _DEBUG
static LONG Count = 0; #endif
/*************************************************************************
* * @doc EXTERNAL API INDEX * * @func HRESULT FAR PASCAL | MVIndexUpdate | * This function will update an index file based on the information * collected in the Index parameter block. * * @parm HFPB | hSysFile | * System file handle. * If it is 0, this function will open the system file * specified in lszFilename, and then close it after finishing the * index update. If the system file does not exist, then this function * will create it. * If it is non-zero, then the system file is already opened. Only the * index sub-file needs to be created * * @parm LSZ | lszFilename | * Index filename. * If hSysFile is non-zero, the format is: !index_filename * if hSysFile is zero, the format is: dos_filename[!index_filename] * If !index_filename is not specified, the default name will be used * if hSysFile == 0 and there is no '!', this is a regular DOS file * * @parm LPIPB | lpipb | * Pointer to Index Parameter Block. This structure contains all the * information necessary to update the index file * * * @rdesc S_OK if succeeded, or other errors * *************************************************************************/ PUBLIC HRESULT EXPORT_API FAR PASCAL MVIndexUpdate (HFPB hSysFile, _LPIPB lpipb, LSZ lszFilename) { return MVIndexUpdateEx(hSysFile, lpipb, lszFilename, NULL, 0); }
/*************************************************************************
* * @doc EXTERNAL API INDEX * * @func HRESULT FAR PASCAL | MVIndexUpdateEx | * This function will update an index file based on the information * collected in the Index parameter block, and also will "pre-delete" the * topics in the given list from the LPIPB before updating. This function is useful * in scenarios where new topics are continuously added into the index * before knowledge of out-dated topics is available (e.g. netnews). * This allows a single-pass update once the deletes are known. * * @parm HFPB | hSysFile | * System file handle. * If it is 0, this function will open the system file * specified in lszFilename, and then close it after finishing the * index update. If the system file does not exist, then this function * will create it. * If it is non-zero, then the system file is already opened. Only the * index sub-file needs to be created * * @parm LSZ | lszFilename | * Index filename. * If hSysFile is non-zero, the format is: !index_filename * if hSysFile is zero, the format is: dos_filename[!index_filename] * If !index_filename is not specified, the default name will be used * if hSysFile == 0 and there is no '!', this is a regular DOS file * * @parm LPIPB | lpipb | * Pointer to Index Parameter Block. This structure contains all the * information necessary to update the index file * * @parm LPDW | lpdwTopicList | * Pointer to DWORD array of topic UIDs to be deleted * * @parm DWORD | dwCount | * The number of topics in the array * * @rdesc S_OK if succeeded, or other errors * *************************************************************************/ PUBLIC HRESULT EXPORT_API FAR PASCAL MVIndexUpdateEx (HFPB hSysFile, _LPIPB lpipb, LSZ lszFilename, DWORD FAR *rgTopicId, DWORD dwCount) { ERRB errb; PHRESULT phr = &errb; PFILEDATA pOutFile; MERGEPARAMS mp; HRESULT fRet; // Return value from this function.
// Flush the internal sort
// Flushes any records in the tree to disk
fRet = FlushTree(lpipb);
// Free all memory blocks
FreeISI (lpipb); if (fRet != S_OK) return(fRet); if (lpipb->esi.cesb == 0) // Nothing to process, there will be no index file
return S_OK;
// Set the state flag
lpipb->bState = UPDATING_STATE; // Open the index file
if ((fRet = IndexOpenRW(lpipb, hSysFile, lszFilename)) != S_OK) { exit00: if (lpipb->idxf & IDXF_NORMALIZE) { FreeHandle (lpipb->wi.hSigma); FreeHandle (lpipb->wi.hLog); lpipb->wi.hSigma = lpipb->wi.hLog = NULL; }
return fRet; } if (rgTopicId && dwCount) { // Sort the incoming array
if ((fRet = HugeDataSort((LPV HUGE*)rgTopicId, dwCount, (FCOMPARE)CompareDWord, NULL, NULL, NULL)) != S_OK) goto exit00;
mp.rgTopicId = rgTopicId; mp.dwCount = dwCount; mp.lpTopicIdLast = rgTopicId; }
if ((fRet = MergeSortTreeFile (lpipb, (rgTopicId && dwCount) ? &mp: NULL)) != S_OK) { FileClose(lpipb->hfpbIdxFile); fRet = SetErrCode (phr, fRet); goto exit00; } FileUnlink (NULL, lpipb->isi.aszTempName, REGULAR_FILE); // Open output file
pOutFile = &lpipb->OutFile; if ((pOutFile->fFile = FileCreate (NULL, lpipb->isi.aszTempName, REGULAR_FILE, phr)) == NULL) { FileClose(lpipb->hfpbIdxFile); fRet = SetErrCode (phr, fRet); goto exit00; }
// Allocate output buffer
pOutFile->dwMax = FILE_BUFFER; pOutFile->cbLeft = FILE_BUFFER; if ((pOutFile->hMem = _GLOBALALLOC (DLLGMEM_ZEROINIT, pOutFile->dwMax + SAFE_SLACK)) == NULL) { fRet = E_OUTOFMEMORY; exit0: FileClose(lpipb->hfpbIdxFile); FileClose (pOutFile->fFile); FileUnlink (NULL, lpipb->isi.aszTempName, REGULAR_FILE); goto exit00; } pOutFile->pCurrent = pOutFile->pMem = _GLOBALLOCK (pOutFile->hMem); // Build the permanent index
fRet = UpdateIndexBTree(lpipb, hSysFile, lpipb->esi.aszTempName, lszFilename); _GLOBALUNLOCK(pOutFile->hMem); _GLOBALFREE(pOutFile->hMem); pOutFile->hMem = NULL; goto exit0; }
/*************************************************************************
* * @doc PRIVATE INDEXING * * @func HRESULT | UpdateIndexBTree | * Allocates required memory and opens input files to create a B-Tree. * Parses incoming words and calls AddRecordToBTree to process them. * * @parm _LPIPB | lpipb | * Pointer to the index parameter block * * @parm LPB | lpszTemp | * Filename of the temporary input file * * @parm LPB | szIndexFilename | * Filename of the permanent B-Tree file * * @rdesc Returns S_OK on success or errors if failed * *************************************************************************/
PRIVATE HRESULT NEAR PASCAL UpdateIndexBTree (_LPIPB lpipb, HFPB hSysFile, LPB lpszTemp, LPB szIndexFilename) { PFILEDATA pInFile; // Pointer to input data
DWORD dwBytesRead = 0; // Checks for EOF
PNODEINFO FAR * rgpNodeInfo; PNODEINFO FAR * rgpTmpNodeInfo; PNODEINFO pIndexDataNode; ERRB errb; PHRESULT phr = &errb; PIH20 pHeader; int cTreeLevel; int iIndex; LPB pWord; WORDINFO WordInfo; OCCF occf; HRESULT fRet; // Return value
FILEOFFSET foFreeListOffset; // File Offset where the FreeList will be saved.
DWORD dwSizeFreeList; // Size of the FreeList to be saved.
rgpNodeInfo = lpipb->BTreeData.rgpNodeInfo; rgpTmpNodeInfo = lpipb->BTreeData.rgpTmpNodeInfo; MEMSET(&WordInfo, 0, sizeof(WORDINFO)); // Open input file
pInFile = &lpipb->InFile; if ((pInFile->fFile = FileOpen (NULL, lpszTemp, REGULAR_FILE, READ, phr)) == NULL) return *phr;
// Allocate input buffer
pInFile->dwMax = FILE_BUFFER; if ((pInFile->hMem = _GLOBALALLOC (DLLGMEM_ZEROINIT, pInFile->dwMax + SAFE_SLACK)) == NULL) { fRet = E_OUTOFMEMORY; exit0: FileClose (pInFile->fFile); FileUnlink (NULL, lpszTemp, REGULAR_FILE); return fRet; } pInFile->pMem = _GLOBALLOCK (pInFile->hMem); pInFile->pCurrent = pInFile->pMem;
pHeader = &lpipb->BTreeData.Header; // Allocate BTree block.
for (cTreeLevel = pHeader->cIdxLevels - 1; cTreeLevel >= 0; cTreeLevel --) { if ((rgpNodeInfo[cTreeLevel] = AllocBTreeNode (lpipb)) == NULL) { fRet = E_OUTOFMEMORY; goto exit2; } if ((rgpTmpNodeInfo[cTreeLevel] = AllocBTreeNode (lpipb)) == NULL) { fRet = E_OUTOFMEMORY; goto exit2; } }
if (((lpipb->pIndexDataNode = pIndexDataNode = AllocBTreeNode (lpipb))) == NULL) { fRet = E_OUTOFMEMORY; goto exit2; } // Reallocate a bigger buffer. BTREE_NODE_SIZE is only good for btree node
_GLOBALUNLOCK (pIndexDataNode->hMem); _GLOBALFREE (pIndexDataNode->hMem); // Allocate 1M of memory for the data buffer
if ((pIndexDataNode->hMem = _GLOBALALLOC (DLLGMEM_ZEROINIT, pIndexDataNode->dwBlockSize = FILE_BUFFER)) == NULL) goto exit2; pIndexDataNode->pCurPtr = pIndexDataNode->pBuffer = _GLOBALLOCK (pIndexDataNode->hMem); lpipb->pIndexDataNode->hfpbIdx = lpipb->hfpbIdxFile; // Index file to read from
// Remember the file offset of this node
rgpNodeInfo[0]->nodeOffset = pHeader->foIdxRoot; // Read in data for the top stem node
if ((fRet = ReadNewNode(lpipb->hfpbIdxFile, rgpNodeInfo[0], pHeader->cIdxLevels > 1 ? FALSE : TRUE)) != S_OK) { exit2: FreeHandle (pInFile->hMem); for (cTreeLevel = pHeader->cIdxLevels - 1; cTreeLevel >= 0; cTreeLevel --) { FreeBTreeNode (rgpNodeInfo[cTreeLevel]); FreeBTreeNode (rgpTmpNodeInfo[cTreeLevel]); } goto exit0; } // Allocate temporary buffer for word. The buffer is allocated as followed:
// - Max word length * 2: for maximum word length. Minimum is 256
// - 3 byte: word length
// - 5 byte: Field Id
// - 5 byte: Topic count
// - 6 byte: data pointer
// iIndex is used as a tmp
iIndex = (WORD)(lpipb->BTreeData.Header.dwMaxWLen * 2); if (iIndex < 1024) iIndex = 1024; iIndex += 3 + 5 + 5 + 6; if ((lpipb->hTmpBuf = _GLOBALALLOC (DLLGMEM_ZEROINIT, iIndex * 2)) == NULL) { fRet = E_OUTOFMEMORY; goto exit2; } lpipb->pTmpBuf = (LPB)_GLOBALLOCK (lpipb->hTmpBuf); lpipb->pWord = lpipb->pTmpBuf + iIndex; // Allocate a big buffer for data
if ((lpipb->hData = _GLOBALALLOC(DLLGMEM_ZEROINIT, lpipb->dwDataSize = 0x80000)) == NULL) { fRet = E_OUTOFMEMORY; goto exit2; } lpipb->pDataBuffer= _GLOBALLOCK(lpipb->hData); // Load the input buffer & repeat until all records are processed
pInFile->dwMax = pInFile->cbLeft = FileRead (pInFile->fFile, pInFile->pMem, pInFile->dwMax, phr); fRet = S_OK; pWord = lpipb->pWord; occf = lpipb->BTreeData.Header.occf; do { LPB pSrcPtr; WORD wLen; if (pInFile->cbLeft < CB_MAX_WORD_LEN * sizeof(DWORD) * 8) { MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft); pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem + pInFile->cbLeft, pInFile->dwMax - pInFile->cbLeft, &errb); pInFile->dwMax = pInFile->cbLeft; pInFile->pCurrent = pInFile->pMem; } // Extract the word and its info
pSrcPtr = pInFile->pCurrent + sizeof(DWORD); // Skip reclength
// Copy the word
MEMCPY (pWord, pSrcPtr, wLen = GETWORD((LPUW)pSrcPtr) + 2); pSrcPtr += GETWORD((LPUW)pSrcPtr) + 2; if (occf & OCCF_LENGTH) { pSrcPtr += CbByteUnpack(&WordInfo.dwWordLen, pSrcPtr); CbBytePack (pWord + wLen, WordInfo.dwWordLen); } else { WordInfo.dwWordLen = wLen - 2; } if (occf & OCCF_FIELDID) pSrcPtr += CbByteUnpack(&WordInfo.dwFieldId, pSrcPtr); WordInfo.dwNewTopicCount = GETLONG((LPUL)pSrcPtr); pSrcPtr += sizeof(DWORD);
pInFile->pCurrent = pSrcPtr; pInFile->cbLeft = (LONG)(pInFile->dwMax - (pSrcPtr - pInFile->pMem)); #if 0
if (STRNICMP(pWord+2, "cylindeeer", 10) == 0) _asm int 3; #endif
#if 0
else { SkipNewData (lpipb, &WordInfo); continue; } #endif
// Find/Add the record
if ((fRet = AddWordToBTree (lpipb, pWord, &WordInfo)) != S_OK) { exit3: _GLOBALUNLOCK (lpipb->hTmpBuf); _GLOBALFREE (lpipb->hTmpBuf); _GLOBALUNLOCK(lpipb->hData); _GLOBALFREE(lpipb->hData); FreeBTreeNode (lpipb->pIndexDataNode); lpipb->hData = lpipb->hTmpBuf = 0; goto exit2; } pSrcPtr = pInFile->pCurrent;
// pInFile->pCurrent points to the record size
if (pInFile->cbLeft <= SAFE_SLACK || (LONG)GETLONG ((LPUL)pInFile->pCurrent) >= pInFile->cbLeft) { MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft); if ((pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem + pInFile->cbLeft, pInFile->dwMax - pInFile->cbLeft, phr)) < 0) { fRet = *phr; goto exit3; }
pInFile->dwMax = pInFile->cbLeft; pInFile->pCurrent = pInFile->pMem; } } while (fRet == S_OK && pInFile->cbLeft);
for (cTreeLevel = pHeader->cIdxLevels - 1; cTreeLevel >= 0; cTreeLevel --) { if (rgpNodeInfo[cTreeLevel]->fFlag == TO_BE_UPDATE) { if ((FileSeekWrite(lpipb->hfpbIdxFile, rgpNodeInfo[cTreeLevel]->pBuffer, rgpNodeInfo[cTreeLevel]->nodeOffset, lpipb->BTreeData.Header.dwBlockSize, phr)) != (LONG)lpipb->BTreeData.Header.dwBlockSize) { fRet = *phr; goto exit3; } } } if (lpipb->idxf & IDXF_NORMALIZE) { LONG loop; for (loop = lpipb->dwMaxTopicId; loop >= 0; loop--) { lpipb->wi.hrgsigma[loop] = (float)sqrt ((double)lpipb->wi.hrgsigma[loop]); }
pHeader->WeightTabSize = (lpipb->dwMaxTopicId + 1)* sizeof(float); if (FileSeekWrite (lpipb->hfpbIdxFile, lpipb->wi.hrgsigma, lpipb->foMaxOffset, pHeader->WeightTabSize, phr) != (LONG)pHeader->WeightTabSize) { fRet = *phr; goto exit3; } pHeader->WeightTabOffset = lpipb->foMaxOffset; } // ERIC: 1/ Save the freelist info to the end of the file
// 2/ Update the header with the new freelist offset/size
if (lpipb->hFreeList) { LPBYTE lpbFreeList;
dwSizeFreeList = FreeListSize(lpipb->hFreeList,phr);
foFreeListOffset = FreeListGetBestFit(lpipb->hFreeList, MakeFo(dwSizeFreeList,0), phr);
if (FoIsNil(foFreeListOffset)) foFreeListOffset = lpipb->foMaxOffset;
if((lpbFreeList = (LPBYTE) _GLOBALALLOCPTR(DLLGMEM_ZEROINIT, dwSizeFreeList)) == NULL) return E_OUTOFMEMORY;
FreeListGetMem(lpipb->hFreeList, (LPVOID)lpbFreeList); FileSeekWrite (lpipb->hfpbIdxFile, (LPBYTE)lpbFreeList, foFreeListOffset, dwSizeFreeList, phr); if (FoEquals(foFreeListOffset, lpipb->foMaxOffset)) dwSizeFreeList |= 0x80000000; FreeListDestroy(lpipb->hFreeList); lpipb->hFreeList = (HFREELIST) NULL; _GLOBALFREEPTR(lpbFreeList); }
// Copy info to header
if (pHeader->lcTopics < lpipb->lcTopics) pHeader->lcTopics = lpipb->lcTopics;
if (pHeader->dwMaxFieldId < lpipb->dwMaxFieldId) pHeader->dwMaxFieldId = lpipb->dwMaxFieldId; if (pHeader->dwMaxWCount < lpipb->dwMaxWCount) pHeader->dwMaxWCount = lpipb->dwMaxWCount; if (pHeader->dwMaxOffset < lpipb->dwMaxOffset) pHeader->dwMaxOffset = lpipb->dwMaxOffset; if (pHeader->dwMaxWLen < lpipb->dwMaxWLen) pHeader->dwMaxWLen = lpipb->dwMaxWLen; pHeader->dwMaxTopicId = lpipb->dwMaxTopicId;
// ERIC: Garbage Collection
pHeader->foFreeListOffset = foFreeListOffset; pHeader->dwFreeListSize = dwSizeFreeList; // END
FileSeekWrite (lpipb->hfpbIdxFile, (LPB)pHeader, MakeFo (0, 0), sizeof (IH20), phr); fRet = S_OK; goto exit3; }
/*********************************************************************
* @func LPB PASCAL | AddWordToBTree | * Find the location of a word in the index. This function also * sets up all relevant data for the future update * * @parm LPIPB | lpipb | * Pointer to index info * * @parm LPB | pWord | * Word to be searched for. This is a 2-byte preceded Pascal string * * @parm PWORDINFO | pWordInfo | * Pointer to word's info * * @rdesc * S_OK or other errors. In case of success, pWordInfo will * be filled with useful data *********************************************************************/ PRIVATE HRESULT NEAR PASCAL AddWordToBTree (_LPIPB lpipb, LPB pWord, PWORDINFO pWordInfo) { int cLevel; LPB lpCurPtr; int nCmp; HRESULT fRet; WORD RecSize = 0; LPB lpMaxAddress; ERRB errb; PHRESULT phr = &errb; WORD wWlen; PNODEINFO pNodeInfo; PNODEINFO pChildNode; LPB pBTreeWord; int cMaxLevel; FILEOFFSET nodeOffset; PNODEINFO FAR *rgpNodeInfo = lpipb->BTreeData.rgpNodeInfo; OCCF occf = lpipb->occf; LONG dwBlockSize = lpipb->BTreeData.Header.dwBlockSize;
#if 0
Count++; if (STRNICMP(pWord+2, "approeeaching", 11) == 0 || STRNICMP(pWord+2, "authenteeic", 11) == 0 || STRNICMP(pWord+2, "eastleeand", 10) == 0) _asm int 3; #endif
// Change to 0-based
cMaxLevel = lpipb->BTreeData.Header.cIdxLevels - 1;
// Remember the last level offset
nodeOffset = rgpNodeInfo[0]->nodeOffset; /* Search in the stem nodes */ for (cLevel = 0; cLevel < cMaxLevel ; cLevel++) { //
// Set variables
//
pNodeInfo = rgpNodeInfo[cLevel]; pChildNode = rgpNodeInfo[cLevel + 1]; pChildNode->prevNodeOffset = foNil; pBTreeWord = pNodeInfo->pTmpResult; // Reload the node if neccessary
if (!FoEquals(pNodeInfo->nodeOffset, nodeOffset)) { if (pNodeInfo->fFlag == TO_BE_UPDATE) { if ((FileSeekWrite(lpipb->hfpbIdxFile, pNodeInfo->pBuffer, pNodeInfo->nodeOffset, dwBlockSize, &errb)) != (LONG)dwBlockSize) return(errb); } pNodeInfo->nodeOffset = nodeOffset; if ((fRet = ReadNewNode (lpipb->hfpbIdxFile, pNodeInfo, FALSE)) != S_OK) { return SetErrCode (phr, fRet); } pNodeInfo->fFlag = 0; } lpMaxAddress = pNodeInfo->pMaxAddress;
lpCurPtr = pNodeInfo->pCurPtr; // points to the LAST ACCESSED word in the block
// The format of the stem node
// cbLeft | (Word | PointerToNode) | Slack
while (lpCurPtr < lpMaxAddress - 1) { // Save the last location. This would be the insertion point for
// update
pNodeInfo->pCurPtr = lpCurPtr; // Reset the word length
wWlen = 0; // Get the compressed word
lpCurPtr = ExtractWord(pBTreeWord, lpCurPtr, &wWlen);
/* Read in NodeId record */ lpCurPtr += ReadFileOffset (&nodeOffset, lpCurPtr);
if ((nCmp = StrCmpPascal2(pWord, pBTreeWord)) == 0) nCmp = (int)((WORD)pWordInfo->dwWordLen - wWlen ); if (nCmp > 0) { // We didn't find the location of the word yet
// Continue searching
if (lpCurPtr < pNodeInfo->pMaxAddress - 1) { MEMCPY (pNodeInfo->pLastWord, pBTreeWord, *(LPUW)pBTreeWord + sizeof(WORD)); // erinfox RISC patch
} pChildNode->prevNodeOffset = nodeOffset; continue; } // We found the location of the word
break; } }
// At this point, nodeOffset is the node id of the leaf that
// is supposed to contain the searched word.
pNodeInfo = rgpNodeInfo[cMaxLevel]; if (!FoEquals(pNodeInfo->nodeOffset, nodeOffset)) { if (pNodeInfo->fFlag == TO_BE_UPDATE) { if ((FileSeekWrite(lpipb->hfpbIdxFile, pNodeInfo->pBuffer, pNodeInfo->nodeOffset, dwBlockSize, phr)) != dwBlockSize) return(*phr); } pNodeInfo->nodeOffset = nodeOffset; if ((fRet = ReadNewNode (lpipb->hfpbIdxFile, pNodeInfo, TRUE)) != S_OK) { return SetErrCode (phr, fRet); } pNodeInfo->fFlag = 0; lpCurPtr = pNodeInfo->pCurPtr; } else { // Reset all data
// lpCurPtr = pNodeInfo->pCurPtr = pNodeInfo->pBuffer + sizeof(WORD) + FOFFSET_SIZE;
lpCurPtr = pNodeInfo->pCurPtr; } pBTreeWord = pNodeInfo->pTmpResult; lpMaxAddress = pNodeInfo->pMaxAddress;
// Reset the last word
*(LPWORD)pNodeInfo->pLastWord = 0;
// Leaf node structure: *
// (Word|FieldId|TopicCnt|PointerToNode|DataSize)*
for (;;) { DWORD dwFieldId; // Save the last location. This would be the insertion point for
// update
pNodeInfo->pCurPtr = lpCurPtr; if (lpCurPtr >= lpMaxAddress) { // Add to the end of the node
if ((fRet = WriteNewDataRecord (lpipb, pWordInfo)) != S_OK) return(fRet); return AddRecordToBTree (lpipb, pWord, pWordInfo, cMaxLevel, 0); } // Get the compressed word
lpCurPtr = ExtractWord(pBTreeWord, lpCurPtr, &wWlen);
// Get fieldif and topic count
if (occf & OCCF_FIELDID) lpCurPtr += CbByteUnpack (&dwFieldId, lpCurPtr); lpCurPtr += CbByteUnpack (&pWordInfo->dwIndexTopicCount, lpCurPtr); // Get the data location and size
lpCurPtr += ReadFileOffset (&pWordInfo->dataLocation, lpCurPtr); lpCurPtr += CbByteUnpack(&pWordInfo->dwDataSize, lpCurPtr); if ((nCmp = StrCmpPascal2(pWord, pBTreeWord)) == 0) { if (occf & OCCF_LENGTH) nCmp = (int)((WORD)pWordInfo->dwWordLen - wWlen); if (nCmp == 0 && (occf & OCCF_FIELDID)) nCmp = (int)(pWordInfo->dwFieldId - dwFieldId); } if (nCmp > 0) { // We didn't find the location of the word yet
// Continue searching
MEMCPY (pNodeInfo->pLastWord, pBTreeWord, *(LPUW)pBTreeWord+sizeof(WORD) + sizeof(WORD)); // erinfox RISC patch
continue; } if (nCmp == 0) { if ((fRet = UpdateDataNode (lpipb, pWordInfo)) != S_OK) return(fRet); return AddRecordToBTree (lpipb, pWord, pWordInfo, cMaxLevel, REPLACE_WORD_01); } else { if ((fRet = WriteNewDataRecord (lpipb, pWordInfo)) != S_OK) return(fRet); return AddRecordToBTree (lpipb, pWord, pWordInfo, cLevel, 0); } break; } return S_OK; }
/*************************************************************************
* @doc INTERNAL * * @func HRESULT PASCAL | ReadNewNode | * Read in a new node from the disk if it is not the top node. * For the top node, just reset various pointers * * @parm PNODEINFO | pNodeInfo | * Pointer to leaf info * * @parm int | fLeafNode| * TRUE if this is a leaf node * * @rdesc S_OK if succesful, otherwise other errors. On exit, * lpCurPtr wil point to the beginning of the 1st word in the * node * * @rcomm The format of the leaf node is different from a stem node * Stem node structure: * * CbLeft |* Word | PointerToNode *| Slack * * * * Leaf node structure: * * NxtBlkPtr|CbLeft|*Word|FieldId|TopicCnt|PointerToNode|DataSize*|Slack * * * *************************************************************************/ PUBLIC HRESULT PASCAL FAR ReadNewNode (HFPB hfpb, PNODEINFO pNodeInfo, int fLeafNode) { ERRB errb;
if (FileSeekRead (hfpb, pNodeInfo->pBuffer, pNodeInfo->nodeOffset, pNodeInfo->dwBlockSize, &errb) != (long)pNodeInfo->dwBlockSize) return E_BADFILE; pNodeInfo->pCurPtr = pNodeInfo->pBuffer; if (fLeafNode) { pNodeInfo->pCurPtr += ReadFileOffset (&pNodeInfo->nextNodeOffset, pNodeInfo->pBuffer); } else pNodeInfo->nextNodeOffset = foNil; pNodeInfo->cbLeft = *(LPUW)(pNodeInfo->pCurPtr); // erinfox RISC patch
pNodeInfo->pCurPtr += sizeof(WORD); pNodeInfo->pMaxAddress = pNodeInfo->pBuffer + pNodeInfo->dwBlockSize - pNodeInfo->cbLeft; *(LPUW)(pNodeInfo->pLastWord) = *(LPUW)(pNodeInfo->pTmpResult) = 0; return S_OK; }
PUBLIC HRESULT PASCAL FAR IndexOpenRW (_LPIPB lpipb, HFPB hfpbSysFile, LSZ lszFilename) { HFPB hfpb; // Handle to system file
HRESULT fRet; ERRB errb; PHRESULT phr = &errb; PIH20 pHeader; int iIndex; LONG i;
// Check the existence of the file
if ((hfpb = FileOpen (hfpbSysFile, lszFilename, hfpbSysFile ? FS_SUBFILE : REGULAR_FILE, READ, phr)) == 0) { return *phr; } FileClose (hfpb); // Reopen the file for read/write
lpipb->hfpbIdxFile = FileOpen (hfpbSysFile, lszFilename, hfpbSysFile ? FS_SUBFILE : REGULAR_FILE, READ_WRITE, phr);
if ((fRet = ReadIndexHeader(lpipb->hfpbIdxFile, pHeader = &lpipb->BTreeData.Header)) != S_OK) { exit01: SetErrCode (phr, fRet); FileClose(lpipb->hfpbIdxFile); return fRet; } if (pHeader->version != VERCURRENT || pHeader->FileStamp != INDEX_STAMP) { fRet = E_BADVERSION; goto exit01; }
// incoming index and occurrence flags must match those in original index
if (pHeader->occf != lpipb->occf || pHeader->idxf != lpipb->idxf) { fRet = E_BADINDEXFLAGS; goto exit01; }
// Update the compression key to be used by WriteDataNode later
lpipb->cKey[CKEY_TOPIC_ID] = pHeader->ckeyTopicId; lpipb->cKey[CKEY_OCC_COUNT] = pHeader->ckeyOccCount; iIndex = CKEY_OCC_BASE; if (pHeader->occf & OCCF_COUNT) lpipb->cKey[iIndex++] = pHeader->ckeyWordCount; if (pHeader->occf & OCCF_OFFSET) lpipb->cKey[iIndex] = pHeader->ckeyOffset; // Update the maximum TopicId
if (pHeader->dwMaxTopicId < lpipb->dwMaxTopicId) pHeader->dwMaxTopicId = lpipb->dwMaxTopicId; else lpipb->dwMaxTopicId = pHeader->dwMaxTopicId;
// Get the file size.
lpipb->foMaxOffset = FileSize (lpipb->hfpbIdxFile, phr); if (lpipb->idxf & IDXF_NORMALIZE) { // Load the sigma table
if (FoEquals(pHeader->WeightTabOffset, foNil)) { fRet = SetErrCode (phr, E_ASSERT); goto exit01; } if ((fRet = AllocSigmaTable (lpipb)) != S_OK) goto exit01;
if (FileSeekRead (lpipb->hfpbIdxFile, lpipb->wi.hrgsigma, pHeader->WeightTabOffset, pHeader->WeightTabSize, phr) != (LONG)pHeader->WeightTabSize) { fRet = errb; goto exit01; } if (lpipb->bState == DELETING_STATE) { // Square the sigma table
// erinfox: off by one bug. change i = lpipb->dwMaxTopicId + 1
// to lpipb->dwMaxTopicId because we have only allocated
// (dwMaxTopicId + 1)*sizeof(float) bytes
for (i = lpipb->dwMaxTopicId; i >= 0; i--) { lpipb->wi.hrgsigma[i] = lpipb->wi.hrgsigma[i] * lpipb->wi.hrgsigma[i]; } } }
/* ERIC */ // Load or create a freelist (dwSize = 0)
if (lpipb->bState == UPDATING_STATE) { if (pHeader->dwFreeListSize) // If a freelist is existing, read it, otherwise, create it.
{ LPBYTE lpbFreeList;
if (pHeader->dwFreeListSize & 0x80000000) { pHeader->dwFreeListSize &= 0x7FFFFFFF; lpipb->foMaxOffset = FoSubFo(lpipb->foMaxOffset,MakeFo(pHeader->dwFreeListSize,0)); } if(!(lpbFreeList = (LPBYTE) _GLOBALALLOCPTR(DLLGMEM_ZEROINIT, pHeader->dwFreeListSize))) { fRet = SetErrCode (phr, E_OUTOFMEMORY); goto exit01; }
FileSeekRead (lpipb->hfpbIdxFile, (LPBYTE)lpbFreeList, pHeader->foFreeListOffset, pHeader->dwFreeListSize, phr);
lpipb->hFreeList = FreeListInitFromMem(lpbFreeList, phr ); _GLOBALFREEPTR(lpbFreeList); } else lpipb->hFreeList = FreeListInit( wDefaultFreeListSize, phr); }
return S_OK; }
PRIVATE PASCAL NEAR AddRecordToBTree (_LPIPB lpipb, LPB pWord, PWORDINFO pWordInfo, int cLevel, int fFlag) { PNODEINFO pNodeInfo; PNODEINFO pTmpNodeInfo; LPB pInsertPtr; // Pointer to insertion point
LPB pWordStorage; LPB pLastWord; LPB pBuffer; BYTE fIsStemNode; WORD wWLen; WORD wNewRecSize; // New record size
LONG cbByteMoved; // Number of bytes moved to leave room for new rec
OCCF occf = lpipb->occf; // Occurrence field flags
BYTE fLength = occf & OCCF_LENGTH; WORD cbLeft; // How many byte left in the current node?
LONG dwBlockSize = lpipb->BTreeData.Header.dwBlockSize; BYTE cbSkip; BYTE fEndNode; ERRB errb; if (cLevel == -1) { // The tree's level has increased by one
int i; if (lpipb->BTreeData.Header.cIdxLevels >= MAX_TREE_HEIGHT - 1) return E_TREETOOBIG;
/* Move down the entries to make room for the top node */ for (i = lpipb->BTreeData.Header.cIdxLevels; i > 0 ; i-- ) { lpipb->BTreeData.rgpNodeInfo[i] = lpipb->BTreeData.rgpNodeInfo[i-1]; lpipb->BTreeData.rgpTmpNodeInfo[i] = lpipb->BTreeData.rgpTmpNodeInfo[i-1]; } // Increase tree level
lpipb->BTreeData.Header.cIdxLevels ++; if ((pNodeInfo = lpipb->BTreeData.rgpNodeInfo[0] = AllocBTreeNode (lpipb)) == NULL) return(E_OUTOFMEMORY); if ((pTmpNodeInfo = lpipb->BTreeData.rgpTmpNodeInfo[0] = AllocBTreeNode (lpipb)) == NULL) return(E_OUTOFMEMORY); pWordStorage = (pBuffer = pNodeInfo->pBuffer) + sizeof(WORD); if (fFlag & USE_BOTH_NODE_40) { if (fFlag & USE_TEMP_FOR_RIGHT_NODE_10) { // Link to the left child node
pWordStorage += PrefixCompressWord (pWordStorage, lpipb->BTreeData.rgpNodeInfo[1]->pTmpResult, EmptyWord, fLength); pWordStorage += CopyFileOffset (pWordStorage, lpipb->BTreeData.rgpNodeInfo[1]->nodeOffset); // Link to the right child node
pWordStorage += PrefixCompressWord (pWordStorage, lpipb->BTreeData.rgpTmpNodeInfo[1]->pTmpResult, lpipb->BTreeData.rgpNodeInfo[1]->pTmpResult, fLength); pWordStorage += CopyFileOffset (pWordStorage, lpipb->BTreeData.rgpTmpNodeInfo[1]->nodeOffset); } else { // Link to the left child node
pWordStorage += PrefixCompressWord (pWordStorage, lpipb->BTreeData.rgpTmpNodeInfo[1]->pTmpResult, EmptyWord, fLength); pWordStorage += CopyFileOffset (pWordStorage, lpipb->BTreeData.rgpTmpNodeInfo[1]->nodeOffset); // Link to the right child node
pWordStorage += PrefixCompressWord (pWordStorage, lpipb->BTreeData.rgpNodeInfo[1]->pTmpResult, lpipb->BTreeData.rgpTmpNodeInfo[1]->pTmpResult, fLength); pWordStorage += CopyFileOffset (pWordStorage, lpipb->BTreeData.rgpNodeInfo[1]->nodeOffset); } } else { // Link to the right child node
pWordStorage += PrefixCompressWord (pWordStorage, pWord, EmptyWord, fLength); pWordStorage += CopyFileOffset (pWordStorage, lpipb->BTreeData.rgpTmpNodeInfo[1]->nodeOffset); } // Set all the parameter
pNodeInfo->pCurPtr = pBuffer + sizeof(WORD); pNodeInfo->cbLeft = (LONG)(pBuffer - pWordStorage + dwBlockSize); pNodeInfo->pMaxAddress = pBuffer + dwBlockSize - pNodeInfo->cbLeft; SETWORD(pBuffer, (WORD)pNodeInfo->cbLeft);
// Write out the new node
if ((FileSeekWrite(lpipb->hfpbIdxFile, pBuffer, lpipb->foMaxOffset, dwBlockSize, &errb)) != (LONG)dwBlockSize) return(errb); // Remember the offset of this node
// Set the pointer to the top stem node
lpipb->BTreeData.Header.foIdxRoot = pNodeInfo->nodeOffset = lpipb->foMaxOffset; lpipb->BTreeData.Header.nidIdxRoot = pNodeInfo->nodeOffset.dwOffset; lpipb->foMaxOffset = FoAddDw (lpipb->foMaxOffset, dwBlockSize); #if 0
return CheckStemNode (pNodeInfo); #else
return(S_OK); #endif
} // Initialize data
pNodeInfo = lpipb->BTreeData.rgpNodeInfo[cLevel]; pTmpNodeInfo = lpipb->BTreeData.rgpTmpNodeInfo[cLevel]; pLastWord = pNodeInfo->pLastWord; pBuffer = pNodeInfo->pBuffer; if (fIsStemNode = (cLevel < lpipb->BTreeData.Header.cIdxLevels - 1)) cbSkip = sizeof(WORD); else cbSkip = sizeof(WORD) + FOFFSET_SIZE; fEndNode = (pNodeInfo->pCurPtr >= pNodeInfo->pMaxAddress); // Calculate how many byte left are there in the old node
pInsertPtr = pNodeInfo->pCurPtr; // Pointer to insertion point
cbLeft = (WORD)pNodeInfo->cbLeft; // Handle special simple cases
if (fFlag & UPDATE_NODE_ADDRESS_08) { // Skip the next word
pInsertPtr = ExtractWord(pTmpNodeInfo->pTmpResult, pInsertPtr, &wWLen); if (fFlag & USE_TEMP_NODE_04) { CopyFileOffset (pInsertPtr, lpipb->BTreeData.rgpTmpNodeInfo[cLevel + 1]->nodeOffset); } else { CopyFileOffset (pInsertPtr, lpipb->BTreeData.rgpNodeInfo[cLevel + 1]->nodeOffset); } #if 0
return(fIsStemNode ? CheckStemNode (pNodeInfo) : CheckLeafNode (pNodeInfo, occf)); #else
return(S_OK); #endif
} if (fFlag & (REPLACE_WORD_01 | SKIP_NEXT_WORD_20)) { // We get more room from the replaced word
DWORD dwTemp; // Skip the next word
if (fFlag & SKIP_NEXT_WORD_20) { pInsertPtr = ExtractWord(pLastWord, pInsertPtr, &wWLen); } else { pInsertPtr = ExtractWord(pTmpNodeInfo->pTmpResult, pInsertPtr, &wWLen); } // Skip the data
if (fIsStemNode) pInsertPtr += FOFFSET_SIZE; else { // Skip field id, topic count. fileoffset, datasize
if (occf & OCCF_FIELDID) pInsertPtr += CbByteUnpack (&dwTemp, pInsertPtr); // FieldId
if (occf & OCCF_TOPICID) { pInsertPtr += CbByteUnpack (&dwTemp, pInsertPtr); pInsertPtr += FOFFSET_SIZE; pInsertPtr += CbByteUnpack (&dwTemp, pInsertPtr); } } if (fFlag & SKIP_NEXT_WORD_20) pNodeInfo->pCurPtr = pInsertPtr; else { // Remove the old data
MEMMOVE (pNodeInfo->pCurPtr, pInsertPtr, cbByteMoved = (LONG)(pNodeInfo->pMaxAddress - pInsertPtr)); pNodeInfo->pMaxAddress = (pInsertPtr = pNodeInfo->pCurPtr) + cbByteMoved; cbLeft = (WORD)(dwBlockSize - (pNodeInfo->pMaxAddress - pBuffer)); } if (pInsertPtr >= pNodeInfo->pMaxAddress) fEndNode = TRUE; } //Calculate the approximate number of bytes needed for the
// new data by compress it to the temporary block
if (fIsStemNode) { if (pInsertPtr <= pNodeInfo->pBuffer + sizeof(WORD)) { // This is the first word, there is no previous one
*(LPWORD)pLastWord = 0; } wNewRecSize = (WORD) CopyNewDataToStemNode (lpipb, pTmpNodeInfo, pWord, pLastWord, cLevel, fFlag); } else { if (pInsertPtr <= pNodeInfo->pBuffer + sizeof(WORD) + FOFFSET_SIZE) { // This is the first word, there is no previous one
*(LPWORD)pLastWord = 0; } wNewRecSize = (WORD) CopyNewDataToLeafNode (lpipb, pTmpNodeInfo, pWordInfo, pWord, pLastWord); }
wNewRecSize -= cbSkip; // I reserved about 4 byte to ensure that when we have enough room
// we do have enough room. Compression may change the size of the
// record, causing us to run out of room when copying the new data
// over
if (cbLeft - sizeof(DWORD) > wNewRecSize) { // We have enough room for the new data. Just insert the new data
pWordStorage = pTmpNodeInfo->pCurPtr; if (!fEndNode) { // We need to recompress the next word
MEMCPY (pTmpNodeInfo->pTmpResult, pWord, *(LPUW)pWord + sizeof(WORD) + sizeof(WORD)); //erinfox RISC patch
pInsertPtr = ExtractWord(pTmpNodeInfo->pTmpResult, pInsertPtr, &wWLen); cbByteMoved = PrefixCompressWord (pWordStorage, pTmpNodeInfo->pTmpResult, pWord, fLength); wNewRecSize += (WORD)cbByteMoved; // Reset the last word for pBTreeWord
MEMCPY (pNodeInfo->pTmpResult, pLastWord, *(LPUW)pLastWord + sizeof(WORD) + sizeof(WORD)); // erinfox RISC patch
}
// Make room for the new data
if ((cbByteMoved = (LONG)(pNodeInfo->pMaxAddress - pInsertPtr)) <= 0) cbByteMoved = 0; else MEMMOVE(pNodeInfo->pCurPtr + wNewRecSize, pInsertPtr, cbByteMoved = (LONG)(pNodeInfo->pMaxAddress - pInsertPtr)); // Copy the new data
MEMCPY (pNodeInfo->pCurPtr, pTmpNodeInfo->pBuffer + cbSkip, wNewRecSize); // Update data
pNodeInfo->pMaxAddress = pNodeInfo->pCurPtr + wNewRecSize + cbByteMoved; pNodeInfo->cbLeft = cbLeft = (WORD)(dwBlockSize - (pNodeInfo->pMaxAddress - pBuffer)); SETWORD(pNodeInfo->pBuffer + cbSkip - sizeof(WORD), (WORD)cbLeft); pNodeInfo->fFlag = TO_BE_UPDATE;
// Change the parent node
if (fEndNode && cLevel) { return (AddRecordToBTree (lpipb, pWord, pWordInfo, cLevel - 1, REPLACE_WORD_01)); } #if 0
return(fIsStemNode ? CheckStemNode (pNodeInfo) : CheckLeafNode (pNodeInfo, occf)); #else
return(S_OK); #endif
return S_OK; } // Case 3: Add to the middle. This is a complex one, since we have
// to split the node into 2.
return(SplitNodeAndAddData (lpipb, pWord, pWordInfo, cLevel, fFlag, fIsStemNode)); } PRIVATE int PASCAL NEAR SplitNodeAndAddData (_LPIPB lpipb, LPB pWord, PWORDINFO pWordInfo, int cLevel, int fFlag, int fIsStemNode) { PNODEINFO pNodeInfo; PNODEINFO pTmpNodeInfo; LONG cbByteMoved; WORD leftSize; WORD rightSize; WORD wWLen; LPB pInsertPtr; LPB pWordStorage; int cbSkip; DWORD dwBlockSize; HRESULT fRet; BYTE fLength = lpipb->occf & OCCF_LENGTH; LPB pLastWord; LPB pTemp; LPB pBuffer; if (fIsStemNode) cbSkip = 0; else cbSkip = FOFFSET_SIZE; // Variable initialization
pNodeInfo = lpipb->BTreeData.rgpNodeInfo[cLevel]; pBuffer = pNodeInfo->pBuffer; pTmpNodeInfo = lpipb->BTreeData.rgpTmpNodeInfo[cLevel]; pInsertPtr = pNodeInfo->pCurPtr; dwBlockSize = lpipb->BTreeData.Header.dwBlockSize; pLastWord = pNodeInfo->pLastWord;
// Calculate approximately the left & right side node sizes
leftSize = (WORD)(pInsertPtr - pBuffer - cbSkip - sizeof(WORD)); rightSize = (WORD)(pNodeInfo->pMaxAddress - pNodeInfo->pCurPtr); if (leftSize >= rightSize) { // We add to the right. The new data will be 1st
// Example:
// Add 4 into 1 2 3 5 --> 1 2 3 and 4 5
if (fIsStemNode) { CopyNewDataToStemNode (lpipb, pTmpNodeInfo, pWord, EmptyWord, cLevel, fFlag); pTemp = pTmpNodeInfo->pBuffer + sizeof(WORD); } else { CopyNewDataToLeafNode (lpipb, pTmpNodeInfo, pWordInfo, pWord, EmptyWord); pTemp = pTmpNodeInfo->pBuffer + sizeof(WORD) + FOFFSET_SIZE; } pWordStorage = pTmpNodeInfo->pCurPtr; // Move back the pointer to the beginning of the word
// for future reference
pTmpNodeInfo->pCurPtr = pTemp; if (rightSize > 0) { // Extract the word on the right of the insertion point
MEMCPY (pTmpNodeInfo->pTmpResult, pWord, *(LPUW)pWord + sizeof(WORD)); // erinfox RISC patch
pInsertPtr = ExtractWord(pTmpNodeInfo->pTmpResult, pInsertPtr, &wWLen); pWordStorage += PrefixCompressWord (pWordStorage, pTmpNodeInfo->pTmpResult, pWord, fLength); // Copy data on the right of the insertion point to the new node
MEMCPY (pWordStorage, pInsertPtr, cbByteMoved = (LONG)(pNodeInfo->pMaxAddress - pInsertPtr)); pWordStorage += cbByteMoved; }
pTmpNodeInfo->pMaxAddress = pWordStorage; // Update the right node
SETWORD(pTmpNodeInfo->pBuffer + cbSkip, (WORD)(pTmpNodeInfo->cbLeft = (LONG)(dwBlockSize - (pWordStorage - pTmpNodeInfo->pBuffer)))); pTmpNodeInfo->pMaxAddress = pTmpNodeInfo->pBuffer + dwBlockSize - pTmpNodeInfo->cbLeft; #if 0
if (fIsStemNode) CheckStemNode (pTmpNodeInfo); else CheckLeafNode (pTmpNodeInfo, lpipb->occf); #endif
MEMSET (pWordStorage, 0, pTmpNodeInfo->cbLeft); if ((fRet = CreateNewNode (lpipb, cLevel, fIsStemNode, NEW_NODE_ON_RIGHT)) != S_OK) return(fRet); // Update the left node
pNodeInfo->fFlag = TO_BE_UPDATE; SETWORD(pBuffer + cbSkip, (WORD)(pNodeInfo->cbLeft = (LONG)(dwBlockSize - (pNodeInfo->pCurPtr - pBuffer)))); #ifdef _DEBUG
MEMSET (pNodeInfo->pCurPtr, 0, pNodeInfo->cbLeft); #endif
pNodeInfo->pMaxAddress = pBuffer + dwBlockSize - pNodeInfo->cbLeft; pNodeInfo->fFlag = TO_BE_UPDATE; #if 0
if (fIsStemNode) CheckStemNode (pNodeInfo); else CheckLeafNode (pNodeInfo, lpipb->occf); #endif
if (cLevel == 0) { if (pNodeInfo->pCurPtr >= pNodeInfo->pMaxAddress - 1) pNodeInfo->pCurPtr = pNodeInfo->pBuffer + cbSkip + sizeof(WORD); GetLastWordInNode (lpipb, pNodeInfo, fIsStemNode); GetLastWordInNode (lpipb, pTmpNodeInfo, fIsStemNode); return AddRecordToBTree (lpipb, pWord, NULL, cLevel - 1, USE_BOTH_NODE_40 | USE_TEMP_FOR_RIGHT_NODE_10); } if (rightSize > 0) { if ((fRet = AddRecordToBTree (lpipb, pWord, NULL, cLevel - 1, USE_TEMP_NODE_04 | UPDATE_NODE_ADDRESS_08)) != S_OK) return fRet; return AddRecordToBTree (lpipb, pNodeInfo->pLastWord, NULL, cLevel - 1, 0); } if (fFlag & REPLACE_WORD_01) { // rightSize == 0 means that we are adding to the end of the block.
// REPLACE_WORD means that we are replacing the same word, so basically
// we have to add a new entry for the left block
if ((fRet = AddRecordToBTree (lpipb, pWord, NULL, cLevel - 1, USE_TEMP_NODE_04 | REPLACE_WORD_01)) != S_OK) return fRet;
return AddRecordToBTree (lpipb, pNodeInfo->pLastWord, NULL, cLevel - 1, 0); }
// Add to the end
return AddRecordToBTree (lpipb, pWord, NULL, cLevel - 1, USE_TEMP_NODE_04 | SKIP_NEXT_WORD_20); } //**********************************************
//
// Add the new data to the end of the leftnode
//
//**********************************************
// We add to the left. The new data will be last
// Example:
// Add 2 into 1 3 4 5 --> 1 2 and 3 4 5
pTmpNodeInfo->pCurPtr = pWordStorage = pTmpNodeInfo->pBuffer + cbSkip + sizeof(WORD);
// Copy the data on the left to the new node
if (cbByteMoved = leftSize) { MEMCPY(pWordStorage, pBuffer + cbSkip + sizeof(WORD), cbByteMoved); pWordStorage += cbByteMoved; } // Emit new data
pWordStorage += PrefixCompressWord (pWordStorage, pWord, pLastWord, lpipb->occf & OCCF_LENGTH); if (fIsStemNode) { if (fFlag & USE_TEMP_NODE_04) { pWordStorage += CopyFileOffset (pWordStorage, lpipb->BTreeData.rgpTmpNodeInfo[cLevel+1]->nodeOffset); } else { pWordStorage += CopyFileOffset (pWordStorage, lpipb->BTreeData.rgpNodeInfo[cLevel+1]->nodeOffset); } } else { // Emit field id, topic count. fileoffset, datasize
if (lpipb->occf & OCCF_FIELDID) pWordStorage += CbBytePack (pWordStorage, pWordInfo->dwFieldId); pWordStorage += CbBytePack (pWordStorage, pWordInfo->dwMergeTopicCount); pWordStorage += CopyFileOffset (pWordStorage, pWordInfo->dataLocation); pWordStorage += CbBytePack (pWordStorage, pWordInfo->dwDataSize); } SETWORD (pTmpNodeInfo->pBuffer + cbSkip, (WORD)(pTmpNodeInfo->cbLeft = (LONG)(pNodeInfo->dwBlockSize - (pWordStorage - pTmpNodeInfo ->pBuffer)))); pTmpNodeInfo->pMaxAddress = pWordStorage; if ((fRet = CreateNewNode (lpipb, cLevel, fIsStemNode, NEW_NODE_ON_LEFT)) != S_OK) return(fRet); // Update the right node
if (leftSize > 0) { MEMMOVE(pNodeInfo->pCurPtr = pBuffer + cbSkip + sizeof(WORD), pInsertPtr, (size_t)(pNodeInfo->pMaxAddress - pInsertPtr)); pNodeInfo->pMaxAddress -= cbByteMoved; // Reconstruct the 1st word in the node.
if (fFlag & REPLACE_WORD_01) { MEMCPY (pTmpNodeInfo->pTmpResult, pWord, *(LPUW)pWord + sizeof(WORD) + sizeof(WORD)); // erinfox RISC patch
} else { MEMCPY (pTmpNodeInfo->pTmpResult, pLastWord, *(LPUW)pLastWord + sizeof(WORD) + sizeof(WORD)); // erinfox RISC patch
} } pInsertPtr = pNodeInfo->pCurPtr; pInsertPtr = ExtractWord(pTmpNodeInfo->pTmpResult, pTemp = pInsertPtr, &wWLen); cbByteMoved = (LONG)(pInsertPtr - pTemp); // Recompress the word using pLastWord of pTmpNodeInfo
wWLen = (WORD) PrefixCompressWord (pTmpNodeInfo->pLastWord, pTmpNodeInfo->pTmpResult, EmptyWord, fLength); // Reserved room for the word
pWordStorage = pBuffer + cbSkip + sizeof(WORD); MEMMOVE (pWordStorage + wWLen, pInsertPtr, (size_t)(pNodeInfo->pMaxAddress - pInsertPtr)); // Copy down the word
MEMCPY(pWordStorage, pTmpNodeInfo->pLastWord, wWLen); pNodeInfo->pMaxAddress += wWLen - cbByteMoved; // Update the right node
SETWORD(pBuffer + cbSkip, (WORD)(pNodeInfo->cbLeft =(WORD)(dwBlockSize - (pNodeInfo->pMaxAddress - pBuffer)))); pNodeInfo->fFlag = TO_BE_UPDATE; #ifdef _DEBUG
MEMSET (pNodeInfo->pMaxAddress, 0, pNodeInfo->cbLeft); #endif
if (cLevel == 0) { GetLastWordInNode (lpipb, pNodeInfo, fIsStemNode); GetLastWordInNode (lpipb, pTmpNodeInfo, fIsStemNode); return AddRecordToBTree (lpipb, pWord, NULL, cLevel - 1, USE_BOTH_NODE_40); } return AddRecordToBTree (lpipb, pWord, NULL, cLevel - 1, USE_TEMP_NODE_04); return(fRet); }
VOID GetLastWordInNode (_LPIPB lpipb, PNODEINFO pNodeInfo, BOOL fIsStemNode) { LPB pInsertPtr = pNodeInfo->pCurPtr; LPB pMaxAddress = pNodeInfo->pMaxAddress; WORD wWLen; DWORD dwTemp; MEMCPY (pNodeInfo->pTmpResult, EmptyWord, 4);
while (pInsertPtr < pNodeInfo->pMaxAddress - 1) { pInsertPtr = ExtractWord(pNodeInfo->pTmpResult, pInsertPtr, &wWLen); if (!fIsStemNode) { if (lpipb->occf & OCCF_FIELDID) pInsertPtr += CbByteUnpack (&dwTemp, pInsertPtr); if (lpipb->occf & OCCF_TOPICID) pInsertPtr += CbByteUnpack (&dwTemp, pInsertPtr);// Topic count
} pInsertPtr += FOFFSET_SIZE; // FileOffset
if (!fIsStemNode) pInsertPtr += CbByteUnpack (&dwTemp, pInsertPtr); } }
PRIVATE HRESULT PASCAL NEAR CreateNewNode(_LPIPB lpipb, int cLevel, int fIsStemNode, int fAfter) { PNODEINFO pNodeInfo; PNODEINFO pTmpNodeInfo; ERRB errb; LONG dwBlockSize = lpipb->BTreeData.Header.dwBlockSize; pNodeInfo = lpipb->BTreeData.rgpNodeInfo[cLevel]; pTmpNodeInfo = lpipb->BTreeData.rgpTmpNodeInfo[cLevel]; #ifdef _DEBUG
dwNewNodeSize += dwBlockSize; #endif
if (!fIsStemNode) { // Add the new node into the linked list
if (fAfter) CopyFileOffset (pTmpNodeInfo->pBuffer, pNodeInfo->nextNodeOffset); else CopyFileOffset (pTmpNodeInfo->pBuffer, pNodeInfo->nodeOffset); } // Write out the new node
if ((FileSeekWrite(lpipb->hfpbIdxFile, pTmpNodeInfo->pBuffer, lpipb->foMaxOffset, dwBlockSize, &errb)) != (LONG)dwBlockSize) return(errb); // Remember the offset of this node
pTmpNodeInfo->nodeOffset = lpipb->foMaxOffset; if (!fIsStemNode) { if (fAfter) { CopyFileOffset (pNodeInfo->pBuffer, lpipb->foMaxOffset); pNodeInfo->fFlag = TO_BE_UPDATE; } else { // Update the previous link
if (!FoEquals(pNodeInfo->prevNodeOffset, foNil)) { BYTE TempBuf[FOFFSET_SIZE + 1]; CopyFileOffset (TempBuf,lpipb->foMaxOffset); if ((FileSeekWrite(lpipb->hfpbIdxFile, TempBuf, pNodeInfo->prevNodeOffset, FOFFSET_SIZE, &errb)) != FOFFSET_SIZE) return(errb); } } } lpipb->foMaxOffset = FoAddDw (lpipb->foMaxOffset, dwBlockSize); return(S_OK); }
PRIVATE HRESULT PASCAL NEAR WriteNewDataRecord (_LPIPB lpipb, PWORDINFO pWordInfo) { PFILEDATA pOutFile = &lpipb->OutFile; DWORD dwBlockSize; ERRB errb; HRESULT fRet; FREEBLOCK FreeBlock;
// Reset the characteristic of the file
pOutFile->pCurrent = pOutFile->pMem; pOutFile->cbLeft = pOutFile->dwMax; pOutFile->ibit = cbitBYTE - 1; FileSeek (pOutFile->fFile, pOutFile->foPhysicalOffset = foNil, 0, &errb); // Write out the data into the temp file
if ((dwBlockSize = WriteDataNode (lpipb, pWordInfo->dwMergeTopicCount = pWordInfo->dwNewTopicCount, &errb)) == 0) return errb;
// Write out the output buffer
if (FileWrite (pOutFile->fFile, pOutFile->pMem, (LONG)(pOutFile->pCurrent - pOutFile->pMem), &errb) != (LONG) (pOutFile->pCurrent - pOutFile->pMem)) return(errb); // if ((errb.err = FileFlush (pOutFile->fFile)) != S_OK)
// return(errb.err);
pWordInfo->dwDataSize = dwBlockSize; // Find the smallest free block that fits the new data
if (GetFreeBlock (lpipb, &FreeBlock, dwBlockSize) != S_OK) { #ifdef _DEBUGFREE
_DPF2("GetFreeBlock failed. Requested %ld bytes, appending to EOF(%ld)\n", dwBlockSize, lpipb->foMaxOffset.dwOffset); #endif
// There is no free block large enough to store the data
if ((fRet = CopyBlockFile (pOutFile, lpipb->hfpbIdxFile, lpipb->foMaxOffset, dwBlockSize)) != S_OK) return fRet; pWordInfo->dataLocation = lpipb->foMaxOffset;
lpipb->foMaxOffset = FoAddDw (lpipb->foMaxOffset, dwBlockSize);
#ifdef _DEBUG
dwNewDataSize += dwBlockSize; #endif
return(S_OK); }
// There is a free block large enough to store the data
if ((fRet = CopyBlockFile (pOutFile, lpipb->hfpbIdxFile, FreeBlock.foBlockOffset, dwBlockSize)) != S_OK) return fRet;
pWordInfo->dataLocation = FreeBlock.foBlockOffset;
return S_OK; }
// erinfox: return a block from the free list if possible
PRIVATE HRESULT GetFreeBlock (_LPIPB lpipb, PFREEBLOCK pFreeBlock, DWORD dwBlockSize) { FILEOFFSET foFreeListOffset; ERRB errb;
// if it can't find a free block, it returns an error
foFreeListOffset = FreeListGetBestFit(lpipb->hFreeList, MakeFo(dwBlockSize,0), &errb); if (FoIsNil(foFreeListOffset)) { return errb; }
pFreeBlock->foBlockOffset = foFreeListOffset;
return S_OK; }
PRIVATE HRESULT PASCAL NEAR CopyBlockFile (PFILEDATA pFileData, HFPB hfpbDest, FILEOFFSET foOffset, DWORD dwBlockSize) { LONG cbCopied; ERRB errb; // Initialize variable
errb = S_OK; // Seek to the right locations
FileSeek (pFileData->fFile, foNil, 0, &errb); if (errb != S_OK) return(errb); FileSeek (hfpbDest, foOffset, 0, &errb); if (errb != S_OK) return(errb); // Do the copy
while (dwBlockSize) { if ((cbCopied = dwBlockSize) > pFileData->dwMax) cbCopied = pFileData->dwMax; if (FileRead (pFileData->fFile, pFileData->pMem, cbCopied, &errb) != cbCopied) return(E_FILEREAD); if (FileWrite(hfpbDest, pFileData->pMem, cbCopied, &errb) != cbCopied) return(E_FILEWRITE); dwBlockSize -= cbCopied; } return(S_OK); }
PRIVATE HRESULT PASCAL NEAR UpdateDataNode (_LPIPB lpipb, PWORDINFO pWordInfo) { // Local replacement Variables
PBTREEDATA pTreeData = &lpipb->BTreeData; PFILEDATA pOutFile = &lpipb->OutFile; // Output data structure
PFILEDATA pInFile = &lpipb->InFile; // Input data structre
HFPB fFile = pOutFile->fFile; // Output file handle
PNODEINFO pIndexDataNode = lpipb->pIndexDataNode; DWORD dwNewDataSize; ERRB errb;
// Working Variables
DWORD dwEncodedSize = 0; // Size of encoded block
DWORD dwTopicIdDelta; // Really only used for weight values
DWORD dwNewTopicId = 0; DWORD dwIndexTopicId = 0; DWORD dwNewTopicCount; DWORD dwIndexTopicCount; DWORD dwTopicCount; FILEOFFSET foStart; // Physical beginning of bit compression block
WORD wWeight = 0; // Only used when IDXF_NORMALIZE is set
DWORD dwTopicId = 0; // Only used when IDXF_NORMALIZE is set
int cbTemp; // # of compressed bytes that uncompressed
OCCF occf = lpipb->occf; BYTE fetchOldData; BYTE fetchNewData; PIH20 pHeader = &lpipb->BTreeData.Header; HRESULT fRet;
// Initialize variables
wWeight = 0; // UNDONE: Don't need it
// Reset the file pointer
FileSeek (pOutFile->fFile, foStart = pOutFile->foPhysicalOffset = foNil, 0, &errb); pOutFile->pCurrent = pOutFile->pMem; pOutFile->cbLeft = pOutFile->dwMax; pOutFile->ibit = cbitBYTE - 1; dwIndexTopicCount = pWordInfo->dwIndexTopicCount; dwNewTopicCount = pWordInfo->dwNewTopicCount; fetchOldData = fetchNewData = TRUE; pWordInfo->dwOldTopicId = pWordInfo->dwNewTopicId = dwTopicCount = 0; // Initialize pIndexDataNode structure
pIndexDataNode->nodeOffset = pWordInfo->dataLocation; pIndexDataNode->dwDataSizeLeft = pWordInfo->dwDataSize; if ((fRet = ReadNewData(pIndexDataNode)) != S_OK) return(fRet); while (dwIndexTopicCount && dwNewTopicCount) { // Get the topicId from the new file
if (fetchNewData) { if (pInFile->cbLeft < 2 * sizeof (DWORD)) { MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft); pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem + pInFile->cbLeft, pInFile->dwMax - pInFile->cbLeft, &errb); pInFile->dwMax = pInFile->cbLeft; pInFile->pCurrent = pInFile->pMem; } cbTemp = CbByteUnpack (&dwTopicIdDelta, pInFile->pCurrent); pInFile->pCurrent += cbTemp; pInFile->cbLeft -= cbTemp;
pWordInfo->dwNewTopicId = (dwNewTopicId += dwTopicIdDelta); fetchNewData = FALSE; } if (fetchOldData) { if (pIndexDataNode->ibit < cbitBYTE - 1) { pIndexDataNode->ibit = cbitBYTE - 1; pIndexDataNode->pCurPtr ++; } // Get the topicId from the index file
if ((fRet = FGetDword(pIndexDataNode, pHeader->ckeyTopicId, &dwTopicIdDelta)) != S_OK) return fRet; pWordInfo->dwIndexTopicId = (dwIndexTopicId += dwTopicIdDelta); fetchOldData = FALSE; } if (dwIndexTopicId < dwNewTopicId) { if ((fRet = EmitOldData (lpipb, pIndexDataNode, pWordInfo)) != S_OK) return(fRet); fetchOldData = TRUE; dwTopicCount++; dwIndexTopicCount --; } else if (dwIndexTopicId == dwNewTopicId) { DWORD dwTmp; if (lpipb->idxf & IDXF_NORMALIZE) { if ((fRet = FGetBits(pIndexDataNode, &dwTmp, sizeof (USHORT) * cbitBYTE)) != S_OK) return fRet; } if (occf & OCCF_HAVE_OCCURRENCE) { if ((fRet = SkipOldData (lpipb, pIndexDataNode)) != S_OK) return(fRet); } fetchOldData = TRUE; dwIndexTopicCount --; if ((fRet = EmitNewData (lpipb, pWordInfo, FALSE)) != S_OK) return(fRet); dwNewTopicCount --; fetchNewData = TRUE; dwTopicCount++; } else { if ((fRet = EmitNewData (lpipb, pWordInfo, TRUE)) != S_OK) return(fRet); dwNewTopicCount --; fetchNewData = TRUE; pWordInfo->dwIndexTopicCount++; dwTopicCount++; } } while (dwIndexTopicCount) { if (fetchOldData) { if (pIndexDataNode->ibit < cbitBYTE - 1) { pIndexDataNode->ibit = cbitBYTE - 1; pIndexDataNode->pCurPtr ++; } // Get the topicId from the index file
if ((fRet = FGetDword(pIndexDataNode, pHeader->ckeyTopicId, &dwTopicIdDelta)) != S_OK) return fRet; pWordInfo->dwIndexTopicId = (dwIndexTopicId += dwTopicIdDelta); fetchOldData = FALSE; } if ((fRet = EmitOldData (lpipb, pIndexDataNode, pWordInfo)) != S_OK) return(fRet); fetchOldData = TRUE; dwIndexTopicCount --; dwTopicCount++; } while (dwNewTopicCount) { // Get the topicId from the new file
if (fetchNewData) { if (pInFile->cbLeft < 2 * sizeof (DWORD)) { MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft); pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem + pInFile->cbLeft, pInFile->dwMax - pInFile->cbLeft, &errb); pInFile->dwMax = pInFile->cbLeft; pInFile->pCurrent = pInFile->pMem; } cbTemp = CbByteUnpack (&dwTopicIdDelta, pInFile->pCurrent); pInFile->pCurrent += cbTemp; pInFile->cbLeft -= cbTemp; pWordInfo->dwNewTopicId = (dwNewTopicId += dwTopicIdDelta); fetchNewData = FALSE; } if ((fRet = EmitNewData (lpipb, pWordInfo, TRUE)) != S_OK) return(fRet); fetchNewData = TRUE; dwNewTopicCount --; dwTopicCount++; pWordInfo->dwIndexTopicCount++; } // Adjust for some bits used
if (pOutFile->ibit < cbitBYTE - 1) { pOutFile->pCurrent++; pOutFile->cbLeft--; pOutFile->foPhysicalOffset = FoAddDw (pOutFile->foPhysicalOffset, 1); }
// Flush the output buffer
if (FileWrite (pOutFile->fFile, pOutFile->pMem, (LONG)(pOutFile->pCurrent - pOutFile->pMem), &errb) != (LONG)(pOutFile->pCurrent - pOutFile->pMem)) return(errb); dwNewDataSize = DwSubFo(pOutFile->foPhysicalOffset, foStart); if (pWordInfo->dwDataSize < dwNewDataSize) { // ERIC: Find the best fit block here
// - Add the block pointed by pWordInfo into the free list
// - Find a new block in the freelist
// if ((fRet = CopyBlockFile (pOutFile, lpipb->hfpbIdxFile,
// foNewDataOffset, dwNewDataSize)) != S_OK)
// where foNewDataOffset may be the max offset or the freelist
// block offset
FILEOFFSET foOffset1, foNewDataOffset; WORD wNumBlocksTemp; WORD wMaxBlocksTemp; // Before adding that block to the FreeList,
// look if we need to change the size of the FreeList
QFREELIST qFreeList = _GLOBALLOCK(lpipb->hFreeList); wNumBlocksTemp = qFreeList->flh.wNumBlocks; wMaxBlocksTemp = qFreeList->flh.wMaxBlocks; _GLOBALUNLOCK(lpipb->hFreeList); // we use a count of two in the test below, in case not only old block is added but
// also an entry for the unused portion of the new block (later).
if (wMaxBlocksTemp < 2 || wNumBlocksTemp >= wMaxBlocksTemp - 2) { HFREELIST hFreeListTemp;
// if the free list can't grow, fall through to FreeListAdd, where the
// smallest free entry will be overwritten and re-used
if (wMaxBlocksTemp < MAXWORD - wDefaultFreeListSize) { hFreeListTemp = FreeListRealloc(lpipb->hFreeList, (WORD)(wMaxBlocksTemp + wDefaultFreeListSize), &errb); if (errb != S_OK) return errb; lpipb->hFreeList = hFreeListTemp; } }
FreeListAdd(lpipb->hFreeList, pWordInfo->dataLocation, MakeFo(pWordInfo->dwDataSize,0)); foNewDataOffset = FreeListGetBestFit(lpipb->hFreeList, MakeFo(dwNewDataSize,0), &errb);
if (FoIsNil(foNewDataOffset)) { #ifdef _DEBUGFREE
_DPF2("UpdateDataNode: Grow from %ld to %ld failed: appending to EOF\n", pWordInfo->dwDataSize,\ dwNewDataSize); #endif
foNewDataOffset = lpipb->foMaxOffset; } else { #ifdef _DEBUGFREE
_DPF3("UpdateDataNode: Grow from %ld to %ld uses free block at %ld\n", pWordInfo->dwDataSize,\ dwNewDataSize, foNewDataOffset.dwOffset ); #endif
foOffset1 = FreeListGetBlockAt(lpipb->hFreeList, foNewDataOffset, &errb); if (FoCompare(foOffset1,MakeFo(sizeof(FREELIST),0)) > 0) FreeListAdd(lpipb->hFreeList, FoAddDw(foNewDataOffset,dwNewDataSize), FoSubFo(foOffset1,MakeFo(dwNewDataSize,0))); }
if ((fRet = CopyBlockFile (pOutFile, lpipb->hfpbIdxFile, foNewDataOffset, dwNewDataSize)) != S_OK) return fRet;
pWordInfo->dataLocation = foNewDataOffset;
//if ((fRet = CopyBlockFile (pOutFile, lpipb->hfpbIdxFile,
// lpipb->foMaxOffset, dwNewDataSize)) != S_OK)
// return fRet;
//pWordInfo->dataLocation = lpipb->foMaxOffset;
// ERIC: Only increase the size of the file if foMaxOffset is used
if (FoEquals(foNewDataOffset,lpipb->foMaxOffset)) { lpipb->foMaxOffset = FoAddDw (lpipb->foMaxOffset, dwNewDataSize);
#ifdef _DEBUG
dwOldDataLoss += pWordInfo->dwDataSize; dwOldDataNeed += dwNewDataSize; #endif
}
pWordInfo->dwDataSize = dwNewDataSize; } else { if ((fRet = CopyBlockFile (pOutFile, lpipb->hfpbIdxFile, pWordInfo->dataLocation, dwNewDataSize)) != S_OK) return fRet; } pWordInfo->dwMergeTopicCount = dwTopicCount; return(S_OK); }
PUBLIC HRESULT PASCAL FAR SkipOldData (_LPIPB lpipb, PNODEINFO pIndexDataNode) { HRESULT fRet; DWORD dwOccs; DWORD dwTmp; // Trash variable.
OCCF occf = lpipb->occf; PIH20 pHeader = &lpipb->BTreeData.Header; // Get the number of occurrences
if ((fRet = FGetDword(pIndexDataNode, pHeader->ckeyOccCount, &dwOccs)) != S_OK) return fRet; //
// One pass through here for each occurence in the
// current sub-list.
//
for (; dwOccs; dwOccs--) { //
// Keeping word-counts? If so, get it.
//
if (occf & OCCF_COUNT) { if ((fRet = FGetDword(pIndexDataNode, pHeader->ckeyWordCount, &dwTmp)) != S_OK) { return fRet; } } //
// Keeping byte-offsets? If so, get it.
//
if (occf & OCCF_OFFSET) { if ((fRet = FGetDword(pIndexDataNode, pHeader->ckeyOffset, &dwTmp)) != S_OK) return fRet; } } return S_OK; }
PRIVATE HRESULT PASCAL FAR EmitNewData (_LPIPB lpipb, PWORDINFO pWordInfo, BOOL fnewData) { DWORD dwTopicDelta; DWORD dwOccs = 0; DWORD dwTemp; WORD wWeight = 0; PBTREEDATA pTreeData = &lpipb->BTreeData; PFILEDATA pInFile = &lpipb->InFile; PFILEDATA pOutFile = &lpipb->OutFile; OCCF occf = lpipb->occf; PIH20 pHeader = &lpipb->BTreeData.Header; int cbTemp; ERRB errb; HRESULT fRet; // Set the delta
dwTopicDelta = pWordInfo->dwNewTopicId - pWordInfo->dwOldTopicId; pWordInfo->dwOldTopicId = pWordInfo->dwNewTopicId;
if (pOutFile->ibit < cbitBYTE - 1) { pOutFile->pCurrent++; pOutFile->cbLeft--; pOutFile->foPhysicalOffset = FoAddDw (pOutFile->foPhysicalOffset, 1); pOutFile->ibit = cbitBYTE - 1; } FAddDword (pOutFile, dwTopicDelta, pHeader->ckeyTopicId); if (occf & OCCF_HAVE_OCCURRENCE) { // Get number of occ data records for this topic
if (pInFile->cbLeft < 2 * sizeof (DWORD)) { MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft); pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem + pInFile->cbLeft, pInFile->dwMax - pInFile->cbLeft, &errb); pInFile->dwMax = pInFile->cbLeft; pInFile->pCurrent = pInFile->pMem; } cbTemp = CbByteUnpack (&dwOccs, pInFile->pCurrent); pInFile->pCurrent += cbTemp; pInFile->cbLeft -= cbTemp; } // If we are term weighing we have to calculate the weight
if (lpipb->idxf & IDXF_NORMALIZE) { FLOAT rLog; FLOAT rTerm; FLOAT rWeight; FLOAT fOcc;
#ifndef ISBU_IR_CHANGE
rLog = (float) log10(cHundredMillion/(double)pWordInfo->dwIndexTopicCount); rTerm = rLog*rLog; if (fnewData) { fOcc = (float) min(cTFThreshold, dwOccs); // Add the new factor into the sigma term
lpipb->wi.hrgsigma[pWordInfo->dwNewTopicId] *= lpipb->wi.hrgsigma[pWordInfo->dwNewTopicId]; lpipb->wi.hrgsigma[pWordInfo->dwNewTopicId] += fOcc * fOcc * rTerm; lpipb->wi.hrgsigma[pWordInfo->dwNewTopicId] = (float)(sqrt((double)lpipb->wi.hrgsigma[pWordInfo->dwNewTopicId])); }
// NOTE : The following weight computation, until the assignment to wWeight, is
// very similar to the weight computation in WriteDataNode() of permind2.c file.
// Read the explanation there for the hard coded figures and logic appearing below.
rTerm = (float) (8.0 - log10((double)pWordInfo->dwIndexTopicCount)); // In extreme cases, rTerm could be 0 or even -ve (when dwTopicCount approaches or
// exceeds 100,000,000)
if (rTerm <= (float) 0.0) rTerm = cVerySmallWt; // very small value. == log(100 mil/ 95 mil)
rWeight = ((float) min(cTFThreshold, dwOccs)) * rTerm * rTerm / lpipb->wi.hrgsigma[pWordInfo->dwNewTopicId]; // without the additional rTerm, we would probably be between 0.0 and 1.0
if (rWeight > rTerm) wWeight = 0xFFFF; else wWeight = (WORD) ((float)0xFFFF * rWeight / rTerm); #else
rLog = (float)(1.0) / (float)pWordInfo->dwIndexTopicCount; rTerm = rLog * rLog; if (fnewData) { // Add the new factor into the sigma term
lpipb->wi.hrgsigma[pWordInfo->dwNewTopicId] *= lpipb->wi.hrgsigma[pWordInfo->dwNewTopicId]; lpipb->wi.hrgsigma[pWordInfo->dwNewTopicId] += dwOccs * rTerm; lpipb->wi.hrgsigma[pWordInfo->dwNewTopicId] = (float)(sqrt((double)lpipb->wi.hrgsigma[pWordInfo->dwNewTopicId])); } rTerm = rTerm * (float)65535.0; rWeight = (float)dwOccs * rTerm / (float)(lpipb->wi.hrgsigma[pWordInfo->dwNewTopicId]); if (rWeight >= 65535.0) wWeight = 65335; else wWeight = (WORD)rWeight; #endif // ISBU_IR_CHANGE
// Write the weight to the output buffer
if ((fRet = FWriteBits (pOutFile, (DWORD)wWeight, (BYTE)(sizeof (WORD) * cbitBYTE))) != S_OK) return fRet; }
if ((occf & OCCF_HAVE_OCCURRENCE) == 0) return(S_OK); // Write the OccCount
FAddDword (pOutFile, dwOccs, pHeader->ckeyOccCount);
// Encode the occ block
for (; dwOccs; dwOccs--) { // Make sure input buffer holds enough data
if (pInFile->cbLeft < 5 * sizeof (DWORD)) { MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft); pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem + pInFile->cbLeft, pInFile->dwMax - pInFile->cbLeft, &errb); pInFile->dwMax = pInFile->cbLeft; pInFile->pCurrent = pInFile->pMem; }
if (occf & OCCF_COUNT) { cbTemp = CbByteUnpack (&dwTemp, pInFile->pCurrent); pInFile->pCurrent += cbTemp; pInFile->cbLeft -= cbTemp; if ((fRet = FAddDword (pOutFile, dwTemp, pHeader->ckeyWordCount)) != S_OK) return(fRet); } if (occf & OCCF_OFFSET) { cbTemp = CbByteUnpack (&dwTemp, pInFile->pCurrent); pInFile->pCurrent += cbTemp; pInFile->cbLeft -= cbTemp; if ((fRet = FAddDword (pOutFile, dwTemp, pHeader->ckeyOffset)) != S_OK) return(fRet); } } return(S_OK); }
PRIVATE HRESULT PASCAL FAR EmitOldData (_LPIPB lpipb, PNODEINFO pIndexDataNode, PWORDINFO pWordInfo) { DWORD dwTopicDelta; DWORD dwOccs; DWORD dwTmp; WORD wWeight = 0; PFILEDATA pOutFile = &lpipb->OutFile; OCCF occf = lpipb->occf; HRESULT fRet; PIH20 pHeader = &lpipb->BTreeData.Header; if (pOutFile->ibit < cbitBYTE - 1) { pOutFile->pCurrent++; pOutFile->cbLeft--; pOutFile->foPhysicalOffset = FoAddDw (pOutFile->foPhysicalOffset, 1); pOutFile->ibit = cbitBYTE - 1; } // Set the delta
dwTopicDelta = pWordInfo->dwIndexTopicId - pWordInfo->dwOldTopicId; pWordInfo->dwOldTopicId = pWordInfo->dwIndexTopicId; if ((fRet = FAddDword (pOutFile, dwTopicDelta, pHeader->ckeyTopicId)) != S_OK) return(fRet); // If we are term weighing we have to calculate the weight
if (lpipb->idxf & IDXF_NORMALIZE) { if ((fRet = FGetBits(pIndexDataNode, &dwTmp, sizeof (USHORT) * cbitBYTE)) != S_OK) return(fRet); // Write the weight to the output buffer
if ((fRet = FWriteBits (pOutFile, (DWORD)wWeight, (BYTE)(sizeof (WORD) * cbitBYTE))) != S_OK) return(fRet); }
// Don't do anything else if there is nothing else to do!!!
if ((occf & OCCF_HAVE_OCCURRENCE) == 0) return S_OK; if ((fRet = FGetDword(pIndexDataNode, pHeader->ckeyOccCount, &dwOccs)) != S_OK) return fRet;
// Write the OccCount
if ((fRet = FAddDword (pOutFile, dwOccs, pHeader->ckeyOccCount)) != S_OK) return(fRet);
// Encode the occ block
for (; dwOccs; dwOccs--) { if (occf & OCCF_COUNT) { if ((fRet = FGetDword(pIndexDataNode, pHeader->ckeyWordCount, &dwTmp)) != S_OK) return fRet; if ((fRet = FAddDword (pOutFile, dwTmp, pHeader->ckeyWordCount)) != S_OK) return(fRet);
} if (occf & OCCF_OFFSET) { if ((fRet = FGetDword(pIndexDataNode, pHeader->ckeyOffset, &dwTmp)) != S_OK) return fRet; if ((fRet = FAddDword (pOutFile, dwTmp, pHeader->ckeyOffset)) != S_OK) return(fRet); } } return(S_OK); }
PRIVATE int PASCAL NEAR CopyNewDataToStemNode (_LPIPB lpipb, PNODEINFO pTmpNode, LPB pWord, LPB pLastWord, int cLevel, int fFlag) { LPB pWordStorage; /************************************************
* Emit the word data to the temp block ************************************************/ pWordStorage = pTmpNode->pBuffer + sizeof(WORD); pWordStorage += PrefixCompressWord (pWordStorage, pWord, pLastWord, lpipb->occf & OCCF_LENGTH);
// Emit fileoffset
if (fFlag & USE_TEMP_NODE_04) { pWordStorage += CopyFileOffset (pWordStorage, lpipb->BTreeData.rgpTmpNodeInfo[cLevel+1]->nodeOffset); } else { pWordStorage += CopyFileOffset (pWordStorage, lpipb->BTreeData.rgpNodeInfo[cLevel+1]->nodeOffset); } pTmpNode->pCurPtr = pWordStorage; SETWORD (pTmpNode->pBuffer, (WORD)(lpipb->BTreeData.Header.dwBlockSize - (pWordStorage - pTmpNode->pBuffer))); return (int)(pWordStorage - pTmpNode->pBuffer); }
PRIVATE int PASCAL NEAR CopyNewDataToLeafNode (_LPIPB lpipb, PNODEINFO pTmpNode, PWORDINFO pWordInfo, LPB pWord, LPB pLastWord) { LPB pWordStorage; /************************************************
* Emit the word data to the temp block ************************************************/ pWordStorage = pTmpNode->pBuffer + FOFFSET_SIZE + sizeof(WORD); pWordStorage += PrefixCompressWord (pWordStorage, pWord, pLastWord, lpipb->occf & OCCF_LENGTH); // Emit field id, topic count. fileoffset, datasize
if (lpipb->occf & OCCF_FIELDID) pWordStorage += CbBytePack (pWordStorage, pWordInfo->dwFieldId); pWordStorage += CbBytePack (pWordStorage, pWordInfo->dwMergeTopicCount); pWordStorage += CopyFileOffset (pWordStorage, pWordInfo->dataLocation); pWordStorage += CbBytePack (pWordStorage, pWordInfo->dwDataSize); pTmpNode->pCurPtr = pWordStorage; SETWORD (pTmpNode->pBuffer + FOFFSET_SIZE, (WORD)(pTmpNode->cbLeft = (LONG)(lpipb->BTreeData.Header.dwBlockSize - (pWordStorage - pTmpNode->pBuffer)))); return (int)(pWordStorage - pTmpNode->pBuffer); }
PRIVATE HRESULT PASCAL FAR SkipNewData (_LPIPB lpipb, PWORDINFO pWordInfo) { DWORD dwOccs; DWORD dwTemp; PBTREEDATA pTreeData = &lpipb->BTreeData; PFILEDATA pInFile = &lpipb->InFile; PFILEDATA pOutFile = &lpipb->OutFile; OCCF occf = lpipb->occf; PIH20 pHeader = &lpipb->BTreeData.Header; int cbTemp; ERRB errb; // Don't do anything else if there is nothing else to do!!!
if ((occf & OCCF_HAVE_OCCURRENCE) == 0) return S_OK; // Get number of occ data records for this topic
if (pInFile->cbLeft < 2 * sizeof (DWORD)) { MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft); pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem + pInFile->cbLeft, pInFile->dwMax - pInFile->cbLeft, &errb); pInFile->dwMax = pInFile->cbLeft; pInFile->pCurrent = pInFile->pMem; } cbTemp = CbByteUnpack (&dwOccs, pInFile->pCurrent); pInFile->pCurrent += cbTemp; pInFile->cbLeft -= cbTemp; // Encode the occ block
for (; dwOccs; dwOccs--) { // Make sure input buffer holds enough data
if (pInFile->cbLeft < 5 * sizeof (DWORD)) { MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft); pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem + pInFile->cbLeft, pInFile->dwMax - pInFile->cbLeft, &errb); pInFile->dwMax = pInFile->cbLeft; pInFile->pCurrent = pInFile->pMem; }
if (occf & OCCF_COUNT) { cbTemp = CbByteUnpack (&dwTemp, pInFile->pCurrent); pInFile->pCurrent += cbTemp; pInFile->cbLeft -= cbTemp; } if (occf & OCCF_OFFSET) { cbTemp = CbByteUnpack (&dwTemp, pInFile->pCurrent); pInFile->pCurrent += cbTemp; pInFile->cbLeft -= cbTemp; } } return(S_OK); }
BYTE CurrentWord [1000]; BYTE LastWord [1000];
#if 0
HRESULT CheckStemNode (PNODEINFO pNodeInfo) { LPB lpCurPtr; WORD wWlen; LPB lpMaxAddress = pNodeInfo->pMaxAddress; FILEOFFSET nodeOffset; lpCurPtr = pNodeInfo->pBuffer + sizeof(WORD); // Reset the last word
*(LPWORD)LastWord = 0; do { lpCurPtr = ExtractWord(CurrentWord, lpCurPtr, &wWlen); if (StrCmpPascal2(LastWord, CurrentWord) > 0) { // _asm int 3;
return(SetErrCode (NULL, ERR_FAILED)); } lpCurPtr += ReadFileOffset (&nodeOffset, lpCurPtr); MEMCPY(LastWord, CurrentWord, wWlen + 2); } while (lpCurPtr < lpMaxAddress); return(S_OK); }
HRESULT CheckLeafNode (PNODEINFO pNodeInfo, int occf) { LPB lpCurPtr; WORD wWlen; LPB lpMaxAddress = pNodeInfo->pMaxAddress; FILEOFFSET nodeOffset; DWORD dwTmp; lpCurPtr = pNodeInfo->pBuffer + sizeof(WORD) + FOFFSET_SIZE; // Reset the last word
*(LPWORD)LastWord = 0; do { lpCurPtr = ExtractWord(CurrentWord, lpCurPtr, &wWlen); if (StrCmpPascal2(LastWord, CurrentWord) > 0) { // _asm int 3;
return(SetErrCode (NULL, ERR_FAILED)); } MEMCPY(LastWord, CurrentWord, wWlen + 2); // Get fieldif and topic count
if (occf & OCCF_FIELDID) lpCurPtr += CbByteUnpack (&dwTmp, lpCurPtr); lpCurPtr += CbByteUnpack (&dwTmp, lpCurPtr); // Get the data location and size
lpCurPtr += ReadFileOffset (&nodeOffset, lpCurPtr); lpCurPtr += CbByteUnpack(&dwTmp, lpCurPtr); } while (lpCurPtr < lpMaxAddress); return(S_OK); } #endif
|