|
|
#define VER3
/*************************************************************************
* * * INDEX.C * * * * Copyright (C) Microsoft Corporation 1990-1994 * * All Rights reserved. * * * ************************************************************************** * * * Module Intent * * This is the second stage of the index building process. After all * * of the word have been add in stage 1, IndexBuild will be called. * * IndexBuild starts the second stage. We will merge-sort the temp file * * generated in phase 1 to create a second temp file to send to phase 3. * * * ************************************************************************** * * * Current Owner: BinhN * * * **************************************************************************/
#include <mvopsys.h>
#include <mem.h>
#include <memory.h>
#include <io.h>
#include <math.h>
#include <mvsearch.h>
#include <orkin.h>
#include "common.h"
#include "index.h"
#ifdef _DEBUG
static BYTE NEAR s_aszModule[] = __FILE__; /* Used by error return functions.*/ #endif
#ifndef _32BIT
#define ESOUTPUT_BUFFER 0xFFFC // Size of output file buffer
// This must be at the size of the largest word + 12
// or word + 14 if OCCF_LENGTH is set
#else
#define ESOUTPUT_BUFFER 0xFFFFC // Size of output file buffer
// This must be at the size of the largest word + 12
// or word + 14 if OCCF_LENGTH is set
#endif
#define FLUSH_NEW_RECORD 1
#define FLUSH_EXCEPT_LAST 2
/*************************************************************************
* * INTERNAL PRIVATE FUNCTIONS * * All of them should be declared near * *************************************************************************/ PRIVATE HRESULT NEAR PASCAL FillInputBuffer (LPESB, HFPB); PRIVATE HRESULT NEAR PASCAL ESFlushBuffer (LPESI); PRIVATE HRESULT NEAR PASCAL ESFillBuffer (_LPIPB, LPESB); PRIVATE HRESULT NEAR PASCAL ESMemory2Disk (_LPIPB, PMERGEHEADER, int); PRIVATE HRESULT NEAR PASCAL ProcessFiles (_LPIPB lpipb, LPMERGEPARAMS); PRIVATE int NEAR PASCAL CompareRecordBuffers (_LPIPB, LPB, LPB); PRIVATE VOID NEAR PASCAL PQueueUp (_LPIPB, LPESB FAR *, LONG); PRIVATE VOID NEAR PASCAL PQueueDown (_LPIPB); PRIVATE PTOPICDATA PASCAL NEAR MergeTopicNode (PMERGEHEADER, PTOPICDATA, int); PRIVATE VOID NEAR MergeOccurrence (PTOPICDATA, PTOPICDATA, int); PRIVATE LPV PASCAL NEAR GetBlockNode (PBLKCOMBO lpBlockCombo); PRIVATE VOID PASCAL NEAR SetQueue (LPESI pEsi); PRIVATE HRESULT PASCAL NEAR ESBBlockAllocate (_LPIPB lpipb, DWORD lMemSize); PRIVATE BOOL PASCAL LoadEsiTemp (_LPIPB lpipb, LPESI lpesi, LPB lpbEsiFile, LPB lpbIsiFile, PHRESULT phr); PRIVATE VOID PASCAL NEAR SaveEsiTemp (LPIPB lpipb, LPESI lpesi); PRIVATE VOID PASCAL NEAR UpdateEsiTemp (LPIPB lpipb); PRIVATE BOOL PASCAL NEAR FindTopic(LPMERGEPARAMS lpmp, DWORD dwTopicId);
/*************************************************************************
* * INTERNAL PUBLIC FUNCTIONS * * All of them should be declared far, unless we know they belong to * the same segment. They should be included in some include file * *************************************************************************/ PUBLIC HRESULT FAR PASCAL FlushTree(_LPIPB lpipb); PUBLIC HRESULT FAR PASCAL MergeSortTreeFile (_LPIPB, LPMERGEPARAMS); HRESULT FAR PASCAL AllocSigmaTable (_LPIPB lpipb);
/*************************************************************************
* * @doc EXTERNAL API INDEX * * @func BOOL FAR PASCAL | MVIndexBuild | * This function will build an index file based on the information * collected in the Index parameter block. * * @parm HFPB | hSysFile | * If it is non-null, it is the handle of an already opened system file. * In this case the index is a subfile of the opened system file * If it is 0, the index file is a regular DOS file * * @parm LPIPB | lpipb | * Pointer to Index Parameter Block. This structure contains all the * information necessary to build the index file * * @parm HFPB | hfpb | * Index hfpb if pstrFile is NULL * * @parm LPSTR | pstrFile | * Index filename if hfpb is NULL * * @rdesc S_OK, or other errors * * @xref MVIndexInitiate() *************************************************************************/ /*
* This operates in three main steps: * * 1. Send finish to first phase to dump the buffer. Then merge-sort * that file into a temporary index. Keep statistics on the information * written to this temporary index. * * 2. Analyze the statistics gathered during the temporary index * building phase. This analysis results in the choice of * compression processes that will be used in the next step. * * 3. Permanent index building phase. During this phase, the * temporary index is read, compressed like crazy, and written * to a permanent index file. Unlike the temporary index, the * permanent index contains directory nodes as well as leaf * nodes. * *************************************************************************/
PUBLIC HRESULT EXPORT_API FAR PASCAL MVIndexBuild (HFPB hSysFile, _LPIPB lpipb, HFPB hfpb, LPSTR pstrFile) { ERRB errb; PHRESULT phr = &errb; BYTE bKeyIndex = CKEY_OCC_BASE; // Index into cKey array for compression
HRESULT fRet; // Return value from this function.
DWORD loop;
// Sanity check
if (lpipb == NULL || (NULL == hfpb && NULL == pstrFile)) return E_INVALIDARG;
// Flush the internal sort
// Flushes any records in the tree to disk
fRet = FlushTree(lpipb);
// Free all memory blocks
FreeISI (lpipb); if (fRet != S_OK) return(fRet); // lpipb->lcTopics++; // Adjust to base-1 from base-0
if (lpipb->esi.cesb == 0) // Nothing to process, there will be no index file
return S_OK;
if (lpipb->idxf & KEEP_TEMP_FILE) SaveEsiTemp (lpipb, &lpipb->esi);
// If we're doing term-weighting, set up a huge array to contain the
// sigma terms. The size of the array depends on the total # of topics
// We also create an array of LOG values to save calculations later
if (lpipb->idxf & IDXF_NORMALIZE) { if ((fRet = AllocSigmaTable (lpipb)) != S_OK) return(fRet); }
if ((fRet = MergeSortTreeFile (lpipb, NULL)) != S_OK) return SetErrCode (phr, fRet); if ((lpipb->idxf & KEEP_TEMP_FILE) == 0) FileUnlink (NULL, lpipb->isi.aszTempName, REGULAR_FILE);
// If we are doing term-weighting we have to square root all sigma values
if (lpipb->idxf & IDXF_NORMALIZE) { // ISBU_IR_CHANGE not needed here 'cos computing sqrt is necessary in both cases
for (loop = 0; loop < lpipb->dwMaxTopicId + 1; ++loop) lpipb->wi.hrgsigma[loop] = (float)sqrt ((double)lpipb->wi.hrgsigma[loop]); }
// Analyze data to get the best compression scheme
// TopicId
// Note: We can't use fixed field compression for topic, since they
// can be modified by update. A fixed field format may become
// insufficient to store larger values of topic differences
VGetBestScheme(&lpipb->cKey[CKEY_TOPIC_ID], &lpipb->BitCount[CKEY_TOPIC_ID][0], lcbitBITSTREAM_ILLEGAL, TRUE);
// Occurrence Count
VGetBestScheme(&lpipb->cKey[CKEY_OCC_COUNT], &lpipb->BitCount[CKEY_OCC_COUNT][0], lcbitBITSTREAM_ILLEGAL, TRUE);
if (lpipb->occf & OCCF_COUNT) { VGetBestScheme(&lpipb->cKey[bKeyIndex], &lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE); bKeyIndex++; }
if (lpipb->occf & OCCF_OFFSET) { VGetBestScheme(&lpipb->cKey[bKeyIndex], &lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE); bKeyIndex++; }
if (lpipb->idxf & KEEP_TEMP_FILE) UpdateEsiTemp (lpipb); // Build the permanent index
fRet = BuildBTree(hSysFile, lpipb, lpipb->esi.aszTempName, hfpb, pstrFile); if (lpipb->idxf & IDXF_NORMALIZE) { FreeHandle (lpipb->wi.hSigma); FreeHandle (lpipb->wi.hLog); } return fRet; }
/*************************************************************************
* * @doc INDEX * * @func HRESULT NEAR PASCAL | FillInputBuffer | * Fills the buffer by reading from the specified file. * * @parm PESB | pEsb | * Pointer to external sort block to fill * * @parm HFPB | hFile | * Handle to the input file * * @rdesc S_OK, or errors if failed * *************************************************************************/
HRESULT NEAR PASCAL FillInputBuffer(LPESB pEsb, HFPB hFile) { ERRB errb; DWORD dwBytesRead;
// Read in data
if ((dwBytesRead = FileSeekRead (hFile, (LPB)pEsb->lrgbMem, pEsb->lfo, pEsb->dwEsbSize, &errb)) == 0) return errb; // Update utility variables
pEsb->lfo = FoAddDw(pEsb->lfo, dwBytesRead); pEsb->dwEsbSize = (CB)dwBytesRead; pEsb->ibBuf = 0; return S_OK; }
/*************************************************************************
* * @doc INDEX * * @func HRESULT NEAR PASCAL | ESFlushBuffer | * Flushes the output buffer to disk and resets it. * * @parm LPESI | pEsi | * Pointer to ESI block * * @rdesc S_OK, or errors if failed * *************************************************************************/
HRESULT NEAR PASCAL ESFlushBuffer(LPESI pEsi) { ERRB errb; DWORD dwLen;
dwLen = pEsi->ibBuf; if (dwLen != (DWORD)FileWrite (pEsi->hfpb, pEsi->pOutputBuffer, dwLen, &errb)) return errb;
pEsi->lfoTempOffset = FoAddDw (pEsi->lfoTempOffset, dwLen); pEsi->ibBuf = 0; return S_OK; }
/*************************************************************************
* * @doc INDEX * * @func HRESULT NEAR PASCAL | ESFillBuffer | * Updates the input buffer with new data from the input file. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm LPESB | pEsb | * Pointer to ESB block to be filled * * @rdesc S_OK, or other errors *************************************************************************/
HRESULT NEAR PASCAL ESFillBuffer(_LPIPB lpipb, LPESB pEsb) { DWORD dwBytesRead; DWORD dwExtra = pEsb->dwEsbSize - pEsb->ibBuf; ERRB errb;
// Read either the entire buffer size or whatever is left
dwBytesRead = DwSubFo (pEsb->lfoMax, pEsb->lfo); if (dwBytesRead > pEsb->dwEsbSize - dwExtra) dwBytesRead = pEsb->dwEsbSize - dwExtra;
// Save unproccessed information to beginning of buffer
if (dwExtra) MEMMOVE ((LPB)pEsb->lrgbMem, pEsb->lrgbMem + pEsb->ibBuf, dwExtra);
// Read in the new data
if ((dwBytesRead = FileSeekRead (lpipb->isi.hfpb, (LPB)(pEsb->lrgbMem + dwExtra), pEsb->lfo, dwBytesRead, &errb)) == 0 && errb != S_OK) return(errb); pEsb->lfo = FoAddDw(pEsb->lfo, dwBytesRead); pEsb->ibBuf = 0; pEsb->dwEsbSize = dwBytesRead + dwExtra; return(S_OK); }
/*************************************************************************
* * @doc INTERNAL INDEXING * * @func HRESULT FAR PASCAL | MergeSortTree File | * Sorts the file generated from the tree output into one * list of sorted elements. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * *************************************************************************/
PUBLIC HRESULT PASCAL FAR MergeSortTreeFile (_LPIPB lpipb, LPMERGEPARAMS lpmp) { // Local replacement variables
LPESI pEsi; // Pointer to external sort info
LPISI pIsi; // Pointer to internal sort info
HFPB hInputFile; // Handle to input file
ERRB errb; PHRESULT phr = &errb; DWORD cesb; // Input buffer count
LPESB FAR* lrgPriorityQueue; // Pointer to Priority Queue
WORD uiQueueSize = 0; // Count of entries in Queue
DWORD dwBufferSize;
// Working variables
HRESULT fRet; LPESB pEsb; // Temp pointer to linked list
// Sanity check
if (lpipb == NULL) return E_INVALIDARG;
// Variables initialization
pEsi = &lpipb->esi; // Pointer to external sort info
pIsi = &lpipb->isi; // Pointer to internal sort info
cesb = pEsi->cesb; // Input buffer count
// Open input file
if ((pIsi->hfpb = FileOpen (NULL, pIsi->aszTempName, REGULAR_FILE, READ, phr)) == NULL) return *phr; hInputFile = pIsi->hfpb;
// Allocate & fill input buffers
for (pEsb = pEsi->lpesbRoot; pEsb != NULL; pEsb = pEsb->lpesbNext) { DWORD cbRead;
dwBufferSize = (lpipb->dwMemAllowed * 6) / (8 * pEsi->cesb); // Alocate buffer space
if ((pEsb->hMem = _GLOBALALLOC (DLLGMEM_ZEROINIT, dwBufferSize)) == NULL) { fRet = E_OUTOFMEMORY; exit1: FreeEsi (lpipb); FileClose(hInputFile); pIsi->hfpb = NULL; return fRet; } pEsb->lrgbMem = (LRGB)_GLOBALLOCK (pEsb->hMem);
if ((cbRead = DwSubFo(pEsb->lfoMax, pEsb->lfo)) > dwBufferSize) cbRead = dwBufferSize;
// Fill buffer from disk
if (FileSeekRead (hInputFile, pEsb->lrgbMem, pEsb->lfo, cbRead, phr) != (LONG)cbRead) { fRet = *phr; _GLOBALUNLOCK(pEsb->hMem); _GLOBALFREE(pEsb->hMem); pEsb->hMem = NULL; goto exit1; } pEsb->dwEsbSize = cbRead; pEsb->ibBuf = 0; pEsb->lfo = FoAddDw (pEsb->lfo, cbRead); }
// Allocate a priority queue array. The size of the array
// is the number of external sort info blocks plus 1, since
// location 0 is not used.
if ((pEsi->hPriorityQueue = _GLOBALALLOC (DLLGMEM_ZEROINIT, (DWORD)(pEsi->cesb + 1) * sizeof (LPB))) == NULL) { fRet = E_OUTOFMEMORY; goto exit1; } pEsi->lrgPriorityQueue = (LPESB FAR *)_GLOBALLOCK (pEsi->hPriorityQueue); lrgPriorityQueue = pEsi->lrgPriorityQueue;
// Attach input buffers to Priority Queue
// Remebering to start at offset 1 NOT 0 (PQ's have a null 0 element)
for (pEsb = pEsi->lpesbRoot; pEsb != NULL; pEsb = pEsb->lpesbNext) { lrgPriorityQueue[++uiQueueSize] = pEsb; PQueueUp (lpipb, lrgPriorityQueue, uiQueueSize); } pEsi->uiQueueSize = uiQueueSize;
// Clear largest Record Size field
// lpipb->dwMaxRecordSize = 0;
fRet = ProcessFiles(lpipb, lpmp); _GLOBALUNLOCK (pEsi->hPriorityQueue); _GLOBALFREE (pEsi->hPriorityQueue); pEsi->hPriorityQueue = NULL; goto exit1; }
/*************************************************************************
* * @doc INDEX * * @func HRESULT NEAR PASCAL | ESMemory2Disk | * Copies temp record to output buffer. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm PMERGEHEADER | pHeader | * Pointer to header to flush * * @parm int | flag | * - if FLUSH_NEW_RECORD, the flush is due to new record, we flush * everything, else we may do a partial flush only * - if FLUSH_EXCEPT_LAST, we don't flush the last topic * * @rdesc S_OK, or other errors *************************************************************************/ PRIVATE HRESULT NEAR PASCAL ESMemory2Disk (_LPIPB lpipb, PMERGEHEADER pHeader, int flag) { // Local replacement variables
LPESI pEsi = &lpipb->esi; LPB pMax = pEsi->pOutputBuffer + ESOUTPUT_BUFFER - 2 * sizeof(DWORD); DWORD dwOccCount; LPB pOutputBuffer = pEsi->pOutputBuffer; ERRB errb; PHRESULT phr = &errb; HRESULT fRet; BYTE cNumOcc; OCCF occf;
// Working variables
PTOPICDATA pTopic; // Temp var to traverse the topic linked list
DWORD loop, sub; // Various loop counters
DWORD dwTopicIdDelta; DWORD OccDelta[5]; // Delta base for all occurrence data
DWORD LastOcc[5]; FLOAT rLog; // (1/n) - IDXF_NORMALIZE is set
FLOAT rLogSquared; // (1/n)^2 - IDXF_NORMALIZE is set
LPB pStart; LPB pCurPtr;
// Set up pointers
pStart = pCurPtr = pOutputBuffer + pEsi->ibBuf; // Variable replacement
occf = lpipb->occf; // Size of string
loop = pHeader->dwStrLen; // Make sure the string, FileId, Topic Count and Record Size fit
// We add in and extra DWORD for 5 byte compresssion problems and
// to cover the Word Length if there is one.
if ((pStart + loop + sizeof (DWORD) * 5) >= pMax) { if ((fRet = ESFlushBuffer (pEsi)) != S_OK) return(fRet); pStart = pCurPtr = pOutputBuffer; }
if (pHeader->fEmitRecord == FALSE) { // If we never emitted the record header then we emitted now
// Reset the flag
pHeader->fEmitRecord = TRUE; // Skip record size field
pCurPtr += sizeof (DWORD);
// Pascal string
MEMCPY (pCurPtr, pHeader->lpbWord, loop); pCurPtr += loop;
// Word Length
if (occf & OCCF_LENGTH) pCurPtr += CbBytePack (pCurPtr, pHeader->dwWordLength);
// FieldId
if (occf & OCCF_FIELDID) pCurPtr += CbBytePack (pCurPtr, pHeader->dwFieldId);
// Topic Count
if (flag & FLUSH_NEW_RECORD) { // This is the whole record. dwTopicCount value is correct
SETLONG((LPUL)pCurPtr, pHeader->dwTopicCount); } else { // Save the offset for backpatching
pHeader->foTopicCount = FoAddDw (pEsi->lfoTempOffset, (DWORD)(pCurPtr - pOutputBuffer)); pHeader->pTopicCount = pCurPtr; } pCurPtr += sizeof(DWORD);
// Write Record Length
*(LPUL)pStart = (DWORD)(pCurPtr - pStart - sizeof (DWORD));
} else if (flag & FLUSH_NEW_RECORD) { // We emit the record before, since pheader->fEmitRecord == TRUE
// We need to backpatch the topic count
if (FoCompare(pHeader->foTopicCount, pEsi->lfoTempOffset) >= 0) { // Everything is still in memory, just do local backpatch
SETLONG((LPUL)(pHeader->pTopicCount), pHeader->dwTopicCount); } else { // Do backpatch in the file by seeking back to the right
// place
if (FileSeekWrite(pEsi->hfpb, &pHeader->dwTopicCount, pHeader->foTopicCount, sizeof(DWORD), phr) != sizeof(DWORD)) return(*phr); // Restore the current file offset
FileSeek(pEsi->hfpb, pEsi->lfoTempOffset, 0, phr); } } // Convert all occ data to delta values & compress them
pTopic = pHeader->pTopic; cNumOcc = lpipb->ucNumOccDataFields; for (; pTopic;) { POCCDATA pOccData; PTOPICDATA pReleased; if ((flag & FLUSH_EXCEPT_LAST) && pTopic->pNext == NULL) break; // Set TopicId delta
dwTopicIdDelta = pTopic->dwTopicId - pHeader->dwLastTopicId; pHeader->dwLastTopicId = pTopic->dwTopicId;
// Save bit size to the statistics array
lpipb->BitCount[CKEY_TOPIC_ID][CbitBitsDw (dwTopicIdDelta)] += 1;
// Write TopicID Delta
if (pCurPtr > pMax) { pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer); if ((fRet = ESFlushBuffer (pEsi)) != S_OK) return(fRet); pCurPtr = pOutputBuffer; } pCurPtr += CbBytePack (pCurPtr, dwTopicIdDelta);
if (cNumOcc == 0) { pReleased = pTopic; pTopic = pTopic->pNext; // Add the released to the freed linked list
pReleased->pNext = (PTOPICDATA)lpipb->TopicBlock.pFreeList; lpipb->TopicBlock.pFreeList = (PLIST)pReleased; lpipb->TopicBlock.dwCount--; continue; } if (dwOccCount = pTopic->dwOccCount) { // Reset count occdata delta for every new topic
MEMSET (OccDelta, 0, 5 * sizeof (DWORD)); MEMSET (LastOcc, 0, 5 * sizeof (DWORD));
// Copy Occurrence Count
if (pCurPtr > pMax) { pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer); if ((fRet = ESFlushBuffer (pEsi)) != S_OK) return(fRet); pCurPtr = pOutputBuffer; } pCurPtr += CbBytePack (pCurPtr, dwOccCount);
// Save bit size to the statistics array
lpipb->BitCount[1][CbitBitsDw (dwOccCount)] += 1;
// Repeat for each occurrence block
for (pOccData = pTopic->pOccData, sub = dwOccCount; sub > 0 && pOccData; --sub) { LPDW lpDw; int iIndex; POCCDATA pReleased; if (pCurPtr + 5 * sizeof(DWORD) > pMax) { pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer); if ((fRet = ESFlushBuffer (pEsi)) != S_OK) return(fRet); pStart = pCurPtr = pOutputBuffer; } lpDw = &pOccData->OccData[0]; iIndex = CKEY_OCC_BASE; if (occf & OCCF_COUNT) { // Convert each value to a delta value
OccDelta[iIndex] = *lpDw - LastOcc[iIndex]; LastOcc[iIndex] = *lpDw; lpDw++; // Save to bit size to the statistics array
lpipb->BitCount[iIndex][CbitBitsDw (OccDelta[iIndex])] += 1; // Compress occurrence field to buffer
pCurPtr += CbBytePack (pCurPtr, OccDelta[iIndex]); iIndex++; } if (occf & OCCF_OFFSET) { // Convert each value to a delta value
OccDelta[iIndex] = *lpDw - LastOcc[iIndex]; LastOcc[iIndex] = *lpDw; lpDw++; // Save to bit size to the statistics array
lpipb->BitCount[iIndex][CbitBitsDw (OccDelta[iIndex])] += 1; // Compress occurrence field to buffer
pCurPtr += CbBytePack (pCurPtr, OccDelta[iIndex]); iIndex++; } pReleased = pOccData; pOccData = pOccData->pNext; pReleased->pNext = (POCCDATA)lpipb->OccBlock.pFreeList; lpipb->OccBlock.pFreeList = (PLIST)pReleased; lpipb->OccBlock.dwCount--; } // Check for mismatch between count and links
#ifdef _DEBUG
if (sub) SetErrCode (phr, E_ASSERT);
if (pOccData) SetErrCode (phr, E_ASSERT); #endif
}
// Update the sigma values if we are doing term weighing
// erinfox: remove test against flag. Sometimes sigma never
// got calculated for a topic and that caused a divide by zero
// later on.
if ((lpipb->idxf & IDXF_NORMALIZE) /* && (flag & FLUSH_NEW_RECORD)*/) { if (pTopic->dwTopicId > lpipb->dwMaxTopicId) { // Incease the size of the sigma table. This can happen when
// updating with new topics
_GLOBALUNLOCK (lpipb->wi.hSigma); if ((lpipb->wi.hSigma = _GLOBALREALLOC (lpipb->wi.hSigma, (pTopic->dwTopicId + 1) * sizeof(float), DLLGMEM_ZEROINIT)) == NULL) { return (SetErrCode(phr, E_OUTOFMEMORY)); } lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK(lpipb->wi.hSigma); lpipb->dwMaxTopicId = pTopic->dwTopicId ; } if (lpipb->bState == INDEXING_STATE) { #ifndef ISBU_IR_CHANGE
FLOAT fOcc;
if (pHeader->dwTopicCount >= cLOG_MAX) { // we have to guard against the possibility of the log resulting in
// a value <= 0.0. Very rare, but possible in the future. This happens
// if dwTopicCount approaches or exceeds the N we are using (N == 100 million)
if (pHeader->dwTopicCount >= cNintyFiveMillion) rLog = cVerySmallWt; // log10(100 mil/ 95 mil) == 0.02
else //rLog = (float) log10(cHundredMillion/(double)pHeader->dwTopicCount);
rLog = (float) (8.0 - log10((double)pHeader->dwTopicCount));
rLogSquared = rLog*rLog; } else rLogSquared = lpipb->wi.lrgrLog[(WORD)pHeader->dwTopicCount];
// Update sigma value
// NOTE : We are bounding dwOccCount by a value of eTFThreshold
// The RHS of the equation below has an upperbound of 2 power 30.
fOcc = (float) min(cTFThreshold, dwOccCount); lpipb->wi.hrgsigma[pTopic->dwTopicId] += (SIGMA) fOcc*fOcc*rLogSquared; //(SIGMA) (fOcc * fOcc * rLogSquared/(float)0xFFFF);
#else
// Failed for update : UNDONE
if (pHeader->dwTopicCount >= cLOG_MAX) { rLog = (float)1.0 / (float)pHeader->dwTopicCount; rLogSquared = rLog * rLog; } else rLogSquared = lpipb->wi.lrgrLog[(WORD)pHeader->dwTopicCount]; // Update sigma value
lpipb->wi.hrgsigma[pTopic->dwTopicId] += (SIGMA)(dwOccCount * dwOccCount) * rLogSquared; #endif // ISBU_IR_CHANGE
} } pReleased = pTopic; pTopic = pTopic->pNext; // Add the released to the freed linked list
pReleased->pNext = (PTOPICDATA)lpipb->TopicBlock.pFreeList; lpipb->TopicBlock.pFreeList = (PLIST)pReleased; lpipb->TopicBlock.dwCount--; }
pHeader->pTopic = pHeader->pLastTopic = pTopic; // Update output offset
pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer); return(S_OK); }
/*************************************************************************
* * @doc INDEX * * @func HRESULT NEAR PASCAL | ProcessFiles | * Sorts the file generated from the tree output into one * list of sorted elements. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @rdesc S_OK, or errors if failed * * @notes * This function processed the input buffers and uses dynamic * memory allocation to sort each word as it come in. Once a * word stops repeating, it is flush to disk and the memory is * reset for the next word. *************************************************************************/
HRESULT NEAR PASCAL ProcessFiles(_LPIPB lpipb, LPMERGEPARAMS lpmp) { // Local replacement variables
LPISI pIsi = &lpipb->isi; LPESI pEsi = &lpipb->esi; LPESB FAR * lrgPriorityQueue = pEsi->lrgPriorityQueue; LONG uiQueueSize = pEsi->uiQueueSize; LPB pQueuePtr; WORD cNumOcc = lpipb->ucNumOccDataFields; WORD OccSize = sizeof(OCCDATA) - sizeof(DWORD) + cNumOcc * sizeof(DWORD); int occf = lpipb->occf; LPB pBufMax; HANDLE hWord; LPB lpbWord; DWORD dwUniqueTerm = 0; // Used for calback function
#ifdef _DEBUG
BYTE astWord[300]; BYTE astLastWord[300]; #endif
// Working variables
PMERGEHEADER pHeader; // Pointer to merge header
LPESB pEsb; // Temp ESB pointer
PTOPICDATA pNewTopic; // Used to create new topic
DWORD loop; // Temp loop counter
HANDLE hHeader; HFPB hOutputFile; // Handle to output file
int fRet; // Return value
USHORT uStringSize; // Size of Psacal String
ERRB errb; PHRESULT phr = &errb;
static long Count = 0;
// Setup Block Manager
if ((fRet = ESBBlockAllocate (lpipb, lpipb->dwMemAllowed / 4)) != S_OK) return(fRet); // Allocate output buffer
if ((pEsi->hBuf = _GLOBALALLOC (DLLGMEM_ZEROINIT, ESOUTPUT_BUFFER)) == NULL) { fRet = E_OUTOFMEMORY; exit1: return fRet; } pEsi->pOutputBuffer = (LPB)_GLOBALLOCK (pEsi->hBuf); pEsi->ibBuf = 0;
// Create output file
GETTEMPFILENAME ((char)0, "eso", 0, pEsi->aszTempName); if ((pEsi->hfpb = FileOpen(NULL, pEsi->aszTempName, REGULAR_FILE, WRITE, &errb)) == NULL) { fRet = E_FILECREATE; exit2: FreeHandle (pEsi->hBuf); pEsi->hBuf = NULL; goto exit1; } hOutputFile = pEsi->hfpb;
// Setup new record in memory
if ((hHeader = _GLOBALALLOC (DLLGMEM_ZEROINIT, sizeof (MERGEHEADER))) == NULL) { fRet = E_OUTOFMEMORY; exit3: FileClose (hOutputFile); goto exit2; } pHeader = (PMERGEHEADER)_GLOBALLOCK (hHeader); // Allocate buffer for a word, which include 64K + sizeof(WORD) + slack
if ((hWord = _GLOBALALLOC(DLLGMEM_ZEROINIT, 0x10004)) == NULL) { exit4: _GLOBALUNLOCK(hHeader); _GLOBALFREE (hHeader); goto exit3; } pHeader->lpbWord = lpbWord = (LPB)_GLOBALLOCK(hWord);
#ifdef _DEBUG
astWord[0] = 0; #endif
// Process all input buffers
do { DWORD dwWordLength; DWORD dwFieldId; LPB lpStart; DWORD dwTopicCount;
#ifdef _DEBUG
Count++; #endif
// Grab smallest record and send to buffer
pEsb = lrgPriorityQueue[1]; // Set the fill limit
pBufMax = pEsb->lrgbMem + pEsb->dwEsbSize - 256; if ((pQueuePtr = pEsb->lrgbMem + pEsb->ibBuf) >= pBufMax) { if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK) goto exit4; pQueuePtr = pEsb->lrgbMem; }
// Save the record beginning
pQueuePtr += sizeof(DWORD); lpStart = pQueuePtr; // Get string
uStringSize = GETWORD ((LPUW)pQueuePtr) + sizeof (SHORT);
pQueuePtr += uStringSize; #ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize) SetErrCode (phr, E_ASSERT); #endif
if (occf & OCCF_LENGTH) pQueuePtr += CbByteUnpack (&dwWordLength, pQueuePtr); else dwWordLength = 0; #ifdef _DEBUG
if (pQueuePtr >= pEsb->lrgbMem + pEsb->dwEsbSize) SetErrCode (phr, E_ASSERT); #endif
if (occf & OCCF_FIELDID) pQueuePtr += CbByteUnpack (&dwFieldId, pQueuePtr); else dwFieldId = 0; #ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize) SetErrCode (phr, E_ASSERT); #endif
// Is the word in the buffer equal to the new word?
// If it is not then flush the old word
if (*(LPUW)pHeader->lpbWord) { fRet = (StrCmp2BytePascal (pHeader->lpbWord, lpStart) || dwWordLength > pHeader->dwWordLength); if (fRet == 0) // Same word, reduce the unique words count
lpipb->dwUniqueWord--; if (fRet || dwFieldId > pHeader->dwFieldId) { #if defined(_DEBUG) && !defined(_MAC)
// Word out of order
if (StrCmp2BytePascal (pHeader->lpbWord, lpStart) > 0) assert(FALSE); #endif
if ((fRet = ESMemory2Disk (lpipb, pHeader, TRUE)) != S_OK) return(fRet);
// Reset pHeader
MEMSET (pHeader, 0, sizeof (MERGEHEADER));
// Set the word buffer
pHeader->lpbWord = lpbWord; #ifdef _DEBUG
STRCPY(astLastWord, astWord); #endif
// Call the user callback every once in a while
if (!(++dwUniqueTerm % 8192L) && (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS)) { PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo; CALLBACKINFO Info;
Info.dwPhase = 2; Info.dwIndex = (DWORD)((float)dwUniqueTerm / lpipb->dwUniqueWord * 100); fRet = (*pCallbackInfo->MessageFunc) (ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info); if (S_OK != fRet) goto exit5; } } }
// Update the data
pHeader->dwFieldId = dwFieldId; pHeader->dwWordLength = dwWordLength; pHeader->dwStrLen = uStringSize; // Copy word and header info
MEMCPY (pHeader->lpbWord, (LPB)lpStart, uStringSize); #ifdef _DEBUG
if (uStringSize >= 300) uStringSize = 300; MEMCPY (astWord, lpStart + 2, uStringSize - 2); astWord[uStringSize - 2] = 0; //if (STRCMP(astWord, "87db") == 0)
// _asm int 3;
#endif
pQueuePtr += CbByteUnpack (&dwTopicCount, pQueuePtr); pHeader->dwTopicCount += dwTopicCount;
#ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize) SetErrCode (phr, E_ASSERT); #endif
pNewTopic = NULL; // Copy topic(s) to memory
for (loop = dwTopicCount; loop > 0; loop--) { DWORD dwTopicId;
// Get the topic id
pQueuePtr += CbByteUnpack (&dwTopicId, pQueuePtr);
// kevynct: if there is a to-delete list, and this topic is on it, skip it
if (lpmp && FindTopic(lpmp, dwTopicId)) { // Get the occ count
if (cNumOcc) { DWORD dwOccCount; DWORD dwT; pQueuePtr += CbByteUnpack (&dwOccCount, pQueuePtr); #ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize) SetErrCode (phr, E_ASSERT); #endif
for (; dwOccCount > 0; dwOccCount--) { // Fill up the buffer if run out of data
if (pQueuePtr >= pBufMax) { pEsb->ibBuf = (DWORD)(pQueuePtr - pEsb->lrgbMem); if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK) goto exit5; pQueuePtr = pEsb->lrgbMem; } switch (cNumOcc) { case 5: pQueuePtr += CbByteUnpack (&dwT, pQueuePtr); case 4: pQueuePtr += CbByteUnpack (&dwT, pQueuePtr); case 3: pQueuePtr += CbByteUnpack (&dwT, pQueuePtr); case 2: pQueuePtr += CbByteUnpack (&dwT, pQueuePtr); case 1: pQueuePtr += CbByteUnpack (&dwT, pQueuePtr); } #ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize) SetErrCode (phr, E_ASSERT); #endif
} // end occ loop
} // end if occ non-zero
pHeader->dwTopicCount--; continue; } // end of to-delete condition
// Allocate a topicdata node
if ((pNewTopic == NULL) && (pNewTopic = GetBlockNode (&lpipb->TopicBlock)) == NULL) { if ((fRet = ESMemory2Disk(lpipb, pHeader, FLUSH_EXCEPT_LAST)) != S_OK) { exit5: _GLOBALUNLOCK(hWord); _GLOBALFREE(hWord); goto exit4; } if ((pNewTopic = GetBlockNode (&lpipb->TopicBlock)) == NULL) { // Extremely weird, since we just release a bunch of
// memory
fRet = E_ASSERT; goto exit5; } }
pNewTopic->dwTopicId = dwTopicId;
#ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize) SetErrCode (phr, E_ASSERT); #endif
// Set the other fields
pNewTopic->pOccData = pNewTopic->pLastOccData = NULL;
// Get the occ count
if (cNumOcc) { DWORD dwOccCount; POCCDATA pOccData; LPDW lpDw; pQueuePtr += CbByteUnpack (&pNewTopic->dwOccCount, pQueuePtr);
#ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize) SetErrCode (phr, E_ASSERT); #endif
for (dwOccCount = pNewTopic->dwOccCount; dwOccCount > 0; dwOccCount--) { // Get all occ fields
if ((pOccData = (POCCDATA)GetBlockNode (&lpipb->OccBlock)) == NULL ) { if ((fRet = ESMemory2Disk(lpipb, pHeader, FLUSH_EXCEPT_LAST)) != S_OK) goto exit5; if ((pOccData = (POCCDATA)GetBlockNode(&lpipb->OccBlock)) == NULL) { // Extremely weird, since we just release a bunch of
// memory, unless there are so many duplicates of the same word
// in the topic
fRet = E_TOOMANYDUPS; goto exit5; } } // Fill up the buffer if run out of data
if (pQueuePtr >= pBufMax) { pEsb->ibBuf = (DWORD) (pQueuePtr - pEsb->lrgbMem); if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK) goto exit5; pQueuePtr = pEsb->lrgbMem; }
lpDw = (LPDW)&pOccData->OccData; switch (cNumOcc) { case 5: pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr); case 4: pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr); case 3: pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr); case 2: pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr); case 1: pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr); } #ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize) SetErrCode (phr, E_ASSERT); #endif
// Attach to the linked list
// Note that we are assumimg that the occurrences are
// already sorted, so no checking is done here
if (pNewTopic->pOccData == NULL) { pNewTopic->pLastOccData = pNewTopic->pOccData = pOccData; } else { // Add to the end of the linked list
pNewTopic->pLastOccData->pNext = pOccData; pNewTopic->pLastOccData = pOccData; } pOccData->pNext = NULL; } } if (pNewTopic = MergeTopicNode (pHeader, pNewTopic, cNumOcc)) pHeader->dwTopicCount --; } // Update the offset
pEsb->ibBuf = (DWORD) (pQueuePtr - pEsb->lrgbMem); // If next record doesn't fit in buffer
// Then reset to beginning and load data
if (pEsb->dwEsbSize - pEsb->ibBuf <= sizeof(DWORD) || pEsb->dwEsbSize - pEsb->ibBuf <= GETLONG((LPUL)pQueuePtr) + 2 * sizeof(DWORD)) { if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK) goto exit4; }
// Adjust priority queue
if (uiQueueSize > 1) { if (DwSubFo (pEsb->lfo, pEsb->lfoMax) != 0 && pEsb->ibBuf >= pEsb->dwEsbSize) { // Replace first record with last
lrgPriorityQueue[1] = lrgPriorityQueue[uiQueueSize]; lrgPriorityQueue[uiQueueSize] = NULL; uiQueueSize--; pEsi->uiQueueSize = uiQueueSize; } #if 0
else { // If the stream still has input add it back into the Queue
lrgPriorityQueue[uiQueueSize] = pEsb; PQueueUp(lpipb, lrgPriorityQueue, uiQueueSize); } #endif
PQueueDown(lpipb); // Maintain sort order
} else if (DwSubFo (pEsb->lfo, pEsb->lfoMax) != 0 && pEsb->ibBuf >= pEsb->dwEsbSize) { uiQueueSize--; pEsi->uiQueueSize = uiQueueSize; if ((fRet = ESMemory2Disk (lpipb, pHeader, FLUSH_NEW_RECORD)) != S_OK) return(fRet); } } while (uiQueueSize);
fRet = ESFlushBuffer(pEsi); goto exit5; }
BOOL PASCAL NEAR FindTopic(LPMERGEPARAMS lpmp, DWORD dwTopicId) { register LPDW lpdw; LPDW lpdwMac;
Assert(lpmp->dwCount > 0); Assert(lpmp->lpTopicIdLast >= lpmp->rgTopicId); Assert(lpmp->lpTopicIdLast < lpmp->rgTopicId + lpmp->dwCount);
if (lpmp->rgTopicId[0] > dwTopicId || *(lpdwMac = lpmp->rgTopicId + lpmp->dwCount - 1) < dwTopicId) return FALSE;
if (*lpmp->lpTopicIdLast == dwTopicId) return TRUE;
if (*lpmp->lpTopicIdLast > dwTopicId) { // re-start at the beginning
lpmp->lpTopicIdLast = lpmp->rgTopicId; } for (lpdw = lpmp->lpTopicIdLast; lpdw < lpdwMac + 1; lpdw++) if (*lpdw == dwTopicId) { lpmp->lpTopicIdLast = lpdw; return TRUE; }
return FALSE; }
/*************************************************************************
* * @doc INTERNAL INDEXING * * @func int | CompareRecordBuffers | * Called from PQueueUp/Down to sort the input buffers based first * upon the string's, then TopicID's, then word length's, etc. * * @parm _LPIPB | lpipb | * Pointer to the index parameter block * * @parm LPB | pBuffer A | * Pointer to the first input buffer * * @parm LPB | pBuffer B | * Pointer to the second input buffer * * @rdesc * If pBufferA < pBufferB return < 0 * If pBufferA == pBufferB return = 0 * If pBufferA > pBufferB return > 0 *************************************************************************/
int PASCAL NEAR CompareRecordBuffers (_LPIPB lpipb, LPB pBufferA, LPB pBufferB) { // Local Replacement Variables
int occf = lpipb->occf; int cNumOcc = lpipb->ucNumOccDataFields; DWORD dwOccMin;
// Working Variables
int fRet; int Len; DWORD dwDataA; DWORD dwDataB;
pBufferA += sizeof (DWORD); // Skip record length
pBufferB += sizeof (DWORD); // Skip record length
// Compare Pascal strings
if ((fRet = StrCmp2BytePascal(pBufferA, pBufferB)) != 0) return fRet; pBufferA += (Len = GETWORD ((LPUW)pBufferA) + sizeof (SHORT)); pBufferB += Len; // Strings equal - compare FieldIds
// Compare Word Lengths
if (occf & OCCF_LENGTH) { pBufferA += CbByteUnpack (&dwDataA, pBufferA); pBufferB += CbByteUnpack (&dwDataB, pBufferB); if ((fRet = (int)(dwDataA - dwDataB)) != 0) return fRet; }
if (occf & OCCF_FIELDID) { pBufferA += CbByteUnpack (&dwDataA, pBufferA); pBufferB += CbByteUnpack (&dwDataB, pBufferB); if ((fRet = (int)(dwDataA - dwDataB)) != 0) return fRet; }
// Skip topic count
pBufferA += CbByteUnpack (&dwDataA, pBufferA); pBufferB += CbByteUnpack (&dwDataB, pBufferB); // Compare 1st topic Id
pBufferA += CbByteUnpack (&dwDataA, pBufferA); pBufferB += CbByteUnpack (&dwDataB, pBufferB); if ((fRet = (int)(dwDataA - dwDataB)) != 0) return fRet; // Get the occurrence count
pBufferA += CbByteUnpack (&dwDataA, pBufferA); pBufferB += CbByteUnpack (&dwDataB, pBufferB); if ((fRet = (int)(dwDataA - dwDataB)) < 0) dwOccMin = dwDataA; else dwOccMin = dwDataB; for (; dwOccMin; dwOccMin--) { switch (cNumOcc) { case 5: pBufferA += CbByteUnpack (&dwDataA, pBufferA); pBufferB += CbByteUnpack (&dwDataB, pBufferB); if ((fRet = (int)(dwDataA - dwDataB)) != 0) return fRet; break; case 4: pBufferA += CbByteUnpack (&dwDataA, pBufferA); pBufferB += CbByteUnpack (&dwDataB, pBufferB); if ((fRet = (int)(dwDataA - dwDataB)) != 0) return fRet; break; case 3: pBufferA += CbByteUnpack (&dwDataA, pBufferA); pBufferB += CbByteUnpack (&dwDataB, pBufferB); if ((fRet = (int)(dwDataA - dwDataB)) != 0) return fRet; break; case 2: pBufferA += CbByteUnpack (&dwDataA, pBufferA); pBufferB += CbByteUnpack (&dwDataB, pBufferB); if ((fRet = (int)(dwDataA - dwDataB)) != 0) return fRet; break; case 1: pBufferA += CbByteUnpack (&dwDataA, pBufferA); pBufferB += CbByteUnpack (&dwDataB, pBufferB); if ((fRet = (int)(dwDataA - dwDataB)) != 0) return fRet; break; } } return fRet; }
/*************************************************************************
* * @doc INTERNAL INDEXING * * @func VOID | PQueueUp | * The function restores the heap condition of a PQ, ie. the parent * node must be less than the children. When the top node is inserted * the heap condition may be violated if the resulting node * is smaller than its parent. In this case the nodes have to * be switched. * * @parm LPESI | lpesi | * Pointer to external sort info, which contains all info * * @parm LONG | index | * Index of the inserted node * *************************************************************************/
VOID PASCAL NEAR PQueueUp (_LPIPB lpipb, LPESB FAR *lrgPriorityQueue, LONG index) { LPESB lpesbTemp; // Pointer to the inserted node
LPESB lpesbHalf; // Pointer to the parent node
WORD uiHalf; // Index of the parent's node
lpesbTemp = lrgPriorityQueue [index];
if ((uiHalf = (WORD) (index/2)) == 0) return; lpesbHalf = lrgPriorityQueue [uiHalf];
/* If the parent node is greated than the child, then exchange the
* nodes, The condition uiHalf != index makes sure that we stop * at node 0 (top node) */ while (uiHalf && CompareRecordBuffers (lpipb, (LPB)lpesbHalf->lrgbMem + lpesbHalf->ibBuf, (LPB)lpesbTemp->lrgbMem + lpesbTemp->ibBuf) > 0) { lrgPriorityQueue [index] = lpesbHalf; index = uiHalf; uiHalf = (WORD)(index/2); lpesbHalf = lrgPriorityQueue [uiHalf]; } lrgPriorityQueue[index] = lpesbTemp; #if BINHN
SetQueue (&lpipb->esi); #endif
}
/*************************************************************************
* * @doc INTERNAL INDEXING * * @func VOID | PQueueDown | * The function restores the heap condition of a PQ, ie. the parent * node must be less than the children. When the top node is removed * the heap condition may be violated if the resulting node * is greater than its children. In this case the nodes have to * be switched. * * @parm LPESI | lpesi | * Pointer to external sort info, which contains all info * *************************************************************************/
PRIVATE VOID PASCAL NEAR PQueueDown (_LPIPB lpipb) { LPESI lpesi = &lpipb->esi; LPESB FAR *lrgPriorityQueue; int CurIndex; int ChildIndex; int MaxCurIndex; int MaxChildIndex; LPESB lpesbSaved; LPESB lpesbTemp; LPESB lpesbChild;
lrgPriorityQueue = lpesi->lrgPriorityQueue; lpesbSaved = lrgPriorityQueue[1]; MaxCurIndex = (MaxChildIndex = lpesi->uiQueueSize) / 2;
for (CurIndex = 1; CurIndex <= MaxCurIndex; CurIndex = ChildIndex) { // Get child index
ChildIndex = CurIndex * 2; // Find the minimum of the two children
if (ChildIndex < MaxChildIndex) { if ((lpesbTemp = lrgPriorityQueue[ChildIndex + 1]) != NULL) { lpesbChild = lrgPriorityQueue[ChildIndex];
// The two children exist. Take the smallest
if (CompareRecordBuffers (lpipb, (LPB)lpesbChild->lrgbMem + lpesbChild->ibBuf, (LPB)lpesbTemp->lrgbMem + lpesbTemp->ibBuf) >= 0) ChildIndex++; } }
// If the parent's node is less than the child, then break
// (heap condition met)
if (ChildIndex > MaxChildIndex) break; lpesbTemp = lrgPriorityQueue [ChildIndex];
if (CompareRecordBuffers (lpipb, (LPB)lpesbSaved->lrgbMem + lpesbSaved->ibBuf, (LPB)lpesbTemp->lrgbMem+lpesbTemp->ibBuf) < 0) break;
// Replace the node
lrgPriorityQueue [CurIndex] = lpesbTemp; } lrgPriorityQueue [CurIndex] = lpesbSaved; #if _BINHN
SetQueue (lpesi); #endif
}
PRIVATE PTOPICDATA PASCAL NEAR MergeTopicNode (PMERGEHEADER pHeader, PTOPICDATA pNewTopic, int cNumOcc) { // PTOPICDATA pLastTopic;
PTOPICDATA pTopic, pPrevTopic; int fResult; if ((pTopic = pHeader->pLastTopic) == NULL) { // The list is empty
pHeader->pTopic = pHeader->pLastTopic = pNewTopic; pNewTopic->pNext = NULL; return(NULL); } fResult = pTopic->dwTopicId - pNewTopic->dwTopicId; if (fResult < 0) { // New node. Add to the end
pNewTopic->pNext = NULL; pHeader->pLastTopic->pNext = pNewTopic; pHeader->pLastTopic = pNewTopic; // Reset pNewTopic for next node allocation
return NULL; } if (fResult == 0) { // Same topic. Return pNewTopic for reuse
if (cNumOcc) MergeOccurrence (pTopic, pNewTopic, cNumOcc); return(pNewTopic); } // If we get to this point, the list is out of order
// Try to find the insertion point
pTopic = pHeader->pTopic; pPrevTopic = NULL; for (; pTopic->pNext; pTopic = pTopic->pNext) { if (pTopic->dwTopicId >= pNewTopic->dwTopicId) { /* We pass the inserted point */ break; } pPrevTopic = pTopic; } if (pTopic->dwTopicId == pNewTopic->dwTopicId) { // Same topic. Return pNewTopic for reuse
if (cNumOcc) MergeOccurrence (pTopic, pNewTopic, cNumOcc); return(pNewTopic); } // Handle empty case
if (pPrevTopic == NULL) { /* Insert at the beginning */ pNewTopic->pNext = pHeader->pTopic; pHeader->pTopic = pNewTopic; } else { /* Inserted at the middle or the end */ pNewTopic->pNext = pPrevTopic->pNext; pPrevTopic->pNext = pNewTopic; } // Update the last topic
while (pTopic->pNext) { pTopic = pTopic->pNext; } pHeader->pLastTopic = pTopic; return(NULL); }
/*************************************************************************
* @doc PRIVATE * @func void | MergeOccurrence | * Merge the occurrence by adding them in order *************************************************************************/ PRIVATE VOID NEAR MergeOccurrence (PTOPICDATA pOldTopic, PTOPICDATA pNewTopic, int cOccNum) { ERRB errb; if (CompareOccurrence (&pOldTopic->pLastOccData->OccData[0], &pNewTopic->pOccData->OccData[0], cOccNum) <= 0) { // The whole last list is less than the current list. This is
// what I expect
// We just linked the 2 lists together
pOldTopic->pLastOccData->pNext = pNewTopic->pOccData; pOldTopic->pLastOccData = pNewTopic->pLastOccData; pOldTopic->dwOccCount += pNewTopic->dwOccCount; return; } // The current list is less than the old list.
// This is weird, but still we can handle it
if (CompareOccurrence (&pNewTopic->pOccData->OccData[0], &pOldTopic->pOccData->OccData[0], cOccNum) <= 0) { pNewTopic->pLastOccData->pNext = pOldTopic->pOccData; pOldTopic->pOccData = pNewTopic->pOccData; pOldTopic->dwOccCount += pNewTopic->dwOccCount; return; } SetErrCode (&errb, E_ASSERT); }
/*====================================================================*/ #ifdef BINHN
PRIVATE VOID PASCAL NEAR SetQueue (LPESI pEsi) { unsigned int i = 0; LPESB FAR *lrgPriorityQueue; lrgPriorityQueue = pEsi->lrgPriorityQueue; for (i = 0; i < 20 && i < pEsi->cesb ; i++) { if (lrgPriorityQueue[i]) pEsi->lpbQueueStr[i] = lrgPriorityQueue[i]->lrgbMem + lrgPriorityQueue[i]->ibBuf + 6; } } #endif
/************************************************************************
* @doc PRIVATE * @func HRESULT PASCAL NEAR | ESBBlockAllocate | * Set the memory allocation based on the memory of the machine * @parm DWORD | lMemSize | * Memory allocated for the indexer * @rdesc S_OK, or E_OUTOFMEMORY ************************************************************************/
PRIVATE HRESULT PASCAL NEAR ESBBlockAllocate (_LPIPB lpipb, DWORD lMemSize) { DWORD dwTopicSize; DWORD dwOccSize; WORD OccNodeSize = sizeof (OCCDATA) - 1 + sizeof(DWORD) * lpipb->ucNumOccDataFields; // About 24bytes
OccNodeSize = (OccNodeSize + 3) & ~3;
/* The memory is for topic block and occurrence blocks, which
* should be in the ratio 1:1.5 */ dwTopicSize = (lMemSize * 2) / 5; dwOccSize = lMemSize - dwTopicSize; #if 0
/* Don't do anything if things are too small */ if (dwTopicSize < MAX_BLOCK_SIZE || dwOccSize < MAX_BLOCK_SIZE) return(E_OUTOFMEMORY); #endif
// Allocate a block manager for topic node
if ((lpipb->TopicBlock.pBlockMgr = BlockInitiate ((MAX_BLOCK_SIZE * sizeof(TOPICDATA)/sizeof(TOPICDATA)), sizeof(TOPICDATA), (WORD)(dwTopicSize/MAX_BLOCK_SIZE), USE_VIRTUAL_MEMORY | THREADED_ELEMENT)) == NULL) { exit2: return SetErrCode (NULL, E_OUTOFMEMORY); } lpipb->TopicBlock.pFreeList = (PLIST)BlockGetLinkedList(lpipb->TopicBlock.pBlockMgr); // Allocate a block manager for occ node
if ((lpipb->OccBlock.pBlockMgr = BlockInitiate((MAX_BLOCK_SIZE * OccNodeSize)/OccNodeSize, OccNodeSize, (WORD)(lMemSize / MAX_BLOCK_SIZE), USE_VIRTUAL_MEMORY | THREADED_ELEMENT)) == NULL) { BlockFree(lpipb->BTNodeBlock.pBlockMgr); lpipb->BTNodeBlock.pBlockMgr = NULL; goto exit2; } lpipb->OccBlock.pFreeList = (PLIST)BlockGetLinkedList(lpipb->OccBlock.pBlockMgr); return (S_OK); }
PRIVATE LPV PASCAL NEAR GetBlockNode (PBLKCOMBO pBlockCombo) { PLIST pList; if (pBlockCombo->pFreeList == NULL) { if ((BlockGrowth (pBlockCombo->pBlockMgr) != S_OK)) return (NULL); pBlockCombo->pFreeList = (PLIST)BlockGetLinkedList(pBlockCombo->pBlockMgr); } pList = pBlockCombo->pFreeList; pBlockCombo->pFreeList = pList->pNext; pBlockCombo->dwCount ++; // pList->pNext = NULL;
return (pList); }
/*************************************************************************
* * @doc INTERNAL * * @func BOOL FAR PASCAL | BuildIndexFile | * This function is for debugging purpose only. In normal indexing, * it will never be called. Since collecting words and indexing can * take a long time, debugging the index phase can become a hassle that * take several hours per shot. To minimize the index time for debugging, * all the intermediate files are saved, which are: * - the internal sorted result file, which contains all words and * their occurrences sorted * - the external sorted result file, which is a snap shot of the * ESI structures and its ESB blocks * The only steps left will be processing the occurrence list and doing * permanent index * * To use the function, add the following lines in the app: * * extern HRESULT PASCAL FAR BuildIndexFile (LPSTR, LPSTR, LPSTR, WORD, WORD, * WORD, INTERRUPT_FUNC, VOID FAR *, STATUS_FUNC, VOID FAR*, PHRESULT); * * int fDotest; * * if (fDotest) { * return BuildIndexFile ((LPSTR)"c:/tmp/test.mvb!MVINDEX", * (LPSTR)"c:/tmp/esi.tmp", (LPSTR)"c:/tmp/iso.tmp", * OCCF_TOPICID, IDXF_NORMALIZE, 0, (INTERRUPT_FUNC)lpfnInterruptFunc, * (LPV)NULL, * (STATUS_FUNC)lpfnStatusFunc, (LPV)hwndGlobal, * NULL); * } * * @parm HFPB | hfpb | * HFPB for index file if pstrIndexFile is NULL * * @parm LPB | pstrIndexFile | * The .MVB + index file, usually with the format TEST.MVB!MVINDEX * * @parm LPB | lpbEsiFile | * The external sort info file * * @parm LPB | lpbIsiFile | * The internal sorted info filename * * @parm PINDEXINFO | pIndexInfo | * IndexInfo * * @rdesc S_OK if succeeded, else other non-zero error codes *************************************************************************/
PUBLIC HRESULT PASCAL EXPORT_API FAR BuildIndexFile (HFPB hfpb, LPSTR pstrIndexFile, LPB lpbEsiFile, LPB lpbIsiFile, PINDEXINFO pIndexInfo) { _LPIPB lpipb; LPESI lpesi; BOOL fRet; ERRB errb; DWORD loop; FLOAT rLog; BYTE bKeyIndex = 0;
if ((lpipb = MVIndexInitiate(pIndexInfo, NULL)) == NULL) return E_FAIL;
lpesi = &lpipb->esi;
if (LoadEsiTemp (lpipb, lpesi, lpbEsiFile, lpbIsiFile, NULL) != S_OK) { fRet = E_FAIL; exit0: MVIndexDispose (lpipb); return fRet; }
if (lpipb->idxf & IDXF_NORMALIZE) { // Allocate a huge buffer to contain all the sigma terms
if ((lpipb->wi.hSigma = _GLOBALALLOC (DLLGMEM_ZEROINIT, (LCB)((lpipb->dwMaxTopicId + 1) * sizeof (SIGMA)))) == NULL) return SetErrCode (&errb, E_OUTOFMEMORY); lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK (lpipb->wi.hSigma);
// Small buffer containing pre-calculated values
if ((lpipb->wi.hLog = _GLOBALALLOC (DLLGMEM_ZEROINIT, (CB)(cLOG_MAX * sizeof (FLOAT)))) == NULL) { SetErrCode (&errb, (HRESULT)(fRet = E_OUTOFMEMORY)); FreeHandle (lpipb->wi.hSigma); goto exit0; } lpipb->wi.lrgrLog = (FLOAT FAR *)_GLOBALLOCK (lpipb->wi.hLog); // Initialize the array
for (loop = cLOG_MAX - 1; loop > 0; --loop) { #ifndef ISBU_IR_CHANGE
rLog = (float) log10(cHundredMillion/(double)loop); #else
rLog = (float)1.0 / (float)loop; #endif // ISBU_IR_CHANGE
lpipb->wi.lrgrLog[loop] = rLog * rLog; } }
if ((fRet = MergeSortTreeFile (lpipb, NULL)) != S_OK) return SetErrCode (&errb, (HRESULT)fRet); if ((lpipb->idxf & KEEP_TEMP_FILE) == 0) FileUnlink (NULL, lpipb->isi.aszTempName, REGULAR_FILE);
// If we are doing term-weighting we have to square root all sigma values
if (lpipb->idxf & IDXF_NORMALIZE) { // ISBU_IR_CHANGE not necessary 'cos sqrt computation is necessary in both cases
for (loop = 0; loop < lpipb->dwMaxTopicId + 1; ++loop) lpipb->wi.hrgsigma[loop] = (float)sqrt ((double)lpipb->wi.hrgsigma[loop]); }
// Analyze data to get the best compression scheme
// TopicId
VGetBestScheme(&lpipb->cKey[CKEY_TOPIC_ID], &lpipb->BitCount[CKEY_TOPIC_ID][0], lcbitBITSTREAM_ILLEGAL, TRUE); // Occurrence Count
VGetBestScheme(&lpipb->cKey[CKEY_OCC_COUNT], &lpipb->BitCount[CKEY_OCC_COUNT][0], lcbitBITSTREAM_ILLEGAL, TRUE);
if (lpipb->occf & OCCF_COUNT) { VGetBestScheme(&lpipb->cKey[bKeyIndex], &lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE); bKeyIndex++; }
if (lpipb->occf & OCCF_OFFSET) { VGetBestScheme(&lpipb->cKey[bKeyIndex], &lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE); bKeyIndex++; }
// Call the user callback every once in a while
if (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS) { PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo; CALLBACKINFO Info;
Info.dwPhase = 2; Info.dwIndex = 100; fRet = (*pCallbackInfo->MessageFunc) (ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info); if (S_OK != fRet) goto exit0; } // Build the permanent index
fRet = BuildBTree(NULL, lpipb, lpipb->esi.aszTempName, hfpb, pstrIndexFile); if (lpipb->idxf & IDXF_NORMALIZE) { FreeHandle (lpipb->wi.hLog); FreeHandle (lpipb->wi.hSigma); } goto exit0; }
PRIVATE VOID PASCAL NEAR SaveEsiTemp (_LPIPB lpipb, LPESI lpesi) { GHANDLE hfpb; LPESB lpesb; char szEsi[100];
GETTEMPFILENAME ((char)0, "foo", 0, szEsi); if ((hfpb = FileOpen(NULL, szEsi, REGULAR_FILE, READ_WRITE, NULL)) == NULL) return;
FileWrite(hfpb, lpipb, sizeof(IPB), NULL);
for (lpesb = lpesi->lpesbRoot; lpesb; lpesb = lpesb->lpesbNext) { if (FileWrite(hfpb, lpesb, sizeof(ESB), NULL) != sizeof(ESB)) { FileClose (hfpb); FileUnlink (NULL, szEsi, REGULAR_FILE); return; } } FileClose (hfpb); MEMCPY (lpipb->szEsiTemp, szEsi, 100); }
PRIVATE VOID PASCAL NEAR UpdateEsiTemp (_LPIPB lpipb) { GHANDLE hfpb;
if ((hfpb = FileOpen(NULL, lpipb->szEsiTemp, REGULAR_FILE, READ_WRITE, NULL)) == NULL) return;
FileWrite(hfpb, lpipb, sizeof(IPB), NULL); FileClose (hfpb); }
PRIVATE BOOL PASCAL LoadEsiTemp (_LPIPB lpipb, LPESI lpesi, LPB lpbEsiFile, LPB lpbIsiFile, PHRESULT phr) { LPESB lpesb; HFILE hFile; ESB esb; HANDLE hesb; HRESULT fRet; IPB ipb; LPISI pIsi = &lpipb->isi; // Pointer to internal sort info
/* Copy the internal sort info filename */ MEMCPY (pIsi->aszTempName, lpbIsiFile, lstrlen(lpbIsiFile));
/* Read in the external sort buffer info */
if ((hFile = _lopen (lpbEsiFile, READ)) == HFILE_ERROR) return E_NOTEXIST;
/* Read old IPB info */ _lread (hFile, &ipb, sizeof(IPB));
/* Transfer meaningful data */
lpipb->dwIndexedWord = ipb.dwIndexedWord; lpipb->dwUniqueWord = ipb.dwUniqueWord; lpipb->dwByteCount = ipb.dwByteCount; lpipb->dwOccOffbits = ipb.dwOccOffbits; lpipb->dwOccExtbits = ipb.dwOccExtbits; lpipb->dwMaxFieldId = ipb.dwMaxFieldId; lpipb->dwMaxWCount = ipb.dwMaxWCount; lpipb->dwMaxOffset = ipb.dwMaxOffset; lpipb->dwTotal3bWordLen = ipb.dwTotal3bWordLen; lpipb->dwTotal2bWordLen = ipb.dwTotal2bWordLen; lpipb->dwTotalUniqueWordLen = ipb.dwTotalUniqueWordLen; lpipb->lcTopics = ipb.lcTopics; lpipb->dwMaxTopicId = ipb.dwMaxTopicId; // lpipb->dwMemAllowed = ipb.dwMemAllowed;
lpipb->dwMaxRecordSize = ipb.dwMaxRecordSize; lpipb->dwMaxEsbRecSize = ipb.dwMaxEsbRecSize; lpipb->dwMaxWLen = ipb.dwMaxWLen; lpipb->idxf = ipb.idxf; while ((_lread (hFile, &esb, sizeof(ESB))) == sizeof(ESB)) { if ((hesb = _GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT, sizeof(ESB))) == NULL) { fRet = SetErrCode (phr,E_OUTOFMEMORY); exit0: _lclose (hFile); return fRet; }
lpesb = (LPESB)_GLOBALLOCK (hesb);
/* Copy the ESB information */ *lpesb = esb;
/* Update the structure */ lpesb->hStruct = hesb;
lpesb->lpesbNext = lpesi->lpesbRoot; lpesi->lpesbRoot= lpesb; lpesi->cesb ++; } _lclose (hFile);
fRet = S_OK; goto exit0;
}
HRESULT FAR PASCAL AllocSigmaTable (_LPIPB lpipb) { ERRB errb; DWORD loop; float rLog;
if ((lpipb->wi.hSigma = _GLOBALALLOC (DLLGMEM_ZEROINIT, (LCB)((lpipb->dwMaxTopicId + 1) * sizeof (SIGMA)))) == NULL) return SetErrCode (&errb, E_OUTOFMEMORY); lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK (lpipb->wi.hSigma);
if ((lpipb->wi.hLog = _GLOBALALLOC (DLLGMEM_ZEROINIT, (CB)(cLOG_MAX * sizeof (FLOAT)))) == NULL) { FreeHandle (lpipb->wi.hSigma); return SetErrCode (&errb, E_OUTOFMEMORY); } lpipb->wi.lrgrLog = (FLOAT FAR *)_GLOBALLOCK (lpipb->wi.hLog); // Initialize the array
for (loop = cLOG_MAX - 1; loop > 0; --loop) { #ifndef ISBU_IR_CHANGE
rLog = (float) log10(cHundredMillion/(double)loop); #else
rLog = (float)1.0 / (float)loop; #endif // ISBU_IR_CHANGE
lpipb->wi.lrgrLog[loop] = rLog * rLog; } return(S_OK); }
|