mirror of https://github.com/tongzx/nt5src
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2058 lines
66 KiB
2058 lines
66 KiB
#define VER3
|
|
/*************************************************************************
|
|
* *
|
|
* INDEX.C *
|
|
* *
|
|
* Copyright (C) Microsoft Corporation 1990-1994 *
|
|
* All Rights reserved. *
|
|
* *
|
|
**************************************************************************
|
|
* *
|
|
* Module Intent *
|
|
* This is the second stage of the index building process. After all *
|
|
* of the word have been add in stage 1, IndexBuild will be called. *
|
|
* IndexBuild starts the second stage. We will merge-sort the temp file *
|
|
* generated in phase 1 to create a second temp file to send to phase 3. *
|
|
* *
|
|
**************************************************************************
|
|
* *
|
|
* Current Owner: BinhN *
|
|
* *
|
|
**************************************************************************/
|
|
|
|
#include <mvopsys.h>
|
|
#include <mem.h>
|
|
#include <memory.h>
|
|
#include <io.h>
|
|
#include <math.h>
|
|
#include <mvsearch.h>
|
|
#include <orkin.h>
|
|
#include "common.h"
|
|
#include "index.h"
|
|
|
|
#ifdef _DEBUG
|
|
static BYTE NEAR s_aszModule[] = __FILE__; /* Used by error return functions.*/
|
|
#endif
|
|
|
|
|
|
#ifndef _32BIT
|
|
#define ESOUTPUT_BUFFER 0xFFFC // Size of output file buffer
|
|
// This must be at the size of the largest word + 12
|
|
// or word + 14 if OCCF_LENGTH is set
|
|
#else
|
|
#define ESOUTPUT_BUFFER 0xFFFFC // Size of output file buffer
|
|
// This must be at the size of the largest word + 12
|
|
// or word + 14 if OCCF_LENGTH is set
|
|
#endif
|
|
|
|
#define FLUSH_NEW_RECORD 1
|
|
#define FLUSH_EXCEPT_LAST 2
|
|
|
|
/*************************************************************************
|
|
*
|
|
* INTERNAL PRIVATE FUNCTIONS
|
|
*
|
|
* All of them should be declared near
|
|
*
|
|
*************************************************************************/
|
|
PRIVATE HRESULT NEAR PASCAL FillInputBuffer (LPESB, HFPB);
|
|
PRIVATE HRESULT NEAR PASCAL ESFlushBuffer (LPESI);
|
|
PRIVATE HRESULT NEAR PASCAL ESFillBuffer (_LPIPB, LPESB);
|
|
PRIVATE HRESULT NEAR PASCAL ESMemory2Disk (_LPIPB, PMERGEHEADER, int);
|
|
PRIVATE HRESULT NEAR PASCAL ProcessFiles (_LPIPB lpipb, LPMERGEPARAMS);
|
|
PRIVATE int NEAR PASCAL CompareRecordBuffers (_LPIPB, LPB, LPB);
|
|
PRIVATE VOID NEAR PASCAL PQueueUp (_LPIPB, LPESB FAR *, LONG);
|
|
PRIVATE VOID NEAR PASCAL PQueueDown (_LPIPB);
|
|
PRIVATE PTOPICDATA PASCAL NEAR MergeTopicNode (PMERGEHEADER, PTOPICDATA, int);
|
|
PRIVATE VOID NEAR MergeOccurrence (PTOPICDATA, PTOPICDATA, int);
|
|
PRIVATE LPV PASCAL NEAR GetBlockNode (PBLKCOMBO lpBlockCombo);
|
|
PRIVATE VOID PASCAL NEAR SetQueue (LPESI pEsi);
|
|
PRIVATE HRESULT PASCAL NEAR ESBBlockAllocate (_LPIPB lpipb, DWORD lMemSize);
|
|
PRIVATE BOOL PASCAL LoadEsiTemp (_LPIPB lpipb, LPESI lpesi, LPB lpbEsiFile,
|
|
LPB lpbIsiFile, PHRESULT phr);
|
|
PRIVATE VOID PASCAL NEAR SaveEsiTemp (LPIPB lpipb, LPESI lpesi);
|
|
PRIVATE VOID PASCAL NEAR UpdateEsiTemp (LPIPB lpipb);
|
|
PRIVATE BOOL PASCAL NEAR FindTopic(LPMERGEPARAMS lpmp, DWORD dwTopicId);
|
|
|
|
/*************************************************************************
|
|
*
|
|
* INTERNAL PUBLIC FUNCTIONS
|
|
*
|
|
* All of them should be declared far, unless we know they belong to
|
|
* the same segment. They should be included in some include file
|
|
*
|
|
*************************************************************************/
|
|
PUBLIC HRESULT FAR PASCAL FlushTree(_LPIPB lpipb);
|
|
PUBLIC HRESULT FAR PASCAL MergeSortTreeFile (_LPIPB, LPMERGEPARAMS);
|
|
HRESULT FAR PASCAL AllocSigmaTable (_LPIPB lpipb);
|
|
|
|
|
|
/*************************************************************************
|
|
*
|
|
* @doc EXTERNAL API INDEX
|
|
*
|
|
* @func BOOL FAR PASCAL | MVIndexBuild |
|
|
* This function will build an index file based on the information
|
|
* collected in the Index parameter block.
|
|
*
|
|
* @parm HFPB | hSysFile |
|
|
* If it is non-null, it is the handle of an already opened system file.
|
|
* In this case the index is a subfile of the opened system file
|
|
* If it is 0, the index file is a regular DOS file
|
|
*
|
|
* @parm LPIPB | lpipb |
|
|
* Pointer to Index Parameter Block. This structure contains all the
|
|
* information necessary to build the index file
|
|
*
|
|
* @parm HFPB | hfpb |
|
|
* Index hfpb if pstrFile is NULL
|
|
*
|
|
* @parm LPSTR | pstrFile |
|
|
* Index filename if hfpb is NULL
|
|
*
|
|
* @rdesc S_OK, or other errors
|
|
*
|
|
* @xref MVIndexInitiate()
|
|
*************************************************************************/
|
|
/*
|
|
* This operates in three main steps:
|
|
*
|
|
* 1. Send finish to first phase to dump the buffer. Then merge-sort
|
|
* that file into a temporary index. Keep statistics on the information
|
|
* written to this temporary index.
|
|
*
|
|
* 2. Analyze the statistics gathered during the temporary index
|
|
* building phase. This analysis results in the choice of
|
|
* compression processes that will be used in the next step.
|
|
*
|
|
* 3. Permanent index building phase. During this phase, the
|
|
* temporary index is read, compressed like crazy, and written
|
|
* to a permanent index file. Unlike the temporary index, the
|
|
* permanent index contains directory nodes as well as leaf
|
|
* nodes.
|
|
*
|
|
*************************************************************************/
|
|
|
|
PUBLIC HRESULT EXPORT_API FAR PASCAL MVIndexBuild (HFPB hSysFile,
|
|
_LPIPB lpipb, HFPB hfpb, LPSTR pstrFile)
|
|
{
|
|
ERRB errb;
|
|
PHRESULT phr = &errb;
|
|
BYTE bKeyIndex = CKEY_OCC_BASE; // Index into cKey array for compression
|
|
HRESULT fRet; // Return value from this function.
|
|
DWORD loop;
|
|
|
|
// Sanity check
|
|
if (lpipb == NULL || (NULL == hfpb && NULL == pstrFile))
|
|
return E_INVALIDARG;
|
|
|
|
// Flush the internal sort
|
|
// Flushes any records in the tree to disk
|
|
fRet = FlushTree(lpipb);
|
|
|
|
// Free all memory blocks
|
|
FreeISI (lpipb);
|
|
|
|
if (fRet != S_OK)
|
|
return(fRet);
|
|
|
|
// lpipb->lcTopics++; // Adjust to base-1 from base-0
|
|
|
|
if (lpipb->esi.cesb == 0)
|
|
// Nothing to process, there will be no index file
|
|
return S_OK;
|
|
|
|
if (lpipb->idxf & KEEP_TEMP_FILE)
|
|
SaveEsiTemp (lpipb, &lpipb->esi);
|
|
|
|
// If we're doing term-weighting, set up a huge array to contain the
|
|
// sigma terms. The size of the array depends on the total # of topics
|
|
// We also create an array of LOG values to save calculations later
|
|
if (lpipb->idxf & IDXF_NORMALIZE)
|
|
{
|
|
if ((fRet = AllocSigmaTable (lpipb)) != S_OK)
|
|
return(fRet);
|
|
}
|
|
|
|
if ((fRet = MergeSortTreeFile (lpipb, NULL)) != S_OK)
|
|
return SetErrCode (phr, fRet);
|
|
if ((lpipb->idxf & KEEP_TEMP_FILE) == 0)
|
|
FileUnlink (NULL, lpipb->isi.aszTempName, REGULAR_FILE);
|
|
|
|
// If we are doing term-weighting we have to square root all sigma values
|
|
if (lpipb->idxf & IDXF_NORMALIZE)
|
|
{
|
|
// ISBU_IR_CHANGE not needed here 'cos computing sqrt is necessary in both cases
|
|
for (loop = 0; loop < lpipb->dwMaxTopicId + 1; ++loop)
|
|
lpipb->wi.hrgsigma[loop] =
|
|
(float)sqrt ((double)lpipb->wi.hrgsigma[loop]);
|
|
}
|
|
|
|
// Analyze data to get the best compression scheme
|
|
// TopicId
|
|
|
|
// Note: We can't use fixed field compression for topic, since they
|
|
// can be modified by update. A fixed field format may become
|
|
// insufficient to store larger values of topic differences
|
|
|
|
VGetBestScheme(&lpipb->cKey[CKEY_TOPIC_ID],
|
|
&lpipb->BitCount[CKEY_TOPIC_ID][0], lcbitBITSTREAM_ILLEGAL, TRUE);
|
|
|
|
// Occurrence Count
|
|
VGetBestScheme(&lpipb->cKey[CKEY_OCC_COUNT],
|
|
&lpipb->BitCount[CKEY_OCC_COUNT][0], lcbitBITSTREAM_ILLEGAL, TRUE);
|
|
|
|
if (lpipb->occf & OCCF_COUNT)
|
|
{
|
|
VGetBestScheme(&lpipb->cKey[bKeyIndex],
|
|
&lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE);
|
|
bKeyIndex++;
|
|
}
|
|
|
|
if (lpipb->occf & OCCF_OFFSET)
|
|
{
|
|
VGetBestScheme(&lpipb->cKey[bKeyIndex],
|
|
&lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE);
|
|
bKeyIndex++;
|
|
}
|
|
|
|
if (lpipb->idxf & KEEP_TEMP_FILE)
|
|
UpdateEsiTemp (lpipb);
|
|
|
|
// Build the permanent index
|
|
fRet = BuildBTree(hSysFile, lpipb, lpipb->esi.aszTempName, hfpb, pstrFile);
|
|
if (lpipb->idxf & IDXF_NORMALIZE)
|
|
{
|
|
FreeHandle (lpipb->wi.hSigma);
|
|
FreeHandle (lpipb->wi.hLog);
|
|
}
|
|
return fRet;
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
*
|
|
* @doc INDEX
|
|
*
|
|
* @func HRESULT NEAR PASCAL | FillInputBuffer |
|
|
* Fills the buffer by reading from the specified file.
|
|
*
|
|
* @parm PESB | pEsb |
|
|
* Pointer to external sort block to fill
|
|
*
|
|
* @parm HFPB | hFile |
|
|
* Handle to the input file
|
|
*
|
|
* @rdesc S_OK, or errors if failed
|
|
*
|
|
*************************************************************************/
|
|
|
|
HRESULT NEAR PASCAL FillInputBuffer(LPESB pEsb, HFPB hFile)
|
|
{
|
|
ERRB errb;
|
|
DWORD dwBytesRead;
|
|
|
|
// Read in data
|
|
if ((dwBytesRead = FileSeekRead (hFile,
|
|
(LPB)pEsb->lrgbMem, pEsb->lfo, pEsb->dwEsbSize, &errb)) == 0)
|
|
return errb;
|
|
|
|
// Update utility variables
|
|
pEsb->lfo = FoAddDw(pEsb->lfo, dwBytesRead);
|
|
pEsb->dwEsbSize = (CB)dwBytesRead;
|
|
pEsb->ibBuf = 0;
|
|
return S_OK;
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
*
|
|
* @doc INDEX
|
|
*
|
|
* @func HRESULT NEAR PASCAL | ESFlushBuffer |
|
|
* Flushes the output buffer to disk and resets it.
|
|
*
|
|
* @parm LPESI | pEsi |
|
|
* Pointer to ESI block
|
|
*
|
|
* @rdesc S_OK, or errors if failed
|
|
*
|
|
*************************************************************************/
|
|
|
|
HRESULT NEAR PASCAL ESFlushBuffer(LPESI pEsi)
|
|
{
|
|
ERRB errb;
|
|
DWORD dwLen;
|
|
|
|
dwLen = pEsi->ibBuf;
|
|
if (dwLen != (DWORD)FileWrite (pEsi->hfpb, pEsi->pOutputBuffer,
|
|
dwLen, &errb))
|
|
return errb;
|
|
|
|
pEsi->lfoTempOffset = FoAddDw (pEsi->lfoTempOffset, dwLen);
|
|
pEsi->ibBuf = 0;
|
|
return S_OK;
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
*
|
|
* @doc INDEX
|
|
*
|
|
* @func HRESULT NEAR PASCAL | ESFillBuffer |
|
|
* Updates the input buffer with new data from the input file.
|
|
*
|
|
* @parm _LPIPB | lpipb |
|
|
* Pointer to index parameter block
|
|
*
|
|
* @parm LPESB | pEsb |
|
|
* Pointer to ESB block to be filled
|
|
*
|
|
* @rdesc S_OK, or other errors
|
|
*************************************************************************/
|
|
|
|
HRESULT NEAR PASCAL ESFillBuffer(_LPIPB lpipb, LPESB pEsb)
|
|
{
|
|
DWORD dwBytesRead;
|
|
DWORD dwExtra = pEsb->dwEsbSize - pEsb->ibBuf;
|
|
ERRB errb;
|
|
|
|
// Read either the entire buffer size or whatever is left
|
|
dwBytesRead = DwSubFo (pEsb->lfoMax, pEsb->lfo);
|
|
|
|
if (dwBytesRead > pEsb->dwEsbSize - dwExtra)
|
|
dwBytesRead = pEsb->dwEsbSize - dwExtra;
|
|
|
|
// Save unproccessed information to beginning of buffer
|
|
if (dwExtra)
|
|
MEMMOVE ((LPB)pEsb->lrgbMem, pEsb->lrgbMem + pEsb->ibBuf, dwExtra);
|
|
|
|
// Read in the new data
|
|
if ((dwBytesRead = FileSeekRead (lpipb->isi.hfpb, (LPB)(pEsb->lrgbMem +
|
|
dwExtra), pEsb->lfo, dwBytesRead, &errb)) == 0 &&
|
|
errb != S_OK)
|
|
return(errb);
|
|
|
|
pEsb->lfo = FoAddDw(pEsb->lfo, dwBytesRead);
|
|
pEsb->ibBuf = 0;
|
|
pEsb->dwEsbSize = dwBytesRead + dwExtra;
|
|
return(S_OK);
|
|
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
*
|
|
* @doc INTERNAL INDEXING
|
|
*
|
|
* @func HRESULT FAR PASCAL | MergeSortTree File |
|
|
* Sorts the file generated from the tree output into one
|
|
* list of sorted elements.
|
|
*
|
|
* @parm _LPIPB | lpipb |
|
|
* Pointer to index parameter block
|
|
*
|
|
*************************************************************************/
|
|
|
|
PUBLIC HRESULT PASCAL FAR MergeSortTreeFile (_LPIPB lpipb, LPMERGEPARAMS lpmp)
|
|
{
|
|
// Local replacement variables
|
|
LPESI pEsi; // Pointer to external sort info
|
|
LPISI pIsi; // Pointer to internal sort info
|
|
HFPB hInputFile; // Handle to input file
|
|
ERRB errb;
|
|
PHRESULT phr = &errb;
|
|
DWORD cesb; // Input buffer count
|
|
LPESB FAR* lrgPriorityQueue; // Pointer to Priority Queue
|
|
WORD uiQueueSize = 0; // Count of entries in Queue
|
|
DWORD dwBufferSize;
|
|
|
|
// Working variables
|
|
HRESULT fRet;
|
|
LPESB pEsb; // Temp pointer to linked list
|
|
|
|
// Sanity check
|
|
if (lpipb == NULL)
|
|
return E_INVALIDARG;
|
|
|
|
// Variables initialization
|
|
pEsi = &lpipb->esi; // Pointer to external sort info
|
|
pIsi = &lpipb->isi; // Pointer to internal sort info
|
|
cesb = pEsi->cesb; // Input buffer count
|
|
|
|
// Open input file
|
|
if ((pIsi->hfpb = FileOpen (NULL, pIsi->aszTempName,
|
|
REGULAR_FILE, READ, phr)) == NULL)
|
|
return *phr;
|
|
|
|
hInputFile = pIsi->hfpb;
|
|
|
|
// Allocate & fill input buffers
|
|
for (pEsb = pEsi->lpesbRoot; pEsb != NULL; pEsb = pEsb->lpesbNext)
|
|
{
|
|
DWORD cbRead;
|
|
|
|
dwBufferSize = (lpipb->dwMemAllowed * 6) / (8 * pEsi->cesb);
|
|
|
|
// Alocate buffer space
|
|
if ((pEsb->hMem = _GLOBALALLOC (DLLGMEM_ZEROINIT,
|
|
dwBufferSize)) == NULL)
|
|
{
|
|
fRet = E_OUTOFMEMORY;
|
|
exit1:
|
|
FreeEsi (lpipb);
|
|
FileClose(hInputFile);
|
|
pIsi->hfpb = NULL;
|
|
return fRet;
|
|
}
|
|
pEsb->lrgbMem = (LRGB)_GLOBALLOCK (pEsb->hMem);
|
|
|
|
if ((cbRead = DwSubFo(pEsb->lfoMax, pEsb->lfo)) > dwBufferSize)
|
|
cbRead = dwBufferSize;
|
|
|
|
// Fill buffer from disk
|
|
if (FileSeekRead (hInputFile, pEsb->lrgbMem, pEsb->lfo,
|
|
cbRead, phr) != (LONG)cbRead)
|
|
{
|
|
fRet = *phr;
|
|
_GLOBALUNLOCK(pEsb->hMem);
|
|
_GLOBALFREE(pEsb->hMem);
|
|
pEsb->hMem = NULL;
|
|
goto exit1;
|
|
}
|
|
|
|
pEsb->dwEsbSize = cbRead;
|
|
pEsb->ibBuf = 0;
|
|
pEsb->lfo = FoAddDw (pEsb->lfo, cbRead);
|
|
}
|
|
|
|
// Allocate a priority queue array. The size of the array
|
|
// is the number of external sort info blocks plus 1, since
|
|
// location 0 is not used.
|
|
if ((pEsi->hPriorityQueue = _GLOBALALLOC (DLLGMEM_ZEROINIT,
|
|
(DWORD)(pEsi->cesb + 1) * sizeof (LPB))) == NULL)
|
|
{
|
|
fRet = E_OUTOFMEMORY;
|
|
goto exit1;
|
|
}
|
|
pEsi->lrgPriorityQueue =
|
|
(LPESB FAR *)_GLOBALLOCK (pEsi->hPriorityQueue);
|
|
lrgPriorityQueue = pEsi->lrgPriorityQueue;
|
|
|
|
// Attach input buffers to Priority Queue
|
|
// Remebering to start at offset 1 NOT 0 (PQ's have a null 0 element)
|
|
for (pEsb = pEsi->lpesbRoot; pEsb != NULL; pEsb = pEsb->lpesbNext)
|
|
{
|
|
lrgPriorityQueue[++uiQueueSize] = pEsb;
|
|
PQueueUp (lpipb, lrgPriorityQueue, uiQueueSize);
|
|
}
|
|
pEsi->uiQueueSize = uiQueueSize;
|
|
|
|
// Clear largest Record Size field
|
|
// lpipb->dwMaxRecordSize = 0;
|
|
|
|
fRet = ProcessFiles(lpipb, lpmp);
|
|
_GLOBALUNLOCK (pEsi->hPriorityQueue);
|
|
_GLOBALFREE (pEsi->hPriorityQueue);
|
|
pEsi->hPriorityQueue = NULL;
|
|
goto exit1;
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
*
|
|
* @doc INDEX
|
|
*
|
|
* @func HRESULT NEAR PASCAL | ESMemory2Disk |
|
|
* Copies temp record to output buffer.
|
|
*
|
|
* @parm _LPIPB | lpipb |
|
|
* Pointer to index parameter block
|
|
*
|
|
* @parm PMERGEHEADER | pHeader |
|
|
* Pointer to header to flush
|
|
*
|
|
* @parm int | flag |
|
|
* - if FLUSH_NEW_RECORD, the flush is due to new record, we flush
|
|
* everything, else we may do a partial flush only
|
|
* - if FLUSH_EXCEPT_LAST, we don't flush the last topic
|
|
*
|
|
* @rdesc S_OK, or other errors
|
|
*************************************************************************/
|
|
PRIVATE HRESULT NEAR PASCAL ESMemory2Disk
|
|
(_LPIPB lpipb, PMERGEHEADER pHeader, int flag)
|
|
{
|
|
// Local replacement variables
|
|
LPESI pEsi = &lpipb->esi;
|
|
LPB pMax = pEsi->pOutputBuffer + ESOUTPUT_BUFFER - 2 * sizeof(DWORD);
|
|
DWORD dwOccCount;
|
|
LPB pOutputBuffer = pEsi->pOutputBuffer;
|
|
ERRB errb;
|
|
PHRESULT phr = &errb;
|
|
HRESULT fRet;
|
|
BYTE cNumOcc;
|
|
OCCF occf;
|
|
|
|
// Working variables
|
|
PTOPICDATA pTopic; // Temp var to traverse the topic linked list
|
|
DWORD loop, sub; // Various loop counters
|
|
DWORD dwTopicIdDelta;
|
|
DWORD OccDelta[5]; // Delta base for all occurrence data
|
|
DWORD LastOcc[5];
|
|
FLOAT rLog; // (1/n) - IDXF_NORMALIZE is set
|
|
FLOAT rLogSquared; // (1/n)^2 - IDXF_NORMALIZE is set
|
|
LPB pStart;
|
|
LPB pCurPtr;
|
|
|
|
// Set up pointers
|
|
pStart = pCurPtr = pOutputBuffer + pEsi->ibBuf;
|
|
|
|
// Variable replacement
|
|
occf = lpipb->occf;
|
|
|
|
// Size of string
|
|
loop = pHeader->dwStrLen;
|
|
|
|
// Make sure the string, FileId, Topic Count and Record Size fit
|
|
// We add in and extra DWORD for 5 byte compresssion problems and
|
|
// to cover the Word Length if there is one.
|
|
if ((pStart + loop + sizeof (DWORD) * 5) >= pMax)
|
|
{
|
|
if ((fRet = ESFlushBuffer (pEsi)) != S_OK)
|
|
return(fRet);
|
|
|
|
pStart = pCurPtr = pOutputBuffer;
|
|
}
|
|
|
|
if (pHeader->fEmitRecord == FALSE)
|
|
{
|
|
// If we never emitted the record header then we emitted now
|
|
// Reset the flag
|
|
pHeader->fEmitRecord = TRUE;
|
|
|
|
// Skip record size field
|
|
pCurPtr += sizeof (DWORD);
|
|
|
|
// Pascal string
|
|
MEMCPY (pCurPtr, pHeader->lpbWord, loop);
|
|
pCurPtr += loop;
|
|
|
|
// Word Length
|
|
if (occf & OCCF_LENGTH)
|
|
pCurPtr += CbBytePack (pCurPtr, pHeader->dwWordLength);
|
|
|
|
// FieldId
|
|
if (occf & OCCF_FIELDID)
|
|
pCurPtr += CbBytePack (pCurPtr, pHeader->dwFieldId);
|
|
|
|
// Topic Count
|
|
if (flag & FLUSH_NEW_RECORD)
|
|
{
|
|
// This is the whole record. dwTopicCount value is correct
|
|
SETLONG((LPUL)pCurPtr, pHeader->dwTopicCount);
|
|
}
|
|
else
|
|
{
|
|
// Save the offset for backpatching
|
|
pHeader->foTopicCount = FoAddDw (pEsi->lfoTempOffset,
|
|
(DWORD)(pCurPtr - pOutputBuffer));
|
|
pHeader->pTopicCount = pCurPtr;
|
|
}
|
|
pCurPtr += sizeof(DWORD);
|
|
|
|
// Write Record Length
|
|
*(LPUL)pStart = (DWORD)(pCurPtr - pStart - sizeof (DWORD));
|
|
|
|
}
|
|
else if (flag & FLUSH_NEW_RECORD)
|
|
{
|
|
// We emit the record before, since pheader->fEmitRecord == TRUE
|
|
// We need to backpatch the topic count
|
|
if (FoCompare(pHeader->foTopicCount, pEsi->lfoTempOffset) >= 0)
|
|
{
|
|
// Everything is still in memory, just do local backpatch
|
|
SETLONG((LPUL)(pHeader->pTopicCount), pHeader->dwTopicCount);
|
|
}
|
|
else
|
|
{
|
|
// Do backpatch in the file by seeking back to the right
|
|
// place
|
|
if (FileSeekWrite(pEsi->hfpb, &pHeader->dwTopicCount,
|
|
pHeader->foTopicCount, sizeof(DWORD), phr) != sizeof(DWORD))
|
|
return(*phr);
|
|
|
|
// Restore the current file offset
|
|
FileSeek(pEsi->hfpb, pEsi->lfoTempOffset, 0, phr);
|
|
}
|
|
}
|
|
|
|
// Convert all occ data to delta values & compress them
|
|
pTopic = pHeader->pTopic;
|
|
cNumOcc = lpipb->ucNumOccDataFields;
|
|
|
|
for (; pTopic;)
|
|
{
|
|
POCCDATA pOccData;
|
|
PTOPICDATA pReleased;
|
|
|
|
if ((flag & FLUSH_EXCEPT_LAST) && pTopic->pNext == NULL)
|
|
break;
|
|
|
|
// Set TopicId delta
|
|
dwTopicIdDelta = pTopic->dwTopicId - pHeader->dwLastTopicId;
|
|
pHeader->dwLastTopicId = pTopic->dwTopicId;
|
|
|
|
// Save bit size to the statistics array
|
|
lpipb->BitCount[CKEY_TOPIC_ID][CbitBitsDw (dwTopicIdDelta)] += 1;
|
|
|
|
// Write TopicID Delta
|
|
if (pCurPtr > pMax)
|
|
{
|
|
pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer);
|
|
if ((fRet = ESFlushBuffer (pEsi)) != S_OK)
|
|
return(fRet);
|
|
pCurPtr = pOutputBuffer;
|
|
}
|
|
pCurPtr += CbBytePack (pCurPtr, dwTopicIdDelta);
|
|
|
|
if (cNumOcc == 0)
|
|
{
|
|
pReleased = pTopic;
|
|
pTopic = pTopic->pNext;
|
|
|
|
// Add the released to the freed linked list
|
|
pReleased->pNext = (PTOPICDATA)lpipb->TopicBlock.pFreeList;
|
|
lpipb->TopicBlock.pFreeList = (PLIST)pReleased;
|
|
lpipb->TopicBlock.dwCount--;
|
|
continue;
|
|
}
|
|
|
|
if (dwOccCount = pTopic->dwOccCount)
|
|
{
|
|
|
|
// Reset count occdata delta for every new topic
|
|
MEMSET (OccDelta, 0, 5 * sizeof (DWORD));
|
|
MEMSET (LastOcc, 0, 5 * sizeof (DWORD));
|
|
|
|
// Copy Occurrence Count
|
|
if (pCurPtr > pMax)
|
|
{
|
|
pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer);
|
|
if ((fRet = ESFlushBuffer (pEsi)) != S_OK)
|
|
return(fRet);
|
|
pCurPtr = pOutputBuffer;
|
|
}
|
|
pCurPtr += CbBytePack (pCurPtr, dwOccCount);
|
|
|
|
// Save bit size to the statistics array
|
|
lpipb->BitCount[1][CbitBitsDw (dwOccCount)] += 1;
|
|
|
|
// Repeat for each occurrence block
|
|
for (pOccData = pTopic->pOccData,
|
|
sub = dwOccCount; sub > 0 && pOccData; --sub)
|
|
{
|
|
LPDW lpDw;
|
|
int iIndex;
|
|
POCCDATA pReleased;
|
|
|
|
if (pCurPtr + 5 * sizeof(DWORD) > pMax)
|
|
{
|
|
pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer);
|
|
if ((fRet = ESFlushBuffer (pEsi)) != S_OK)
|
|
return(fRet);
|
|
pStart = pCurPtr = pOutputBuffer;
|
|
}
|
|
|
|
lpDw = &pOccData->OccData[0];
|
|
iIndex = CKEY_OCC_BASE;
|
|
|
|
if (occf & OCCF_COUNT)
|
|
{
|
|
// Convert each value to a delta value
|
|
OccDelta[iIndex] = *lpDw - LastOcc[iIndex];
|
|
LastOcc[iIndex] = *lpDw;
|
|
lpDw++;
|
|
|
|
// Save to bit size to the statistics array
|
|
lpipb->BitCount[iIndex][CbitBitsDw (OccDelta[iIndex])] += 1;
|
|
|
|
// Compress occurrence field to buffer
|
|
pCurPtr += CbBytePack (pCurPtr, OccDelta[iIndex]);
|
|
iIndex++;
|
|
}
|
|
|
|
if (occf & OCCF_OFFSET)
|
|
{
|
|
// Convert each value to a delta value
|
|
OccDelta[iIndex] = *lpDw - LastOcc[iIndex];
|
|
LastOcc[iIndex] = *lpDw;
|
|
lpDw++;
|
|
|
|
// Save to bit size to the statistics array
|
|
lpipb->BitCount[iIndex][CbitBitsDw (OccDelta[iIndex])] += 1;
|
|
|
|
// Compress occurrence field to buffer
|
|
pCurPtr += CbBytePack (pCurPtr, OccDelta[iIndex]);
|
|
iIndex++;
|
|
}
|
|
|
|
pReleased = pOccData;
|
|
pOccData = pOccData->pNext;
|
|
pReleased->pNext = (POCCDATA)lpipb->OccBlock.pFreeList;
|
|
lpipb->OccBlock.pFreeList = (PLIST)pReleased;
|
|
lpipb->OccBlock.dwCount--;
|
|
}
|
|
|
|
// Check for mismatch between count and links
|
|
#ifdef _DEBUG
|
|
if (sub)
|
|
SetErrCode (phr, E_ASSERT);
|
|
|
|
if (pOccData)
|
|
SetErrCode (phr, E_ASSERT);
|
|
#endif
|
|
}
|
|
|
|
// Update the sigma values if we are doing term weighing
|
|
// erinfox: remove test against flag. Sometimes sigma never
|
|
// got calculated for a topic and that caused a divide by zero
|
|
// later on.
|
|
if ((lpipb->idxf & IDXF_NORMALIZE) /* && (flag & FLUSH_NEW_RECORD)*/)
|
|
{
|
|
|
|
if (pTopic->dwTopicId > lpipb->dwMaxTopicId)
|
|
{
|
|
// Incease the size of the sigma table. This can happen when
|
|
// updating with new topics
|
|
_GLOBALUNLOCK (lpipb->wi.hSigma);
|
|
if ((lpipb->wi.hSigma = _GLOBALREALLOC (lpipb->wi.hSigma,
|
|
(pTopic->dwTopicId + 1) * sizeof(float),
|
|
DLLGMEM_ZEROINIT)) == NULL)
|
|
{
|
|
return (SetErrCode(phr, E_OUTOFMEMORY));
|
|
}
|
|
lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK(lpipb->wi.hSigma);
|
|
lpipb->dwMaxTopicId = pTopic->dwTopicId ;
|
|
}
|
|
|
|
if (lpipb->bState == INDEXING_STATE)
|
|
{
|
|
#ifndef ISBU_IR_CHANGE
|
|
FLOAT fOcc;
|
|
|
|
if (pHeader->dwTopicCount >= cLOG_MAX)
|
|
{
|
|
// we have to guard against the possibility of the log resulting in
|
|
// a value <= 0.0. Very rare, but possible in the future. This happens
|
|
// if dwTopicCount approaches or exceeds the N we are using (N == 100 million)
|
|
if (pHeader->dwTopicCount >= cNintyFiveMillion)
|
|
rLog = cVerySmallWt; // log10(100 mil/ 95 mil) == 0.02
|
|
else
|
|
//rLog = (float) log10(cHundredMillion/(double)pHeader->dwTopicCount);
|
|
rLog = (float) (8.0 - log10((double)pHeader->dwTopicCount));
|
|
|
|
rLogSquared = rLog*rLog;
|
|
}
|
|
else
|
|
rLogSquared = lpipb->wi.lrgrLog[(WORD)pHeader->dwTopicCount];
|
|
|
|
// Update sigma value
|
|
// NOTE : We are bounding dwOccCount by a value of eTFThreshold
|
|
// The RHS of the equation below has an upperbound of 2 power 30.
|
|
fOcc = (float) min(cTFThreshold, dwOccCount);
|
|
lpipb->wi.hrgsigma[pTopic->dwTopicId] += (SIGMA) fOcc*fOcc*rLogSquared;
|
|
//(SIGMA) (fOcc * fOcc * rLogSquared/(float)0xFFFF);
|
|
#else
|
|
// Failed for update : UNDONE
|
|
if (pHeader->dwTopicCount >= cLOG_MAX)
|
|
{
|
|
rLog = (float)1.0 / (float)pHeader->dwTopicCount;
|
|
rLogSquared = rLog * rLog;
|
|
}
|
|
else
|
|
rLogSquared = lpipb->wi.lrgrLog[(WORD)pHeader->dwTopicCount];
|
|
// Update sigma value
|
|
lpipb->wi.hrgsigma[pTopic->dwTopicId] +=
|
|
(SIGMA)(dwOccCount * dwOccCount) * rLogSquared;
|
|
#endif // ISBU_IR_CHANGE
|
|
}
|
|
}
|
|
pReleased = pTopic;
|
|
pTopic = pTopic->pNext;
|
|
|
|
// Add the released to the freed linked list
|
|
pReleased->pNext = (PTOPICDATA)lpipb->TopicBlock.pFreeList;
|
|
lpipb->TopicBlock.pFreeList = (PLIST)pReleased;
|
|
lpipb->TopicBlock.dwCount--;
|
|
}
|
|
|
|
pHeader->pTopic = pHeader->pLastTopic = pTopic;
|
|
|
|
// Update output offset
|
|
pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer);
|
|
|
|
return(S_OK);
|
|
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
*
|
|
* @doc INDEX
|
|
*
|
|
* @func HRESULT NEAR PASCAL | ProcessFiles |
|
|
* Sorts the file generated from the tree output into one
|
|
* list of sorted elements.
|
|
*
|
|
* @parm _LPIPB | lpipb |
|
|
* Pointer to index parameter block
|
|
*
|
|
* @rdesc S_OK, or errors if failed
|
|
*
|
|
* @notes
|
|
* This function processed the input buffers and uses dynamic
|
|
* memory allocation to sort each word as it come in. Once a
|
|
* word stops repeating, it is flush to disk and the memory is
|
|
* reset for the next word.
|
|
*************************************************************************/
|
|
|
|
HRESULT NEAR PASCAL ProcessFiles(_LPIPB lpipb, LPMERGEPARAMS lpmp)
|
|
{
|
|
// Local replacement variables
|
|
LPISI pIsi = &lpipb->isi;
|
|
LPESI pEsi = &lpipb->esi;
|
|
LPESB FAR * lrgPriorityQueue = pEsi->lrgPriorityQueue;
|
|
LONG uiQueueSize = pEsi->uiQueueSize;
|
|
LPB pQueuePtr;
|
|
WORD cNumOcc = lpipb->ucNumOccDataFields;
|
|
WORD OccSize = sizeof(OCCDATA) - sizeof(DWORD) + cNumOcc *
|
|
sizeof(DWORD);
|
|
int occf = lpipb->occf;
|
|
LPB pBufMax;
|
|
HANDLE hWord;
|
|
LPB lpbWord;
|
|
DWORD dwUniqueTerm = 0; // Used for calback function
|
|
#ifdef _DEBUG
|
|
BYTE astWord[300];
|
|
BYTE astLastWord[300];
|
|
#endif
|
|
|
|
// Working variables
|
|
PMERGEHEADER pHeader; // Pointer to merge header
|
|
LPESB pEsb; // Temp ESB pointer
|
|
PTOPICDATA pNewTopic; // Used to create new topic
|
|
DWORD loop; // Temp loop counter
|
|
HANDLE hHeader;
|
|
HFPB hOutputFile; // Handle to output file
|
|
int fRet; // Return value
|
|
USHORT uStringSize; // Size of Psacal String
|
|
ERRB errb;
|
|
PHRESULT phr = &errb;
|
|
|
|
static long Count = 0;
|
|
|
|
// Setup Block Manager
|
|
if ((fRet = ESBBlockAllocate (lpipb, lpipb->dwMemAllowed / 4)) != S_OK)
|
|
return(fRet);
|
|
|
|
// Allocate output buffer
|
|
if ((pEsi->hBuf = _GLOBALALLOC
|
|
(DLLGMEM_ZEROINIT, ESOUTPUT_BUFFER)) == NULL)
|
|
{
|
|
fRet = E_OUTOFMEMORY;
|
|
exit1:
|
|
return fRet;
|
|
}
|
|
|
|
pEsi->pOutputBuffer = (LPB)_GLOBALLOCK (pEsi->hBuf);
|
|
pEsi->ibBuf = 0;
|
|
|
|
// Create output file
|
|
GETTEMPFILENAME ((char)0, "eso", 0, pEsi->aszTempName);
|
|
|
|
if ((pEsi->hfpb = FileOpen(NULL, pEsi->aszTempName,
|
|
REGULAR_FILE, WRITE, &errb)) == NULL)
|
|
{
|
|
fRet = E_FILECREATE;
|
|
exit2:
|
|
FreeHandle (pEsi->hBuf);
|
|
pEsi->hBuf = NULL;
|
|
goto exit1;
|
|
}
|
|
hOutputFile = pEsi->hfpb;
|
|
|
|
// Setup new record in memory
|
|
if ((hHeader = _GLOBALALLOC
|
|
(DLLGMEM_ZEROINIT, sizeof (MERGEHEADER))) == NULL)
|
|
{
|
|
fRet = E_OUTOFMEMORY;
|
|
exit3:
|
|
FileClose (hOutputFile);
|
|
goto exit2;
|
|
}
|
|
pHeader = (PMERGEHEADER)_GLOBALLOCK (hHeader);
|
|
|
|
// Allocate buffer for a word, which include 64K + sizeof(WORD) + slack
|
|
if ((hWord = _GLOBALALLOC(DLLGMEM_ZEROINIT, 0x10004)) == NULL)
|
|
{
|
|
exit4:
|
|
_GLOBALUNLOCK(hHeader);
|
|
_GLOBALFREE (hHeader);
|
|
goto exit3;
|
|
}
|
|
|
|
pHeader->lpbWord = lpbWord = (LPB)_GLOBALLOCK(hWord);
|
|
|
|
#ifdef _DEBUG
|
|
astWord[0] = 0;
|
|
#endif
|
|
|
|
// Process all input buffers
|
|
do
|
|
{
|
|
DWORD dwWordLength;
|
|
DWORD dwFieldId;
|
|
LPB lpStart;
|
|
DWORD dwTopicCount;
|
|
|
|
#ifdef _DEBUG
|
|
Count++;
|
|
#endif
|
|
|
|
// Grab smallest record and send to buffer
|
|
pEsb = lrgPriorityQueue[1];
|
|
|
|
// Set the fill limit
|
|
pBufMax = pEsb->lrgbMem + pEsb->dwEsbSize - 256;
|
|
|
|
if ((pQueuePtr = pEsb->lrgbMem + pEsb->ibBuf) >= pBufMax)
|
|
{
|
|
if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK)
|
|
goto exit4;
|
|
|
|
pQueuePtr = pEsb->lrgbMem;
|
|
}
|
|
|
|
// Save the record beginning
|
|
pQueuePtr += sizeof(DWORD);
|
|
lpStart = pQueuePtr;
|
|
|
|
// Get string
|
|
uStringSize = GETWORD ((LPUW)pQueuePtr) + sizeof (SHORT);
|
|
|
|
pQueuePtr += uStringSize;
|
|
#ifdef _DEBUG
|
|
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
|
|
SetErrCode (phr, E_ASSERT);
|
|
#endif
|
|
|
|
if (occf & OCCF_LENGTH)
|
|
pQueuePtr += CbByteUnpack (&dwWordLength, pQueuePtr);
|
|
else
|
|
dwWordLength = 0;
|
|
|
|
#ifdef _DEBUG
|
|
if (pQueuePtr >= pEsb->lrgbMem + pEsb->dwEsbSize)
|
|
SetErrCode (phr, E_ASSERT);
|
|
#endif
|
|
if (occf & OCCF_FIELDID)
|
|
pQueuePtr += CbByteUnpack (&dwFieldId, pQueuePtr);
|
|
else
|
|
dwFieldId = 0;
|
|
|
|
#ifdef _DEBUG
|
|
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
|
|
SetErrCode (phr, E_ASSERT);
|
|
#endif
|
|
// Is the word in the buffer equal to the new word?
|
|
// If it is not then flush the old word
|
|
if (*(LPUW)pHeader->lpbWord)
|
|
{
|
|
fRet = (StrCmp2BytePascal (pHeader->lpbWord, lpStart)
|
|
|| dwWordLength > pHeader->dwWordLength);
|
|
if (fRet == 0) // Same word, reduce the unique words count
|
|
lpipb->dwUniqueWord--;
|
|
if (fRet || dwFieldId > pHeader->dwFieldId)
|
|
{
|
|
#if defined(_DEBUG) && !defined(_MAC)
|
|
// Word out of order
|
|
if (StrCmp2BytePascal (pHeader->lpbWord, lpStart) > 0)
|
|
assert(FALSE);
|
|
#endif
|
|
if ((fRet = ESMemory2Disk (lpipb, pHeader, TRUE)) != S_OK)
|
|
return(fRet);
|
|
|
|
// Reset pHeader
|
|
MEMSET (pHeader, 0, sizeof (MERGEHEADER));
|
|
|
|
// Set the word buffer
|
|
pHeader->lpbWord = lpbWord;
|
|
#ifdef _DEBUG
|
|
STRCPY(astLastWord, astWord);
|
|
#endif
|
|
// Call the user callback every once in a while
|
|
if (!(++dwUniqueTerm % 8192L)
|
|
&& (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS))
|
|
{
|
|
PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo;
|
|
CALLBACKINFO Info;
|
|
|
|
Info.dwPhase = 2;
|
|
Info.dwIndex = (DWORD)((float)dwUniqueTerm / lpipb->dwUniqueWord * 100);
|
|
fRet = (*pCallbackInfo->MessageFunc)
|
|
(ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info);
|
|
if (S_OK != fRet)
|
|
goto exit5;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update the data
|
|
pHeader->dwFieldId = dwFieldId;
|
|
pHeader->dwWordLength = dwWordLength;
|
|
pHeader->dwStrLen = uStringSize;
|
|
|
|
// Copy word and header info
|
|
MEMCPY (pHeader->lpbWord, (LPB)lpStart, uStringSize);
|
|
#ifdef _DEBUG
|
|
if (uStringSize >= 300)
|
|
uStringSize = 300;
|
|
MEMCPY (astWord, lpStart + 2, uStringSize - 2);
|
|
astWord[uStringSize - 2] = 0;
|
|
//if (STRCMP(astWord, "87db") == 0)
|
|
// _asm int 3;
|
|
#endif
|
|
|
|
pQueuePtr += CbByteUnpack (&dwTopicCount, pQueuePtr);
|
|
pHeader->dwTopicCount += dwTopicCount;
|
|
|
|
#ifdef _DEBUG
|
|
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
|
|
SetErrCode (phr, E_ASSERT);
|
|
#endif
|
|
pNewTopic = NULL;
|
|
|
|
// Copy topic(s) to memory
|
|
for (loop = dwTopicCount; loop > 0; loop--)
|
|
{
|
|
DWORD dwTopicId;
|
|
|
|
// Get the topic id
|
|
pQueuePtr += CbByteUnpack (&dwTopicId, pQueuePtr);
|
|
|
|
// kevynct: if there is a to-delete list, and this topic is on it, skip it
|
|
if (lpmp && FindTopic(lpmp, dwTopicId))
|
|
{
|
|
// Get the occ count
|
|
if (cNumOcc)
|
|
{
|
|
DWORD dwOccCount;
|
|
DWORD dwT;
|
|
|
|
pQueuePtr += CbByteUnpack (&dwOccCount, pQueuePtr);
|
|
#ifdef _DEBUG
|
|
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
|
|
SetErrCode (phr, E_ASSERT);
|
|
#endif
|
|
for (; dwOccCount > 0; dwOccCount--)
|
|
{
|
|
// Fill up the buffer if run out of data
|
|
if (pQueuePtr >= pBufMax)
|
|
{
|
|
pEsb->ibBuf = (DWORD)(pQueuePtr - pEsb->lrgbMem);
|
|
if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK)
|
|
goto exit5;
|
|
pQueuePtr = pEsb->lrgbMem;
|
|
}
|
|
|
|
switch (cNumOcc)
|
|
{
|
|
case 5:
|
|
pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
|
|
case 4:
|
|
pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
|
|
case 3:
|
|
pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
|
|
case 2:
|
|
pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
|
|
case 1:
|
|
pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
|
|
}
|
|
|
|
#ifdef _DEBUG
|
|
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
|
|
SetErrCode (phr, E_ASSERT);
|
|
#endif
|
|
} // end occ loop
|
|
} // end if occ non-zero
|
|
|
|
pHeader->dwTopicCount--;
|
|
continue;
|
|
} // end of to-delete condition
|
|
|
|
// Allocate a topicdata node
|
|
if ((pNewTopic == NULL) &&
|
|
(pNewTopic = GetBlockNode (&lpipb->TopicBlock)) == NULL)
|
|
{
|
|
if ((fRet = ESMemory2Disk(lpipb, pHeader, FLUSH_EXCEPT_LAST)) != S_OK)
|
|
{
|
|
exit5:
|
|
_GLOBALUNLOCK(hWord);
|
|
_GLOBALFREE(hWord);
|
|
goto exit4;
|
|
}
|
|
|
|
if ((pNewTopic = GetBlockNode (&lpipb->TopicBlock)) == NULL)
|
|
{
|
|
// Extremely weird, since we just release a bunch of
|
|
// memory
|
|
fRet = E_ASSERT;
|
|
goto exit5;
|
|
}
|
|
}
|
|
|
|
pNewTopic->dwTopicId = dwTopicId;
|
|
|
|
#ifdef _DEBUG
|
|
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
|
|
SetErrCode (phr, E_ASSERT);
|
|
#endif
|
|
// Set the other fields
|
|
pNewTopic->pOccData = pNewTopic->pLastOccData = NULL;
|
|
|
|
// Get the occ count
|
|
if (cNumOcc)
|
|
{
|
|
DWORD dwOccCount;
|
|
POCCDATA pOccData;
|
|
LPDW lpDw;
|
|
|
|
pQueuePtr += CbByteUnpack (&pNewTopic->dwOccCount,
|
|
pQueuePtr);
|
|
|
|
#ifdef _DEBUG
|
|
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
|
|
SetErrCode (phr, E_ASSERT);
|
|
#endif
|
|
for (dwOccCount = pNewTopic->dwOccCount; dwOccCount > 0;
|
|
dwOccCount--)
|
|
{
|
|
// Get all occ fields
|
|
if ((pOccData = (POCCDATA)GetBlockNode
|
|
(&lpipb->OccBlock)) == NULL )
|
|
{
|
|
if ((fRet = ESMemory2Disk(lpipb, pHeader,
|
|
FLUSH_EXCEPT_LAST)) != S_OK)
|
|
goto exit5;
|
|
|
|
if ((pOccData =
|
|
(POCCDATA)GetBlockNode(&lpipb->OccBlock)) == NULL)
|
|
{
|
|
// Extremely weird, since we just release a bunch of
|
|
// memory, unless there are so many duplicates of the same word
|
|
// in the topic
|
|
|
|
fRet = E_TOOMANYDUPS;
|
|
goto exit5;
|
|
}
|
|
}
|
|
|
|
// Fill up the buffer if run out of data
|
|
if (pQueuePtr >= pBufMax)
|
|
{
|
|
pEsb->ibBuf = (DWORD) (pQueuePtr - pEsb->lrgbMem);
|
|
if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK)
|
|
goto exit5;
|
|
pQueuePtr = pEsb->lrgbMem;
|
|
}
|
|
|
|
lpDw = (LPDW)&pOccData->OccData;
|
|
switch (cNumOcc)
|
|
{
|
|
case 5:
|
|
pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
|
|
case 4:
|
|
pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
|
|
case 3:
|
|
pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
|
|
case 2:
|
|
pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
|
|
case 1:
|
|
pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
|
|
}
|
|
|
|
#ifdef _DEBUG
|
|
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
|
|
SetErrCode (phr, E_ASSERT);
|
|
#endif
|
|
// Attach to the linked list
|
|
// Note that we are assumimg that the occurrences are
|
|
// already sorted, so no checking is done here
|
|
if (pNewTopic->pOccData == NULL)
|
|
{
|
|
pNewTopic->pLastOccData = pNewTopic->pOccData
|
|
= pOccData;
|
|
}
|
|
else
|
|
{
|
|
// Add to the end of the linked list
|
|
pNewTopic->pLastOccData->pNext = pOccData;
|
|
pNewTopic->pLastOccData = pOccData;
|
|
}
|
|
pOccData->pNext = NULL;
|
|
}
|
|
}
|
|
|
|
if (pNewTopic = MergeTopicNode (pHeader, pNewTopic, cNumOcc))
|
|
pHeader->dwTopicCount --;
|
|
}
|
|
|
|
// Update the offset
|
|
pEsb->ibBuf = (DWORD) (pQueuePtr - pEsb->lrgbMem);
|
|
|
|
// If next record doesn't fit in buffer
|
|
// Then reset to beginning and load data
|
|
if (pEsb->dwEsbSize - pEsb->ibBuf <= sizeof(DWORD) ||
|
|
pEsb->dwEsbSize - pEsb->ibBuf <= GETLONG((LPUL)pQueuePtr) +
|
|
2 * sizeof(DWORD))
|
|
{
|
|
if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK)
|
|
goto exit4;
|
|
}
|
|
|
|
// Adjust priority queue
|
|
if (uiQueueSize > 1)
|
|
{
|
|
if (DwSubFo (pEsb->lfo, pEsb->lfoMax) != 0 &&
|
|
pEsb->ibBuf >= pEsb->dwEsbSize)
|
|
{
|
|
// Replace first record with last
|
|
lrgPriorityQueue[1] = lrgPriorityQueue[uiQueueSize];
|
|
lrgPriorityQueue[uiQueueSize] = NULL;
|
|
uiQueueSize--;
|
|
pEsi->uiQueueSize = uiQueueSize;
|
|
}
|
|
#if 0
|
|
else
|
|
{ // If the stream still has input add it back into the Queue
|
|
lrgPriorityQueue[uiQueueSize] = pEsb;
|
|
PQueueUp(lpipb, lrgPriorityQueue, uiQueueSize);
|
|
}
|
|
#endif
|
|
PQueueDown(lpipb); // Maintain sort order
|
|
}
|
|
else if (DwSubFo (pEsb->lfo, pEsb->lfoMax) != 0 &&
|
|
pEsb->ibBuf >= pEsb->dwEsbSize)
|
|
{
|
|
uiQueueSize--;
|
|
pEsi->uiQueueSize = uiQueueSize;
|
|
if ((fRet = ESMemory2Disk (lpipb, pHeader, FLUSH_NEW_RECORD)) != S_OK)
|
|
return(fRet);
|
|
}
|
|
} while (uiQueueSize);
|
|
|
|
fRet = ESFlushBuffer(pEsi);
|
|
goto exit5;
|
|
}
|
|
|
|
|
|
BOOL PASCAL NEAR FindTopic(LPMERGEPARAMS lpmp, DWORD dwTopicId)
|
|
{
|
|
register LPDW lpdw;
|
|
LPDW lpdwMac;
|
|
|
|
Assert(lpmp->dwCount > 0);
|
|
Assert(lpmp->lpTopicIdLast >= lpmp->rgTopicId);
|
|
Assert(lpmp->lpTopicIdLast < lpmp->rgTopicId + lpmp->dwCount);
|
|
|
|
if (lpmp->rgTopicId[0] > dwTopicId
|
|
||
|
|
*(lpdwMac = lpmp->rgTopicId + lpmp->dwCount - 1) < dwTopicId)
|
|
return FALSE;
|
|
|
|
if (*lpmp->lpTopicIdLast == dwTopicId)
|
|
return TRUE;
|
|
|
|
if (*lpmp->lpTopicIdLast > dwTopicId)
|
|
{
|
|
// re-start at the beginning
|
|
lpmp->lpTopicIdLast = lpmp->rgTopicId;
|
|
}
|
|
|
|
for (lpdw = lpmp->lpTopicIdLast; lpdw < lpdwMac + 1; lpdw++)
|
|
if (*lpdw == dwTopicId)
|
|
{
|
|
lpmp->lpTopicIdLast = lpdw;
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/*************************************************************************
|
|
*
|
|
* @doc INTERNAL INDEXING
|
|
*
|
|
* @func int | CompareRecordBuffers |
|
|
* Called from PQueueUp/Down to sort the input buffers based first
|
|
* upon the string's, then TopicID's, then word length's, etc.
|
|
*
|
|
* @parm _LPIPB | lpipb |
|
|
* Pointer to the index parameter block
|
|
*
|
|
* @parm LPB | pBuffer A |
|
|
* Pointer to the first input buffer
|
|
*
|
|
* @parm LPB | pBuffer B |
|
|
* Pointer to the second input buffer
|
|
*
|
|
* @rdesc
|
|
* If pBufferA < pBufferB return < 0
|
|
* If pBufferA == pBufferB return = 0
|
|
* If pBufferA > pBufferB return > 0
|
|
*************************************************************************/
|
|
|
|
int PASCAL NEAR CompareRecordBuffers (_LPIPB lpipb, LPB pBufferA, LPB pBufferB)
|
|
{
|
|
// Local Replacement Variables
|
|
int occf = lpipb->occf;
|
|
int cNumOcc = lpipb->ucNumOccDataFields;
|
|
DWORD dwOccMin;
|
|
|
|
// Working Variables
|
|
int fRet;
|
|
int Len;
|
|
DWORD dwDataA;
|
|
DWORD dwDataB;
|
|
|
|
pBufferA += sizeof (DWORD); // Skip record length
|
|
pBufferB += sizeof (DWORD); // Skip record length
|
|
|
|
// Compare Pascal strings
|
|
if ((fRet = StrCmp2BytePascal(pBufferA, pBufferB)) != 0)
|
|
return fRet;
|
|
|
|
pBufferA += (Len = GETWORD ((LPUW)pBufferA) + sizeof (SHORT));
|
|
pBufferB += Len;
|
|
|
|
// Strings equal - compare FieldIds
|
|
// Compare Word Lengths
|
|
if (occf & OCCF_LENGTH)
|
|
{
|
|
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
|
|
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
|
|
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
|
|
return fRet;
|
|
}
|
|
|
|
if (occf & OCCF_FIELDID)
|
|
{
|
|
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
|
|
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
|
|
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
|
|
return fRet;
|
|
}
|
|
|
|
|
|
// Skip topic count
|
|
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
|
|
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
|
|
|
|
// Compare 1st topic Id
|
|
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
|
|
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
|
|
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
|
|
return fRet;
|
|
|
|
// Get the occurrence count
|
|
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
|
|
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
|
|
|
|
if ((fRet = (int)(dwDataA - dwDataB)) < 0)
|
|
dwOccMin = dwDataA;
|
|
else
|
|
dwOccMin = dwDataB;
|
|
for (; dwOccMin; dwOccMin--)
|
|
{
|
|
switch (cNumOcc)
|
|
{
|
|
case 5:
|
|
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
|
|
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
|
|
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
|
|
return fRet;
|
|
break;
|
|
case 4:
|
|
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
|
|
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
|
|
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
|
|
return fRet;
|
|
break;
|
|
case 3:
|
|
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
|
|
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
|
|
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
|
|
return fRet;
|
|
break;
|
|
case 2:
|
|
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
|
|
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
|
|
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
|
|
return fRet;
|
|
break;
|
|
case 1:
|
|
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
|
|
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
|
|
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
|
|
return fRet;
|
|
break;
|
|
}
|
|
}
|
|
return fRet;
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
*
|
|
* @doc INTERNAL INDEXING
|
|
*
|
|
* @func VOID | PQueueUp |
|
|
* The function restores the heap condition of a PQ, ie. the parent
|
|
* node must be less than the children. When the top node is inserted
|
|
* the heap condition may be violated if the resulting node
|
|
* is smaller than its parent. In this case the nodes have to
|
|
* be switched.
|
|
*
|
|
* @parm LPESI | lpesi |
|
|
* Pointer to external sort info, which contains all info
|
|
*
|
|
* @parm LONG | index |
|
|
* Index of the inserted node
|
|
*
|
|
*************************************************************************/
|
|
|
|
VOID PASCAL NEAR PQueueUp
|
|
(_LPIPB lpipb, LPESB FAR *lrgPriorityQueue, LONG index)
|
|
{
|
|
LPESB lpesbTemp; // Pointer to the inserted node
|
|
LPESB lpesbHalf; // Pointer to the parent node
|
|
WORD uiHalf; // Index of the parent's node
|
|
|
|
lpesbTemp = lrgPriorityQueue [index];
|
|
|
|
if ((uiHalf = (WORD) (index/2)) == 0)
|
|
return;
|
|
lpesbHalf = lrgPriorityQueue [uiHalf];
|
|
|
|
/* If the parent node is greated than the child, then exchange the
|
|
* nodes, The condition uiHalf != index makes sure that we stop
|
|
* at node 0 (top node)
|
|
*/
|
|
while (uiHalf && CompareRecordBuffers (lpipb, (LPB)lpesbHalf->lrgbMem +
|
|
lpesbHalf->ibBuf, (LPB)lpesbTemp->lrgbMem + lpesbTemp->ibBuf) > 0)
|
|
{
|
|
lrgPriorityQueue [index] = lpesbHalf;
|
|
index = uiHalf;
|
|
uiHalf = (WORD)(index/2);
|
|
lpesbHalf = lrgPriorityQueue [uiHalf];
|
|
}
|
|
lrgPriorityQueue[index] = lpesbTemp;
|
|
#if BINHN
|
|
SetQueue (&lpipb->esi);
|
|
#endif
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
*
|
|
* @doc INTERNAL INDEXING
|
|
*
|
|
* @func VOID | PQueueDown |
|
|
* The function restores the heap condition of a PQ, ie. the parent
|
|
* node must be less than the children. When the top node is removed
|
|
* the heap condition may be violated if the resulting node
|
|
* is greater than its children. In this case the nodes have to
|
|
* be switched.
|
|
*
|
|
* @parm LPESI | lpesi |
|
|
* Pointer to external sort info, which contains all info
|
|
*
|
|
*************************************************************************/
|
|
|
|
PRIVATE VOID PASCAL NEAR PQueueDown (_LPIPB lpipb)
|
|
{
|
|
LPESI lpesi = &lpipb->esi;
|
|
LPESB FAR *lrgPriorityQueue;
|
|
int CurIndex;
|
|
int ChildIndex;
|
|
int MaxCurIndex;
|
|
int MaxChildIndex;
|
|
LPESB lpesbSaved;
|
|
LPESB lpesbTemp;
|
|
LPESB lpesbChild;
|
|
|
|
lrgPriorityQueue = lpesi->lrgPriorityQueue;
|
|
lpesbSaved = lrgPriorityQueue[1];
|
|
MaxCurIndex = (MaxChildIndex = lpesi->uiQueueSize) / 2;
|
|
|
|
for (CurIndex = 1; CurIndex <= MaxCurIndex; CurIndex = ChildIndex)
|
|
{
|
|
// Get child index
|
|
ChildIndex = CurIndex * 2;
|
|
// Find the minimum of the two children
|
|
if (ChildIndex < MaxChildIndex)
|
|
{
|
|
if ((lpesbTemp = lrgPriorityQueue[ChildIndex + 1]) != NULL)
|
|
{
|
|
lpesbChild = lrgPriorityQueue[ChildIndex];
|
|
|
|
// The two children exist. Take the smallest
|
|
if (CompareRecordBuffers
|
|
(lpipb, (LPB)lpesbChild->lrgbMem + lpesbChild->ibBuf,
|
|
(LPB)lpesbTemp->lrgbMem + lpesbTemp->ibBuf) >= 0)
|
|
ChildIndex++;
|
|
}
|
|
}
|
|
|
|
// If the parent's node is less than the child, then break
|
|
// (heap condition met)
|
|
if (ChildIndex > MaxChildIndex)
|
|
break;
|
|
|
|
lpesbTemp = lrgPriorityQueue [ChildIndex];
|
|
|
|
if (CompareRecordBuffers (lpipb, (LPB)lpesbSaved->lrgbMem +
|
|
lpesbSaved->ibBuf, (LPB)lpesbTemp->lrgbMem+lpesbTemp->ibBuf) < 0)
|
|
break;
|
|
|
|
// Replace the node
|
|
lrgPriorityQueue [CurIndex] = lpesbTemp;
|
|
}
|
|
lrgPriorityQueue [CurIndex] = lpesbSaved;
|
|
#if _BINHN
|
|
SetQueue (lpesi);
|
|
#endif
|
|
}
|
|
|
|
|
|
PRIVATE PTOPICDATA PASCAL NEAR MergeTopicNode (PMERGEHEADER pHeader,
|
|
PTOPICDATA pNewTopic, int cNumOcc)
|
|
{
|
|
// PTOPICDATA pLastTopic;
|
|
PTOPICDATA pTopic, pPrevTopic;
|
|
int fResult;
|
|
|
|
if ((pTopic = pHeader->pLastTopic) == NULL)
|
|
{
|
|
// The list is empty
|
|
pHeader->pTopic = pHeader->pLastTopic = pNewTopic;
|
|
pNewTopic->pNext = NULL;
|
|
return(NULL);
|
|
|
|
}
|
|
|
|
fResult = pTopic->dwTopicId - pNewTopic->dwTopicId;
|
|
|
|
if (fResult < 0)
|
|
{
|
|
// New node. Add to the end
|
|
pNewTopic->pNext = NULL;
|
|
pHeader->pLastTopic->pNext = pNewTopic;
|
|
pHeader->pLastTopic = pNewTopic;
|
|
|
|
// Reset pNewTopic for next node allocation
|
|
return NULL;
|
|
}
|
|
|
|
if (fResult == 0)
|
|
{
|
|
// Same topic. Return pNewTopic for reuse
|
|
if (cNumOcc)
|
|
MergeOccurrence (pTopic, pNewTopic, cNumOcc);
|
|
return(pNewTopic);
|
|
}
|
|
|
|
// If we get to this point, the list is out of order
|
|
// Try to find the insertion point
|
|
pTopic = pHeader->pTopic;
|
|
pPrevTopic = NULL;
|
|
|
|
for (; pTopic->pNext; pTopic = pTopic->pNext)
|
|
{
|
|
if (pTopic->dwTopicId >= pNewTopic->dwTopicId)
|
|
{
|
|
/* We pass the inserted point */
|
|
break;
|
|
}
|
|
pPrevTopic = pTopic;
|
|
}
|
|
|
|
if (pTopic->dwTopicId == pNewTopic->dwTopicId)
|
|
{
|
|
// Same topic. Return pNewTopic for reuse
|
|
if (cNumOcc)
|
|
MergeOccurrence (pTopic, pNewTopic, cNumOcc);
|
|
return(pNewTopic);
|
|
}
|
|
|
|
// Handle empty case
|
|
if (pPrevTopic == NULL)
|
|
{
|
|
/* Insert at the beginning */
|
|
pNewTopic->pNext = pHeader->pTopic;
|
|
pHeader->pTopic = pNewTopic;
|
|
}
|
|
|
|
else
|
|
{
|
|
/* Inserted at the middle or the end */
|
|
pNewTopic->pNext = pPrevTopic->pNext;
|
|
pPrevTopic->pNext = pNewTopic;
|
|
}
|
|
|
|
// Update the last topic
|
|
while (pTopic->pNext)
|
|
{
|
|
pTopic = pTopic->pNext;
|
|
}
|
|
pHeader->pLastTopic = pTopic;
|
|
return(NULL);
|
|
}
|
|
|
|
/*************************************************************************
|
|
* @doc PRIVATE
|
|
* @func void | MergeOccurrence |
|
|
* Merge the occurrence by adding them in order
|
|
*************************************************************************/
|
|
PRIVATE VOID NEAR MergeOccurrence (PTOPICDATA pOldTopic,
|
|
PTOPICDATA pNewTopic, int cOccNum)
|
|
{
|
|
ERRB errb;
|
|
|
|
if (CompareOccurrence (&pOldTopic->pLastOccData->OccData[0],
|
|
&pNewTopic->pOccData->OccData[0], cOccNum) <= 0)
|
|
{
|
|
// The whole last list is less than the current list. This is
|
|
// what I expect
|
|
// We just linked the 2 lists together
|
|
pOldTopic->pLastOccData->pNext = pNewTopic->pOccData;
|
|
pOldTopic->pLastOccData = pNewTopic->pLastOccData;
|
|
pOldTopic->dwOccCount += pNewTopic->dwOccCount;
|
|
return;
|
|
}
|
|
|
|
// The current list is less than the old list.
|
|
// This is weird, but still we can handle it
|
|
if (CompareOccurrence (&pNewTopic->pOccData->OccData[0],
|
|
&pOldTopic->pOccData->OccData[0], cOccNum) <= 0)
|
|
{
|
|
pNewTopic->pLastOccData->pNext = pOldTopic->pOccData;
|
|
pOldTopic->pOccData = pNewTopic->pOccData;
|
|
pOldTopic->dwOccCount += pNewTopic->dwOccCount;
|
|
return;
|
|
}
|
|
|
|
SetErrCode (&errb, E_ASSERT);
|
|
}
|
|
|
|
/*====================================================================*/
|
|
#ifdef BINHN
|
|
PRIVATE VOID PASCAL NEAR SetQueue (LPESI pEsi)
|
|
{
|
|
unsigned int i = 0;
|
|
LPESB FAR *lrgPriorityQueue;
|
|
|
|
lrgPriorityQueue = pEsi->lrgPriorityQueue;
|
|
for (i = 0; i < 20 && i < pEsi->cesb ; i++)
|
|
{
|
|
if (lrgPriorityQueue[i])
|
|
pEsi->lpbQueueStr[i] = lrgPriorityQueue[i]->lrgbMem +
|
|
lrgPriorityQueue[i]->ibBuf + 6;
|
|
}
|
|
|
|
}
|
|
#endif
|
|
|
|
/************************************************************************
|
|
* @doc PRIVATE
|
|
* @func HRESULT PASCAL NEAR | ESBBlockAllocate |
|
|
* Set the memory allocation based on the memory of the machine
|
|
* @parm DWORD | lMemSize |
|
|
* Memory allocated for the indexer
|
|
* @rdesc S_OK, or E_OUTOFMEMORY
|
|
************************************************************************/
|
|
|
|
PRIVATE HRESULT PASCAL NEAR ESBBlockAllocate (_LPIPB lpipb, DWORD lMemSize)
|
|
{
|
|
DWORD dwTopicSize;
|
|
DWORD dwOccSize;
|
|
WORD OccNodeSize = sizeof (OCCDATA) - 1 + sizeof(DWORD) *
|
|
lpipb->ucNumOccDataFields; // About 24bytes
|
|
|
|
OccNodeSize = (OccNodeSize + 3) & ~3;
|
|
|
|
/* The memory is for topic block and occurrence blocks, which
|
|
* should be in the ratio 1:1.5
|
|
*/
|
|
dwTopicSize = (lMemSize * 2) / 5;
|
|
dwOccSize = lMemSize - dwTopicSize;
|
|
|
|
#if 0
|
|
/* Don't do anything if things are too small */
|
|
if (dwTopicSize < MAX_BLOCK_SIZE || dwOccSize < MAX_BLOCK_SIZE)
|
|
return(E_OUTOFMEMORY);
|
|
#endif
|
|
|
|
// Allocate a block manager for topic node
|
|
|
|
if ((lpipb->TopicBlock.pBlockMgr =
|
|
BlockInitiate ((MAX_BLOCK_SIZE * sizeof(TOPICDATA)/sizeof(TOPICDATA)),
|
|
sizeof(TOPICDATA),
|
|
(WORD)(dwTopicSize/MAX_BLOCK_SIZE),
|
|
USE_VIRTUAL_MEMORY | THREADED_ELEMENT)) == NULL)
|
|
{
|
|
exit2:
|
|
return SetErrCode (NULL, E_OUTOFMEMORY);
|
|
}
|
|
lpipb->TopicBlock.pFreeList =
|
|
(PLIST)BlockGetLinkedList(lpipb->TopicBlock.pBlockMgr);
|
|
|
|
// Allocate a block manager for occ node
|
|
if ((lpipb->OccBlock.pBlockMgr =
|
|
BlockInitiate((MAX_BLOCK_SIZE * OccNodeSize)/OccNodeSize,
|
|
OccNodeSize, (WORD)(lMemSize / MAX_BLOCK_SIZE),
|
|
USE_VIRTUAL_MEMORY | THREADED_ELEMENT)) == NULL)
|
|
{
|
|
BlockFree(lpipb->BTNodeBlock.pBlockMgr);
|
|
lpipb->BTNodeBlock.pBlockMgr = NULL;
|
|
goto exit2;
|
|
}
|
|
lpipb->OccBlock.pFreeList = (PLIST)BlockGetLinkedList(lpipb->OccBlock.pBlockMgr);
|
|
|
|
return (S_OK);
|
|
}
|
|
|
|
PRIVATE LPV PASCAL NEAR GetBlockNode (PBLKCOMBO pBlockCombo)
|
|
{
|
|
PLIST pList;
|
|
|
|
if (pBlockCombo->pFreeList == NULL)
|
|
{
|
|
if ((BlockGrowth (pBlockCombo->pBlockMgr) != S_OK))
|
|
return (NULL);
|
|
pBlockCombo->pFreeList =
|
|
(PLIST)BlockGetLinkedList(pBlockCombo->pBlockMgr);
|
|
}
|
|
pList = pBlockCombo->pFreeList;
|
|
pBlockCombo->pFreeList = pList->pNext;
|
|
pBlockCombo->dwCount ++;
|
|
// pList->pNext = NULL;
|
|
return (pList);
|
|
}
|
|
|
|
/*************************************************************************
|
|
*
|
|
* @doc INTERNAL
|
|
*
|
|
* @func BOOL FAR PASCAL | BuildIndexFile |
|
|
* This function is for debugging purpose only. In normal indexing,
|
|
* it will never be called. Since collecting words and indexing can
|
|
* take a long time, debugging the index phase can become a hassle that
|
|
* take several hours per shot. To minimize the index time for debugging,
|
|
* all the intermediate files are saved, which are:
|
|
* - the internal sorted result file, which contains all words and
|
|
* their occurrences sorted
|
|
* - the external sorted result file, which is a snap shot of the
|
|
* ESI structures and its ESB blocks
|
|
* The only steps left will be processing the occurrence list and doing
|
|
* permanent index
|
|
*
|
|
* To use the function, add the following lines in the app:
|
|
*
|
|
* extern HRESULT PASCAL FAR BuildIndexFile (LPSTR, LPSTR, LPSTR, WORD, WORD,
|
|
* WORD, INTERRUPT_FUNC, VOID FAR *, STATUS_FUNC, VOID FAR*, PHRESULT);
|
|
*
|
|
* int fDotest;
|
|
*
|
|
* if (fDotest) {
|
|
* return BuildIndexFile ((LPSTR)"c:/tmp/test.mvb!MVINDEX",
|
|
* (LPSTR)"c:/tmp/esi.tmp", (LPSTR)"c:/tmp/iso.tmp",
|
|
* OCCF_TOPICID, IDXF_NORMALIZE, 0, (INTERRUPT_FUNC)lpfnInterruptFunc,
|
|
* (LPV)NULL,
|
|
* (STATUS_FUNC)lpfnStatusFunc, (LPV)hwndGlobal,
|
|
* NULL);
|
|
* }
|
|
*
|
|
* @parm HFPB | hfpb |
|
|
* HFPB for index file if pstrIndexFile is NULL
|
|
*
|
|
* @parm LPB | pstrIndexFile |
|
|
* The .MVB + index file, usually with the format TEST.MVB!MVINDEX
|
|
*
|
|
* @parm LPB | lpbEsiFile |
|
|
* The external sort info file
|
|
*
|
|
* @parm LPB | lpbIsiFile |
|
|
* The internal sorted info filename
|
|
*
|
|
* @parm PINDEXINFO | pIndexInfo |
|
|
* IndexInfo
|
|
*
|
|
* @rdesc S_OK if succeeded, else other non-zero error codes
|
|
*************************************************************************/
|
|
|
|
PUBLIC HRESULT PASCAL EXPORT_API FAR BuildIndexFile
|
|
(HFPB hfpb, LPSTR pstrIndexFile,
|
|
LPB lpbEsiFile, LPB lpbIsiFile, PINDEXINFO pIndexInfo)
|
|
{
|
|
_LPIPB lpipb;
|
|
LPESI lpesi;
|
|
BOOL fRet;
|
|
ERRB errb;
|
|
DWORD loop;
|
|
FLOAT rLog;
|
|
BYTE bKeyIndex = 0;
|
|
|
|
if ((lpipb = MVIndexInitiate(pIndexInfo, NULL)) == NULL)
|
|
return E_FAIL;
|
|
|
|
lpesi = &lpipb->esi;
|
|
|
|
if (LoadEsiTemp (lpipb, lpesi, lpbEsiFile, lpbIsiFile, NULL) != S_OK)
|
|
{
|
|
fRet = E_FAIL;
|
|
exit0:
|
|
MVIndexDispose (lpipb);
|
|
return fRet;
|
|
}
|
|
|
|
if (lpipb->idxf & IDXF_NORMALIZE)
|
|
{
|
|
// Allocate a huge buffer to contain all the sigma terms
|
|
if ((lpipb->wi.hSigma = _GLOBALALLOC (DLLGMEM_ZEROINIT,
|
|
(LCB)((lpipb->dwMaxTopicId + 1) * sizeof (SIGMA)))) == NULL)
|
|
return SetErrCode (&errb, E_OUTOFMEMORY);
|
|
lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK (lpipb->wi.hSigma);
|
|
|
|
// Small buffer containing pre-calculated values
|
|
if ((lpipb->wi.hLog = _GLOBALALLOC (DLLGMEM_ZEROINIT,
|
|
(CB)(cLOG_MAX * sizeof (FLOAT)))) == NULL)
|
|
{
|
|
SetErrCode (&errb, (HRESULT)(fRet = E_OUTOFMEMORY));
|
|
FreeHandle (lpipb->wi.hSigma);
|
|
goto exit0;
|
|
}
|
|
lpipb->wi.lrgrLog = (FLOAT FAR *)_GLOBALLOCK (lpipb->wi.hLog);
|
|
// Initialize the array
|
|
for (loop = cLOG_MAX - 1; loop > 0; --loop)
|
|
{
|
|
#ifndef ISBU_IR_CHANGE
|
|
rLog = (float) log10(cHundredMillion/(double)loop);
|
|
#else
|
|
rLog = (float)1.0 / (float)loop;
|
|
#endif // ISBU_IR_CHANGE
|
|
lpipb->wi.lrgrLog[loop] = rLog * rLog;
|
|
}
|
|
}
|
|
|
|
if ((fRet = MergeSortTreeFile (lpipb, NULL)) != S_OK)
|
|
return SetErrCode (&errb, (HRESULT)fRet);
|
|
if ((lpipb->idxf & KEEP_TEMP_FILE) == 0)
|
|
FileUnlink (NULL, lpipb->isi.aszTempName, REGULAR_FILE);
|
|
|
|
// If we are doing term-weighting we have to square root all sigma values
|
|
if (lpipb->idxf & IDXF_NORMALIZE)
|
|
{
|
|
// ISBU_IR_CHANGE not necessary 'cos sqrt computation is necessary in both cases
|
|
for (loop = 0; loop < lpipb->dwMaxTopicId + 1; ++loop)
|
|
lpipb->wi.hrgsigma[loop] =
|
|
(float)sqrt ((double)lpipb->wi.hrgsigma[loop]);
|
|
}
|
|
|
|
// Analyze data to get the best compression scheme
|
|
// TopicId
|
|
VGetBestScheme(&lpipb->cKey[CKEY_TOPIC_ID],
|
|
&lpipb->BitCount[CKEY_TOPIC_ID][0], lcbitBITSTREAM_ILLEGAL, TRUE);
|
|
// Occurrence Count
|
|
VGetBestScheme(&lpipb->cKey[CKEY_OCC_COUNT],
|
|
&lpipb->BitCount[CKEY_OCC_COUNT][0], lcbitBITSTREAM_ILLEGAL, TRUE);
|
|
|
|
if (lpipb->occf & OCCF_COUNT)
|
|
{
|
|
VGetBestScheme(&lpipb->cKey[bKeyIndex],
|
|
&lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE);
|
|
bKeyIndex++;
|
|
}
|
|
|
|
if (lpipb->occf & OCCF_OFFSET)
|
|
{
|
|
VGetBestScheme(&lpipb->cKey[bKeyIndex],
|
|
&lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE);
|
|
bKeyIndex++;
|
|
}
|
|
|
|
// Call the user callback every once in a while
|
|
if (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS)
|
|
{
|
|
PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo;
|
|
CALLBACKINFO Info;
|
|
|
|
Info.dwPhase = 2;
|
|
Info.dwIndex = 100;
|
|
fRet = (*pCallbackInfo->MessageFunc)
|
|
(ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info);
|
|
if (S_OK != fRet)
|
|
goto exit0;
|
|
}
|
|
|
|
// Build the permanent index
|
|
fRet = BuildBTree(NULL, lpipb, lpipb->esi.aszTempName, hfpb, pstrIndexFile);
|
|
if (lpipb->idxf & IDXF_NORMALIZE)
|
|
{
|
|
FreeHandle (lpipb->wi.hLog);
|
|
FreeHandle (lpipb->wi.hSigma);
|
|
}
|
|
goto exit0;
|
|
}
|
|
|
|
PRIVATE VOID PASCAL NEAR SaveEsiTemp (_LPIPB lpipb, LPESI lpesi)
|
|
{
|
|
GHANDLE hfpb;
|
|
LPESB lpesb;
|
|
char szEsi[100];
|
|
|
|
GETTEMPFILENAME ((char)0, "foo", 0, szEsi);
|
|
if ((hfpb = FileOpen(NULL, szEsi, REGULAR_FILE, READ_WRITE, NULL)) == NULL)
|
|
return;
|
|
|
|
FileWrite(hfpb, lpipb, sizeof(IPB), NULL);
|
|
|
|
for (lpesb = lpesi->lpesbRoot; lpesb; lpesb = lpesb->lpesbNext)
|
|
{
|
|
if (FileWrite(hfpb, lpesb, sizeof(ESB), NULL) != sizeof(ESB))
|
|
{
|
|
FileClose (hfpb);
|
|
FileUnlink (NULL, szEsi, REGULAR_FILE);
|
|
return;
|
|
}
|
|
}
|
|
FileClose (hfpb);
|
|
MEMCPY (lpipb->szEsiTemp, szEsi, 100);
|
|
}
|
|
|
|
PRIVATE VOID PASCAL NEAR UpdateEsiTemp (_LPIPB lpipb)
|
|
{
|
|
GHANDLE hfpb;
|
|
|
|
if ((hfpb = FileOpen(NULL, lpipb->szEsiTemp, REGULAR_FILE,
|
|
READ_WRITE, NULL)) == NULL)
|
|
return;
|
|
|
|
FileWrite(hfpb, lpipb, sizeof(IPB), NULL);
|
|
FileClose (hfpb);
|
|
}
|
|
|
|
PRIVATE BOOL PASCAL LoadEsiTemp (_LPIPB lpipb, LPESI lpesi, LPB lpbEsiFile,
|
|
LPB lpbIsiFile, PHRESULT phr)
|
|
{
|
|
LPESB lpesb;
|
|
HFILE hFile;
|
|
ESB esb;
|
|
HANDLE hesb;
|
|
HRESULT fRet;
|
|
IPB ipb;
|
|
LPISI pIsi = &lpipb->isi; // Pointer to internal sort info
|
|
|
|
/* Copy the internal sort info filename */
|
|
MEMCPY (pIsi->aszTempName, lpbIsiFile, lstrlen(lpbIsiFile));
|
|
|
|
/* Read in the external sort buffer info */
|
|
|
|
if ((hFile = _lopen (lpbEsiFile, READ)) == HFILE_ERROR)
|
|
return E_NOTEXIST;
|
|
|
|
/* Read old IPB info */
|
|
_lread (hFile, &ipb, sizeof(IPB));
|
|
|
|
/* Transfer meaningful data */
|
|
|
|
lpipb->dwIndexedWord = ipb.dwIndexedWord;
|
|
lpipb->dwUniqueWord = ipb.dwUniqueWord;
|
|
lpipb->dwByteCount = ipb.dwByteCount;
|
|
lpipb->dwOccOffbits = ipb.dwOccOffbits;
|
|
lpipb->dwOccExtbits = ipb.dwOccExtbits;
|
|
lpipb->dwMaxFieldId = ipb.dwMaxFieldId;
|
|
lpipb->dwMaxWCount = ipb.dwMaxWCount;
|
|
lpipb->dwMaxOffset = ipb.dwMaxOffset;
|
|
lpipb->dwTotal3bWordLen = ipb.dwTotal3bWordLen;
|
|
lpipb->dwTotal2bWordLen = ipb.dwTotal2bWordLen;
|
|
lpipb->dwTotalUniqueWordLen = ipb.dwTotalUniqueWordLen;
|
|
lpipb->lcTopics = ipb.lcTopics;
|
|
lpipb->dwMaxTopicId = ipb.dwMaxTopicId;
|
|
// lpipb->dwMemAllowed = ipb.dwMemAllowed;
|
|
lpipb->dwMaxRecordSize = ipb.dwMaxRecordSize;
|
|
lpipb->dwMaxEsbRecSize = ipb.dwMaxEsbRecSize;
|
|
lpipb->dwMaxWLen = ipb.dwMaxWLen;
|
|
lpipb->idxf = ipb.idxf;
|
|
|
|
while ((_lread (hFile, &esb, sizeof(ESB))) == sizeof(ESB))
|
|
{
|
|
if ((hesb = _GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
|
|
sizeof(ESB))) == NULL) {
|
|
|
|
fRet = SetErrCode (phr,E_OUTOFMEMORY);
|
|
exit0:
|
|
_lclose (hFile);
|
|
return fRet;
|
|
}
|
|
|
|
lpesb = (LPESB)_GLOBALLOCK (hesb);
|
|
|
|
/* Copy the ESB information */
|
|
*lpesb = esb;
|
|
|
|
/* Update the structure */
|
|
lpesb->hStruct = hesb;
|
|
|
|
lpesb->lpesbNext = lpesi->lpesbRoot;
|
|
lpesi->lpesbRoot= lpesb;
|
|
lpesi->cesb ++;
|
|
}
|
|
_lclose (hFile);
|
|
|
|
fRet = S_OK;
|
|
goto exit0;
|
|
|
|
}
|
|
|
|
HRESULT FAR PASCAL AllocSigmaTable (_LPIPB lpipb)
|
|
{
|
|
ERRB errb;
|
|
DWORD loop;
|
|
float rLog;
|
|
|
|
|
|
if ((lpipb->wi.hSigma = _GLOBALALLOC (DLLGMEM_ZEROINIT,
|
|
(LCB)((lpipb->dwMaxTopicId + 1) * sizeof (SIGMA)))) == NULL)
|
|
return SetErrCode (&errb, E_OUTOFMEMORY);
|
|
lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK (lpipb->wi.hSigma);
|
|
|
|
if ((lpipb->wi.hLog = _GLOBALALLOC (DLLGMEM_ZEROINIT,
|
|
(CB)(cLOG_MAX * sizeof (FLOAT)))) == NULL)
|
|
{
|
|
FreeHandle (lpipb->wi.hSigma);
|
|
return SetErrCode (&errb, E_OUTOFMEMORY);
|
|
}
|
|
lpipb->wi.lrgrLog = (FLOAT FAR *)_GLOBALLOCK (lpipb->wi.hLog);
|
|
// Initialize the array
|
|
for (loop = cLOG_MAX - 1; loop > 0; --loop)
|
|
{
|
|
#ifndef ISBU_IR_CHANGE
|
|
rLog = (float) log10(cHundredMillion/(double)loop);
|
|
#else
|
|
rLog = (float)1.0 / (float)loop;
|
|
#endif // ISBU_IR_CHANGE
|
|
lpipb->wi.lrgrLog[loop] = rLog * rLog;
|
|
}
|
|
return(S_OK);
|
|
}
|