|
|
//#define _DUMPALL
/*************************************************************************
* * * SIMILAR.C * * * * Copyright (C) Microsoft Corporation 1990-1996 * * All Rights reserved. * * * ************************************************************************** * * * Module Intent: * * * * Search Core Engine: Find Similar functionality * * * ************************************************************************** * * Revision History: * * 09/24/96 kevynct Started from algorithm notes (4 hrs) * 09/25/96 kevynct Implemented skeleton of ProcessSimilarityTerm (1 hr) * 09/26/96 kevynct More work on inner loop and relevant list (5 hrs) * 09/27/96 kevynct Query parsing, weighting, and sorting (6 hrs) * 10/01/96 kevynct Incorporate into MV2.0b (10 min) * 10/02/96 kevynct Clean-up query code, start resolve query code (4 hrs) * 10/03/96 kevynct Resolve query code (2 hrs) * 10/11/96 kevynct Start bucket routines (2 hrs) * 10/13/96 kevynct Finish bucket routines, write node processor, cleanup (6 hrs) * 10/14/96 kevynct Clean-up, remove compilation errors, debugging (6 hrs) * 10/24/96 kevynct Convert to two-phase query resolution (3 hrs) * 10/25/96 kevynct Fix sort by cTopics, debug new query resolution, try new weighting (2 hrs) * 11/26/96 kevynct Testing, fix and improve weighting and accumulation: aliases, digits (8 hrs) * 12/2/96 kevynct More weighting tests (8 hrs) * Work remaining: * * Investigate field and stemming support * * Use probabilistic upperbounds for pruning. Remove single-term nodes after each term process * Test current bucket method vs. exact scores w/ heap * ************************************************************************** * * Current Owner: KevynCT * **************************************************************************/
#include <mvopsys.h>
#include <mem.h>
#include <memory.h>
#include <orkin.h>
#include <mvsearch.h>
#include <math.h>
#include <groups.h>
#include "common.h"
#include "search.h"
#ifdef _DEBUG
static BYTE NEAR s_aszModule[] = __FILE__; // Used by error return functions.
#endif
#define FGetDword(a,b,c) (*DecodeTable[b.cschScheme])(a, b, c)
#define IS_DIGIT(p) ((p) >= '0' && (p) <= '9')
// these are in case the doc scoring is approximate: they tell which
// direction to err on the side of.
#define ROUND_DOWN 0
#define ROUND_UP 1
#define SCORE_BLOCK_SIZE 32
#define NUM_SCORE_BLOCKS (MAX_WEIGHT/SCORE_BLOCK_SIZE)
typedef struct tagDocScoreList { HANDLE hMem; int cScoresLeft; int iBucketLowest; int iHighestScore; int rgiScores[NUM_SCORE_BLOCKS + 1]; } DSL, FAR *_LPDSL;
PUBLIC HRESULT PASCAL FAR SkipOccList(_LPQT lpqt, PNODEINFO pNodeInfo, DWORD dwOccs); // ftsearch.c
PUBLIC int PASCAL FAR CompareTerm(_LPQTNODE lpQtNode, LST lstTermWord, LST lstBtreeWord, DWORD dwBtreeFieldId, char []); // ftsearch.c
PUBLIC STRING_TOKEN FAR *PASCAL AllocWord(_LPQT lpQueryTree, LST lstWord); // qtparse.c
__inline LPVOID InitDocScoreList(int cScores); __inline void FreeDocScoreList(LPV lpDocScores); __inline int GetMaxDocScore(_LPDSL lpDocScores); __inline int GetMinDocScore(_LPDSL lpDocScores, BOOL fRoundUp); BOOL UpdateDocScoreList(_LPDSL lpDocScores, int iOldScore, int i); __inline BOOL IsDocScoreListFull(_LPDSL lpdsl); __inline WORD AddWeights(DWORD w1, DWORD w2); int GetSortedDocScore(_LPDSL lpDocScores, int iThis, BOOL fRoundUp); #if defined(_DEBUG)
BOOL DumpDocScoreList(_LPDSL lpdsl, PSRCHINFO pSrchInfo); #endif
__inline void MergeWordInfoCounts(WORDINFO FAR *lpwiDest, WORDINFO FAR *lpwiSrc);
PRIVATE LPQT TokenizeFlatQuery(LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, PHRESULT phr); PRIVATE HRESULT PASCAL NEAR ResolveFlatQuery(_LPQT lpqt, _LPQTNODE lpCurQtNode, LPRETV lpRetV); PRIVATE HRESULT GetWordInfoList(_LPQT lpqt, STRING_TOKEN FAR *lpStrToken, _LPQTNODE lpCurQtNode, LPRETV lpRetV); PRIVATE VOID PASCAL SortStringWeights(_LPQT lpQueryTree); PRIVATE VOID PASCAL SetStringWeights (LPQI lpQueryInfo); PUBLIC HRESULT PASCAL FAR EXPORT_API FFlatCallBack (LST lstRawWord, LST lstNormWord, LFO lfoWordOffset, LPQI lpqi);
__inline LPVOID InitDocScoreList(int cScores) { _LPDSL lpdsl;
if ((lpdsl = (_LPDSL)GlobalLockedStructMemAlloc(sizeof(DSL))) == NULL) return NULL;
lpdsl->cScoresLeft = cScores; lpdsl->iHighestScore = 0; lpdsl->iBucketLowest = -1; return (LPV)lpdsl; }
__inline void FreeDocScoreList(LPV lpDocScores) { if ((_LPDSL)lpDocScores) GlobalLockedStructMemFree((_LPDSL)lpDocScores); }
__inline int GetMaxDocScore(_LPDSL lpDocScores) { return lpDocScores->iHighestScore; } __inline int GetMinDocScore(_LPDSL lpDocScores, BOOL fRoundUp) { if (lpDocScores->iBucketLowest >= 0) return (lpDocScores->iBucketLowest + !!fRoundUp) * SCORE_BLOCK_SIZE;
return 0; }
int GetSortedDocScore(_LPDSL lpdsl, int cThis, BOOL fRoundUp) { LPINT lpi, lpiFirst;
if (lpdsl->iHighestScore < 0) return 0;
lpiFirst= &lpdsl->rgiScores[0];
for (lpi = &lpdsl->rgiScores[lpdsl->iHighestScore/SCORE_BLOCK_SIZE]; lpi >= lpiFirst; cThis -= *lpi, lpi--) { if (cThis <= *lpi) return ((lpi - lpiFirst) + !!fRoundUp) * SCORE_BLOCK_SIZE; } return (!!fRoundUp * SCORE_BLOCK_SIZE); }
#if defined(_DEBUG)
BOOL DumpDocScoreList(_LPDSL lpdsl, PSRCHINFO pSrchInfo) { LPINT lpi, lpiMax; int iT = 0; int i;
lpi = &lpdsl->rgiScores[0]; lpiMax = lpi + NUM_SCORE_BLOCKS; for (i = 0;lpi < lpiMax;lpi++, i++) { if (*lpi) { _DPF2("Score %d (count %d)\n", i, *lpi); } iT += *lpi; } _DPF1("%d topics in scorelist\n", iT);
return TRUE;
} #endif
BOOL UpdateDocScoreList(_LPDSL lpdsl, int iOldScore, int iScore) { int iThis = iScore/SCORE_BLOCK_SIZE; int iOld = iOldScore/SCORE_BLOCK_SIZE; if (lpdsl->cScoresLeft <= 0) { // already full, figure out which buckets need updating
if (iThis > lpdsl->iBucketLowest) { // if we're updating an existing entry, remove that
// otherwise remove the lowest one
if (iOld >= lpdsl->iBucketLowest) lpdsl->rgiScores[iOld]--; else lpdsl->rgiScores[lpdsl->iBucketLowest]--;
// then make sure lowest one is still non-empty; if not,
// revise upwards
if (lpdsl->rgiScores[lpdsl->iBucketLowest] <= 0) { for (lpdsl->iBucketLowest++; lpdsl->iBucketLowest <= iThis; lpdsl->iBucketLowest++) if (lpdsl->rgiScores[lpdsl->iBucketLowest]) break; add_new_doc: if (lpdsl->iBucketLowest >= 0) lpdsl->iBucketLowest = min(lpdsl->iBucketLowest, iThis); else lpdsl->iBucketLowest = iThis; }
// then add the new entry
lpdsl->rgiScores[iThis]++; update_highest_score: if (iScore > lpdsl->iHighestScore) lpdsl->iHighestScore = iScore;
#if defined(_DEBUG) && defined(_DUMPALL)
//DumpDocScoreList(lpdsl, NULL);
#endif
Assert(lpdsl->rgiScores[lpdsl->iHighestScore/SCORE_BLOCK_SIZE] >= 0); return TRUE; } else if (iThis == lpdsl->iBucketLowest) goto update_highest_score;
Assert(lpdsl->rgiScores[lpdsl->iHighestScore/SCORE_BLOCK_SIZE] >= 0); return FALSE; }
// doc score list is not yet full, so automatically add if new,
// remove old if update
if (iOld >= lpdsl->iBucketLowest) lpdsl->rgiScores[iOld]--; else lpdsl->cScoresLeft--; goto add_new_doc; }
__inline BOOL IsDocScoreListFull(_LPDSL lpdsl) { return (lpdsl->cScoresLeft <= 0); }
__inline WORD AddWeights(DWORD w1, DWORD w2) { return (WORD)min(MAX_WEIGHT, w1 + w2); }
/*************************************************************************
* @doc EXTERNAL API RETRIEVAL * * @func LPHL FAR PASCAL | MVIndexFindSimilar | * Given a query which probably represents a document text stream, returns * a hitlist containing topics which are determined to be similar to the query * using nearest-neighbour searching. * * @parm LPIDX | lpidx | * Pointer to index information. * * @parm LPQT | lpqt | * Pointer to query tree (returned by MVQueryParse()) * * @parm PSRCHINFO | pSrchInfo | * Pointer to search information data * * @parm _LPGROUP | lpResGroup | * Pointer to resulting group * * @parm LPVOID | pCallback | * Pointer to callback struct FCALLBACK_MSG (optional) * * @parm PHRESULT | phr | * Pointer to error buffer * * @rdesc Pointer to hitlist structure if succeeded, even there is * no hits (use MVHitListEntries() to find out how many hits have been * returned). It will return NULL if failed. The error buffer * (see IndexOpen()) will contain descriptions about the cause of * the failure. There is one special case when the function returns * a non-null pointer, even there is error, that is when it can't * write the result to the disk, and everything is still in memory. * *************************************************************************/ // bugbug: handle wildcards
PUBLIC LPHL EXPORT_API FAR PASCAL MVIndexFindSimilar (_LPIDX lpidx, LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, _LPGROUP lpResGroup, LPVOID pCallback, PHRESULT phr) { HRESULT fRet; // Return from this function.
LPRETV lpRetV; // Retrieval memory/files.
GHANDLE hRetv; //OCCF occf; // Index occurence flags temporary variable.
_LPHL lphl; // Pointer to hitlist
_LPQTNODE lpTreeTop; HANDLE hTreeTop = NULL; _LPQT lpqt;
if (lpidx == NULL || lpParms == NULL || pSrchInfo == NULL) { /* We get some bad arguments!! */ SetErrCode (phr, E_INVALIDARG); return NULL; }
if (NULL == (lpqt = TokenizeFlatQuery(lpParms, pSrchInfo, phr))) { // errb was set
return NULL; }
fRet = E_FAIL; // Assume thing will go wrong
// Transfer all the information about the index to the query tree
lpqt->foIdxRoot = lpidx->ih.foIdxRoot; /* Top node offset */ lpqt->dwBlockSize = lpidx->ih.dwBlockSize; /* Index block size */ lpqt->cIdxLevels = lpidx->ih.cIdxLevels; /* Index's depth */ lpqt->occf = lpidx->ih.occf; lpqt->idxf = lpidx->ih.idxf; lpqt->foIdxRoot = lpidx->ih.foIdxRoot; lpqt->ckeyTopicId = lpidx->ih.ckeyTopicId; lpqt->ckeyOccCount = lpidx->ih.ckeyOccCount; lpqt->ckeyWordCount = lpidx->ih.ckeyWordCount; lpqt->ckeyOffset = lpidx->ih.ckeyOffset;
if (pSrchInfo->dwMemAllowed) { // allocate document result list
// no occurrence info is returned for similarity query
SetBlockCount (lpqt->lpTopicMemBlock, (WORD)(pSrchInfo->dwMemAllowed / (sizeof(TOPIC_LIST) * cTOPIC_PER_BLOCK)));
SetBlockCount (lpqt->lpOccMemBlock, 1); }
if (pCallback) MVSearchSetCallback(lpqt, pCallback);
/* Allocate hitlist */ if ((lphl = (_LPHL)GlobalLockedStructMemAlloc(sizeof (HL))) == NULL) { fRet = E_OUTOFMEMORY; SetErrCode(phr, fRet); exit00: if (lpqt) { FreeDocScoreList(lpqt->lpDocScores); MVQueryFree(lpqt); }
if (lphl && fRet != S_OK && fRet != E_TOOMANYTOPICS) { MVHitListDispose(lphl); lphl = NULL; } return (LPHL)lphl; } lphl->lLastTopicId = 0xffffffff; lphl->lcMaxTopic = lpidx->ih.lcTopics;
/* Allocate a return value structure */
if ((hRetv = _GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT, sizeof(RETV))) == NULL) { SetErrCode(phr, E_OUTOFMEMORY); goto exit00; }
lpRetV = (LPRETV)_GLOBALLOCK(hRetv); lpRetV->lpqt = lpqt;
if ((fRet = TopNodeRead(lpidx)) != S_OK) { SetErrCode (phr, fRet); exit02: FreeHandle(hRetv); goto exit00; }
//
// Count the number of occurence fields present. My retrieval
// occurence record is going to cost 4 bytes per field.
//
//occf = lpqt->occf;
//for (lpRetV->cOccFields = 0; occf; lpRetV->cOccFields++)
// occf &= occf - 1;
lpqt->dwOccSize = lpRetV->dwOccSize = 0; //sizeof(OCCURENCE) + lpRetV->cOccFields * sizeof (DWORD);
lpRetV->fRank = TRUE; //((pSrchInfo->Flag &
//(QUERYRESULT_RANK | QUERYRESULT_NORMALIZE)) != 0);
// Set pointer to various buffer
lpRetV->LeafInfo.pTopNode = lpidx->lrgbTopNode; lpRetV->LeafInfo.pStemNode = lpRetV->pNodeBuf; lpRetV->LeafInfo.pLeafNode = lpRetV->pNodeBuf; lpRetV->LeafInfo.pDataNode = lpRetV->pDataBuf; lpRetV->LeafInfo.hfpbIdx = lpidx->hfpbIdxSubFile; // Index file to read from
lpRetV->DataInfo.pTopNode = lpidx->lrgbTopNode; lpRetV->DataInfo.pStemNode = lpRetV->pNodeBuf; lpRetV->DataInfo.pLeafNode = lpRetV->pNodeBuf; lpRetV->DataInfo.pDataNode = lpRetV->pDataBuf; lpRetV->DataInfo.hfpbIdx = lpidx->hfpbIdxSubFile; // Index file to read from
lpRetV->lcid = lpidx->ih.lcid; // Save search information
lpRetV->SrchInfo = *pSrchInfo; if (pSrchInfo->dwValue == 0) lpRetV->SrchInfo.dwValue = (DWORD)(-1); else lpRetV->SrchInfo.dwValue = lpidx->ih.lcTopics/pSrchInfo->dwValue;
// this is a dummy node that we pass in to hold all term results
if ((lpTreeTop = (_LPQTNODE)_GLOBALLOCK( \ hTreeTop = _GLOBALALLOC(GHND, sizeof (QTNODE)))) == NULL) { SetErrCode(phr, fRet = E_OUTOFMEMORY); goto exit02; } QTN_FLAG(lpTreeTop) = EXACT_MATCH; lpTreeTop->pNext = NULL; lpTreeTop->pPrev = NULL; lpTreeTop->lpTopicList = NULL;
if ( (fRet = ResolveFlatQuery(lpqt, lpTreeTop, lpRetV)) != S_OK) { SetErrCode (phr, fRet);
/* Free the Topic and Occurrence memory blocks since they are
* not freed by QueryTreeFree(), or MVHitListDispose() at this * point */
if (fRet != E_TOOMANYTOPICS) { BlockFree ((LPV)lpqt->lpTopicMemBlock); BlockFree ((LPV)lpqt->lpOccMemBlock); lpqt->lpTopicMemBlock = NULL; lpqt->lpOccMemBlock = NULL; exit03: if (hTreeTop) { _GLOBALUNLOCK(hTreeTop); _GLOBALFREE(hTreeTop); } goto exit02; } }
/* Create a group if requested */ if ((pSrchInfo->Flag & QUERYRESULT_GROUPCREATE) && lpResGroup) { LPITOPIC lpCurTopic; /* Topic's current pointer */ LPB lpbGrpBitVect; DWORD maxTopicId; /* Initialize the pointer */ lpbGrpBitVect = lpResGroup->lpbGrpBitVect;
maxTopicId = lpResGroup->dwSize * 8; for (lpCurTopic = QTN_TOPICLIST(lpTreeTop); lpCurTopic; lpCurTopic = lpCurTopic->pNext) { /* Set the bit */ if (lpCurTopic->dwTopicId < maxTopicId) { lpbGrpBitVect[(DWORD)(lpCurTopic->dwTopicId / 8)] |= 1 << (lpCurTopic->dwTopicId % 8); } } } if ((pSrchInfo->Flag & QUERYRESULT_UIDSORT) == 0) {
/* Sort the result depending on ranking or not */ if (lpRetV->fRank) SortResult ((LPQT)lpqt, lpTreeTop, WEIGHT_BASED); else SortResult ((LPQT)lpqt, lpTreeTop, HIT_COUNT_BASED); }
/* Update HitList info structure, cut off the unwanted list */ if (lphl->lpTopicList = lpTreeTop->lpTopicList) lphl->lcReturnedTopics = lphl->lcTotalNumOfTopics = lpTreeTop->cTopic; // Only return the number of topics that the user requested
// if dwTopicCount == 0, it means that the user wants to return all
if (pSrchInfo->dwTopicCount != 0 && pSrchInfo->dwTopicCount < lphl->lcReturnedTopics) lphl->lcReturnedTopics = pSrchInfo->dwTopicCount;
lphl->lpOccMemBlock = lpqt->lpOccMemBlock; lphl->lpTopicMemBlock = lpqt->lpTopicMemBlock;
#if 1
/* WARNING: The following code should be commented out for
* diskless devices. No returned error is checked, since * if disk writes fail, everything is still in memory */
if ((pSrchInfo->Flag & QUERYRESULT_IN_MEM) == 0) { if ((fRet = MVHitListFlush (lphl, lphl->lcReturnedTopics)) != S_OK) SetErrCode (phr, fRet); } #endif
fRet = S_OK; goto exit03; }
PRIVATE LPQT TokenizeFlatQuery(LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, PHRESULT phr) { HRESULT fRet; // Return value.
HANDLE hqi; // Handle to "lpqi".
HANDLE hibi; // Handle to internal breaker info
HANDLE hQuery; // Handle to secondary query buffer
LPQI lpQueryInfo; // Query information.
LPIBI lpibi; // Pointer to internal breaker info
LPB lpbQueryBuf; // Copy of query's buffer
_LPQT lpQueryTree; // Query tree pointer
BRK_PARMS brkParms; // Breaker info parms
LPCHARTAB lpCharTabInfo;// Pointer to character table's info
/* LPPARSE_PARMS structure break-out variables */ BYTE FAR CONST *lpbQuery; // Query buffer
DWORD cbQuery; // Query length
LPBRKLIST lpfnTable; // DType function table
LPGROUP lpGroup; // Group
lpbQuery = lpParms->lpbQuery; cbQuery = lpParms->cbQuery; lpfnTable = lpParms->lpfnTable; lpGroup = lpParms->lpGroup;
if (lpfnTable == NULL) { SetErrCode(phr, E_BADBREAKER); return NULL; }
if (cbQuery == 0 || lpbQuery == NULL) { SetErrCode(phr, E_NULLQUERY); return NULL; }
lpQueryTree = NULL; hqi = hibi = hQuery = NULL; fRet = E_FAIL;
if ((hqi = (GHANDLE)_GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT, (LCB)sizeof(QUERY_INFO))) == NULL) { fRet = SetErrCode(phr, E_OUTOFMEMORY); goto ErrFreeAll; } lpQueryInfo = (LPQI)_GLOBALLOCK(hqi); lpQueryInfo->lperrb = phr; lpQueryInfo->lpOpSymTab = NULL; // not used for similarity
lpQueryInfo->cOpEntry = 0;
/* Allocate a breaker info block used by different breakers */ if ((hibi = (GHANDLE)_GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT, (LCB)sizeof(IBI))) == NULL) { fRet = SetErrCode(phr, E_OUTOFMEMORY); goto ErrFreeAll; } lpibi = (LPBRKI)_GLOBALLOCK(hibi);
/* Set the default breaker function, and stop list */ #ifndef CW
lpQueryInfo->lpfnBreakFunc = lpfnTable[0].lpfnBreakFunc; #endif
lpQueryInfo->lpStopListInfo = lpfnTable[0].lpStopListInfo;
if ((lpCharTabInfo = lpQueryInfo->lpCharTab = lpfnTable[0].lpCharTab) == NULL) {
/* Default character and ligature tables */
lpCharTabInfo = lpQueryInfo->lpCharTab = MVCharTableGetDefault (phr); if (lpCharTabInfo == NULL) { fRet = SetErrCode(phr, E_NOHANDLE); goto ErrFreeAll; } lpQueryInfo->fFlag |= FREE_CHARTAB; } /* Change the property of '*' and '?' to character */
((LPCMAP)lpCharTabInfo->lpCMapTab)['*'].Class = CLASS_WILDCARD; ((LPCMAP)lpCharTabInfo->lpCMapTab)['?'].Class = CLASS_WILDCARD;
switch (lpCharTabInfo->fFlag) { case USE_DEF_LIGATURE: lpCharTabInfo->wcLigature = DEF_LIGATURE_COUNT; lpCharTabInfo->lpLigature = LigatureTable; break;
case NO_LIGATURE: lpCharTabInfo->wcLigature = 0; lpCharTabInfo->lpLigature = NULL; }
// not used for similarity
lpQueryInfo->lpStack = NULL;
/* Allocate a query tree */ if ((lpQueryTree = (_LPQT)QueryTreeAlloc()) == NULL) { fRet = SetErrCode(phr, E_OUTOFMEMORY); goto ErrFreeAll; }
/* Associate the query tree with the query. In the future, this will
* ensure the capability to have several queries and query trees * at once */ lpQueryInfo->lpQueryTree = (LPQT)lpQueryTree;
/* Default arguments */
lpQueryTree->iDefaultOp = (BYTE)OR_OP; lpQueryTree->lpGroup = lpGroup; // Use default Group
lpQueryTree->dwFieldId = 0;//DW_NIL_FIELD; // No fieldid search
lpQueryTree->cStruct.dwKey = CALLBACKKEY;
lpQueryTree->fFlag = 0; lpQueryTree->wProxDist = 0;
if (NULL == (lpQueryTree->lpDocScores = InitDocScoreList(pSrchInfo->dwTopicCount))) { fRet = SetErrCode(phr, E_OUTOFMEMORY); goto ErrFreeAll; }
/* Copy the query into a temporary buffer since we are going to make
change to it */ if ((hQuery = _GLOBALALLOC(DLLGMEM_ZEROINIT, (LCB)cbQuery + 2)) == NULL) { SetErrCode(phr, E_OUTOFMEMORY); FreeHandle(hqi); return NULL; } lpbQueryBuf = lpQueryInfo->lpbQuery = (LPB)_GLOBALLOCK(hQuery); lpbQueryBuf[cbQuery] = ' '; // Add a space to help LowLeveltransformation
lpbQueryBuf[cbQuery + 1] = 0; // Zero-terminated string (safety bytes)
MEMCPY(lpbQueryBuf, lpbQuery, cbQuery);
//
// Word-break between here and there.
//
brkParms.lpInternalBreakInfo = lpibi; brkParms.lpbBuf = lpbQueryBuf; brkParms.cbBufCount = cbQuery; brkParms.lcbBufOffset = 0; brkParms.lpvUser = lpQueryInfo; brkParms.lpfnOutWord = (FWORDCB)FFlatCallBack; brkParms.lpStopInfoBlock = lpQueryInfo->lpStopListInfo; brkParms.lpCharTab = lpQueryInfo->lpCharTab; brkParms.fFlags = ACCEPT_WILDCARD;
if ((fRet = (*lpQueryInfo->lpfnBreakFunc)((LPBRK_PARMS)&brkParms)) != S_OK) { fRet = SetErrCode(phr, (WORD)fRet); goto ErrFreeAll; }
/* Flush the word breaker */ brkParms.lpbBuf = NULL; brkParms.cbBufCount = 0;
if ((fRet = (*lpQueryInfo->lpfnBreakFunc)((LPBRK_PARMS)&brkParms)) != S_OK) { fRet = SetErrCode(phr, fRet); goto ErrFreeAll; }
/* Set the position of pointer to report missing term at
the end of the query. -1 since the offset starts at 0 */ lpQueryInfo->dwOffset = cbQuery - 1;
fRet = S_OK;
ErrFreeAll: /* Free the charmap table */ if (lpQueryInfo->fFlag & FREE_CHARTAB) MVCharTableDispose (lpQueryInfo->lpCharTab);
/* Free query info */ if (hqi) { FreeHandle(hqi); };
/* Free internal breaker info */ if (hibi) { FreeHandle(hibi); };
/* Free internal query buffer info */ if (hQuery) { FreeHandle(hQuery); };
if (fRet == S_OK) return lpQueryTree;
if (lpQueryTree) { BlockFree(lpQueryTree->lpStringBlock); BlockFree(lpQueryTree->lpWordInfoBlock); BlockFree(lpQueryTree->lpOccMemBlock); BlockFree(lpQueryTree->lpTopicMemBlock); BlockFree(lpQueryTree->lpNodeBlock);
FreeDocScoreList(lpQueryTree->lpDocScores); /* Free Query tree block */ FreeHandle ((HANDLE)lpQueryTree->cStruct.dwReserved); } return NULL; }
/*************************************************************************
* @doc INTERNAL * * @func HRESULT FAR PASCAL | ProcessTerm | * This function will search the index for the given word' data. * @parm _LPQT | lpqt | * Pointer to index structure * @parm LPRETV | lpRetV | * Pointer to "globals" * @parm _LPQTNODE | lpCurQtNode | * Current node in the query tree containing important data * - The number of topics * - The location of the data * - The size of the data * - Pointer to the next word (for wildcard search) * @rdesc S_OK or other errors *************************************************************************/ PUBLIC HRESULT EXPORT_API FAR PASCAL ProcessTerm(_LPQT lpqt, LPRETV lpRetV, _LPQTNODE lpResQuery, _LPQTNODE lpQtNode, STRING_TOKEN FAR *lpToken) { DWORD dwTopicIDDelta; // Topic-ID delta from previous sub-list.
DWORD dwOccs; // Number of occurences in this sub-list.
DWORD dwTmp; // Scratch variable.
WORD wWeight; // Term-weight associated with this sub-list.
WORD wWeightMax; DWORD dwTopicID; // TopicId
WORD wImportance; DWORD dwLength; // Length of the word
TOPIC_LIST FAR *lpResTopicList; // Result TopicList
HRESULT fRet; // Returned value
PNODEINFO pDataInfo; DWORD dwTopicCount; _LPQT lpQueryTree; // Query tree
OCCF occf; BYTE fSkipOccList = FALSE; _LPDSL lpDocScores = (_LPDSL)(lpqt->lpDocScores);
pDataInfo = &lpRetV->DataInfo; if ((pDataInfo->dwDataSizeLeft = lpQtNode->cbData) == 0) return(S_OK); // There is nothing to process
// Initialize variables
occf = lpqt->occf; wImportance = QTN_TOKEN(lpQtNode)->wWeight; lpResTopicList = NULL; lpQueryTree = lpRetV->lpqt; dwTopicCount = lpQtNode->cTopic; wWeight = (WORD)(65535L/(lpToken ? lpToken->dwTopicCount : dwTopicCount)); // Reset the topic count for lpQtNode so that is will not affect the
// result in case that lpResQuery == NULL
lpQtNode->cTopic = 0; if (lpResQuery == NULL) lpResQuery = lpQtNode; // Initialize the data buffer node values
pDataInfo->pBuffer = pDataInfo->pDataNode; pDataInfo->nodeOffset = lpQtNode->foData; // Read the data block
if ((fRet = ReadNewData(pDataInfo)) != S_OK) return(fRet); dwTopicID = 0L; // Init occurence record
dwLength = 0;
// for each document in posting
for (; dwTopicCount; dwTopicCount--) { /* Check for interrupt now and then */ if ((++lpqt->cInterruptCount) == 0) { if (lpqt->fInterrupt == E_INTERRUPT) return E_INTERRUPT; if (*lpqt->cStruct.Callback.MessageFunc && (fRet = (*lpqt->cStruct.Callback.MessageFunc)( lpqt->cStruct.Callback.dwFlags, lpqt->cStruct.Callback.pUserData, NULL)) != S_OK) return(fRet); } // Byte align
if (pDataInfo->ibit != cbitBYTE - 1) { pDataInfo->ibit = cbitBYTE - 1; pDataInfo->pCurPtr ++; } // Get value from which I will calculate current doc-ID.
if ((fRet = FGetDword(pDataInfo, lpqt->ckeyTopicId, &dwTopicIDDelta)) != S_OK) { exit0: return fRet; }
dwTopicID += dwTopicIDDelta; //
// Get term-weight if present. I'm going to get this
// even if I'm not doing ranking, because it's in the
// index, and I have to get around it somehow.
//
if (lpqt->idxf & IDXF_NORMALIZE) { if ((fRet = FGetBits(pDataInfo, &dwTmp, sizeof (USHORT) * cbitBYTE)) != S_OK) goto exit0;
if (wImportance != MAX_WEIGHT) dwTmp = (dwTmp * wImportance) / 65535;
// BUGBUG: we actually want the weights for all aliased terms
// to be considered at once.
wWeight = (WORD)dwTmp; }
// always skip any occurrence info
if (occf & (OCCF_OFFSET | OCCF_COUNT)) { // Figure out how many occurences there are in this
// sub-list.
//
if ((fRet = FGetDword(pDataInfo, lpqt->ckeyOccCount, &dwOccs)) != S_OK) goto exit0;
if ((fRet = SkipOccList (lpqt, pDataInfo, dwOccs)) != S_OK) goto exit0; }
// If this search includes a group, and the doc is not in the
// group then ignore it
if (lpQueryTree->lpGroup && FGroupLookup(lpQueryTree->lpGroup, dwTopicID) == FALSE) continue;
// calculate relevance upper bound Dr = Ds + sum(Qi) for this document
if (lpResTopicList = TopicNodeSearch(lpQueryTree, lpResQuery, dwTopicID)) wWeightMax = lpResTopicList->wWeight; else wWeightMax = 0;
wWeightMax = AddWeights(wWeightMax, wWeight); wWeightMax = AddWeights(wWeightMax, QTN_TOKEN(lpQtNode)->wWeightRemain); if (wWeightMax < GetMinDocScore(lpDocScores, ROUND_DOWN) && IsDocScoreListFull(lpDocScores)) { // do not alloc/ or remove D from result list if present
if (lpResTopicList) { register LPITOPIC lpPrev, lpTmp;
// find lpPrev
// UNDONE: look into removing necessity for this loop
for (lpPrev = NULL, lpTmp = (LPITOPIC)lpQtNode->lpTopicList; lpTmp; lpTmp = lpTmp->pNext) { if (lpTmp == (LPITOPIC)lpResTopicList) break; lpPrev = lpTmp; }
TopicNodeFree(lpQueryTree, lpResQuery, lpPrev, lpResTopicList); #if defined(_DEBUG) && defined(_DUMPALL)
_DPF3("Remove topic %lu, wWeightMax = %lu, MinDocScore = %u\n", dwTopicID, \ wWeightMax, GetMinDocScore(lpDocScores, ROUND_DOWN)); #endif
} // no need to update top-N docs since this wasn't one of them
continue; }
if (lpResTopicList) { WORD wOldWeight = lpResTopicList->wWeight;
// Calc new Ds for this doc and if good enough for the club, ensure that
// club invariant is maintained, else leave it since it could still become
// a club member in the future
lpResTopicList->wWeight = AddWeights(lpResTopicList->wWeight, wWeight); if (lpResTopicList->wWeight > GetMinDocScore(lpDocScores, ROUND_DOWN)) UpdateDocScoreList(lpDocScores, wOldWeight, lpResTopicList->wWeight);
#if defined(_DEBUG) && defined(_DUMPALL)
_DPF3("Update topic %lu, wWeightMax = %lu, wWeight = %u\n", dwTopicID, \ wWeightMax, lpResTopicList->wWeight); #endif
continue; }
// a new document counter: possible club member, or not enough
// total documents yet
if ((lpResTopicList = TopicNodeAllocate(lpQueryTree)) == NULL) { fRet = E_TOOMANYTOPICS; goto exit0; } lpResTopicList->dwTopicId = dwTopicID; lpResTopicList->lpOccur = NULL; lpResTopicList->lcOccur = 0; lpResTopicList->wWeight = wWeight;
/* Add the new TopicID node into TopicList */ TopicNodeInsert (lpQueryTree, lpResQuery, lpResTopicList); UpdateDocScoreList(lpDocScores, -1, lpResTopicList->wWeight);
#if defined(_DEBUG) && defined(_DUMPALL)
_DPF3("New topic %lu, wWeightMax = %lu, wWeight = %u\n", dwTopicID, \ wWeightMax, lpResTopicList->wWeight); #endif
} // end for each topic in posting
fRet = S_OK;
return fRet; }
PRIVATE HRESULT PASCAL NEAR ResolveFlatQuery(_LPQT lpqt, _LPQTNODE lpCurQtNode, LPRETV lpRetV) { HRESULT fRet; PNODEINFO pLeafInfo = &lpRetV->LeafInfo; LPB astBTreeWord = lpRetV->pBTreeWord; DWORD dwTotalTopic; LPB lstModified = lpRetV->pModifiedWord; ERRB errb; WORD cByteMatched = 0; STRING_TOKEN FAR *lpStrList; /* Pointer to strings table */ STRING_TOKEN FAR *lpPrev; /* Pointer to strings table */ _LPDSL lpDocScores = (_LPDSL)(lpqt->lpDocScores); LPWORDINFO lpwiT; LPWORDINFO lpwiPrev;
// first collect the word info for each token
for (lpStrList = lpqt->lpStrList, lpPrev = NULL; lpStrList; lpStrList = lpStrList->pNext) { BOOL fNumber = TRUE;
// accumulate the list of terms to have data read
if ((fRet = GetWordInfoList(lpqt, lpStrList, lpCurQtNode, lpRetV)) != S_OK) { return SetErrCode (&errb, fRet); }
// if no word info was available, remove the token from the list
// it won't get freed until end of query, but who cares - it makes
// the rest of the processing faster
if (!lpStrList->lpwi) { if (lpPrev) lpPrev->pNext = lpStrList->pNext; else lpqt->lpStrList = lpStrList->pNext; // NOTE: lpPrev must remain unchanged when deleting!
continue; }
// cycle through all the instances of this term's lookalikes
// (e.g. multiple aliases) and add up the total topic count
// since we don't want to treat aliases as rare, even though
// they may be.
lpStrList->dwTopicCount = lpStrList->lpwi->cTopic; for (lpwiT = lpStrList->lpwi->pNext, lpwiPrev = NULL; lpwiT; lpwiPrev = lpwiT, lpwiT = lpwiT->pNext) lpStrList->dwTopicCount += lpwiT->cTopic;
lpPrev = lpStrList; } // for next term
// sort string list by descending term rarity
SortStringWeights(lpqt);
dwTotalTopic = 0;
for (lpStrList = lpqt->lpStrList; lpStrList; lpStrList = lpStrList->pNext) { LPWORDINFO lpwiT;
if (lpStrList->lpwi == NULL) continue;
#if defined(_DEBUG) && defined(_DUMPALL)
{ char szTemp[256]; STRNCPY(szTemp, lpStrList->lpString + 2, *(LPWORD)lpStrList->lpString); szTemp[*(LPWORD)lpStrList->lpString] = 0; _DPF1("Term: '%s'\n", szTemp); } #endif
// We can terminate the query processing if the upper bound on the
// smallest current doc score is lteq the current score of the R-th
// biggest doc score, since any further computation will at most
// result in a re-ordering of the bottom (N - R) documents.
// However, this leaves the remaining documents only partially
// sorted by relevancy, which may or may not be acceptable.
if (AddWeights(GetMinDocScore(lpDocScores, ROUND_UP), lpStrList->wWeightRemain) <= GetSortedDocScore(lpDocScores, (int)lpRetV->SrchInfo.dwTopicFullCalc, ROUND_DOWN)) break;
lpqt->lpTopicStartSearch = NULL; lpqt->lpOccStartSearch = NULL;
QTN_TOKEN(lpCurQtNode) = lpStrList;
for (lpwiT = lpStrList->lpwi; lpwiT; lpwiT = lpwiT->pNext) { // TO DO: replace with WORDINFO in curqt node
lpCurQtNode->cTopic = lpwiT->cTopic; lpCurQtNode->foData = lpwiT->foData; lpCurQtNode->cbData = lpwiT->cbData; lpCurQtNode->wRealLength = lpwiT->wRealLength;
if ((fRet = ProcessTerm(lpqt, lpRetV, NULL, lpCurQtNode, lpStrList)) != S_OK) { // kevynct: no need to overwrite count on error since
// we may be attempting to continue
lpCurQtNode->cTopic += dwTotalTopic; return(fRet); }
// Accumulate the topic count, since cTopic will be destroyed
// if there is more searches for this node (such as wildcard)
dwTotalTopic += lpCurQtNode->cTopic; } }
lpCurQtNode->cTopic = dwTotalTopic;
return S_OK; }
__inline void MergeWordInfoCounts(WORDINFO FAR *lpwiDest, WORDINFO FAR *lpwiSrc) { lpwiDest->cTopic += lpwiSrc->cTopic; }
// adds zero or more WORDINFO nodes for the passed-in string
PRIVATE HRESULT GetWordInfoList(_LPQT lpqt, STRING_TOKEN FAR *lpStrToken, _LPQTNODE lpCurQtNode, LPRETV lpRetV) { int cLevel; int cMaxLevel; int fCheckFieldId; LST lstSearchStr; LPB lpCurPtr; int nCmp; HRESULT fRet; int f1stIsWild; LPB lpMaxAddress; PNODEINFO pLeafInfo = &lpRetV->LeafInfo; DWORD dwTemp; LPB astBTreeWord = lpRetV->pBTreeWord; WORD wLen; DWORD dwFieldID; LPB lstModified = lpRetV->pModifiedWord; BYTE fStemmed; LPB pBTreeWord; ERRB errb; WORD cByteMatched = 0; WORDINFO wi; LPWORDINFO lpwi;
fStemmed = 0;
lstSearchStr = lpStrToken->lpString; f1stIsWild = (lstSearchStr[2] == WILDCARD_CHAR || lstSearchStr[2] == WILDCARD_STAR);
// Make sure to turn of stemming if there is any wildcard characters
for (nCmp = *((LPW)lstSearchStr) + 1; nCmp >= 2; nCmp--) { if (lstSearchStr[nCmp] == '*' || lstSearchStr[nCmp] == '?') { fStemmed = FALSE; break; } }
// Turned off stemming for short words
if (*(LPW)lstSearchStr < 3) fStemmed = FALSE;
pLeafInfo->nodeOffset = lpqt->foIdxRoot; pLeafInfo->iLeafLevel = lpqt->cIdxLevels - 1; pLeafInfo->dwBlockSize = lpqt->dwBlockSize;
// BUGBUG: we don't handle stemming for now.
MEMCPY (lstModified, lstSearchStr, *((LPW)lstSearchStr) + sizeof (SHORT)); // Zero terminated for wildcard search
lstModified [*((LPW)lstModified) + 2] = 0; pBTreeWord = lpRetV->pBTreeWord; /* Change all '*' and '?' to 0. This will
* ensure that things gets compared correctly with * the top node's entries */ for (nCmp = *((LPW)lstModified) + 1; nCmp >= 2; nCmp--) { if (lstModified[nCmp] == '*' || lstModified[nCmp] == '?') { lstModified[nCmp] = 0; *(LPW)lstModified = nCmp - 2; } }
/*
* Point node-resolution variables at the right things. This * sets these up to read b-tree nodes. Fields not set here are * set as appropriate elsewhere. */
/* Set the flag */ fCheckFieldId = (lpqt->occf & OCCF_FIELDID) && (lpCurQtNode->dwFieldId != DW_NIL_FIELD);
astBTreeWord[0] = 0; cMaxLevel = lpqt->cIdxLevels - 1;
/*
First we have to find which tree level the word is in. The number of searches is equal to the number of tree levels at most. The structure of the directory node is a sequence of: - Words: PASCAL strings - Data offset: will tell us where is the offset of the record in the index file */ for (cLevel = 0; cLevel < cMaxLevel ; cLevel++) { //
// Get a node.
//
if ((fRet = ReadStemNode ((PNODEINFO)pLeafInfo, cLevel)) != S_OK) { return SetErrCode (&errb, fRet); } lpMaxAddress = pLeafInfo->pMaxAddress; lpCurPtr = pLeafInfo->pCurPtr;
//
// Loop through it. This compares the word I'm
// looking for against the word in the b-tree.
// If the word in the b-tree is >= the word I'm
// looking for, I'm done.
//
// If I run off the end of the node, there can be
// no match for this term, so I skip the entire
// process.
//
for (;;) { if (lpCurPtr >= lpMaxAddress) return S_OK;
lpCurPtr = ExtractWord(astBTreeWord, lpCurPtr, &wLen);
if (fStemmed) { if ((fRet = FStem (pBTreeWord, astBTreeWord)) != S_OK) return(S_OK); } /* Read in NodeId record */ lpCurPtr += ReadFileOffset (&pLeafInfo->nodeOffset, lpCurPtr);
if (f1stIsWild) break; if (StrCmpPascal2(lstModified, pBTreeWord) <= 0) break; } }
/* At this point, pLeafInfo->nodeOffset is the node id of the leaf that
is supposed to contain the searched word. Read in the leaf node */ if ((fRet = ReadLeafNode ((PNODEINFO)pLeafInfo, cLevel)) != S_OK) { return fRet; }
lpCurPtr = pLeafInfo->pCurPtr; lpMaxAddress = pLeafInfo->pMaxAddress;
//
// Second step is to deal with the leaf node(s). I'm going to
// find and capture some occurence lists. I'll probably have to
// ignore some bogus ones first.
//
// Reset the word
if (fStemmed) { MEMCPY (lstModified, lpRetV->pStemmedQueryWord, *(LPW)lpRetV->pStemmedQueryWord + sizeof(WORD)); } else { MEMCPY (lstModified, lstSearchStr, *((LPW)lstSearchStr) + sizeof (SHORT)); } for (;;) { // Check for out of data
if (lpCurPtr >= lpMaxAddress) { // Get the offset of the next node
ReadFileOffset (&pLeafInfo->nodeOffset, pLeafInfo->pBuffer); if (FoIsNil (pLeafInfo->nodeOffset)) { return S_OK; } // Read the next node
if ((fRet = ReadLeafNode ((PNODEINFO)pLeafInfo, cLevel)) != S_OK) { return SetErrCode (&errb, fRet); } lpCurPtr = pLeafInfo->pBuffer + FOFFSET_SIZE + sizeof (SHORT); lpMaxAddress = pLeafInfo->pMaxAddress; } /* Check for interrupt now and then */ if ((++lpqt->cInterruptCount) == 0) { if (lpqt->fInterrupt == E_INTERRUPT) return E_INTERRUPT; if (*lpqt->cStruct.Callback.MessageFunc && (fRet = (*lpqt->cStruct.Callback.MessageFunc)( lpqt->cStruct.Callback.dwFlags, lpqt->cStruct.Callback.pUserData, NULL)) != S_OK) return(fRet); } // Extract the word
lpCurPtr = ExtractWord(astBTreeWord, lpCurPtr, &wLen); if (fStemmed) { if ((fRet = FStem (pBTreeWord, astBTreeWord)) != S_OK) return(fRet); }
if (lpqt->occf & OCCF_FIELDID) lpCurPtr += CbByteUnpack (&dwFieldID, lpCurPtr); nCmp = CompareTerm (lpCurQtNode, lstModified, pBTreeWord, fCheckFieldId ? dwFieldID : lpCurQtNode->dwFieldId, lpRetV->pLeadByteTable);
switch (nCmp) { case KEEP_SEARCHING: // Skip TopicCount
lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr); // Skip data offset
lpCurPtr += FOFFSET_SIZE; // Skip DataSize
lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr); break;
case STRING_MATCH:
lpCurPtr += CbByteUnpack (&wi.cTopic, lpCurPtr); lpCurPtr += ReadFileOffset (&wi.foData, lpCurPtr); lpCurPtr += CbByteUnpack (&wi.cbData, lpCurPtr); wi.wRealLength = wLen;// BUGBUG doublecheck this
// Check for Topic count. This can be 0 if the word has been deleted
// from the index
if (wi.cTopic == 0) break;
// long search optimization: clip noise words.
// Johnms- eliminate frequent words.
// typically, you eliminate if in more than 1/7 of documents.
if ((lpRetV->SrchInfo.Flag & LARGEQUERY_SEARCH) && lpRetV->SrchInfo.dwValue < wi.cTopic ) { break; }
// allocate WORDINFO node
if ((lpwi = BlockGetElement(lpqt->lpWordInfoBlock)) == NULL) return E_OUTOFMEMORY;
*lpwi = wi;
lpwi->pNext = lpStrToken->lpwi; lpStrToken->lpwi = lpwi;
// Save the info
pLeafInfo->pCurPtr = lpCurPtr; break;
case NOT_FOUND: // No unconditional "break" above.
if (fStemmed && (strncmp (lstSearchStr+ 2, pBTreeWord + 2, cByteMatched) == 0)) { // Continue searching in case stemming is messed up
// by non-alphabetic word, such as the sequence:
// subtopic subtopic2 subtopics
lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr); // Skip data offset
lpCurPtr += FOFFSET_SIZE; // Skip DataSize
lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr); break; } return S_OK; } }
}
/*************************************************************************
* @doc INTERNAL * * @func HRESULT PASCAL FAR | FFlatCallBack | * This call back function is called by various breakers after * fetching a token. The token is checked for wild char presence * * @parm LST | lstRawWord | * Pointer to unnormalized string * * @parm LST | lstNormWord | * Pointer to normalized string. This pascal string's size should be * at least *lstNormWord+2 * * @parm LFO | lfoWordOffset | * Offset into the query buffer. It is used to mark the location * where an parsing error has occurred * * @parm LPQI | lpqi | * Pointer to query info structure. This has all "global" variables * * @rdesc S_OK if succeeded, else various errors. *************************************************************************/ PUBLIC HRESULT PASCAL FAR EXPORT_API FFlatCallBack (LST lstRawWord, LST lstNormWord, LFO lfoWordOffset, LPQI lpqi) { /* Add extra 0 to make sure that AllocWord() gets the needed 0
* for WildCardCompare() */ lstNormWord[*(LPW)(lstNormWord) + 2] = 0;
// add the token to the string list
if (AllocWord(lpqi->lpQueryTree, lstNormWord) == NULL) return E_OUTOFMEMORY;
return S_OK; }
// for now, perform simple insertion sort on the string list
// bugbug: use heapsort or faster method for long lists
// for now, we sort by total topic count decreasing (rare terms first)
PRIVATE VOID PASCAL SortStringWeights(_LPQT lpQueryTree) { STRING_TOKEN FAR *pStr, *pStrNext, *pT, *pTPrev; STRING_TOKEN FAR *pStrHead = lpQueryTree->lpStrList; DWORD dwSum, dwT; DWORD dwMaxWeight; WORD wWeightT; int nCmp; FLOAT rLog; FLOAT rLogSquared; FLOAT rSigma; FLOAT rTerm; BOOL fNormalize = FALSE; // Normalize was for testing only.
if (fNormalize) { rSigma = (float)0.0;
// for each term:
for (pStr = pStrHead; pStr; pStr = pStr->pNext) { FLOAT fOcc;
// we have to guard against the possibility of the log resulting in
// a value <= 0.0. Very rare, but possible in the future. This happens
// if dwTopicCount approaches or exceeds the N we are using (N == 100 million)
if (pStr->dwTopicCount >= cNintyFiveMillion) rLog = cVerySmallWt; // log10(100 mil/ 95 mil) == 0.02
else //rLog = (float) log10(cHundredMillion/(double)pHeader->dwTopicCount);
rLog = (float) (8.0 - log10((double)pStr->dwTopicCount));
rLogSquared = rLog*rLog;
// Update sigma value
// NOTE : We are bounding dwOccCount by a value of eTFThreshold
// The RHS of the equation below has an upperbound of 2 power 30.
fOcc = (float) min(cTFThreshold, pStr->cUsed); rSigma += fOcc*fOcc*rLogSquared; }
rSigma = (float)sqrt(rSigma); }
// calculate final weights and corrections
dwSum = dwMaxWeight = 0L; for (pStr = pStrHead; pStr; pStr = pStr->pNext, nCmp++) { BOOL fNumber;
// once sigma is known, each term's proper weight can be calculated
if (fNormalize) { FLOAT rWeight; // log10(x/y) == log10 (x) - log10 (y). Since x in our case is a known constant,
// 100,000,000, I'm replacing that with its equivalent log10 value of 8.0 and subtracting
// the log10(y) from it
rTerm = (float) (8.0 - log10((double) pStr->dwTopicCount)); // In extreme cases, rTerm could be 0 or even -ve (when dwTopicCount approaches or
// exceeds 100,000,000)
if (rTerm <= (float) 0.0) rTerm = cVerySmallWt; // very small value. == log(100 mil/ 95 mil)
// NOTE : rWeight for the doc term would be as follows:
// rWeight = float(min(4096, dwBlockSize)) * rTerm / lpipb->wi.hrgsigma[dwTopicId]
//
// Since rTerm needs to be recomputed again for the query term weight computation,
// and since rTerm will be the same value for the current term ('cos N and n of log(N/n)
// are the same (N = 100 million and n is whatever the doc term freq is for the term),
// we will factor in the second rTerm at index time. This way, we don't have to deal
// with rTerm at search time (reduces computation and query time shortens)
//
// MV 2.0 initially did the same thing. However, BinhN removed the second rTerm
// because he decided to remove the rTerm altogether from the query term weight. He
// did that to keep the scores reasonably high.
rWeight = ((float) min(cTFThreshold, pStr->cUsed)) * rTerm * rTerm / rSigma; // without the additional rTerm, we would probably be between 0.0 and 1.0
if (rWeight > rTerm) wWeightT = 0xFFFF; else wWeightT = (WORD) ((float)0xFFFF * rWeight / rTerm);
} else wWeightT = 65535;
pStr->wWeight = (WORD)(16383 + 49152 / pStr->dwTopicCount);
// perform any special weight adjustments here
// BUGBUG: use NextChar here, and use charmap here
// numbers four digits or less get downgraded
fNumber = TRUE; for (nCmp = *((LPWORD)pStr->lpString) + 1; nCmp >= 2; nCmp--) if (nCmp > 5 || !IS_DIGIT(pStr->lpString[nCmp])) { fNumber = FALSE; break; }
if (fNumber) pStr->wWeight = pStr->wWeight / 256;
//pStr->wTermWeight = (WORD)(pStr->wWeight * wWeightT / 65535L);
dwMaxWeight = max(dwMaxWeight, pStr->wWeight); dwSum += pStr->wWeight; }
// now sort 'em
for (pStr = pStrHead; pStr;) { if (NULL == (pStrNext = pStr->pNext)) break;
if (pStrNext->wWeight <= pStr->wWeight) { pStr = pStr->pNext; continue; }
// find element in already-sorted section
for (pT = pStrHead, pTPrev = NULL; pT; pTPrev = pT, pT = pT->pNext) { if (pT->wWeight <= pStrNext->wWeight) { pStr->pNext = pStrNext->pNext; pStrNext->pNext = pT;
if (pTPrev) pTPrev->pNext = pStrNext; else pStrHead = pStrNext;
break; } } }
dwT = 0; for (pStr = pStrHead; pStr; pStr = pStr->pNext) { dwT += pStr->wWeight; if (dwSum > dwT) pStr->wWeightRemain = AddWeights(0, (WORD)((dwSum - dwT) * 65535.0 / dwSum)); else pStr->wWeightRemain = 1; }
lpQueryTree->lpStrList = pStrHead; }
|