Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1570 lines
47 KiB

  1. //#define _DUMPALL
  2. /*************************************************************************
  3. * *
  4. * SIMILAR.C *
  5. * *
  6. * Copyright (C) Microsoft Corporation 1990-1996 *
  7. * All Rights reserved. *
  8. * *
  9. **************************************************************************
  10. * *
  11. * Module Intent: *
  12. * *
  13. * Search Core Engine: Find Similar functionality *
  14. * *
  15. **************************************************************************
  16. *
  17. * Revision History:
  18. *
  19. * 09/24/96 kevynct Started from algorithm notes (4 hrs)
  20. * 09/25/96 kevynct Implemented skeleton of ProcessSimilarityTerm (1 hr)
  21. * 09/26/96 kevynct More work on inner loop and relevant list (5 hrs)
  22. * 09/27/96 kevynct Query parsing, weighting, and sorting (6 hrs)
  23. * 10/01/96 kevynct Incorporate into MV2.0b (10 min)
  24. * 10/02/96 kevynct Clean-up query code, start resolve query code (4 hrs)
  25. * 10/03/96 kevynct Resolve query code (2 hrs)
  26. * 10/11/96 kevynct Start bucket routines (2 hrs)
  27. * 10/13/96 kevynct Finish bucket routines, write node processor, cleanup (6 hrs)
  28. * 10/14/96 kevynct Clean-up, remove compilation errors, debugging (6 hrs)
  29. * 10/24/96 kevynct Convert to two-phase query resolution (3 hrs)
  30. * 10/25/96 kevynct Fix sort by cTopics, debug new query resolution, try new weighting (2 hrs)
  31. * 11/26/96 kevynct Testing, fix and improve weighting and accumulation: aliases, digits (8 hrs)
  32. * 12/2/96 kevynct More weighting tests (8 hrs)
  33. * Work remaining:
  34. *
  35. * Investigate field and stemming support
  36. *
  37. * Use probabilistic upperbounds for pruning. Remove single-term nodes after each term process
  38. * Test current bucket method vs. exact scores w/ heap
  39. *
  40. **************************************************************************
  41. *
  42. * Current Owner: KevynCT
  43. *
  44. **************************************************************************/
  45. #include <mvopsys.h>
  46. #include <mem.h>
  47. #include <memory.h>
  48. #include <orkin.h>
  49. #include <mvsearch.h>
  50. #include <math.h>
  51. #include <groups.h>
  52. #include "common.h"
  53. #include "search.h"
  54. #ifdef _DEBUG
  55. static BYTE NEAR s_aszModule[] = __FILE__; // Used by error return functions.
  56. #endif
  57. #define FGetDword(a,b,c) (*DecodeTable[b.cschScheme])(a, b, c)
  58. #define IS_DIGIT(p) ((p) >= '0' && (p) <= '9')
  59. // these are in case the doc scoring is approximate: they tell which
  60. // direction to err on the side of.
  61. #define ROUND_DOWN 0
  62. #define ROUND_UP 1
  63. #define SCORE_BLOCK_SIZE 32
  64. #define NUM_SCORE_BLOCKS (MAX_WEIGHT/SCORE_BLOCK_SIZE)
  65. typedef struct tagDocScoreList {
  66. HANDLE hMem;
  67. int cScoresLeft;
  68. int iBucketLowest;
  69. int iHighestScore;
  70. int rgiScores[NUM_SCORE_BLOCKS + 1];
  71. } DSL, FAR *_LPDSL;
  72. PUBLIC HRESULT PASCAL FAR SkipOccList(_LPQT lpqt, PNODEINFO pNodeInfo, DWORD dwOccs); // ftsearch.c
  73. PUBLIC int PASCAL FAR CompareTerm(_LPQTNODE lpQtNode,
  74. LST lstTermWord, LST lstBtreeWord, DWORD dwBtreeFieldId, char []); // ftsearch.c
  75. PUBLIC STRING_TOKEN FAR *PASCAL AllocWord(_LPQT lpQueryTree, LST lstWord); // qtparse.c
  76. __inline LPVOID InitDocScoreList(int cScores);
  77. __inline void FreeDocScoreList(LPV lpDocScores);
  78. __inline int GetMaxDocScore(_LPDSL lpDocScores);
  79. __inline int GetMinDocScore(_LPDSL lpDocScores, BOOL fRoundUp);
  80. BOOL UpdateDocScoreList(_LPDSL lpDocScores, int iOldScore, int i);
  81. __inline BOOL IsDocScoreListFull(_LPDSL lpdsl);
  82. __inline WORD AddWeights(DWORD w1, DWORD w2);
  83. int GetSortedDocScore(_LPDSL lpDocScores, int iThis, BOOL fRoundUp);
  84. #if defined(_DEBUG)
  85. BOOL DumpDocScoreList(_LPDSL lpdsl, PSRCHINFO pSrchInfo);
  86. #endif
  87. __inline void MergeWordInfoCounts(WORDINFO FAR *lpwiDest, WORDINFO FAR *lpwiSrc);
  88. PRIVATE LPQT TokenizeFlatQuery(LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, PHRESULT phr);
  89. PRIVATE HRESULT PASCAL NEAR ResolveFlatQuery(_LPQT lpqt, _LPQTNODE lpCurQtNode, LPRETV lpRetV);
  90. PRIVATE HRESULT GetWordInfoList(_LPQT lpqt, STRING_TOKEN FAR *lpStrToken, _LPQTNODE lpCurQtNode, LPRETV lpRetV);
  91. PRIVATE VOID PASCAL SortStringWeights(_LPQT lpQueryTree);
  92. PRIVATE VOID PASCAL SetStringWeights (LPQI lpQueryInfo);
  93. PUBLIC HRESULT PASCAL FAR EXPORT_API FFlatCallBack (LST lstRawWord, LST lstNormWord,
  94. LFO lfoWordOffset, LPQI lpqi);
  95. __inline LPVOID InitDocScoreList(int cScores)
  96. {
  97. _LPDSL lpdsl;
  98. if ((lpdsl = (_LPDSL)GlobalLockedStructMemAlloc(sizeof(DSL))) == NULL)
  99. return NULL;
  100. lpdsl->cScoresLeft = cScores;
  101. lpdsl->iHighestScore = 0;
  102. lpdsl->iBucketLowest = -1;
  103. return (LPV)lpdsl;
  104. }
  105. __inline void FreeDocScoreList(LPV lpDocScores)
  106. {
  107. if ((_LPDSL)lpDocScores)
  108. GlobalLockedStructMemFree((_LPDSL)lpDocScores);
  109. }
  110. __inline int GetMaxDocScore(_LPDSL lpDocScores)
  111. {
  112. return lpDocScores->iHighestScore;
  113. }
  114. __inline int GetMinDocScore(_LPDSL lpDocScores, BOOL fRoundUp)
  115. {
  116. if (lpDocScores->iBucketLowest >= 0)
  117. return (lpDocScores->iBucketLowest + !!fRoundUp) * SCORE_BLOCK_SIZE;
  118. return 0;
  119. }
  120. int GetSortedDocScore(_LPDSL lpdsl, int cThis, BOOL fRoundUp)
  121. {
  122. LPINT lpi, lpiFirst;
  123. if (lpdsl->iHighestScore < 0)
  124. return 0;
  125. lpiFirst= &lpdsl->rgiScores[0];
  126. for (lpi = &lpdsl->rgiScores[lpdsl->iHighestScore/SCORE_BLOCK_SIZE];
  127. lpi >= lpiFirst; cThis -= *lpi, lpi--)
  128. {
  129. if (cThis <= *lpi)
  130. return ((lpi - lpiFirst) + !!fRoundUp) * SCORE_BLOCK_SIZE;
  131. }
  132. return (!!fRoundUp * SCORE_BLOCK_SIZE);
  133. }
  134. #if defined(_DEBUG)
  135. BOOL DumpDocScoreList(_LPDSL lpdsl, PSRCHINFO pSrchInfo)
  136. {
  137. LPINT lpi, lpiMax;
  138. int iT = 0;
  139. int i;
  140. lpi = &lpdsl->rgiScores[0];
  141. lpiMax = lpi + NUM_SCORE_BLOCKS;
  142. for (i = 0;lpi < lpiMax;lpi++, i++)
  143. {
  144. if (*lpi)
  145. {
  146. _DPF2("Score %d (count %d)\n", i, *lpi);
  147. }
  148. iT += *lpi;
  149. }
  150. _DPF1("%d topics in scorelist\n", iT);
  151. return TRUE;
  152. }
  153. #endif
  154. BOOL UpdateDocScoreList(_LPDSL lpdsl, int iOldScore, int iScore)
  155. {
  156. int iThis = iScore/SCORE_BLOCK_SIZE;
  157. int iOld = iOldScore/SCORE_BLOCK_SIZE;
  158. if (lpdsl->cScoresLeft <= 0)
  159. {
  160. // already full, figure out which buckets need updating
  161. if (iThis > lpdsl->iBucketLowest)
  162. {
  163. // if we're updating an existing entry, remove that
  164. // otherwise remove the lowest one
  165. if (iOld >= lpdsl->iBucketLowest)
  166. lpdsl->rgiScores[iOld]--;
  167. else
  168. lpdsl->rgiScores[lpdsl->iBucketLowest]--;
  169. // then make sure lowest one is still non-empty; if not,
  170. // revise upwards
  171. if (lpdsl->rgiScores[lpdsl->iBucketLowest] <= 0)
  172. {
  173. for (lpdsl->iBucketLowest++; lpdsl->iBucketLowest <= iThis; lpdsl->iBucketLowest++)
  174. if (lpdsl->rgiScores[lpdsl->iBucketLowest])
  175. break;
  176. add_new_doc:
  177. if (lpdsl->iBucketLowest >= 0)
  178. lpdsl->iBucketLowest = min(lpdsl->iBucketLowest, iThis);
  179. else
  180. lpdsl->iBucketLowest = iThis;
  181. }
  182. // then add the new entry
  183. lpdsl->rgiScores[iThis]++;
  184. update_highest_score:
  185. if (iScore > lpdsl->iHighestScore)
  186. lpdsl->iHighestScore = iScore;
  187. #if defined(_DEBUG) && defined(_DUMPALL)
  188. //DumpDocScoreList(lpdsl, NULL);
  189. #endif
  190. Assert(lpdsl->rgiScores[lpdsl->iHighestScore/SCORE_BLOCK_SIZE] >= 0);
  191. return TRUE;
  192. }
  193. else
  194. if (iThis == lpdsl->iBucketLowest)
  195. goto update_highest_score;
  196. Assert(lpdsl->rgiScores[lpdsl->iHighestScore/SCORE_BLOCK_SIZE] >= 0);
  197. return FALSE;
  198. }
  199. // doc score list is not yet full, so automatically add if new,
  200. // remove old if update
  201. if (iOld >= lpdsl->iBucketLowest)
  202. lpdsl->rgiScores[iOld]--;
  203. else
  204. lpdsl->cScoresLeft--;
  205. goto add_new_doc;
  206. }
  207. __inline BOOL IsDocScoreListFull(_LPDSL lpdsl)
  208. {
  209. return (lpdsl->cScoresLeft <= 0);
  210. }
  211. __inline WORD AddWeights(DWORD w1, DWORD w2)
  212. {
  213. return (WORD)min(MAX_WEIGHT, w1 + w2);
  214. }
  215. /*************************************************************************
  216. * @doc EXTERNAL API RETRIEVAL
  217. *
  218. * @func LPHL FAR PASCAL | MVIndexFindSimilar |
  219. * Given a query which probably represents a document text stream, returns
  220. * a hitlist containing topics which are determined to be similar to the query
  221. * using nearest-neighbour searching.
  222. *
  223. * @parm LPIDX | lpidx |
  224. * Pointer to index information.
  225. *
  226. * @parm LPQT | lpqt |
  227. * Pointer to query tree (returned by MVQueryParse())
  228. *
  229. * @parm PSRCHINFO | pSrchInfo |
  230. * Pointer to search information data
  231. *
  232. * @parm _LPGROUP | lpResGroup |
  233. * Pointer to resulting group
  234. *
  235. * @parm LPVOID | pCallback |
  236. * Pointer to callback struct FCALLBACK_MSG (optional)
  237. *
  238. * @parm PHRESULT | phr |
  239. * Pointer to error buffer
  240. *
  241. * @rdesc Pointer to hitlist structure if succeeded, even there is
  242. * no hits (use MVHitListEntries() to find out how many hits have been
  243. * returned). It will return NULL if failed. The error buffer
  244. * (see IndexOpen()) will contain descriptions about the cause of
  245. * the failure. There is one special case when the function returns
  246. * a non-null pointer, even there is error, that is when it can't
  247. * write the result to the disk, and everything is still in memory.
  248. *
  249. *************************************************************************/
  250. // bugbug: handle wildcards
  251. PUBLIC LPHL EXPORT_API FAR PASCAL MVIndexFindSimilar (_LPIDX lpidx,
  252. LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, _LPGROUP lpResGroup,
  253. LPVOID pCallback, PHRESULT phr)
  254. {
  255. HRESULT fRet; // Return from this function.
  256. LPRETV lpRetV; // Retrieval memory/files.
  257. GHANDLE hRetv;
  258. //OCCF occf; // Index occurence flags temporary variable.
  259. _LPHL lphl; // Pointer to hitlist
  260. _LPQTNODE lpTreeTop;
  261. HANDLE hTreeTop = NULL;
  262. _LPQT lpqt;
  263. if (lpidx == NULL || lpParms == NULL || pSrchInfo == NULL)
  264. {
  265. /* We get some bad arguments!! */
  266. SetErrCode (phr, E_INVALIDARG);
  267. return NULL;
  268. }
  269. if (NULL == (lpqt = TokenizeFlatQuery(lpParms, pSrchInfo, phr)))
  270. {
  271. // errb was set
  272. return NULL;
  273. }
  274. fRet = E_FAIL; // Assume thing will go wrong
  275. // Transfer all the information about the index to the query tree
  276. lpqt->foIdxRoot = lpidx->ih.foIdxRoot; /* Top node offset */
  277. lpqt->dwBlockSize = lpidx->ih.dwBlockSize; /* Index block size */
  278. lpqt->cIdxLevels = lpidx->ih.cIdxLevels; /* Index's depth */
  279. lpqt->occf = lpidx->ih.occf;
  280. lpqt->idxf = lpidx->ih.idxf;
  281. lpqt->foIdxRoot = lpidx->ih.foIdxRoot;
  282. lpqt->ckeyTopicId = lpidx->ih.ckeyTopicId;
  283. lpqt->ckeyOccCount = lpidx->ih.ckeyOccCount;
  284. lpqt->ckeyWordCount = lpidx->ih.ckeyWordCount;
  285. lpqt->ckeyOffset = lpidx->ih.ckeyOffset;
  286. if (pSrchInfo->dwMemAllowed)
  287. {
  288. // allocate document result list
  289. // no occurrence info is returned for similarity query
  290. SetBlockCount (lpqt->lpTopicMemBlock, (WORD)(pSrchInfo->dwMemAllowed /
  291. (sizeof(TOPIC_LIST) * cTOPIC_PER_BLOCK)));
  292. SetBlockCount (lpqt->lpOccMemBlock, 1);
  293. }
  294. if (pCallback)
  295. MVSearchSetCallback(lpqt, pCallback);
  296. /* Allocate hitlist */
  297. if ((lphl = (_LPHL)GlobalLockedStructMemAlloc(sizeof (HL))) == NULL)
  298. {
  299. fRet = E_OUTOFMEMORY;
  300. SetErrCode(phr, fRet);
  301. exit00:
  302. if (lpqt)
  303. {
  304. FreeDocScoreList(lpqt->lpDocScores);
  305. MVQueryFree(lpqt);
  306. }
  307. if (lphl && fRet != S_OK && fRet != E_TOOMANYTOPICS)
  308. {
  309. MVHitListDispose(lphl);
  310. lphl = NULL;
  311. }
  312. return (LPHL)lphl;
  313. }
  314. lphl->lLastTopicId = 0xffffffff;
  315. lphl->lcMaxTopic = lpidx->ih.lcTopics;
  316. /* Allocate a return value structure */
  317. if ((hRetv = _GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
  318. sizeof(RETV))) == NULL)
  319. {
  320. SetErrCode(phr, E_OUTOFMEMORY);
  321. goto exit00;
  322. }
  323. lpRetV = (LPRETV)_GLOBALLOCK(hRetv);
  324. lpRetV->lpqt = lpqt;
  325. if ((fRet = TopNodeRead(lpidx)) != S_OK)
  326. {
  327. SetErrCode (phr, fRet);
  328. exit02:
  329. FreeHandle(hRetv);
  330. goto exit00;
  331. }
  332. //
  333. // Count the number of occurence fields present. My retrieval
  334. // occurence record is going to cost 4 bytes per field.
  335. //
  336. //occf = lpqt->occf;
  337. //for (lpRetV->cOccFields = 0; occf; lpRetV->cOccFields++)
  338. // occf &= occf - 1;
  339. lpqt->dwOccSize = lpRetV->dwOccSize = 0;
  340. //sizeof(OCCURENCE) + lpRetV->cOccFields * sizeof (DWORD);
  341. lpRetV->fRank = TRUE; //((pSrchInfo->Flag &
  342. //(QUERYRESULT_RANK | QUERYRESULT_NORMALIZE)) != 0);
  343. // Set pointer to various buffer
  344. lpRetV->LeafInfo.pTopNode = lpidx->lrgbTopNode;
  345. lpRetV->LeafInfo.pStemNode = lpRetV->pNodeBuf;
  346. lpRetV->LeafInfo.pLeafNode = lpRetV->pNodeBuf;
  347. lpRetV->LeafInfo.pDataNode = lpRetV->pDataBuf;
  348. lpRetV->LeafInfo.hfpbIdx = lpidx->hfpbIdxSubFile; // Index file to read from
  349. lpRetV->DataInfo.pTopNode = lpidx->lrgbTopNode;
  350. lpRetV->DataInfo.pStemNode = lpRetV->pNodeBuf;
  351. lpRetV->DataInfo.pLeafNode = lpRetV->pNodeBuf;
  352. lpRetV->DataInfo.pDataNode = lpRetV->pDataBuf;
  353. lpRetV->DataInfo.hfpbIdx = lpidx->hfpbIdxSubFile; // Index file to read from
  354. lpRetV->lcid = lpidx->ih.lcid;
  355. // Save search information
  356. lpRetV->SrchInfo = *pSrchInfo;
  357. if (pSrchInfo->dwValue == 0)
  358. lpRetV->SrchInfo.dwValue = (DWORD)(-1);
  359. else
  360. lpRetV->SrchInfo.dwValue = lpidx->ih.lcTopics/pSrchInfo->dwValue;
  361. // this is a dummy node that we pass in to hold all term results
  362. if ((lpTreeTop = (_LPQTNODE)_GLOBALLOCK( \
  363. hTreeTop = _GLOBALALLOC(GHND, sizeof (QTNODE)))) == NULL)
  364. {
  365. SetErrCode(phr, fRet = E_OUTOFMEMORY);
  366. goto exit02;
  367. }
  368. QTN_FLAG(lpTreeTop) = EXACT_MATCH;
  369. lpTreeTop->pNext = NULL;
  370. lpTreeTop->pPrev = NULL;
  371. lpTreeTop->lpTopicList = NULL;
  372. if ( (fRet = ResolveFlatQuery(lpqt, lpTreeTop, lpRetV)) != S_OK)
  373. {
  374. SetErrCode (phr, fRet);
  375. /* Free the Topic and Occurrence memory blocks since they are
  376. * not freed by QueryTreeFree(), or MVHitListDispose() at this
  377. * point
  378. */
  379. if (fRet != E_TOOMANYTOPICS)
  380. {
  381. BlockFree ((LPV)lpqt->lpTopicMemBlock);
  382. BlockFree ((LPV)lpqt->lpOccMemBlock);
  383. lpqt->lpTopicMemBlock = NULL;
  384. lpqt->lpOccMemBlock = NULL;
  385. exit03:
  386. if (hTreeTop)
  387. {
  388. _GLOBALUNLOCK(hTreeTop);
  389. _GLOBALFREE(hTreeTop);
  390. }
  391. goto exit02;
  392. }
  393. }
  394. /* Create a group if requested */
  395. if ((pSrchInfo->Flag & QUERYRESULT_GROUPCREATE) && lpResGroup)
  396. {
  397. LPITOPIC lpCurTopic; /* Topic's current pointer */
  398. LPB lpbGrpBitVect;
  399. DWORD maxTopicId;
  400. /* Initialize the pointer */
  401. lpbGrpBitVect = lpResGroup->lpbGrpBitVect;
  402. maxTopicId = lpResGroup->dwSize * 8;
  403. for (lpCurTopic = QTN_TOPICLIST(lpTreeTop); lpCurTopic;
  404. lpCurTopic = lpCurTopic->pNext)
  405. {
  406. /* Set the bit */
  407. if (lpCurTopic->dwTopicId < maxTopicId)
  408. {
  409. lpbGrpBitVect[(DWORD)(lpCurTopic->dwTopicId / 8)] |= 1 <<
  410. (lpCurTopic->dwTopicId % 8);
  411. }
  412. }
  413. }
  414. if ((pSrchInfo->Flag & QUERYRESULT_UIDSORT) == 0)
  415. {
  416. /* Sort the result depending on ranking or not */
  417. if (lpRetV->fRank)
  418. SortResult ((LPQT)lpqt, lpTreeTop, WEIGHT_BASED);
  419. else
  420. SortResult ((LPQT)lpqt, lpTreeTop, HIT_COUNT_BASED);
  421. }
  422. /* Update HitList info structure, cut off the unwanted list */
  423. if (lphl->lpTopicList = lpTreeTop->lpTopicList)
  424. lphl->lcReturnedTopics = lphl->lcTotalNumOfTopics = lpTreeTop->cTopic;
  425. // Only return the number of topics that the user requested
  426. // if dwTopicCount == 0, it means that the user wants to return all
  427. if (pSrchInfo->dwTopicCount != 0 &&
  428. pSrchInfo->dwTopicCount < lphl->lcReturnedTopics)
  429. lphl->lcReturnedTopics = pSrchInfo->dwTopicCount;
  430. lphl->lpOccMemBlock = lpqt->lpOccMemBlock;
  431. lphl->lpTopicMemBlock = lpqt->lpTopicMemBlock;
  432. #if 1
  433. /* WARNING: The following code should be commented out for
  434. * diskless devices. No returned error is checked, since
  435. * if disk writes fail, everything is still in memory
  436. */
  437. if ((pSrchInfo->Flag & QUERYRESULT_IN_MEM) == 0)
  438. {
  439. if ((fRet = MVHitListFlush (lphl, lphl->lcReturnedTopics)) != S_OK)
  440. SetErrCode (phr, fRet);
  441. }
  442. #endif
  443. fRet = S_OK;
  444. goto exit03;
  445. }
  446. PRIVATE LPQT TokenizeFlatQuery(LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, PHRESULT phr)
  447. {
  448. HRESULT fRet; // Return value.
  449. HANDLE hqi; // Handle to "lpqi".
  450. HANDLE hibi; // Handle to internal breaker info
  451. HANDLE hQuery; // Handle to secondary query buffer
  452. LPQI lpQueryInfo; // Query information.
  453. LPIBI lpibi; // Pointer to internal breaker info
  454. LPB lpbQueryBuf; // Copy of query's buffer
  455. _LPQT lpQueryTree; // Query tree pointer
  456. BRK_PARMS brkParms; // Breaker info parms
  457. LPCHARTAB lpCharTabInfo;// Pointer to character table's info
  458. /* LPPARSE_PARMS structure break-out variables */
  459. BYTE FAR CONST *lpbQuery; // Query buffer
  460. DWORD cbQuery; // Query length
  461. LPBRKLIST lpfnTable; // DType function table
  462. LPGROUP lpGroup; // Group
  463. lpbQuery = lpParms->lpbQuery;
  464. cbQuery = lpParms->cbQuery;
  465. lpfnTable = lpParms->lpfnTable;
  466. lpGroup = lpParms->lpGroup;
  467. if (lpfnTable == NULL)
  468. {
  469. SetErrCode(phr, E_BADBREAKER);
  470. return NULL;
  471. }
  472. if (cbQuery == 0 || lpbQuery == NULL) {
  473. SetErrCode(phr, E_NULLQUERY);
  474. return NULL;
  475. }
  476. lpQueryTree = NULL;
  477. hqi = hibi = hQuery = NULL;
  478. fRet = E_FAIL;
  479. if ((hqi = (GHANDLE)_GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
  480. (LCB)sizeof(QUERY_INFO))) == NULL)
  481. {
  482. fRet = SetErrCode(phr, E_OUTOFMEMORY);
  483. goto ErrFreeAll;
  484. }
  485. lpQueryInfo = (LPQI)_GLOBALLOCK(hqi);
  486. lpQueryInfo->lperrb = phr;
  487. lpQueryInfo->lpOpSymTab = NULL; // not used for similarity
  488. lpQueryInfo->cOpEntry = 0;
  489. /* Allocate a breaker info block used by different breakers */
  490. if ((hibi = (GHANDLE)_GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
  491. (LCB)sizeof(IBI))) == NULL)
  492. {
  493. fRet = SetErrCode(phr, E_OUTOFMEMORY);
  494. goto ErrFreeAll;
  495. }
  496. lpibi = (LPBRKI)_GLOBALLOCK(hibi);
  497. /* Set the default breaker function, and stop list */
  498. #ifndef CW
  499. lpQueryInfo->lpfnBreakFunc = lpfnTable[0].lpfnBreakFunc;
  500. #endif
  501. lpQueryInfo->lpStopListInfo = lpfnTable[0].lpStopListInfo;
  502. if ((lpCharTabInfo = lpQueryInfo->lpCharTab =
  503. lpfnTable[0].lpCharTab) == NULL)
  504. {
  505. /* Default character and ligature tables */
  506. lpCharTabInfo = lpQueryInfo->lpCharTab = MVCharTableGetDefault (phr);
  507. if (lpCharTabInfo == NULL)
  508. {
  509. fRet = SetErrCode(phr, E_NOHANDLE);
  510. goto ErrFreeAll;
  511. }
  512. lpQueryInfo->fFlag |= FREE_CHARTAB;
  513. }
  514. /* Change the property of '*' and '?' to character */
  515. ((LPCMAP)lpCharTabInfo->lpCMapTab)['*'].Class = CLASS_WILDCARD;
  516. ((LPCMAP)lpCharTabInfo->lpCMapTab)['?'].Class = CLASS_WILDCARD;
  517. switch (lpCharTabInfo->fFlag)
  518. {
  519. case USE_DEF_LIGATURE:
  520. lpCharTabInfo->wcLigature = DEF_LIGATURE_COUNT;
  521. lpCharTabInfo->lpLigature = LigatureTable;
  522. break;
  523. case NO_LIGATURE:
  524. lpCharTabInfo->wcLigature = 0;
  525. lpCharTabInfo->lpLigature = NULL;
  526. }
  527. // not used for similarity
  528. lpQueryInfo->lpStack = NULL;
  529. /* Allocate a query tree */
  530. if ((lpQueryTree = (_LPQT)QueryTreeAlloc()) == NULL)
  531. {
  532. fRet = SetErrCode(phr, E_OUTOFMEMORY);
  533. goto ErrFreeAll;
  534. }
  535. /* Associate the query tree with the query. In the future, this will
  536. * ensure the capability to have several queries and query trees
  537. * at once
  538. */
  539. lpQueryInfo->lpQueryTree = (LPQT)lpQueryTree;
  540. /* Default arguments */
  541. lpQueryTree->iDefaultOp = (BYTE)OR_OP;
  542. lpQueryTree->lpGroup = lpGroup; // Use default Group
  543. lpQueryTree->dwFieldId = 0;//DW_NIL_FIELD; // No fieldid search
  544. lpQueryTree->cStruct.dwKey = CALLBACKKEY;
  545. lpQueryTree->fFlag = 0;
  546. lpQueryTree->wProxDist = 0;
  547. if (NULL == (lpQueryTree->lpDocScores = InitDocScoreList(pSrchInfo->dwTopicCount)))
  548. {
  549. fRet = SetErrCode(phr, E_OUTOFMEMORY);
  550. goto ErrFreeAll;
  551. }
  552. /* Copy the query into a temporary buffer since we are going to make
  553. change to it
  554. */
  555. if ((hQuery = _GLOBALALLOC(DLLGMEM_ZEROINIT, (LCB)cbQuery + 2)) == NULL)
  556. {
  557. SetErrCode(phr, E_OUTOFMEMORY);
  558. FreeHandle(hqi);
  559. return NULL;
  560. }
  561. lpbQueryBuf = lpQueryInfo->lpbQuery = (LPB)_GLOBALLOCK(hQuery);
  562. lpbQueryBuf[cbQuery] = ' '; // Add a space to help LowLeveltransformation
  563. lpbQueryBuf[cbQuery + 1] = 0; // Zero-terminated string (safety bytes)
  564. MEMCPY(lpbQueryBuf, lpbQuery, cbQuery);
  565. //
  566. // Word-break between here and there.
  567. //
  568. brkParms.lpInternalBreakInfo = lpibi;
  569. brkParms.lpbBuf = lpbQueryBuf;
  570. brkParms.cbBufCount = cbQuery;
  571. brkParms.lcbBufOffset = 0;
  572. brkParms.lpvUser = lpQueryInfo;
  573. brkParms.lpfnOutWord = (FWORDCB)FFlatCallBack;
  574. brkParms.lpStopInfoBlock = lpQueryInfo->lpStopListInfo;
  575. brkParms.lpCharTab = lpQueryInfo->lpCharTab;
  576. brkParms.fFlags = ACCEPT_WILDCARD;
  577. if ((fRet = (*lpQueryInfo->lpfnBreakFunc)((LPBRK_PARMS)&brkParms))
  578. != S_OK)
  579. {
  580. fRet = SetErrCode(phr, (WORD)fRet);
  581. goto ErrFreeAll;
  582. }
  583. /* Flush the word breaker */
  584. brkParms.lpbBuf = NULL;
  585. brkParms.cbBufCount = 0;
  586. if ((fRet = (*lpQueryInfo->lpfnBreakFunc)((LPBRK_PARMS)&brkParms))
  587. != S_OK)
  588. {
  589. fRet = SetErrCode(phr, fRet);
  590. goto ErrFreeAll;
  591. }
  592. /* Set the position of pointer to report missing term at
  593. the end of the query. -1 since the offset starts at 0
  594. */
  595. lpQueryInfo->dwOffset = cbQuery - 1;
  596. fRet = S_OK;
  597. ErrFreeAll:
  598. /* Free the charmap table */
  599. if (lpQueryInfo->fFlag & FREE_CHARTAB)
  600. MVCharTableDispose (lpQueryInfo->lpCharTab);
  601. /* Free query info */
  602. if (hqi)
  603. {
  604. FreeHandle(hqi);
  605. };
  606. /* Free internal breaker info */
  607. if (hibi)
  608. {
  609. FreeHandle(hibi);
  610. };
  611. /* Free internal query buffer info */
  612. if (hQuery)
  613. {
  614. FreeHandle(hQuery);
  615. };
  616. if (fRet == S_OK)
  617. return lpQueryTree;
  618. if (lpQueryTree)
  619. {
  620. BlockFree(lpQueryTree->lpStringBlock);
  621. BlockFree(lpQueryTree->lpWordInfoBlock);
  622. BlockFree(lpQueryTree->lpOccMemBlock);
  623. BlockFree(lpQueryTree->lpTopicMemBlock);
  624. BlockFree(lpQueryTree->lpNodeBlock);
  625. FreeDocScoreList(lpQueryTree->lpDocScores);
  626. /* Free Query tree block */
  627. FreeHandle ((HANDLE)lpQueryTree->cStruct.dwReserved);
  628. }
  629. return NULL;
  630. }
  631. /*************************************************************************
  632. * @doc INTERNAL
  633. *
  634. * @func HRESULT FAR PASCAL | ProcessTerm |
  635. * This function will search the index for the given word' data.
  636. * @parm _LPQT | lpqt |
  637. * Pointer to index structure
  638. * @parm LPRETV | lpRetV |
  639. * Pointer to "globals"
  640. * @parm _LPQTNODE | lpCurQtNode |
  641. * Current node in the query tree containing important data
  642. * - The number of topics
  643. * - The location of the data
  644. * - The size of the data
  645. * - Pointer to the next word (for wildcard search)
  646. * @rdesc S_OK or other errors
  647. *************************************************************************/
  648. PUBLIC HRESULT EXPORT_API FAR PASCAL ProcessTerm(_LPQT lpqt, LPRETV lpRetV,
  649. _LPQTNODE lpResQuery, _LPQTNODE lpQtNode, STRING_TOKEN FAR *lpToken)
  650. {
  651. DWORD dwTopicIDDelta; // Topic-ID delta from previous sub-list.
  652. DWORD dwOccs; // Number of occurences in this sub-list.
  653. DWORD dwTmp; // Scratch variable.
  654. WORD wWeight; // Term-weight associated with this sub-list.
  655. WORD wWeightMax;
  656. DWORD dwTopicID; // TopicId
  657. WORD wImportance;
  658. DWORD dwLength; // Length of the word
  659. TOPIC_LIST FAR *lpResTopicList; // Result TopicList
  660. HRESULT fRet; // Returned value
  661. PNODEINFO pDataInfo;
  662. DWORD dwTopicCount;
  663. _LPQT lpQueryTree; // Query tree
  664. OCCF occf;
  665. BYTE fSkipOccList = FALSE;
  666. _LPDSL lpDocScores = (_LPDSL)(lpqt->lpDocScores);
  667. pDataInfo = &lpRetV->DataInfo;
  668. if ((pDataInfo->dwDataSizeLeft = lpQtNode->cbData) == 0)
  669. return(S_OK); // There is nothing to process
  670. // Initialize variables
  671. occf = lpqt->occf;
  672. wImportance = QTN_TOKEN(lpQtNode)->wWeight;
  673. lpResTopicList = NULL;
  674. lpQueryTree = lpRetV->lpqt;
  675. dwTopicCount = lpQtNode->cTopic;
  676. wWeight = (WORD)(65535L/(lpToken ? lpToken->dwTopicCount : dwTopicCount));
  677. // Reset the topic count for lpQtNode so that is will not affect the
  678. // result in case that lpResQuery == NULL
  679. lpQtNode->cTopic = 0;
  680. if (lpResQuery == NULL)
  681. lpResQuery = lpQtNode;
  682. // Initialize the data buffer node values
  683. pDataInfo->pBuffer = pDataInfo->pDataNode;
  684. pDataInfo->nodeOffset = lpQtNode->foData;
  685. // Read the data block
  686. if ((fRet = ReadNewData(pDataInfo)) != S_OK)
  687. return(fRet);
  688. dwTopicID = 0L; // Init occurence record
  689. dwLength = 0;
  690. // for each document in posting
  691. for (; dwTopicCount; dwTopicCount--)
  692. {
  693. /* Check for interrupt now and then */
  694. if ((++lpqt->cInterruptCount) == 0)
  695. {
  696. if (lpqt->fInterrupt == E_INTERRUPT)
  697. return E_INTERRUPT;
  698. if (*lpqt->cStruct.Callback.MessageFunc &&
  699. (fRet = (*lpqt->cStruct.Callback.MessageFunc)(
  700. lpqt->cStruct.Callback.dwFlags,
  701. lpqt->cStruct.Callback.pUserData, NULL)) != S_OK)
  702. return(fRet);
  703. }
  704. // Byte align
  705. if (pDataInfo->ibit != cbitBYTE - 1)
  706. {
  707. pDataInfo->ibit = cbitBYTE - 1;
  708. pDataInfo->pCurPtr ++;
  709. }
  710. // Get value from which I will calculate current doc-ID.
  711. if ((fRet = FGetDword(pDataInfo, lpqt->ckeyTopicId,
  712. &dwTopicIDDelta)) != S_OK)
  713. {
  714. exit0:
  715. return fRet;
  716. }
  717. dwTopicID += dwTopicIDDelta;
  718. //
  719. // Get term-weight if present. I'm going to get this
  720. // even if I'm not doing ranking, because it's in the
  721. // index, and I have to get around it somehow.
  722. //
  723. if (lpqt->idxf & IDXF_NORMALIZE)
  724. {
  725. if ((fRet = FGetBits(pDataInfo, &dwTmp, sizeof (USHORT) * cbitBYTE))
  726. != S_OK)
  727. goto exit0;
  728. if (wImportance != MAX_WEIGHT)
  729. dwTmp = (dwTmp * wImportance) / 65535;
  730. // BUGBUG: we actually want the weights for all aliased terms
  731. // to be considered at once.
  732. wWeight = (WORD)dwTmp;
  733. }
  734. // always skip any occurrence info
  735. if (occf & (OCCF_OFFSET | OCCF_COUNT))
  736. {
  737. // Figure out how many occurences there are in this
  738. // sub-list.
  739. //
  740. if ((fRet = FGetDword(pDataInfo, lpqt->ckeyOccCount,
  741. &dwOccs)) != S_OK)
  742. goto exit0;
  743. if ((fRet = SkipOccList (lpqt, pDataInfo, dwOccs)) != S_OK)
  744. goto exit0;
  745. }
  746. // If this search includes a group, and the doc is not in the
  747. // group then ignore it
  748. if (lpQueryTree->lpGroup
  749. && FGroupLookup(lpQueryTree->lpGroup, dwTopicID) == FALSE)
  750. continue;
  751. // calculate relevance upper bound Dr = Ds + sum(Qi) for this document
  752. if (lpResTopicList = TopicNodeSearch(lpQueryTree, lpResQuery, dwTopicID))
  753. wWeightMax = lpResTopicList->wWeight;
  754. else
  755. wWeightMax = 0;
  756. wWeightMax = AddWeights(wWeightMax, wWeight);
  757. wWeightMax = AddWeights(wWeightMax, QTN_TOKEN(lpQtNode)->wWeightRemain);
  758. if (wWeightMax < GetMinDocScore(lpDocScores, ROUND_DOWN)
  759. &&
  760. IsDocScoreListFull(lpDocScores))
  761. {
  762. // do not alloc/ or remove D from result list if present
  763. if (lpResTopicList)
  764. {
  765. register LPITOPIC lpPrev, lpTmp;
  766. // find lpPrev
  767. // UNDONE: look into removing necessity for this loop
  768. for (lpPrev = NULL, lpTmp = (LPITOPIC)lpQtNode->lpTopicList; lpTmp;
  769. lpTmp = lpTmp->pNext) {
  770. if (lpTmp == (LPITOPIC)lpResTopicList)
  771. break;
  772. lpPrev = lpTmp;
  773. }
  774. TopicNodeFree(lpQueryTree, lpResQuery, lpPrev, lpResTopicList);
  775. #if defined(_DEBUG) && defined(_DUMPALL)
  776. _DPF3("Remove topic %lu, wWeightMax = %lu, MinDocScore = %u\n", dwTopicID, \
  777. wWeightMax, GetMinDocScore(lpDocScores, ROUND_DOWN));
  778. #endif
  779. }
  780. // no need to update top-N docs since this wasn't one of them
  781. continue;
  782. }
  783. if (lpResTopicList)
  784. {
  785. WORD wOldWeight = lpResTopicList->wWeight;
  786. // Calc new Ds for this doc and if good enough for the club, ensure that
  787. // club invariant is maintained, else leave it since it could still become
  788. // a club member in the future
  789. lpResTopicList->wWeight = AddWeights(lpResTopicList->wWeight, wWeight);
  790. if (lpResTopicList->wWeight > GetMinDocScore(lpDocScores, ROUND_DOWN))
  791. UpdateDocScoreList(lpDocScores, wOldWeight, lpResTopicList->wWeight);
  792. #if defined(_DEBUG) && defined(_DUMPALL)
  793. _DPF3("Update topic %lu, wWeightMax = %lu, wWeight = %u\n", dwTopicID, \
  794. wWeightMax, lpResTopicList->wWeight);
  795. #endif
  796. continue;
  797. }
  798. // a new document counter: possible club member, or not enough
  799. // total documents yet
  800. if ((lpResTopicList = TopicNodeAllocate(lpQueryTree)) == NULL)
  801. {
  802. fRet = E_TOOMANYTOPICS;
  803. goto exit0;
  804. }
  805. lpResTopicList->dwTopicId = dwTopicID;
  806. lpResTopicList->lpOccur = NULL;
  807. lpResTopicList->lcOccur = 0;
  808. lpResTopicList->wWeight = wWeight;
  809. /* Add the new TopicID node into TopicList */
  810. TopicNodeInsert (lpQueryTree, lpResQuery, lpResTopicList);
  811. UpdateDocScoreList(lpDocScores, -1, lpResTopicList->wWeight);
  812. #if defined(_DEBUG) && defined(_DUMPALL)
  813. _DPF3("New topic %lu, wWeightMax = %lu, wWeight = %u\n", dwTopicID, \
  814. wWeightMax, lpResTopicList->wWeight);
  815. #endif
  816. } // end for each topic in posting
  817. fRet = S_OK;
  818. return fRet;
  819. }
  820. PRIVATE HRESULT PASCAL NEAR ResolveFlatQuery(_LPQT lpqt, _LPQTNODE lpCurQtNode, LPRETV lpRetV)
  821. {
  822. HRESULT fRet;
  823. PNODEINFO pLeafInfo = &lpRetV->LeafInfo;
  824. LPB astBTreeWord = lpRetV->pBTreeWord;
  825. DWORD dwTotalTopic;
  826. LPB lstModified = lpRetV->pModifiedWord;
  827. ERRB errb;
  828. WORD cByteMatched = 0;
  829. STRING_TOKEN FAR *lpStrList; /* Pointer to strings table */
  830. STRING_TOKEN FAR *lpPrev; /* Pointer to strings table */
  831. _LPDSL lpDocScores = (_LPDSL)(lpqt->lpDocScores);
  832. LPWORDINFO lpwiT;
  833. LPWORDINFO lpwiPrev;
  834. // first collect the word info for each token
  835. for (lpStrList = lpqt->lpStrList, lpPrev = NULL;
  836. lpStrList; lpStrList = lpStrList->pNext)
  837. {
  838. BOOL fNumber = TRUE;
  839. // accumulate the list of terms to have data read
  840. if ((fRet = GetWordInfoList(lpqt, lpStrList, lpCurQtNode, lpRetV)) != S_OK)
  841. {
  842. return SetErrCode (&errb, fRet);
  843. }
  844. // if no word info was available, remove the token from the list
  845. // it won't get freed until end of query, but who cares - it makes
  846. // the rest of the processing faster
  847. if (!lpStrList->lpwi)
  848. {
  849. if (lpPrev)
  850. lpPrev->pNext = lpStrList->pNext;
  851. else
  852. lpqt->lpStrList = lpStrList->pNext;
  853. // NOTE: lpPrev must remain unchanged when deleting!
  854. continue;
  855. }
  856. // cycle through all the instances of this term's lookalikes
  857. // (e.g. multiple aliases) and add up the total topic count
  858. // since we don't want to treat aliases as rare, even though
  859. // they may be.
  860. lpStrList->dwTopicCount = lpStrList->lpwi->cTopic;
  861. for (lpwiT = lpStrList->lpwi->pNext, lpwiPrev = NULL; lpwiT;
  862. lpwiPrev = lpwiT, lpwiT = lpwiT->pNext)
  863. lpStrList->dwTopicCount += lpwiT->cTopic;
  864. lpPrev = lpStrList;
  865. } // for next term
  866. // sort string list by descending term rarity
  867. SortStringWeights(lpqt);
  868. dwTotalTopic = 0;
  869. for (lpStrList = lpqt->lpStrList;
  870. lpStrList; lpStrList = lpStrList->pNext)
  871. {
  872. LPWORDINFO lpwiT;
  873. if (lpStrList->lpwi == NULL)
  874. continue;
  875. #if defined(_DEBUG) && defined(_DUMPALL)
  876. {
  877. char szTemp[256];
  878. STRNCPY(szTemp, lpStrList->lpString + 2, *(LPWORD)lpStrList->lpString);
  879. szTemp[*(LPWORD)lpStrList->lpString] = 0;
  880. _DPF1("Term: '%s'\n", szTemp);
  881. }
  882. #endif
  883. // We can terminate the query processing if the upper bound on the
  884. // smallest current doc score is lteq the current score of the R-th
  885. // biggest doc score, since any further computation will at most
  886. // result in a re-ordering of the bottom (N - R) documents.
  887. // However, this leaves the remaining documents only partially
  888. // sorted by relevancy, which may or may not be acceptable.
  889. if (AddWeights(GetMinDocScore(lpDocScores, ROUND_UP),
  890. lpStrList->wWeightRemain) <= GetSortedDocScore(lpDocScores,
  891. (int)lpRetV->SrchInfo.dwTopicFullCalc, ROUND_DOWN))
  892. break;
  893. lpqt->lpTopicStartSearch = NULL;
  894. lpqt->lpOccStartSearch = NULL;
  895. QTN_TOKEN(lpCurQtNode) = lpStrList;
  896. for (lpwiT = lpStrList->lpwi; lpwiT; lpwiT = lpwiT->pNext)
  897. {
  898. // TO DO: replace with WORDINFO in curqt node
  899. lpCurQtNode->cTopic = lpwiT->cTopic;
  900. lpCurQtNode->foData = lpwiT->foData;
  901. lpCurQtNode->cbData = lpwiT->cbData;
  902. lpCurQtNode->wRealLength = lpwiT->wRealLength;
  903. if ((fRet = ProcessTerm(lpqt, lpRetV,
  904. NULL, lpCurQtNode, lpStrList)) != S_OK)
  905. {
  906. // kevynct: no need to overwrite count on error since
  907. // we may be attempting to continue
  908. lpCurQtNode->cTopic += dwTotalTopic;
  909. return(fRet);
  910. }
  911. // Accumulate the topic count, since cTopic will be destroyed
  912. // if there is more searches for this node (such as wildcard)
  913. dwTotalTopic += lpCurQtNode->cTopic;
  914. }
  915. }
  916. lpCurQtNode->cTopic = dwTotalTopic;
  917. return S_OK;
  918. }
  919. __inline void MergeWordInfoCounts(WORDINFO FAR *lpwiDest, WORDINFO FAR *lpwiSrc)
  920. {
  921. lpwiDest->cTopic += lpwiSrc->cTopic;
  922. }
  923. // adds zero or more WORDINFO nodes for the passed-in string
  924. PRIVATE HRESULT GetWordInfoList(_LPQT lpqt, STRING_TOKEN FAR *lpStrToken, _LPQTNODE lpCurQtNode, LPRETV lpRetV)
  925. {
  926. int cLevel;
  927. int cMaxLevel;
  928. int fCheckFieldId;
  929. LST lstSearchStr;
  930. LPB lpCurPtr;
  931. int nCmp;
  932. HRESULT fRet;
  933. int f1stIsWild;
  934. LPB lpMaxAddress;
  935. PNODEINFO pLeafInfo = &lpRetV->LeafInfo;
  936. DWORD dwTemp;
  937. LPB astBTreeWord = lpRetV->pBTreeWord;
  938. WORD wLen;
  939. DWORD dwFieldID;
  940. LPB lstModified = lpRetV->pModifiedWord;
  941. BYTE fStemmed;
  942. LPB pBTreeWord;
  943. ERRB errb;
  944. WORD cByteMatched = 0;
  945. WORDINFO wi;
  946. LPWORDINFO lpwi;
  947. fStemmed = 0;
  948. lstSearchStr = lpStrToken->lpString;
  949. f1stIsWild = (lstSearchStr[2] == WILDCARD_CHAR ||
  950. lstSearchStr[2] == WILDCARD_STAR);
  951. // Make sure to turn of stemming if there is any wildcard characters
  952. for (nCmp = *((LPW)lstSearchStr) + 1; nCmp >= 2; nCmp--)
  953. {
  954. if (lstSearchStr[nCmp] == '*' || lstSearchStr[nCmp] == '?')
  955. {
  956. fStemmed = FALSE;
  957. break;
  958. }
  959. }
  960. // Turned off stemming for short words
  961. if (*(LPW)lstSearchStr < 3)
  962. fStemmed = FALSE;
  963. pLeafInfo->nodeOffset = lpqt->foIdxRoot;
  964. pLeafInfo->iLeafLevel = lpqt->cIdxLevels - 1;
  965. pLeafInfo->dwBlockSize = lpqt->dwBlockSize;
  966. // BUGBUG: we don't handle stemming for now.
  967. MEMCPY (lstModified, lstSearchStr,
  968. *((LPW)lstSearchStr) + sizeof (SHORT));
  969. // Zero terminated for wildcard search
  970. lstModified [*((LPW)lstModified) + 2] = 0;
  971. pBTreeWord = lpRetV->pBTreeWord;
  972. /* Change all '*' and '?' to 0. This will
  973. * ensure that things gets compared correctly with
  974. * the top node's entries
  975. */
  976. for (nCmp = *((LPW)lstModified) + 1; nCmp >= 2; nCmp--)
  977. {
  978. if (lstModified[nCmp] == '*' || lstModified[nCmp] == '?')
  979. {
  980. lstModified[nCmp] = 0;
  981. *(LPW)lstModified = nCmp - 2;
  982. }
  983. }
  984. /*
  985. * Point node-resolution variables at the right things. This
  986. * sets these up to read b-tree nodes. Fields not set here are
  987. * set as appropriate elsewhere.
  988. */
  989. /* Set the flag */
  990. fCheckFieldId = (lpqt->occf & OCCF_FIELDID) && (lpCurQtNode->dwFieldId != DW_NIL_FIELD);
  991. astBTreeWord[0] = 0;
  992. cMaxLevel = lpqt->cIdxLevels - 1;
  993. /*
  994. First we have to find which tree level the word is in. The number of
  995. searches is equal to the number of tree levels at most. The
  996. structure of the directory node is a sequence of:
  997. - Words: PASCAL strings
  998. - Data offset: will tell us where is the
  999. offset of the record in the index file
  1000. */
  1001. for (cLevel = 0; cLevel < cMaxLevel ; cLevel++)
  1002. {
  1003. //
  1004. // Get a node.
  1005. //
  1006. if ((fRet = ReadStemNode ((PNODEINFO)pLeafInfo, cLevel)) != S_OK)
  1007. {
  1008. return SetErrCode (&errb, fRet);
  1009. }
  1010. lpMaxAddress = pLeafInfo->pMaxAddress;
  1011. lpCurPtr = pLeafInfo->pCurPtr;
  1012. //
  1013. // Loop through it. This compares the word I'm
  1014. // looking for against the word in the b-tree.
  1015. // If the word in the b-tree is >= the word I'm
  1016. // looking for, I'm done.
  1017. //
  1018. // If I run off the end of the node, there can be
  1019. // no match for this term, so I skip the entire
  1020. // process.
  1021. //
  1022. for (;;)
  1023. {
  1024. if (lpCurPtr >= lpMaxAddress)
  1025. return S_OK;
  1026. lpCurPtr = ExtractWord(astBTreeWord, lpCurPtr, &wLen);
  1027. if (fStemmed)
  1028. {
  1029. if ((fRet = FStem (pBTreeWord, astBTreeWord)) !=
  1030. S_OK)
  1031. return(S_OK);
  1032. }
  1033. /* Read in NodeId record */
  1034. lpCurPtr += ReadFileOffset (&pLeafInfo->nodeOffset, lpCurPtr);
  1035. if (f1stIsWild)
  1036. break;
  1037. if (StrCmpPascal2(lstModified, pBTreeWord) <= 0)
  1038. break;
  1039. }
  1040. }
  1041. /* At this point, pLeafInfo->nodeOffset is the node id of the leaf that
  1042. is supposed to contain the searched word. Read in the leaf node
  1043. */
  1044. if ((fRet = ReadLeafNode ((PNODEINFO)pLeafInfo, cLevel)) != S_OK)
  1045. {
  1046. return fRet;
  1047. }
  1048. lpCurPtr = pLeafInfo->pCurPtr;
  1049. lpMaxAddress = pLeafInfo->pMaxAddress;
  1050. //
  1051. // Second step is to deal with the leaf node(s). I'm going to
  1052. // find and capture some occurence lists. I'll probably have to
  1053. // ignore some bogus ones first.
  1054. //
  1055. // Reset the word
  1056. if (fStemmed)
  1057. {
  1058. MEMCPY (lstModified, lpRetV->pStemmedQueryWord,
  1059. *(LPW)lpRetV->pStemmedQueryWord + sizeof(WORD));
  1060. }
  1061. else
  1062. {
  1063. MEMCPY (lstModified, lstSearchStr,
  1064. *((LPW)lstSearchStr) + sizeof (SHORT));
  1065. }
  1066. for (;;)
  1067. {
  1068. // Check for out of data
  1069. if (lpCurPtr >= lpMaxAddress)
  1070. {
  1071. // Get the offset of the next node
  1072. ReadFileOffset (&pLeafInfo->nodeOffset, pLeafInfo->pBuffer);
  1073. if (FoIsNil (pLeafInfo->nodeOffset))
  1074. {
  1075. return S_OK;
  1076. }
  1077. // Read the next node
  1078. if ((fRet = ReadLeafNode ((PNODEINFO)pLeafInfo, cLevel))
  1079. != S_OK)
  1080. {
  1081. return SetErrCode (&errb, fRet);
  1082. }
  1083. lpCurPtr =
  1084. pLeafInfo->pBuffer + FOFFSET_SIZE + sizeof (SHORT);
  1085. lpMaxAddress = pLeafInfo->pMaxAddress;
  1086. }
  1087. /* Check for interrupt now and then */
  1088. if ((++lpqt->cInterruptCount) == 0)
  1089. {
  1090. if (lpqt->fInterrupt == E_INTERRUPT)
  1091. return E_INTERRUPT;
  1092. if (*lpqt->cStruct.Callback.MessageFunc &&
  1093. (fRet = (*lpqt->cStruct.Callback.MessageFunc)(
  1094. lpqt->cStruct.Callback.dwFlags,
  1095. lpqt->cStruct.Callback.pUserData, NULL)) != S_OK)
  1096. return(fRet);
  1097. }
  1098. // Extract the word
  1099. lpCurPtr = ExtractWord(astBTreeWord, lpCurPtr, &wLen);
  1100. if (fStemmed)
  1101. {
  1102. if ((fRet = FStem (pBTreeWord, astBTreeWord)) != S_OK)
  1103. return(fRet);
  1104. }
  1105. if (lpqt->occf & OCCF_FIELDID)
  1106. lpCurPtr += CbByteUnpack (&dwFieldID, lpCurPtr);
  1107. nCmp = CompareTerm (lpCurQtNode, lstModified, pBTreeWord, fCheckFieldId ?
  1108. dwFieldID : lpCurQtNode->dwFieldId, lpRetV->pLeadByteTable);
  1109. switch (nCmp)
  1110. {
  1111. case KEEP_SEARCHING:
  1112. // Skip TopicCount
  1113. lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
  1114. // Skip data offset
  1115. lpCurPtr += FOFFSET_SIZE;
  1116. // Skip DataSize
  1117. lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
  1118. break;
  1119. case STRING_MATCH:
  1120. lpCurPtr += CbByteUnpack (&wi.cTopic, lpCurPtr);
  1121. lpCurPtr += ReadFileOffset (&wi.foData, lpCurPtr);
  1122. lpCurPtr += CbByteUnpack (&wi.cbData, lpCurPtr);
  1123. wi.wRealLength = wLen;// BUGBUG doublecheck this
  1124. // Check for Topic count. This can be 0 if the word has been deleted
  1125. // from the index
  1126. if (wi.cTopic == 0)
  1127. break;
  1128. // long search optimization: clip noise words.
  1129. // Johnms- eliminate frequent words.
  1130. // typically, you eliminate if in more than 1/7 of documents.
  1131. if ((lpRetV->SrchInfo.Flag & LARGEQUERY_SEARCH)
  1132. &&
  1133. lpRetV->SrchInfo.dwValue < wi.cTopic
  1134. )
  1135. {
  1136. break;
  1137. }
  1138. // allocate WORDINFO node
  1139. if ((lpwi = BlockGetElement(lpqt->lpWordInfoBlock)) == NULL)
  1140. return E_OUTOFMEMORY;
  1141. *lpwi = wi;
  1142. lpwi->pNext = lpStrToken->lpwi;
  1143. lpStrToken->lpwi = lpwi;
  1144. // Save the info
  1145. pLeafInfo->pCurPtr = lpCurPtr;
  1146. break;
  1147. case NOT_FOUND: // No unconditional "break" above.
  1148. if (fStemmed && (strncmp (lstSearchStr+ 2, pBTreeWord + 2,
  1149. cByteMatched) == 0))
  1150. {
  1151. // Continue searching in case stemming is messed up
  1152. // by non-alphabetic word, such as the sequence:
  1153. // subtopic subtopic2 subtopics
  1154. lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
  1155. // Skip data offset
  1156. lpCurPtr += FOFFSET_SIZE;
  1157. // Skip DataSize
  1158. lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
  1159. break;
  1160. }
  1161. return S_OK;
  1162. }
  1163. }
  1164. }
  1165. /*************************************************************************
  1166. * @doc INTERNAL
  1167. *
  1168. * @func HRESULT PASCAL FAR | FFlatCallBack |
  1169. * This call back function is called by various breakers after
  1170. * fetching a token. The token is checked for wild char presence
  1171. *
  1172. * @parm LST | lstRawWord |
  1173. * Pointer to unnormalized string
  1174. *
  1175. * @parm LST | lstNormWord |
  1176. * Pointer to normalized string. This pascal string's size should be
  1177. * at least *lstNormWord+2
  1178. *
  1179. * @parm LFO | lfoWordOffset |
  1180. * Offset into the query buffer. It is used to mark the location
  1181. * where an parsing error has occurred
  1182. *
  1183. * @parm LPQI | lpqi |
  1184. * Pointer to query info structure. This has all "global" variables
  1185. *
  1186. * @rdesc S_OK if succeeded, else various errors.
  1187. *************************************************************************/
  1188. PUBLIC HRESULT PASCAL FAR EXPORT_API FFlatCallBack (LST lstRawWord, LST lstNormWord,
  1189. LFO lfoWordOffset, LPQI lpqi)
  1190. {
  1191. /* Add extra 0 to make sure that AllocWord() gets the needed 0
  1192. * for WildCardCompare()
  1193. */
  1194. lstNormWord[*(LPW)(lstNormWord) + 2] = 0;
  1195. // add the token to the string list
  1196. if (AllocWord(lpqi->lpQueryTree, lstNormWord) == NULL)
  1197. return E_OUTOFMEMORY;
  1198. return S_OK;
  1199. }
  1200. // for now, perform simple insertion sort on the string list
  1201. // bugbug: use heapsort or faster method for long lists
  1202. // for now, we sort by total topic count decreasing (rare terms first)
  1203. PRIVATE VOID PASCAL SortStringWeights(_LPQT lpQueryTree)
  1204. {
  1205. STRING_TOKEN FAR *pStr, *pStrNext, *pT, *pTPrev;
  1206. STRING_TOKEN FAR *pStrHead = lpQueryTree->lpStrList;
  1207. DWORD dwSum, dwT;
  1208. DWORD dwMaxWeight;
  1209. WORD wWeightT;
  1210. int nCmp;
  1211. FLOAT rLog;
  1212. FLOAT rLogSquared;
  1213. FLOAT rSigma;
  1214. FLOAT rTerm;
  1215. BOOL fNormalize = FALSE; // Normalize was for testing only.
  1216. if (fNormalize)
  1217. {
  1218. rSigma = (float)0.0;
  1219. // for each term:
  1220. for (pStr = pStrHead; pStr; pStr = pStr->pNext)
  1221. {
  1222. FLOAT fOcc;
  1223. // we have to guard against the possibility of the log resulting in
  1224. // a value <= 0.0. Very rare, but possible in the future. This happens
  1225. // if dwTopicCount approaches or exceeds the N we are using (N == 100 million)
  1226. if (pStr->dwTopicCount >= cNintyFiveMillion)
  1227. rLog = cVerySmallWt; // log10(100 mil/ 95 mil) == 0.02
  1228. else
  1229. //rLog = (float) log10(cHundredMillion/(double)pHeader->dwTopicCount);
  1230. rLog = (float) (8.0 - log10((double)pStr->dwTopicCount));
  1231. rLogSquared = rLog*rLog;
  1232. // Update sigma value
  1233. // NOTE : We are bounding dwOccCount by a value of eTFThreshold
  1234. // The RHS of the equation below has an upperbound of 2 power 30.
  1235. fOcc = (float) min(cTFThreshold, pStr->cUsed);
  1236. rSigma += fOcc*fOcc*rLogSquared;
  1237. }
  1238. rSigma = (float)sqrt(rSigma);
  1239. }
  1240. // calculate final weights and corrections
  1241. dwSum = dwMaxWeight = 0L;
  1242. for (pStr = pStrHead; pStr; pStr = pStr->pNext, nCmp++)
  1243. {
  1244. BOOL fNumber;
  1245. // once sigma is known, each term's proper weight can be calculated
  1246. if (fNormalize)
  1247. {
  1248. FLOAT rWeight;
  1249. // log10(x/y) == log10 (x) - log10 (y). Since x in our case is a known constant,
  1250. // 100,000,000, I'm replacing that with its equivalent log10 value of 8.0 and subtracting
  1251. // the log10(y) from it
  1252. rTerm = (float) (8.0 - log10((double) pStr->dwTopicCount));
  1253. // In extreme cases, rTerm could be 0 or even -ve (when dwTopicCount approaches or
  1254. // exceeds 100,000,000)
  1255. if (rTerm <= (float) 0.0)
  1256. rTerm = cVerySmallWt; // very small value. == log(100 mil/ 95 mil)
  1257. // NOTE : rWeight for the doc term would be as follows:
  1258. // rWeight = float(min(4096, dwBlockSize)) * rTerm / lpipb->wi.hrgsigma[dwTopicId]
  1259. //
  1260. // Since rTerm needs to be recomputed again for the query term weight computation,
  1261. // and since rTerm will be the same value for the current term ('cos N and n of log(N/n)
  1262. // are the same (N = 100 million and n is whatever the doc term freq is for the term),
  1263. // we will factor in the second rTerm at index time. This way, we don't have to deal
  1264. // with rTerm at search time (reduces computation and query time shortens)
  1265. //
  1266. // MV 2.0 initially did the same thing. However, BinhN removed the second rTerm
  1267. // because he decided to remove the rTerm altogether from the query term weight. He
  1268. // did that to keep the scores reasonably high.
  1269. rWeight = ((float) min(cTFThreshold, pStr->cUsed))
  1270. * rTerm * rTerm / rSigma;
  1271. // without the additional rTerm, we would probably be between 0.0 and 1.0
  1272. if (rWeight > rTerm)
  1273. wWeightT = 0xFFFF;
  1274. else
  1275. wWeightT = (WORD) ((float)0xFFFF * rWeight / rTerm);
  1276. }
  1277. else
  1278. wWeightT = 65535;
  1279. pStr->wWeight = (WORD)(16383 + 49152 / pStr->dwTopicCount);
  1280. // perform any special weight adjustments here
  1281. // BUGBUG: use NextChar here, and use charmap here
  1282. // numbers four digits or less get downgraded
  1283. fNumber = TRUE;
  1284. for (nCmp = *((LPWORD)pStr->lpString) + 1; nCmp >= 2; nCmp--)
  1285. if (nCmp > 5 || !IS_DIGIT(pStr->lpString[nCmp]))
  1286. {
  1287. fNumber = FALSE;
  1288. break;
  1289. }
  1290. if (fNumber)
  1291. pStr->wWeight = pStr->wWeight / 256;
  1292. //pStr->wTermWeight = (WORD)(pStr->wWeight * wWeightT / 65535L);
  1293. dwMaxWeight = max(dwMaxWeight, pStr->wWeight);
  1294. dwSum += pStr->wWeight;
  1295. }
  1296. // now sort 'em
  1297. for (pStr = pStrHead; pStr;)
  1298. {
  1299. if (NULL == (pStrNext = pStr->pNext))
  1300. break;
  1301. if (pStrNext->wWeight <= pStr->wWeight)
  1302. {
  1303. pStr = pStr->pNext;
  1304. continue;
  1305. }
  1306. // find element in already-sorted section
  1307. for (pT = pStrHead, pTPrev = NULL; pT; pTPrev = pT, pT = pT->pNext)
  1308. {
  1309. if (pT->wWeight <= pStrNext->wWeight)
  1310. {
  1311. pStr->pNext = pStrNext->pNext;
  1312. pStrNext->pNext = pT;
  1313. if (pTPrev)
  1314. pTPrev->pNext = pStrNext;
  1315. else
  1316. pStrHead = pStrNext;
  1317. break;
  1318. }
  1319. }
  1320. }
  1321. dwT = 0;
  1322. for (pStr = pStrHead; pStr; pStr = pStr->pNext)
  1323. {
  1324. dwT += pStr->wWeight;
  1325. if (dwSum > dwT)
  1326. pStr->wWeightRemain = AddWeights(0, (WORD)((dwSum - dwT) * 65535.0 / dwSum));
  1327. else
  1328. pStr->wWeightRemain = 1;
  1329. }
  1330. lpQueryTree->lpStrList = pStrHead;
  1331. }