Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2389 lines
77 KiB

  1. /*************************************************************************
  2. * *
  3. * SEARCH.C *
  4. * *
  5. * Copyright (C) Microsoft Corporation 1990-1994 *
  6. * All Rights reserved. *
  7. * *
  8. **************************************************************************
  9. * *
  10. * Module Intent *
  11. * Search Core Engine
  12. * *
  13. **************************************************************************
  14. * *
  15. * Current Owner: BinhN *
  16. * *
  17. **************************************************************************/
  18. #include <verstamp.h>
  19. SETVERSIONSTAMP(MVSR);
  20. #include <mvopsys.h>
  21. #include <mem.h>
  22. #include <memory.h>
  23. #ifdef DOS_ONLY
  24. #include <stdio.h>
  25. #include <assert.h>
  26. #endif
  27. #include <mvsearch.h>
  28. #include <groups.h>
  29. #include "common.h"
  30. #include "search.h"
  31. #ifdef _DEBUG
  32. static BYTE NEAR s_aszModule[] = __FILE__; /* Used by error return functions.*/
  33. #endif
  34. #if 0
  35. #define KEEP_SEARCHING ((int)-1)
  36. #define STRING_MATCH 0
  37. #define NOT_FOUND 1
  38. #endif
  39. #define KEEP_OCC TRUE
  40. #define RESET_OCC_FLAG TRUE
  41. typedef struct
  42. {
  43. unsigned char b1;
  44. unsigned char b2;
  45. } TWOBYTE;
  46. #ifdef _BIG_E
  47. #define BYTE1(p) (((TWOBYTE FAR *)&p)->b1)
  48. #define BYTE2(p) (((TWOBYTE FAR *)&p)->b2)
  49. #else
  50. #define BYTE1(p) (((TWOBYTE FAR *)&p)->b2)
  51. #define BYTE2(p) (((TWOBYTE FAR *)&p)->b1)
  52. #endif
  53. typedef HRESULT (PASCAL FAR *FDECODE) (PNODEINFO, CKEY, LPDW);
  54. /*************************************************************************
  55. * EXTERNAL VARIABLES
  56. * All those variables must be read only
  57. *************************************************************************/
  58. extern OPSYM OperatorArray[];
  59. extern FNHANDLER HandlerFuncTable[];
  60. extern FDECODE DecodeTable[];
  61. /*************************************************************************
  62. *
  63. * API FUNCTIONS
  64. * Those functions should be exported in a .DEF file
  65. *************************************************************************/
  66. PUBLIC LPIDX EXPORT_API FAR PASCAL MVIndexOpen (HFPB, LSZ, PHRESULT);
  67. PUBLIC void EXPORT_API FAR PASCAL MVIndexClose (LPIDX);
  68. PUBLIC LPHL EXPORT_API FAR PASCAL MVIndexSearch (LPIDX, LPQT,
  69. PSRCHINFO, LPGROUP, PHRESULT);
  70. /*************************************************************************
  71. *
  72. * INTERNAL GLOBAL FUNCTIONS
  73. * All of them should be declared far, unless they are known to be called
  74. * in the same segment
  75. *************************************************************************/
  76. VOID PASCAL FAR CleanMarkedOccList (LPITOPIC);
  77. VOID PASCAL FAR TopicWeightCalc(LPITOPIC);
  78. BOOL NEAR PASCAL FGroupLookup(LPGROUP, DWORD);
  79. LPB PASCAL FAR NextChar (LPB pStr, BYTE prgbLeadByteTable[]);
  80. __inline BOOL PASCAL FAR CompareChar (LPB pStr1, LPB pStr2, BYTE prgbLeadByteTable[]);
  81. /*************************************************************************
  82. *
  83. * INTERNAL PRIVATE FUNCTIONS
  84. * All of them should be declared near
  85. *************************************************************************/
  86. #ifndef SIMILARITY
  87. PUBLIC int PASCAL FAR CompareTerm(_LPQTNODE lpQtNode, LST lstTermWord,
  88. LST lstBtreeWord, DWORD dwBtreeFieldId, BYTE prgbLeadByteTable[]);
  89. #else
  90. PRIVATE int PASCAL NEAR CompareTerm(_LPQTNODE lpQtNode, LST lstTermWord,
  91. LST lstBtreeWord, DWORD dwBtreeFieldId, BYTE prgbLeadByteTable[]);
  92. #endif
  93. #ifndef SIMILARITY
  94. PUBLIC HRESULT PASCAL FAR SkipOccList(_LPQT lpqt, PNODEINFO pNodeInfo, DWORD dwOccs);
  95. #else
  96. PRIVATE HRESULT PASCAL NEAR SkipOccList(_LPQT lpqt, PNODEINFO pNodeInfo, DWORD dwOccs);
  97. #endif
  98. PRIVATE HRESULT PASCAL NEAR FCaptureOccList(_LPIDX, LPRETV, PNODEINFO, DWORD, int,
  99. _LPQTNODE, int);
  100. PRIVATE HRESULT PASCAL NEAR LoadNode (_LPQT, int, _LPQTNODE, _LPQTNODE,
  101. LPRETV, int, int);
  102. PRIVATE int PASCAL NEAR WildCardCompare (LPB, LPB, BYTE []);
  103. PRIVATE HRESULT PASCAL NEAR GetWordDataLocation (_LPQT, LPRETV,
  104. _LPQTNODE);
  105. PRIVATE HRESULT PASCAL NEAR GetWordData (_LPQT, LPRETV,
  106. int, _LPQTNODE, _LPQTNODE, int, int);
  107. #define FGetDword(a,b,c) (*DecodeTable[b.cschScheme])(a, b, c)
  108. /*************************************************************************
  109. * @doc EXTERNAL API RETRIEVAL
  110. *
  111. * @func LPIDX FAR PASCAL | MVIndexOpen |
  112. * Open an index file
  113. *
  114. * @parm HANDLE | hfpbSysFile |
  115. * If non-zero, this is the handle of an already opened system file
  116. *
  117. * @parm LSZ | lszFilename |
  118. * If hpfbSysFile is non-zero, this is the index subfile filename.
  119. * If it is 0, it is the filename of a regular DOS file
  120. *
  121. * @parm PHRESULT | phr |
  122. * Pointer to error buffer. This error buffer will be used for all
  123. * subsequential index retrieval related calls
  124. *
  125. * @rdesc If succeeded, the function will return a pointer to index structure.
  126. * If failed, it will return NULL, and the error buffer will contain the
  127. * description of the error
  128. *************************************************************************/
  129. PUBLIC LPIDX EXPORT_API FAR PASCAL MVIndexOpen (HFPB hfpbSysFile,
  130. LSZ lszFilename, PHRESULT phr)
  131. {
  132. _LPIDX lpidx; // Index information.
  133. HIDX hidx; // Handle to "lpidx".
  134. HRESULT fRet;
  135. HANDLE handle;
  136. LANGID langidFull;
  137. LANGID langidPrimary;
  138. /* Allocate an IDX structure */
  139. if ((hidx = _GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
  140. sizeof(IDX))) == NULL)
  141. {
  142. SetErrCode(phr, E_OUTOFMEMORY);
  143. return NULL;
  144. }
  145. lpidx = (_LPIDX)_GLOBALLOCK(hidx);
  146. lpidx->hStruct = hidx;
  147. #if 0
  148. lpidx->lpfnfInterCb = lpfnfInterCb;
  149. lpidx->lpvCbParams = lpvCbParams;
  150. #endif
  151. lpidx->lperrb = phr;
  152. /* Regular DOS file */
  153. if ((lpidx->hfpbIdxSubFile = (HFPB)FileOpen (hfpbSysFile, lszFilename,
  154. hfpbSysFile ? FS_SUBFILE : REGULAR_FILE, READ, phr)) == 0)
  155. {
  156. exit0:
  157. FreeHandle(hidx);
  158. return NULL;
  159. }
  160. if ((fRet = ReadIndexHeader(lpidx->hfpbIdxSubFile, &lpidx->ih)) != S_OK)
  161. {
  162. exit01:
  163. SetErrCode (phr, fRet);
  164. IndexCloseFile(lpidx);
  165. goto exit0;
  166. }
  167. if (lpidx->ih.version != VERCURRENT || lpidx->ih.FileStamp != INDEX_STAMP)
  168. {
  169. fRet = E_BADVERSION;
  170. goto exit01;
  171. }
  172. /* Set the slack size */
  173. lpidx->wSlackSize = LEAF_SLACK;
  174. langidPrimary = PRIMARYLANGID(langidFull = LANGIDFROMLCID(lpidx->ih.lcid));
  175. /* Build the Lead-Byte Table */
  176. if (langidPrimary == LANG_JAPANESE
  177. || langidPrimary == LANG_CHINESE
  178. || langidPrimary == LANG_KOREAN)
  179. {
  180. if (NULL == (handle = _GLOBALALLOC
  181. (GMEM_MOVEABLE | GMEM_ZEROINIT, 256)))
  182. {
  183. SetErrCode (phr, E_OUTOFMEMORY);
  184. fRet = E_OUTOFMEMORY;
  185. goto exit01;
  186. }
  187. lpidx->pLeadByteTable = (LPBYTE)_GLOBALLOCK (handle);
  188. lpidx->hLeadByteTable = handle;
  189. switch (langidPrimary)
  190. {
  191. case LANG_JAPANESE:
  192. MEMSET (lpidx->pLeadByteTable + 0x81, '\1', 0x1F);
  193. MEMSET (lpidx->pLeadByteTable + 0xE0, '\1', 0x1D);
  194. break;
  195. case LANG_CHINESE:
  196. switch (SUBLANGID(langidFull))
  197. {
  198. case SUBLANG_CHINESE_TRADITIONAL:
  199. MEMSET (lpidx->pLeadByteTable + 0x81, '\1', 0x7E);
  200. break;
  201. case SUBLANG_CHINESE_SIMPLIFIED:
  202. default:
  203. // Simplified Chinese and Korean have the same lead-bytes
  204. MEMSET (lpidx->pLeadByteTable + 0xA1, '\1', 0x5E);
  205. break;
  206. }
  207. break;
  208. case LANG_KOREAN:
  209. // Simplified Chinese and Korean have the same lead-bytes
  210. MEMSET (lpidx->pLeadByteTable + 0xA1, '\1', 0x5E);
  211. break;
  212. }
  213. }
  214. if ((fRet = TopNodeRead(lpidx)) != S_OK)
  215. {
  216. if (lpidx->pLeadByteTable)
  217. {
  218. _GLOBALUNLOCK (lpidx->hLeadByteTable);
  219. _GLOBALFREE (lpidx->hLeadByteTable);
  220. }
  221. goto exit01;
  222. }
  223. /* The the callback key */
  224. lpidx->dwKey = CALLBACKKEY;
  225. return (LPIDX)lpidx;
  226. }
  227. /*************************************************************************
  228. * @doc EXTERNAL API RETRIEVAL
  229. *
  230. * @func void FAR PASCAL | MVIndexClose |
  231. * Close an index file, and release all allocated memory associated with
  232. * the index
  233. *
  234. * @parm LPIDX | lpidx |
  235. * Pointer to index information structure (got from IndexOpen())
  236. *************************************************************************/
  237. // Shuts down an index.
  238. PUBLIC void EXPORT_API FAR PASCAL MVIndexClose(_LPIDX lpidx)
  239. {
  240. if (lpidx == NULL)
  241. return;
  242. TopNodePurge(lpidx);
  243. IndexCloseFile(lpidx);
  244. if (lpidx->pLeadByteTable)
  245. {
  246. _GLOBALUNLOCK (lpidx->hLeadByteTable);
  247. _GLOBALFREE (lpidx->hLeadByteTable);
  248. }
  249. FreeHandle(lpidx->hStruct);
  250. }
  251. /*************************************************************************
  252. * @doc EXTERNAL API RETRIEVAL
  253. *
  254. * @func void FAR PASCAL | MVGetIndexInfoLpidx |
  255. * Fills in an INDEXINFO struct given an LPIDX. All members of the
  256. * INDEXINFO struct are filled in except for dwMemSize.
  257. *
  258. * @parm LPIDX | lpidx |
  259. * Pointer to index information structure (got from IndexOpen())
  260. * @parm INDEXINFO* | lpindexinfo |
  261. * Pointer to public index information structure.
  262. *************************************************************************/
  263. PUBLIC void EXPORT_API PASCAL FAR MVGetIndexInfoLpidx(LPIDX lpidx,
  264. INDEXINFO *lpindexinfo)
  265. {
  266. _LPIDX _lpidx;
  267. if (lpidx == NULL || lpindexinfo == NULL)
  268. return;
  269. _lpidx = (_LPIDX) lpidx;
  270. lpindexinfo->dwBlockSize = _lpidx->ih.dwBlockSize;
  271. lpindexinfo->Occf = _lpidx->ih.occf;
  272. lpindexinfo->Idxf = _lpidx->ih.idxf;
  273. lpindexinfo->dwCodePageID = _lpidx->ih.dwCodePageID;
  274. lpindexinfo->lcid = _lpidx->ih.lcid;
  275. lpindexinfo->dwBreakerInstID = _lpidx->ih.dwBreakerInstID;
  276. }
  277. /*************************************************************************
  278. * @doc EXTERNAL API RETRIEVAL
  279. *
  280. * @func void FAR PASCAL | MVStopSearch |
  281. * This function will stop the search process. Typically it can be
  282. * only used in a multithreaded environment, where another thread
  283. * will use the query structure, which is currently accessed by the
  284. * the current search, to tell the search process to stop.
  285. *
  286. * @parm LPQT | lpqt |
  287. * Pointer to the query structure used by MVIndexSearch()
  288. *************************************************************************/
  289. PUBLIC VOID EXPORT_API FAR PASCAL MVStopSearch (_LPQT lpqt)
  290. {
  291. lpqt->fInterrupt = (BYTE)E_INTERRUPT;
  292. }
  293. /*************************************************************************
  294. * @doc EXTERNAL API RETRIEVAL
  295. *
  296. * @func void FAR PASCAL | MVSearchSetCallback |
  297. * Set appropriate user's call back function to be called during the search.
  298. * The user's function will be polled at interval. It should return
  299. * S_OK if there is nothing to process, E_INTERRUPT to abort the
  300. * search and dispose the search result, or ERR_TOOMANYDOCS to abort the
  301. * search, but keep the partial result
  302. * @parm LPQT | lpqt |
  303. * Pointer to query structure returned by MVQueryParse().
  304. * @parm PFCALLBACK_MSG | pfCallBackMsg |
  305. * Pointer to call back structure
  306. * @rdesc Return S_OK if successful, or E_INVALIDARG if any parameter
  307. * is NULL
  308. *************************************************************************/
  309. PUBLIC HRESULT EXPORT_API FAR PASCAL MVSearchSetCallback (_LPQT lpqt,
  310. PFCALLBACK_MSG pfCallBackMsg)
  311. {
  312. if (lpqt == NULL || pfCallBackMsg == NULL)
  313. return(E_INVALIDARG);
  314. lpqt->cStruct.Callback = *pfCallBackMsg;
  315. return(S_OK);
  316. }
  317. /*************************************************************************
  318. * @doc EXTERNAL API RETRIEVAL
  319. *
  320. * @func LPHL FAR PASCAL | MVIndexSearch |
  321. * Carry the search
  322. *
  323. * @parm LPIDX | lpidx |
  324. * Pointer to index information.
  325. *
  326. * @parm LPQT | lpqt |
  327. * Pointer to query tree (returned by MVQueryParse())
  328. *
  329. * @parm PSRCHINFO | pSrchInfo |
  330. * Pointer to search information data
  331. *
  332. * @parm _LPGROUP | lpResGroup |
  333. * Pointer to resulting group
  334. *
  335. * @parm PHRESULT | phr |
  336. * Pointer to error buffer
  337. *
  338. * @rdesc Pointer to hitlist structure if succeeded, even there is
  339. * no hits (use MVHitListEntries() to find out how many hits have been
  340. * returned). It will return NULL if failed. The error buffer
  341. * (see IndexOpen()) will contain descriptions about the cause of
  342. * the failure. There is one special case when the function returns
  343. * a non-null pointer, even there is error, that is when it can't
  344. * write the result to the disk, and everything is still in memory.
  345. *
  346. *************************************************************************/
  347. PUBLIC LPHL EXPORT_API FAR PASCAL MVIndexSearch (_LPIDX lpidx,
  348. _LPQT lpqt, PSRCHINFO pSrchInfo, _LPGROUP lpResGroup, PHRESULT phr)
  349. {
  350. HRESULT fRet; // Return from this function.
  351. LPRETV lpRetV; // Retrieval memory/files.
  352. GHANDLE hRetv;
  353. OCCF occf; // Index occurence flags temporary variable.
  354. _LPHL lphl; // Pointer to hitlist
  355. _LPQTNODE lpTreeTop;
  356. if (lpidx == NULL || lpqt == NULL || pSrchInfo == NULL)
  357. {
  358. /* We get some bad arguments!! */
  359. SetErrCode (phr, E_INVALIDARG);
  360. return NULL;
  361. }
  362. fRet = E_FAIL; // Assume thing will go wrong
  363. // Transfer all the information about the index to the query tree
  364. lpqt->foIdxRoot = lpidx->ih.foIdxRoot; /* Top node offset */
  365. lpqt->dwBlockSize = lpidx->ih.dwBlockSize; /* Index block size */
  366. lpqt->cIdxLevels = lpidx->ih.cIdxLevels; /* Index's depth */
  367. lpqt->occf = lpidx->ih.occf;
  368. lpqt->idxf = lpidx->ih.idxf;
  369. lpqt->foIdxRoot = lpidx->ih.foIdxRoot;
  370. lpqt->ckeyTopicId = lpidx->ih.ckeyTopicId;
  371. lpqt->ckeyOccCount = lpidx->ih.ckeyOccCount;
  372. lpqt->ckeyWordCount = lpidx->ih.ckeyWordCount;
  373. lpqt->ckeyOffset = lpidx->ih.ckeyOffset;
  374. if (lpqt->cQuery == 1)
  375. lpqt->fFlag |= ALL_ANDORNOT;
  376. #if 1
  377. if (pSrchInfo->dwMemAllowed)
  378. {
  379. if (DO_FAST_MERGE(pSrchInfo, lpqt))
  380. {
  381. SetBlockCount (lpqt->lpTopicMemBlock, (WORD)(pSrchInfo->dwMemAllowed /
  382. (sizeof(TOPIC_LIST) * cTOPIC_PER_BLOCK)));
  383. SetBlockCount (lpqt->lpOccMemBlock, 1);
  384. }
  385. else
  386. {
  387. SetBlockCount (lpqt->lpTopicMemBlock, (WORD)(pSrchInfo->dwMemAllowed * 2 /
  388. (5 * sizeof(TOPIC_LIST) * cTOPIC_PER_BLOCK)));
  389. SetBlockCount (lpqt->lpOccMemBlock, (WORD)(pSrchInfo->dwMemAllowed * 3 /
  390. (5 * sizeof(OCCURENCE) * cOCC_PER_BLOCK)));
  391. }
  392. }
  393. #endif
  394. /* Allocate hitlist */
  395. if ((lphl = (_LPHL)GLOBALLOCKEDSTRUCTMEMALLOC(sizeof (HL))) == NULL)
  396. {
  397. SetErrCode(phr, E_OUTOFMEMORY);
  398. return NULL;
  399. }
  400. lphl->lLastTopicId = 0xffffffff;
  401. lphl->lcMaxTopic = lpidx->ih.lcTopics;
  402. /* Allocate a return value structure */
  403. if ((hRetv = _GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
  404. sizeof(RETV))) == NULL)
  405. {
  406. SetErrCode(phr, E_OUTOFMEMORY);
  407. exit0:
  408. if (fRet != S_OK && fRet != E_TOOMANYTOPICS)
  409. {
  410. MVHitListDispose(lphl);
  411. lphl = NULL;
  412. }
  413. return (LPHL)lphl;
  414. }
  415. lpRetV = (LPRETV)_GLOBALLOCK(hRetv);
  416. lpRetV->lpqt = lpqt;
  417. if ((fRet = TopNodeRead(lpidx)) != S_OK)
  418. {
  419. SetErrCode (phr, fRet);
  420. exit02:
  421. FreeHandle(hRetv);
  422. goto exit0;
  423. }
  424. //
  425. // Count the number of occurence fields present. My retrieval
  426. // occurence record is going to cost 4 bytes per field.
  427. //
  428. occf = lpqt->occf;
  429. for (lpRetV->cOccFields = 0; occf; lpRetV->cOccFields++)
  430. occf &= occf - 1;
  431. lpqt->dwOccSize = lpRetV->dwOccSize =
  432. sizeof(OCCURENCE) + lpRetV->cOccFields * sizeof (DWORD);
  433. lpRetV->fRank = ((pSrchInfo->Flag &
  434. (QUERYRESULT_RANK | QUERYRESULT_NORMALIZE)) != 0);
  435. // Set pointer to various buffer
  436. lpRetV->LeafInfo.pTopNode = lpidx->lrgbTopNode;
  437. lpRetV->LeafInfo.pStemNode = lpRetV->pNodeBuf;
  438. lpRetV->LeafInfo.pLeafNode = lpRetV->pNodeBuf;
  439. lpRetV->LeafInfo.pDataNode = lpRetV->pDataBuf;
  440. lpRetV->LeafInfo.hfpbIdx = lpidx->hfpbIdxSubFile; // Index file to read from
  441. lpRetV->DataInfo.pTopNode = lpidx->lrgbTopNode;
  442. lpRetV->DataInfo.pStemNode = lpRetV->pNodeBuf;
  443. lpRetV->DataInfo.pLeafNode = lpRetV->pNodeBuf;
  444. lpRetV->DataInfo.pDataNode = lpRetV->pDataBuf;
  445. lpRetV->DataInfo.hfpbIdx = lpidx->hfpbIdxSubFile; // Index file to read from
  446. lpRetV->lcid = lpidx->ih.lcid;
  447. lpRetV->pLeadByteTable = lpidx->pLeadByteTable;
  448. // Save search information
  449. lpRetV->SrchInfo = *pSrchInfo;
  450. if (pSrchInfo->dwValue == 0)
  451. lpRetV->SrchInfo.dwValue = (DWORD)(-1);
  452. else
  453. lpRetV->SrchInfo.dwValue = lpidx->ih.lcTopics/pSrchInfo->dwValue;
  454. if ( (fRet = ResolveTree(lpqt, lpTreeTop = lpqt->lpTopNode,
  455. lpRetV, E_FAIL)) != S_OK)
  456. {
  457. SetErrCode (phr, fRet);
  458. /* Free the Topic and Occurrence memory blocks since they are
  459. * not freed by QueryTreeFree(), or MVHitListDispose() at this
  460. * point
  461. */
  462. if (fRet != E_TOOMANYTOPICS)
  463. {
  464. BlockFree ((LPV)lpqt->lpTopicMemBlock);
  465. BlockFree ((LPV)lpqt->lpOccMemBlock);
  466. lpqt->lpTopicMemBlock = NULL;
  467. lpqt->lpOccMemBlock = NULL;
  468. goto exit02;
  469. }
  470. }
  471. if (lpqt->fFlag & HAS_NEAR_RESULT)
  472. {
  473. NearHandlerCleanUp (lpqt, lpTreeTop);
  474. }
  475. /* Create a group if requested */
  476. if ((pSrchInfo->Flag & QUERYRESULT_GROUPCREATE) && lpResGroup)
  477. {
  478. LPITOPIC lpCurTopic; /* Topic's current pointer */
  479. LPB lpbGrpBitVect;
  480. DWORD maxTopicId;
  481. /* Initialize the pointer */
  482. lpbGrpBitVect = lpResGroup->lpbGrpBitVect;
  483. maxTopicId = lpResGroup->dwSize * 8;
  484. for (lpCurTopic = QTN_TOPICLIST(lpTreeTop); lpCurTopic;
  485. lpCurTopic = lpCurTopic->pNext)
  486. {
  487. /* Set the bit */
  488. if (lpCurTopic->dwTopicId < maxTopicId)
  489. {
  490. lpbGrpBitVect[(DWORD)(lpCurTopic->dwTopicId / 8)] |= 1 <<
  491. (lpCurTopic->dwTopicId % 8);
  492. }
  493. }
  494. lpResGroup->lcItem = lpTreeTop->cTopic; // erinfox: this wasn't getting set!
  495. }
  496. if ((pSrchInfo->Flag & QUERYRESULT_UIDSORT) == 0)
  497. {
  498. // if we are skipping occurrence info, topic weights
  499. // will have already been calculated directly
  500. if (lpRetV->fRank && !DO_FAST_MERGE(pSrchInfo, lpqt))
  501. TopicWeightCalc(QTN_TOPICLIST(lpTreeTop));
  502. if (lpqt->fFlag & (HAS_NEAR_RESULT | ORDERED_BASED))
  503. {
  504. SortResult (lpqt, lpTreeTop, ORDERED_BASED);
  505. lpqt->fFlag &= ~(HAS_NEAR_RESULT | TO_BE_SORTED);
  506. }
  507. /* Sort the result depending on ranking or not */
  508. if (lpRetV->fRank)
  509. SortResult ((LPQT)lpqt, lpTreeTop, WEIGHT_BASED);
  510. else
  511. SortResult ((LPQT)lpqt, lpTreeTop, HIT_COUNT_BASED);
  512. }
  513. /* Update HitList info structure, cut off the unwanted list */
  514. if (lphl->lpTopicList = lpTreeTop->lpTopicList)
  515. lphl->lcReturnedTopics = lphl->lcTotalNumOfTopics = lpTreeTop->cTopic;
  516. // Only return the number of topics that the user requested
  517. // if dwTopicCount == 0, it means that the user wants to return all
  518. if (pSrchInfo->dwTopicCount != 0 &&
  519. pSrchInfo->dwTopicCount < lphl->lcReturnedTopics)
  520. lphl->lcReturnedTopics = pSrchInfo->dwTopicCount;
  521. lphl->lpOccMemBlock = lpqt->lpOccMemBlock;
  522. lphl->lpTopicMemBlock = lpqt->lpTopicMemBlock;
  523. #if 1
  524. /* WARNING: The following code should be commented out for
  525. * diskless devices. No returned error is checked, since
  526. * if disk writes fail, everything is still in memory
  527. */
  528. if ((pSrchInfo->Flag & QUERYRESULT_IN_MEM) == 0)
  529. {
  530. if ((fRet = MVHitListFlush (lphl, lphl->lcReturnedTopics)) != S_OK)
  531. SetErrCode (phr, fRet);
  532. }
  533. #endif
  534. goto exit02;
  535. }
  536. /*************************************************************************
  537. * @doc INTERNAL
  538. *
  539. * @func HRESULT PASCAL NEAR | ResolveTree |
  540. * This function will read in the data from the index file for
  541. * each word, and combine them according to the operators.
  542. *
  543. * @func _LPQT | lpqt |
  544. * Index information
  545. *
  546. * @parm _LPQTNODE | lpQtNode |
  547. * Query tree top node to be resolved
  548. *
  549. * @parm LPRETV | lpRetV |
  550. * Returned values
  551. *
  552. * @parm int | fDivide |
  553. * Divide the weight between occurences
  554. *
  555. * @rdesc S_OK, or other errors
  556. *************************************************************************/
  557. PUBLIC HRESULT PASCAL NEAR ResolveTree(_LPQT lpqt, _LPQTNODE lpQtNode,
  558. LPRETV lpRetV, int fDivide)
  559. {
  560. _LPQTNODE lpLeft; /* Left node */
  561. _LPQTNODE lpRight; /* Right node */
  562. WORD OpVal; /* Operator value */
  563. WORD NodeType; /* type of node */
  564. HRESULT fRet = S_OK; /* Return value */
  565. HRESULT fOutOfMemory = S_OK;
  566. _LPQT lpQueryTree = lpRetV->lpqt;
  567. _LPQTNODE FAR *rgStack;
  568. HANDLE hStack;
  569. int StackTop = -1;
  570. /* Allocate a stack large enough to handle the tree's "recursion" */
  571. if ((hStack = _GLOBALALLOC(DLLGMEM_ZEROINIT, (LCB)lpQueryTree->TreeDepth *
  572. sizeof(_LPQTNODE))) == NULL)
  573. return E_OUTOFMEMORY;
  574. rgStack = (_LPQTNODE FAR *)_GLOBALLOCK(hStack);
  575. /* Traverse the tree */
  576. for (; lpQtNode;)
  577. {
  578. if (QTN_FLAG(lpQtNode) & PROCESSED)
  579. {
  580. /* This node has already been processed, just move up one
  581. * level, and continue the process
  582. */
  583. goto PopStack;
  584. }
  585. /* Handle TERM_NODE */
  586. if ((NodeType = QTN_NODETYPE(lpQtNode)) == TERM_NODE)
  587. {
  588. lpQueryTree->lpTopicStartSearch = NULL;
  589. lpQueryTree->lpOccStartSearch = NULL;
  590. if ((fRet = LoadNode (lpqt, OR_OP, NULL, lpQtNode,
  591. lpRetV, fDivide, fOutOfMemory)) != S_OK)
  592. {
  593. if (fRet != E_TOOMANYTOPICS)
  594. goto Exit;
  595. fOutOfMemory = E_TOOMANYTOPICS;
  596. // kevynct: delay abort until processing of operator node
  597. // goto TooManyHits;
  598. }
  599. if (QTN_TOPICLIST(lpQtNode))
  600. QTN_NODETYPE(lpQtNode) = EXPRESSION_NODE;
  601. else
  602. QTN_NODETYPE(lpQtNode) = NULL_NODE;
  603. /* Mark that the node has been processed */
  604. QTN_FLAG(lpQtNode) |= PROCESSED;
  605. goto PopStack;
  606. }
  607. OpVal = lpQtNode->OpVal;
  608. if (NodeType == OPERATOR_NODE)
  609. {
  610. if ((QTN_FLAG(lpLeft = QTN_LEFT(lpQtNode)) & PROCESSED) == 0)
  611. {
  612. /* Resolve left tree if we have not resolve it yet
  613. * Push the current node onto the stack, and process the
  614. * left node
  615. */
  616. rgStack[++StackTop] = lpQtNode;
  617. lpQtNode = lpLeft;
  618. continue;
  619. }
  620. /* Assertion for correctness */
  621. RET_ASSERT (QTN_NODETYPE(lpLeft) == EXPRESSION_NODE ||
  622. QTN_NODETYPE(lpLeft) == NULL_NODE);
  623. /* Binary operator. */
  624. /* Special cases */
  625. if (QTN_NODETYPE(lpLeft) == NULL_NODE)
  626. {
  627. switch (OpVal)
  628. {
  629. case AND_OP: // NULL & a = NULL
  630. case NEAR_OP: // NULL NEAR a = NULL
  631. case PHRASE_OP: // NULL PHRASE a = NULL ??
  632. case NOT_OP: // NULL not a = NULL
  633. /*
  634. * Change the sub-tree to a node and forget about
  635. * the right sub-tree that is not processed yet
  636. */
  637. *lpQtNode = *lpLeft;
  638. QTN_RIGHT(lpQtNode) = QTN_LEFT(lpQtNode) = NULL;
  639. goto PopStack;
  640. }
  641. }
  642. // kevynct: Handle partial hit list:
  643. //
  644. // In case we run out of memory for the left tree, we can sometimes still
  645. // partially handle the right tree. For example, we keep going if AND-like op with
  646. // right term node since this will likely at least increase chance of a smaller, more
  647. // meaningful result. For OR-like operators, we ignore right sub-tree altogether if
  648. // we haven't already traversed it.
  649. //
  650. // In any case, if there was a partial hitlist this function will still return
  651. // with E_TOOMANYTOPICS.
  652. if (fOutOfMemory)
  653. {
  654. switch (OpVal)
  655. {
  656. case OR_OP:
  657. // if right subtree already processed, keep it, since all memory
  658. // has already been allocated by this point and the handler will merely
  659. // combine.
  660. if (QTN_FLAG(QTN_RIGHT(lpQtNode)) & PROCESSED)
  661. break;
  662. /*
  663. * Change the sub-tree to a node and forget about
  664. * the right sub-tree that is not processed yet
  665. */
  666. *lpQtNode = *lpLeft;
  667. QTN_RIGHT(lpQtNode) = QTN_LEFT(lpQtNode) = NULL;
  668. goto PopStack;
  669. case AND_OP:
  670. case NEAR_OP:
  671. case PHRASE_OP:
  672. case NOT_OP:
  673. // continue processing if right node is a single term OR we've already
  674. // processed it. otherwise, another left node will get loaded later and we know we are
  675. // already oom.
  676. if ((QTN_FLAG(QTN_RIGHT(lpQtNode)) & PROCESSED)
  677. ||
  678. QTN_NODETYPE(QTN_RIGHT(lpQtNode)) == TERM_NODE)
  679. break;
  680. // warning: fallthru
  681. default:
  682. goto TooManyHits;
  683. }
  684. }
  685. /* Make some preparations before resolving the right tree */
  686. lpQueryTree->lpTopicStartSearch = NULL;
  687. lpQueryTree->lpOccStartSearch = NULL;
  688. /* Do some preparations for NOT operator */
  689. if (OpVal == NOT_OP)
  690. {
  691. MarkTopicList(lpLeft);
  692. }
  693. if (OpVal != PHRASE_OP && OpVal != NEAR_OP &&
  694. (lpQueryTree->fFlag & TO_BE_SORTED))
  695. {
  696. if (lpQueryTree->fFlag & HAS_NEAR_RESULT)
  697. NearHandlerCleanUp (lpQueryTree, lpLeft);
  698. /* We have to sort the left tree, which is the result of PHRASE,
  699. * to remove redundancies. This step should only be done after
  700. * we finishes processing ALL PHRASE terms. Same for NEAR
  701. */
  702. lpQueryTree->fFlag &= ~TO_BE_SORTED;
  703. SortResult (lpQueryTree, lpLeft, ORDERED_BASED);
  704. }
  705. /* Resolve the right tree */
  706. if (QTN_NODETYPE(lpRight = QTN_RIGHT(lpQtNode)) == TERM_NODE)
  707. {
  708. /* Handle EXPRESSION_TERM */
  709. if ((fRet = LoadNode (lpqt, OpVal, lpLeft, lpRight,
  710. lpRetV, fDivide, fOutOfMemory)) != S_OK)
  711. {
  712. if (fRet != E_TOOMANYTOPICS)
  713. goto Exit;
  714. fOutOfMemory = E_TOOMANYTOPICS;
  715. // kevynct: delay abort until processing of operator node
  716. // goto TooManyHits;
  717. }
  718. switch (OpVal)
  719. {
  720. case NEAR_OP:
  721. RemoveUnmarkedNearTopicList(lpQueryTree, lpLeft);
  722. lpQueryTree->fFlag |= TO_BE_SORTED | HAS_NEAR_RESULT;
  723. break;
  724. case PHRASE_OP:
  725. RemoveUnmarkedTopicList(lpQueryTree, lpLeft, !KEEP_OCC);
  726. lpQueryTree->fFlag |= TO_BE_SORTED;
  727. break;
  728. case AND_OP:
  729. RemoveUnmarkedTopicList(lpQueryTree, lpLeft, KEEP_OCC);
  730. CleanMarkedOccList (lpLeft->lpTopicList);
  731. break;
  732. case NOT_OP:
  733. RemoveUnmarkedTopicList(lpQueryTree, lpLeft, KEEP_OCC);
  734. break;
  735. }
  736. if (QTN_TOPICLIST(lpLeft))
  737. QTN_NODETYPE(lpLeft) = EXPRESSION_NODE;
  738. else
  739. QTN_NODETYPE(lpLeft) = NULL_NODE;
  740. }
  741. else
  742. {
  743. if ((QTN_FLAG(lpRight = QTN_RIGHT(lpQtNode)) &
  744. PROCESSED) == 0)
  745. {
  746. /* Resolve right tree if we have not resolved it yet
  747. * Push the current node onto the stack, and process the
  748. * left node
  749. */
  750. rgStack[++StackTop] = lpQtNode;
  751. lpQtNode = lpRight;
  752. continue;
  753. }
  754. /* Apply the operator */
  755. if ((fRet = (*HandlerFuncTable[OpVal])(lpQueryTree,
  756. lpLeft, NULL, (BYTE FAR *)lpRight,
  757. EXPRESSION_EXPRESSION)) != S_OK)
  758. {
  759. /* Copy the result, and release the nodes */
  760. if (fRet != E_TOOMANYTOPICS)
  761. goto Exit;
  762. // kevynct: we check for out of memory below
  763. }
  764. switch (OpVal)
  765. {
  766. case NEAR_OP:
  767. lpQueryTree->fFlag |= HAS_NEAR_RESULT;
  768. RemoveUnmarkedNearTopicList(lpQueryTree, lpLeft);
  769. break;
  770. case PHRASE_OP:
  771. RemoveUnmarkedTopicList(lpQueryTree, lpLeft, !KEEP_OCC);
  772. break;
  773. case NOT_OP:
  774. RemoveUnmarkedTopicList(lpQueryTree, lpLeft, KEEP_OCC);
  775. break;
  776. }
  777. }
  778. *lpQtNode = *lpLeft; // Change the sub-tree to a node
  779. QTN_FLAG(lpQtNode) |= PROCESSED;
  780. #if 0
  781. FreeHandle (lpLeft->hStruct);
  782. FreeHandle (lpRight->hStruct);
  783. #endif
  784. QTN_LEFT(lpQtNode) = QTN_RIGHT(lpQtNode) = NULL;
  785. // kevynct: only quit if this error comes from processing a real operator node
  786. // since fOutOfMemory is not set in that case above, whereas it IS set
  787. // when processing term node. Just a hack.
  788. if (fRet == E_TOOMANYTOPICS && !fOutOfMemory)
  789. goto TooManyHits;
  790. }
  791. PopStack:
  792. if (StackTop >= 0)
  793. {
  794. lpQtNode = rgStack[StackTop];
  795. StackTop--;
  796. }
  797. else
  798. break;
  799. }
  800. // kevynct: if we got this far, the tree was completed, but we may have only
  801. // been processing a partial hitlist (e.g. multiple "and") so we need
  802. // to still notify of possible oom even though all cleanup has been done
  803. fRet = fOutOfMemory;
  804. Exit:
  805. /* Release the stack */
  806. FreeHandle(hStack);
  807. return fRet;
  808. TooManyHits:
  809. /* If we hit that label, it means that we have too many hits
  810. * lpQtNode is the left node, the right node has been
  811. * processed. What we have to do now is to keep the partial
  812. * result, and release all nodes
  813. */
  814. if (StackTop >= 0)
  815. {
  816. /* The root node is saved on the stack */
  817. lpLeft = QTN_LEFT(*rgStack);
  818. lpRight = QTN_RIGHT(*rgStack);
  819. QTN_LEFT(*rgStack) = QTN_RIGHT(*rgStack) = NULL;
  820. *rgStack[0] = *lpQtNode;
  821. }
  822. FreeHandle(hStack);
  823. return E_TOOMANYTOPICS;
  824. }
  825. VOID PASCAL FAR TopicWeightCalc(LPITOPIC lpCurTopic)
  826. {
  827. LPIOCC lpCurOcc;
  828. WORD wWeight;
  829. for (; lpCurTopic; lpCurTopic = lpCurTopic->pNext)
  830. {
  831. wWeight = 0;
  832. for (lpCurOcc = lpCurTopic->lpOccur; lpCurOcc;
  833. lpCurOcc = lpCurOcc->pNext)
  834. {
  835. if (wWeight > (WORD)(wWeight + lpCurOcc->wWeight))
  836. {
  837. wWeight = MAX_WEIGHT;
  838. break;
  839. }
  840. else
  841. wWeight += lpCurOcc->wWeight;
  842. }
  843. lpCurTopic->wWeight = wWeight;
  844. }
  845. }
  846. #if 0
  847. /*************************************************************************
  848. * @doc INTERNAL
  849. *
  850. * @func HRESULT FAR PASCAL | GetWordDataLocation |
  851. * This function will search the index for the given word. It will
  852. * return back information about:
  853. * - The number of topics
  854. * - The location of the data
  855. * - The size of the data
  856. * - Pointer to the next word (for wildcard search)
  857. * @parm _LPQT | lpqt |
  858. * Pointer to index structure
  859. * @parm LPRETV | lpRetV |
  860. * Pointer to "globals"
  861. * @parm _LPQTNODE | lpCurQtNode |
  862. * Current node in the query tree
  863. * @rdesc S_OK or other errors
  864. *************************************************************************/
  865. PRIVATE HRESULT NEAR PASCAL GetWordDataLocation (_LPQT lpqt,
  866. LPRETV lpRetV, _LPQTNODE lpCurQtNode)
  867. {
  868. int cLevel;
  869. int cMaxLevel;
  870. int fCheckFieldId;
  871. LST lstSearchStr;
  872. LPB lpCurPtr;
  873. int nCmp;
  874. HRESULT fRet;
  875. int f1stIsWild;
  876. LPB lpMaxAddress;
  877. PNODEINFO pLeafInfo = &lpRetV->LeafInfo;
  878. DWORD dwTemp;
  879. LPB astBTreeWord = lpRetV->pBTreeWord;
  880. WORD wLen;
  881. DWORD dwFieldID;
  882. ERRB errb;
  883. BYTE lstModified[CB_MAX_WORD_LEN + sizeof (SHORT)];
  884. lstSearchStr = QTN_TOKEN(lpCurQtNode)->lpString;
  885. f1stIsWild = (lstSearchStr[2] == WILDCARD_CHAR ||
  886. lstSearchStr[2] == WILDCARD_STAR);
  887. pLeafInfo->nodeOffset = lpqt->foIdxRoot;
  888. pLeafInfo->iLeafLevel = lpqt->cIdxLevels - 1;
  889. pLeafInfo->dwBlockSize = lpqt->dwBlockSize;
  890. /* Copy and change all '*' and '?' to 0. This will
  891. * ensure that things gets compared correctly with
  892. * the top node's entries
  893. */
  894. MEMCPY (lstModified, lstSearchStr,
  895. *((LPW)lstSearchStr) + sizeof (SHORT));
  896. for (nCmp = *((LPW)lstModified) + 1; nCmp > 2; nCmp--)
  897. {
  898. if (lstModified[nCmp] == '*' || lstModified[nCmp] == '?')
  899. {
  900. lstModified[nCmp] = 0;
  901. lstModified[0] = nCmp - 2;
  902. }
  903. }
  904. /*
  905. * Point node-resolution variables at the right things. This
  906. * sets these up to read b-tree nodes. Fields not set here are
  907. * set as appropriate elsewhere.
  908. */
  909. /* Set the flag */
  910. fCheckFieldId = ((lpqt->occf & OCCF_FIELDID) &&
  911. (lpCurQtNode->dwFieldId != DW_NIL_FIELD));
  912. astBTreeWord[0] = 0;
  913. cMaxLevel = lpqt->cIdxLevels - 1;
  914. /*
  915. First we have to find which tree level the word is in. The number of
  916. searches is equal to the number of tree levels at most. The
  917. structure of the directory node is a sequence of:
  918. - Words: PASCAL strings
  919. - Data offset: will tell us where is the
  920. offset of the record in the index file
  921. */
  922. for (cLevel = 0; cLevel < cMaxLevel ; cLevel++)
  923. {
  924. //
  925. // Get a node.
  926. //
  927. if ((fRet = ReadStemNode ((PNODEINFO)pLeafInfo, cLevel)) != S_OK)
  928. {
  929. return SetErrCode (&errb, fRet);
  930. }
  931. lpMaxAddress = pLeafInfo->pMaxAddress;
  932. lpCurPtr = pLeafInfo->pCurPtr;
  933. //
  934. // Loop through it. This compares the word I'm
  935. // looking for against the word in the b-tree.
  936. // If the word in the b-tree is >= the word I'm
  937. // looking for, I'm done.
  938. //
  939. // If I run off the end of the node, there can be
  940. // no match for this term, so I skip the entire
  941. // process.
  942. //
  943. for (;;)
  944. {
  945. if (lpCurPtr >= lpMaxAddress)
  946. return S_OK;
  947. lpCurPtr = ExtractWord(astBTreeWord, lpCurPtr, &wLen);
  948. /* Read in NodeId record */
  949. lpCurPtr += ReadFileOffset (&pLeafInfo->nodeOffset, lpCurPtr);
  950. if (f1stIsWild)
  951. break;
  952. if (StrCmpPascal2(lstModified, astBTreeWord) <= 0)
  953. break;
  954. // erinfox:
  955. // if stemming is turned on, there could be a case in which the stemmed
  956. // word is less than the search term, but the unstemmed word is greater.
  957. // if we don't check the unstemmed, we'll skip this node erroneously.
  958. if (fStemmed && StrCmpPascal2(lstModified, astBTreeWord) <= 0)
  959. break;
  960. }
  961. }
  962. /* At this point, pLeafInfo->nodeOffset is the node id of the leaf that
  963. is supposed to contain the searched word. Read in the leaf node
  964. */
  965. if ((fRet = ReadLeafNode ((PNODEINFO)pLeafInfo, cLevel)) != S_OK)
  966. {
  967. return fRet;
  968. }
  969. lpCurPtr = pLeafInfo->pCurPtr;
  970. lpMaxAddress = pLeafInfo->pMaxAddress;
  971. //
  972. // Second step is to deal with the leaf node(s). I'm going to
  973. // find and capture some occurence lists. I'll probably have to
  974. // ignore some bogus ones first.
  975. //
  976. for (;;)
  977. {
  978. // Check for out of data
  979. if (lpCurPtr >= lpMaxAddress)
  980. {
  981. // Get the offset of the next node
  982. ReadFileOffset (&pLeafInfo->nodeOffset, pLeafInfo->pBuffer);
  983. if (FoIsNil (pLeafInfo->nodeOffset))
  984. return S_OK;
  985. // Read the next node
  986. if ((fRet = ReadStemNode ((PNODEINFO)pLeafInfo, cLevel))
  987. != S_OK)
  988. {
  989. return SetErrCode (&errb, fRet);
  990. }
  991. lpCurPtr =
  992. pLeafInfo->pBuffer + FOFFSET_SIZE + sizeof (SHORT);
  993. lpMaxAddress = pLeafInfo->pMaxAddress;
  994. }
  995. // Extract the word
  996. lpCurPtr = ExtractWord(astBTreeWord, lpCurPtr, &wLen);
  997. // Save the word length
  998. lpCurQtNode->wRealLength = wLen;
  999. if (lpqt->occf & OCCF_FIELDID)
  1000. lpCurPtr += CbByteUnpack (&dwFieldID, lpCurPtr);
  1001. nCmp = CompareTerm (lpCurQtNode, astBTreeWord, fCheckFieldId ?
  1002. dwFieldID : lpCurQtNode->dwFieldId, lpRetV->pLeadByteTable);
  1003. switch (nCmp)
  1004. {
  1005. case KEEP_SEARCHING:
  1006. // Skip TopicCount
  1007. lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
  1008. // Skip data offset
  1009. lpCurPtr += FOFFSET_SIZE;
  1010. // Skip DataSize
  1011. lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
  1012. break;
  1013. case STRING_MATCH:
  1014. lpCurPtr += CbByteUnpack (&lpCurQtNode->cTopic, lpCurPtr);
  1015. lpCurPtr += ReadFileOffset (&lpCurQtNode->foData, lpCurPtr);
  1016. lpCurPtr += CbByteUnpack (&lpCurQtNode->cbData, lpCurPtr);
  1017. // Set FieldId to give back the field id
  1018. lpCurQtNode->dwFieldId = dwFieldID;
  1019. // Set return pointer to beginning of next node
  1020. if (lpCurQtNode->iCurOff == 0)
  1021. lpCurQtNode->iCurOff = lpCurPtr - pLeafInfo->pBuffer;
  1022. return S_OK;
  1023. case NOT_FOUND: // No unconditional "break" above.
  1024. return S_OK;
  1025. }
  1026. }
  1027. }
  1028. #endif
  1029. /*************************************************************************
  1030. * @doc INTERNAL
  1031. *
  1032. * @func HRESULT FAR PASCAL | GetWordData |
  1033. * This function will search the index for the given word' data.
  1034. * @parm _LPQT | lpqt |
  1035. * Pointer to index structure
  1036. * @parm LPRETV | lpRetV |
  1037. * Pointer to "globals"
  1038. * @parm _LPQTNODE | lpCurQtNode |
  1039. * Current node in the query tree containing important data
  1040. * - The number of topics
  1041. * - The location of the data
  1042. * - The size of the data
  1043. * - Pointer to the next word (for wildcard search)
  1044. * @rdesc S_OK or other errors
  1045. *************************************************************************/
  1046. PUBLIC HRESULT EXPORT_API FAR PASCAL GetWordData (_LPQT lpqt, LPRETV lpRetV,
  1047. int Operator, _LPQTNODE lpResQuery, _LPQTNODE lpQtNode, int fDivide, int fOutOfMemory)
  1048. {
  1049. LPIOCC lpOccur; // The current occurence is collected into
  1050. // here.
  1051. DWORD dwTopicIDDelta; // Topic-ID delta from previous sub-list.
  1052. DWORD dwOccs; // Number of occurences in this sub-list.
  1053. DWORD dwTmp; // Scratch variable.
  1054. WORD wWeight; // Term-weight associated with this sub-list.
  1055. DWORD dwTopicID; // TopicId
  1056. WORD wImportance;
  1057. DWORD dwCount; // Word count
  1058. DWORD dwOffset; // Offset of the word
  1059. DWORD dwLength; // Length of the word
  1060. TOPIC_LIST FAR *lpResTopicList; // Result TopicList
  1061. HRESULT fRet; // Returned value
  1062. PNODEINFO pDataInfo;
  1063. DWORD dwTopicCount;
  1064. _LPQT lpQueryTree; // Query tree
  1065. OCCF occf;
  1066. BYTE fSkipOccList = FALSE;
  1067. pDataInfo = &lpRetV->DataInfo;
  1068. if ((pDataInfo->dwDataSizeLeft = lpQtNode->cbData) == 0)
  1069. return(S_OK); // There is nothing to process
  1070. // Initialize variables
  1071. occf = lpqt->occf;
  1072. wImportance = QTN_TOKEN(lpQtNode)->wWeight;
  1073. lpResTopicList = NULL;
  1074. lpQueryTree = lpRetV->lpqt;
  1075. dwTopicCount = lpQtNode->cTopic;
  1076. wWeight = (WORD)(65535L/dwTopicCount);
  1077. // Reset the topic count for lpQtNode so that is will not affect the
  1078. // result in case that lpResQuery == NULL
  1079. lpQtNode->cTopic = 0;
  1080. if (lpResQuery == NULL)
  1081. lpResQuery = lpQtNode;
  1082. // Initialize the data buffer node values
  1083. pDataInfo->pBuffer = pDataInfo->pDataNode;
  1084. pDataInfo->nodeOffset = lpQtNode->foData;
  1085. // Read the data block
  1086. if ((fRet = ReadNewData(pDataInfo)) != S_OK)
  1087. return(fRet);
  1088. dwTopicID = 0L; // Init occurence record
  1089. dwLength = 0;
  1090. // One pass through here for each sublist in the Topiclist.
  1091. for (; dwTopicCount; dwTopicCount--)
  1092. {
  1093. /* Check for interrupt now and then */
  1094. if ((++lpqt->cInterruptCount) == 0)
  1095. {
  1096. if (lpqt->fInterrupt == E_INTERRUPT)
  1097. return E_INTERRUPT;
  1098. if (*lpqt->cStruct.Callback.MessageFunc &&
  1099. (fRet = (*lpqt->cStruct.Callback.MessageFunc)(
  1100. lpqt->cStruct.Callback.dwFlags,
  1101. lpqt->cStruct.Callback.pUserData, NULL)) != S_OK)
  1102. return(fRet);
  1103. }
  1104. // Byte align
  1105. if (pDataInfo->ibit != cbitBYTE - 1)
  1106. {
  1107. pDataInfo->ibit = cbitBYTE - 1;
  1108. pDataInfo->pCurPtr ++;
  1109. }
  1110. // Get value from which I will calculate current doc-ID.
  1111. if ((fRet = FGetDword(pDataInfo, lpqt->ckeyTopicId,
  1112. &dwTopicIDDelta)) != S_OK)
  1113. {
  1114. exit0:
  1115. return fRet;
  1116. }
  1117. dwTopicID += dwTopicIDDelta;
  1118. //
  1119. // Get term-weight if present. I'm going to get this
  1120. // even if I'm not doing ranking, because it's in the
  1121. // index, and I have to get around it somehow.
  1122. //
  1123. if (lpqt->idxf & IDXF_NORMALIZE)
  1124. {
  1125. if ((fRet = FGetBits(pDataInfo, &dwTmp, sizeof (USHORT) * cbitBYTE))
  1126. != S_OK)
  1127. goto exit0;
  1128. if (wImportance != MAX_WEIGHT)
  1129. dwTmp = (dwTmp * wImportance) / 65535;
  1130. wWeight = (WORD)dwTmp;
  1131. }
  1132. //
  1133. // If this search includes a group, and the doc is not in the
  1134. // group then ignore it
  1135. fSkipOccList = (lpQueryTree->lpGroup &&
  1136. FGroupLookup(lpQueryTree->lpGroup, dwTopicID) == FALSE);
  1137. // erinfox: move test agains fSkipOccList outside
  1138. if (!fSkipOccList)
  1139. {
  1140. if (/*!fSkipOccList && */((lpResTopicList = TopicNodeSearch (lpQueryTree,
  1141. lpResQuery, dwTopicID)) == NULL))
  1142. {
  1143. /* Adding an new occurrence to a non-existing TopicList. */
  1144. /* Allocate the new TopicList only if it is an OR
  1145. operator. This record should be skipped for all other
  1146. operator
  1147. */
  1148. if (Operator == OR_OP && !fOutOfMemory)
  1149. {
  1150. if ((lpResTopicList = TopicNodeAllocate(lpQueryTree)) == NULL)
  1151. {
  1152. fRet = E_TOOMANYTOPICS;
  1153. goto exit0;
  1154. }
  1155. lpResTopicList->dwTopicId = dwTopicID;
  1156. lpResTopicList->lpOccur = NULL;
  1157. lpResTopicList->lcOccur = 0;
  1158. lpResTopicList->wWeight = 0;
  1159. /* Add the new TopicID node into TopicList */
  1160. TopicNodeInsert (lpQueryTree, lpResQuery, lpResTopicList);
  1161. }
  1162. else
  1163. {
  1164. /* There is no corresponding Topic list. Consequently, we
  1165. don't need to read in the right node's data for
  1166. the following operators: AND, PHRASE, NEAR, NOT
  1167. */
  1168. fSkipOccList = TRUE;
  1169. }
  1170. }
  1171. else
  1172. {
  1173. if (Operator == NOT_OP)
  1174. {
  1175. /* Don't skip this Topic list since it also contains
  1176. * the right node's docId
  1177. */
  1178. if (lpResTopicList)
  1179. lpResTopicList->fFlag &= ~TO_BE_KEPT;
  1180. fSkipOccList = TRUE;
  1181. }
  1182. else if (Operator == AND_OP && lpQueryTree->lpTopicStartSearch)
  1183. lpQueryTree->lpTopicStartSearch->fFlag |= TO_BE_KEPT;
  1184. }
  1185. }
  1186. lpQueryTree->lpOccStartSearch = NULL;
  1187. if ((occf & (OCCF_OFFSET | OCCF_COUNT)) == 0)
  1188. continue;
  1189. // Figure out how many occurences there are in this
  1190. // sub-list.
  1191. //
  1192. if ((fRet = FGetDword(pDataInfo, lpqt->ckeyOccCount,
  1193. &dwOccs)) != S_OK)
  1194. goto exit0;
  1195. if (fSkipOccList || fOutOfMemory)
  1196. {
  1197. skip_occ_list:
  1198. if ((fRet = SkipOccList (lpqt, pDataInfo, dwOccs)) != S_OK)
  1199. goto exit0;
  1200. continue;
  1201. }
  1202. if ((lpqt->idxf & IDXF_NORMALIZE) == FALSE && lpRetV->fRank)
  1203. {
  1204. wWeight = (WORD)(wWeight * dwOccs);
  1205. }
  1206. //
  1207. // If I'm doing ranking, divide the weight for
  1208. // this topic amongst all the occurences in
  1209. // the topic if I need to.
  1210. //
  1211. if (lpRetV->fRank && fDivide)
  1212. {
  1213. if (dwOccs > 65535L)
  1214. wWeight = 0;
  1215. else if ((WORD)dwOccs > 1)
  1216. wWeight /= (WORD)dwOccs;
  1217. }
  1218. // optimization for ISBU/IR:
  1219. // if no highlighting info is needed, and this is not near-type query
  1220. // then store the term weights in the topic list directly, and skip the occurrence
  1221. // list completely. If this is an AND or OR operator, then increment the existing
  1222. // weight since the occurrences are undergoing union. NOT operator leaves
  1223. // current weight unchanged.
  1224. if (DO_FAST_MERGE(&lpRetV->SrchInfo, lpqt))
  1225. {
  1226. if (lpResTopicList && (Operator == OR_OP || Operator == AND_OP) && lpRetV->fRank)
  1227. lpResTopicList->wWeight = (WORD) min(MAX_WEIGHT, lpResTopicList->wWeight + wWeight * dwOccs);
  1228. goto skip_occ_list;
  1229. }
  1230. //
  1231. // One pass through here for each occurence in
  1232. // this sub-list. If this index doesn't really
  1233. // have sub-lists it will still make one pass
  1234. // through here anyway, at which time it will
  1235. // write the doc-ID and possibly the term-weight
  1236. // and field-ID, then drop out.
  1237. //
  1238. dwCount = 0L;
  1239. dwOffset = 0L;
  1240. for (; dwOccs; dwOccs--)
  1241. {
  1242. // interrupt about every 4096
  1243. if ((dwOccs & 0x0FFF) == 0)
  1244. {
  1245. if (lpqt->fInterrupt == E_INTERRUPT)
  1246. {
  1247. fRet = E_INTERRUPT;
  1248. goto exit;
  1249. }
  1250. if (*lpqt->cStruct.Callback.MessageFunc &&
  1251. (fRet = (*lpqt->cStruct.Callback.MessageFunc)(
  1252. lpqt->cStruct.Callback.dwFlags,
  1253. lpqt->cStruct.Callback.pUserData, NULL)) != S_OK)
  1254. goto exit;
  1255. }
  1256. // Get word-count, if present.
  1257. //
  1258. if ((lpOccur = OccNodeAllocate(lpQueryTree)) == NULL)
  1259. {
  1260. fRet = E_TOOMANYTOPICS;
  1261. goto exit;
  1262. }
  1263. lpOccur->dwFieldId = lpQtNode->dwFieldId;
  1264. lpOccur->cLength = lpQtNode->wRealLength;
  1265. // If the caller requested term strings, put in the occurrence
  1266. // record a pointer to the term that currently matches the query
  1267. // we're gathering occurrence data for.
  1268. if ((lpRetV->SrchInfo.Flag & QUERY_GETTERMS) != 0)
  1269. lpOccur->lpvTerm = lpQtNode->lpvIndexedTerm;
  1270. if (occf & OCCF_COUNT)
  1271. {
  1272. if ((fRet = FGetDword(pDataInfo, lpqt->ckeyWordCount,
  1273. &dwTmp)) != S_OK)
  1274. {
  1275. exit1:
  1276. /* Just release the occurence node */
  1277. lpOccur->pNext = (LPIOCC)lpQueryTree->lpOccFreeList;
  1278. lpQueryTree->lpOccFreeList = (LPSLINK)lpOccur;
  1279. goto exit0;
  1280. }
  1281. dwCount += dwTmp;
  1282. lpOccur->dwCount = dwCount; // Needed for phrase and near
  1283. }
  1284. // Get byte-offset, if present.
  1285. //
  1286. if (occf & OCCF_OFFSET)
  1287. {
  1288. if ((fRet = FGetDword(pDataInfo, lpqt->ckeyOffset, &dwTmp))
  1289. != S_OK)
  1290. {
  1291. goto exit1;
  1292. }
  1293. dwOffset += dwTmp;
  1294. lpOccur->dwOffset = dwOffset;
  1295. }
  1296. // Get term-weight, if present.
  1297. //
  1298. if (lpRetV->fRank)
  1299. {
  1300. if (!fDivide)
  1301. wWeight = 0;
  1302. lpOccur->wWeight = wWeight;
  1303. }
  1304. #ifndef CW
  1305. if ((fRet = (*HandlerFuncTable[Operator])(lpQueryTree,
  1306. lpQtNode, lpResTopicList, (BYTE FAR *)lpOccur,
  1307. EXPRESSION_TERM)) != S_OK)
  1308. {
  1309. goto exit;
  1310. }
  1311. #else
  1312. switch (Operator)
  1313. {
  1314. case NEAR_OP:
  1315. if ((fRet = NearHandler(lpQueryTree,
  1316. lpQtNode, lpResTopicList, (BYTE FAR *)lpOccur,
  1317. EXPRESSION_TERM)) != S_OK)
  1318. {
  1319. goto exit;
  1320. }
  1321. break;
  1322. case PHRASE_OP:
  1323. if ((fRet = PhraseHandler(lpQueryTree,
  1324. lpQtNode, lpResTopicList, (BYTE FAR *)lpOccur,
  1325. EXPRESSION_TERM)) != S_OK)
  1326. {
  1327. goto exit;
  1328. }
  1329. break;
  1330. case AND_OP:
  1331. if ((fRet = AndHandler(lpQueryTree,
  1332. lpQtNode, lpResTopicList, (BYTE FAR *)lpOccur,
  1333. EXPRESSION_TERM)) != S_OK)
  1334. {
  1335. goto exit;
  1336. }
  1337. break;
  1338. case NOT_OP:
  1339. if ((fRet = NotHandler(lpQueryTree,
  1340. lpQtNode, lpResTopicList, (BYTE FAR *)lpOccur,
  1341. EXPRESSION_TERM)) != S_OK)
  1342. {
  1343. goto exit;
  1344. }
  1345. break;
  1346. case OR_OP:
  1347. if ((fRet = OrHandler(lpQueryTree,
  1348. lpQtNode, lpResTopicList, (BYTE FAR *)lpOccur,
  1349. EXPRESSION_TERM)) != S_OK)
  1350. {
  1351. goto exit;
  1352. }
  1353. break;
  1354. default:
  1355. fRet = E_FAIL;
  1356. goto exit;
  1357. }
  1358. #endif
  1359. }
  1360. }
  1361. fRet = S_OK;
  1362. exit:
  1363. /* Check to make sure that there are occurrences associcated with the
  1364. * TopicList. The main reason for no occurrence is that the user hits
  1365. * cancel when occurrences are being read in. Cancel will cause the
  1366. * read to fail, and there is no occurrence associated with the Topic
  1367. * List, which in turn, will cause hili code to fail. So, if there is
  1368. * no occurrence, just remove the list
  1369. */
  1370. if (lpResTopicList && lpResTopicList->lcOccur == 0
  1371. &&
  1372. !DO_FAST_MERGE(&lpRetV->SrchInfo, lpqt)
  1373. &&
  1374. (lpqt->occf & (OCCF_OFFSET | OCCF_COUNT)))
  1375. RemoveNode(lpQueryTree, (LPV)lpResQuery, NULL,
  1376. (LPSLINK)lpResTopicList, TOPICLIST_NODE);
  1377. goto exit0;
  1378. }
  1379. /*************************************************************************
  1380. * @doc INTERNAL
  1381. *
  1382. * @func HRESULT PASCAL NEAR | LoadNode |
  1383. * Load all the data related to a word from the index file,
  1384. * and apply the operator to them and the resulting data
  1385. *
  1386. * @parm _LPQT | lpqt |
  1387. * Index information
  1388. *
  1389. * @parm int | Operator |
  1390. * What operator we are dealing with
  1391. *
  1392. * @parm _LPQTNODE | lpResQuery |
  1393. * Resulting query node
  1394. *
  1395. * @parm _LPQTNODE | lpCurQtNode |
  1396. * Current query node
  1397. *
  1398. * @parm LPRETV | lpRetV |
  1399. * Returned result
  1400. *
  1401. * @parm int | fDivide |
  1402. * Divide the weight between occurences
  1403. *
  1404. * @rdesc S_OK if succeeded, errors otherwise
  1405. *************************************************************************/
  1406. PRIVATE HRESULT PASCAL NEAR LoadNode (_LPQT lpqt, int Operator,
  1407. _LPQTNODE lpResQuery, _LPQTNODE lpCurQtNode, LPRETV lpRetV, int fDivide, int fOutOfMemory)
  1408. {
  1409. int cLevel;
  1410. int cMaxLevel;
  1411. int fCheckFieldId;
  1412. LST lstSearchStr;
  1413. LPB lpCurPtr;
  1414. int nCmp;
  1415. HRESULT fRet;
  1416. int f1stIsWild;
  1417. LPB lpMaxAddress;
  1418. PNODEINFO pLeafInfo = &lpRetV->LeafInfo;
  1419. DWORD dwTemp;
  1420. LPB astBTreeWord = lpRetV->pBTreeWord;
  1421. WORD wLen;
  1422. DWORD dwFieldID;
  1423. DWORD dwTotalTopic;
  1424. LPB lstModified = lpRetV->pModifiedWord;
  1425. BYTE fStemmed;
  1426. LPB pBTreeWord;
  1427. ERRB errb;
  1428. WORD cByteMatched = 0;
  1429. fStemmed = ((lpRetV->SrchInfo.Flag & STEMMED_SEARCH) != 0) &&
  1430. (PRIMARYLANGID(LANGIDFROMLCID(lpRetV->lcid)) == LANG_ENGLISH);
  1431. lstSearchStr = QTN_TOKEN(lpCurQtNode)->lpString;
  1432. f1stIsWild = (lstSearchStr[2] == WILDCARD_CHAR ||
  1433. lstSearchStr[2] == WILDCARD_STAR);
  1434. // Make sure to turn of stemming if there is any wildcard characters
  1435. for (nCmp = *((LPW)lstSearchStr) + 1; nCmp >= 2; nCmp--)
  1436. {
  1437. if (lstSearchStr[nCmp] == '*' || lstSearchStr[nCmp] == '?')
  1438. {
  1439. fStemmed = FALSE;
  1440. break;
  1441. }
  1442. }
  1443. // Turned off stemming for short words
  1444. if (*(LPW)lstSearchStr < 3)
  1445. fStemmed = FALSE;
  1446. pLeafInfo->nodeOffset = lpqt->foIdxRoot;
  1447. pLeafInfo->iLeafLevel = lpqt->cIdxLevels - 1;
  1448. pLeafInfo->dwBlockSize = lpqt->dwBlockSize;
  1449. if (fStemmed)
  1450. {
  1451. if ((fRet = ExtStemWord(lpRetV->SrchInfo.lpvIndexObjBridge,
  1452. &lpRetV->pStemmedQueryWord[0], lstSearchStr)) != S_OK)
  1453. {
  1454. return(fRet);
  1455. }
  1456. MEMCPY (lstModified, lpRetV->pStemmedQueryWord,
  1457. *(LPW)lpRetV->pStemmedQueryWord + sizeof(WORD));
  1458. pBTreeWord = lpRetV->pStemmedBTreeWord;
  1459. for (nCmp = 2; nCmp <= *(LPW)lstModified+1; nCmp++)
  1460. {
  1461. if (lstModified[nCmp] == lstSearchStr[nCmp])
  1462. cByteMatched++;
  1463. else
  1464. break;
  1465. }
  1466. }
  1467. else
  1468. {
  1469. // Restore the original word
  1470. MEMCPY (lstModified, lstSearchStr,
  1471. *((LPW)lstSearchStr) + sizeof (SHORT));
  1472. // Zero terminated for wildcard search
  1473. lstModified [*((LPW)lstModified) + 2] = 0;
  1474. pBTreeWord = lpRetV->pBTreeWord;
  1475. }
  1476. /* Change all '*' and '?' to 0. This will
  1477. * ensure that things gets compared correctly with
  1478. * the top node's entries
  1479. */
  1480. for (nCmp = *((LPW)lstModified) + 1; nCmp >= 2; nCmp--)
  1481. {
  1482. if (lpRetV->pLeadByteTable
  1483. && lpRetV->pLeadByteTable[lstModified[nCmp - 1]])
  1484. {
  1485. nCmp--;
  1486. }
  1487. else if (lstModified[nCmp] == '*' || lstModified[nCmp] == '?')
  1488. {
  1489. lstModified[nCmp] = 0;
  1490. *(LPW)lstModified = nCmp - 2;
  1491. }
  1492. }
  1493. /*
  1494. * Point node-resolution variables at the right things. This
  1495. * sets these up to read b-tree nodes. Fields not set here are
  1496. * set as appropriate elsewhere.
  1497. */
  1498. /* Set the flag */
  1499. fCheckFieldId = ((lpqt->occf & OCCF_FIELDID) &&
  1500. (lpCurQtNode->dwFieldId != DW_NIL_FIELD));
  1501. astBTreeWord[0] = 0;
  1502. cMaxLevel = lpqt->cIdxLevels - 1;
  1503. /*
  1504. First we have to find which tree level the word is in. The number of
  1505. searches is equal to the number of tree levels at most. The
  1506. structure of the directory node is a sequence of:
  1507. - Words: PASCAL strings
  1508. - Data offset: will tell us where is the
  1509. offset of the record in the index file
  1510. */
  1511. for (cLevel = 0; cLevel < cMaxLevel ; cLevel++)
  1512. {
  1513. //
  1514. // Get a node.
  1515. //
  1516. if ((fRet = ReadStemNode ((PNODEINFO)pLeafInfo, cLevel)) != S_OK)
  1517. {
  1518. return SetErrCode (&errb, fRet);
  1519. }
  1520. lpMaxAddress = pLeafInfo->pMaxAddress;
  1521. lpCurPtr = pLeafInfo->pCurPtr;
  1522. //
  1523. // Loop through it. This compares the word I'm
  1524. // looking for against the word in the b-tree.
  1525. // If the word in the b-tree is >= the word I'm
  1526. // looking for, I'm done.
  1527. //
  1528. // If I run off the end of the node, there can be
  1529. // no match for this term, so I skip the entire
  1530. // process.
  1531. //
  1532. for (;;)
  1533. {
  1534. if (lpCurPtr >= lpMaxAddress)
  1535. return S_OK;
  1536. lpCurPtr = ExtractWord(astBTreeWord, lpCurPtr, &wLen);
  1537. if (fStemmed)
  1538. {
  1539. if ((fRet = ExtStemWord(lpRetV->SrchInfo.lpvIndexObjBridge,
  1540. pBTreeWord, astBTreeWord)) != S_OK)
  1541. return(fRet);
  1542. }
  1543. /* Read in NodeId record */
  1544. lpCurPtr += ReadFileOffset (&pLeafInfo->nodeOffset, lpCurPtr);
  1545. if (f1stIsWild)
  1546. break;
  1547. if (StrCmpPascal2(lstModified, pBTreeWord) <= 0)
  1548. break;
  1549. // erinfox:
  1550. // if stemming is turned on, there could be a case in which the stemmed
  1551. // word is less than the search term, but the unstemmed word is greater.
  1552. // if we don't check the unstemmed, we'll skip this node erroneously.
  1553. if (fStemmed && StrCmpPascal2(lstModified, astBTreeWord) <= 0)
  1554. break;
  1555. }
  1556. }
  1557. /* At this point, pLeafInfo->nodeOffset is the node id of the leaf that
  1558. is supposed to contain the searched word. Read in the leaf node
  1559. */
  1560. if ((fRet = ReadLeafNode ((PNODEINFO)pLeafInfo, cLevel)) != S_OK)
  1561. {
  1562. return fRet;
  1563. }
  1564. lpCurPtr = pLeafInfo->pCurPtr;
  1565. lpMaxAddress = pLeafInfo->pMaxAddress;
  1566. dwTotalTopic = 0;
  1567. //
  1568. // Second step is to deal with the leaf node(s). I'm going to
  1569. // find and capture some occurence lists. I'll probably have to
  1570. // ignore some bogus ones first.
  1571. //
  1572. // Reset the word
  1573. if (fStemmed)
  1574. {
  1575. MEMCPY (lstModified, lpRetV->pStemmedQueryWord,
  1576. *(LPW)lpRetV->pStemmedQueryWord + sizeof(WORD));
  1577. }
  1578. else
  1579. {
  1580. MEMCPY (lstModified, lstSearchStr,
  1581. *((LPW)lstSearchStr) + sizeof (SHORT));
  1582. }
  1583. for (;;)
  1584. {
  1585. // Check for out of data
  1586. if (lpCurPtr >= lpMaxAddress)
  1587. {
  1588. // Get the offset of the next node
  1589. ReadFileOffset (&pLeafInfo->nodeOffset, pLeafInfo->pBuffer);
  1590. if (FoIsNil (pLeafInfo->nodeOffset))
  1591. {
  1592. lpCurQtNode->cTopic = dwTotalTopic;
  1593. return S_OK;
  1594. }
  1595. // Read the next node
  1596. if ((fRet = ReadLeafNode ((PNODEINFO)pLeafInfo, cLevel))
  1597. != S_OK)
  1598. {
  1599. return SetErrCode (&errb, fRet);
  1600. }
  1601. lpCurPtr =
  1602. pLeafInfo->pBuffer + FOFFSET_SIZE + sizeof (SHORT);
  1603. lpMaxAddress = pLeafInfo->pMaxAddress;
  1604. }
  1605. /* Check for interrupt now and then */
  1606. if ((++lpqt->cInterruptCount) == 0)
  1607. {
  1608. if (lpqt->fInterrupt == E_INTERRUPT)
  1609. return E_INTERRUPT;
  1610. if (*lpqt->cStruct.Callback.MessageFunc &&
  1611. (fRet = (*lpqt->cStruct.Callback.MessageFunc)(
  1612. lpqt->cStruct.Callback.dwFlags,
  1613. lpqt->cStruct.Callback.pUserData, NULL)) != S_OK)
  1614. return(fRet);
  1615. }
  1616. // Extract the word
  1617. lpCurPtr = ExtractWord(astBTreeWord, lpCurPtr, &wLen);
  1618. if (fStemmed)
  1619. {
  1620. if ((fRet = ExtStemWord(lpRetV->SrchInfo.lpvIndexObjBridge,
  1621. pBTreeWord, astBTreeWord)) != S_OK)
  1622. return(fRet);
  1623. }
  1624. // Save the word length
  1625. lpCurQtNode->wRealLength = wLen;
  1626. if (lpqt->occf & OCCF_FIELDID)
  1627. lpCurPtr += CbByteUnpack (&dwFieldID, lpCurPtr);
  1628. nCmp = CompareTerm (lpCurQtNode, lstModified, pBTreeWord, fCheckFieldId ?
  1629. dwFieldID : lpCurQtNode->dwFieldId, lpRetV->pLeadByteTable);
  1630. switch (nCmp)
  1631. {
  1632. case KEEP_SEARCHING:
  1633. // Skip TopicCount
  1634. lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
  1635. // Skip data offset
  1636. lpCurPtr += FOFFSET_SIZE;
  1637. // Skip DataSize
  1638. lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
  1639. break;
  1640. case STRING_MATCH:
  1641. lpCurPtr += CbByteUnpack (&lpCurQtNode->cTopic, lpCurPtr);
  1642. lpCurPtr += ReadFileOffset (&lpCurQtNode->foData, lpCurPtr);
  1643. lpCurPtr += CbByteUnpack (&lpCurQtNode->cbData, lpCurPtr);
  1644. // Check for Topic count. This can be 0 if the word has been deleted
  1645. // from the index
  1646. if (lpCurQtNode->cTopic == 0)
  1647. break;
  1648. if (lpRetV->SrchInfo.Flag & LARGEQUERY_SEARCH)
  1649. {
  1650. // long search optimization: clip noise words.
  1651. // Johnms- eliminate frequent words.
  1652. // typically, you eliminate if in more than 1/7 of documents.
  1653. if (lpRetV->SrchInfo.dwValue < lpCurQtNode->cTopic)
  1654. break;
  1655. }
  1656. // Add the raw (i.e. unstemmed) term from the index that currently
  1657. // matches the query term for this node to the query result term
  1658. // dictionary, and pass a pointer to the term in the dictionary
  1659. // to GetWordData so that it can add it to the occurrence records.
  1660. if ((lpRetV->SrchInfo.Flag & QUERY_GETTERMS) != 0 &&
  1661. (fRet = ExtAddQueryResultTerm(
  1662. lpRetV->SrchInfo.lpvIndexObjBridge,
  1663. astBTreeWord,
  1664. &lpCurQtNode->lpvIndexedTerm)) != S_OK)
  1665. {
  1666. return (fRet);
  1667. }
  1668. // Save the info
  1669. pLeafInfo->pCurPtr = lpCurPtr;
  1670. if ((fRet = GetWordData (lpqt, lpRetV,
  1671. Operator, lpResQuery, lpCurQtNode, fDivide,
  1672. fOutOfMemory)) != S_OK)
  1673. {
  1674. // kevynct: no need to overwrite count on error since
  1675. // we may be attempting to continue
  1676. lpCurQtNode->cTopic += dwTotalTopic;
  1677. return(fRet);
  1678. }
  1679. // Accumulate the topic count, since cTopic will be destroyed
  1680. // if there is more searches for this node (such as wildcard)
  1681. dwTotalTopic += lpCurQtNode->cTopic;
  1682. break;
  1683. case NOT_FOUND: // No unconditional "break" above.
  1684. if (fStemmed && (strncmp (lstSearchStr+ 2, pBTreeWord + 2,
  1685. cByteMatched) == 0))
  1686. {
  1687. // Continue searching in case stemming is messed up
  1688. // by non-alphabetic word, such as the sequence:
  1689. // subtopic subtopic2 subtopics
  1690. lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
  1691. // Skip data offset
  1692. lpCurPtr += FOFFSET_SIZE;
  1693. // Skip DataSize
  1694. lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
  1695. break;
  1696. }
  1697. lpCurQtNode->cTopic = dwTotalTopic;
  1698. return S_OK;
  1699. }
  1700. }
  1701. }
  1702. /*************************************************************************
  1703. * @doc INTERNAL
  1704. *
  1705. * @func int PASCAL NEAR | CompareTerm |
  1706. * This function compares two Pascal strings
  1707. *
  1708. * @parm _LPQTNODE FAR* | lpQtNode |
  1709. * Query tree node
  1710. *
  1711. * @parm LST | lstSrchStr |
  1712. * String to be searched
  1713. *
  1714. * @parm LST | lstBtreeWord |
  1715. * The word from the b-tree.
  1716. *
  1717. * @parm DWORD | dwBtreeFieldId |
  1718. * The field-ID from the index b-tree. if it is DW_NIL_FIELD,
  1719. * then there is no need to check
  1720. *
  1721. * @parm DWORD | dwLanguage |
  1722. * The language of the index that we are searching
  1723. *
  1724. * @rdesc
  1725. * The returned values are:
  1726. * @flag NOT_FOUND |
  1727. * The words do not match, and we have passed the interested point
  1728. * @flag KEEP_SEARCHING |
  1729. * The words do not match, but we should continue the search for
  1730. * the match may be ahead
  1731. * @flag STRING_MATCH |
  1732. * The words match
  1733. *************************************************************************/
  1734. #ifndef SIMILARITY
  1735. PUBLIC int PASCAL FAR CompareTerm(_LPQTNODE lpQtNode,LST lstTermWord,
  1736. LST lstBtreeWord, DWORD dwBtreeFieldId, BYTE prgbLeadByteTable[])
  1737. #else
  1738. PRIVATE int PASCAL NEAR CompareTerm(_LPQTNODE lpQtNode, LST lstTermWord,
  1739. LST lstBtreeWord, DWORD dwBtreeFieldId, BYTE prgbLeadByteTable[])
  1740. #endif
  1741. {
  1742. int nCmp; // The result of compare
  1743. BYTE FAR *lstTermHiWord;// Pointer to the hi term string
  1744. DWORD dwTermFieldId;
  1745. /* Get the variables */
  1746. dwTermFieldId = lpQtNode->dwFieldId;
  1747. switch (QTN_FLAG(lpQtNode))
  1748. {
  1749. case EXACT_MATCH:
  1750. /*
  1751. * This is very straight, it just compares the two words.
  1752. */
  1753. if ((nCmp = StrCmpPascal2(lstTermWord, lstBtreeWord)) < 0)
  1754. {
  1755. /* lstTermWord > lstBtreeWord */
  1756. return NOT_FOUND;
  1757. }
  1758. if (nCmp)
  1759. return KEEP_SEARCHING;
  1760. if (dwBtreeFieldId < dwTermFieldId)
  1761. return KEEP_SEARCHING;
  1762. if (dwBtreeFieldId == dwTermFieldId)
  1763. return STRING_MATCH;
  1764. if (dwBtreeFieldId > dwTermFieldId)
  1765. return NOT_FOUND;
  1766. break;
  1767. case TERM_RANGE_MATCH:
  1768. /*
  1769. * This makes sure that the b-tree word is between the
  1770. * two term words provided, and that the field-ID's
  1771. * match up.
  1772. */
  1773. lstTermHiWord = lpQtNode->lpHiString;
  1774. if ((nCmp = StrCmpPascal2(lstTermWord, lstBtreeWord)) > 0)
  1775. {
  1776. /* lstTermWord < lstBtreeWord */
  1777. return KEEP_SEARCHING;
  1778. }
  1779. if ((nCmp = StrCmpPascal2(lstTermHiWord, lstBtreeWord)) < 0)
  1780. {
  1781. /* lstTermHiWord > lstBtreeWord */
  1782. return NOT_FOUND;
  1783. }
  1784. if (dwTermFieldId != dwBtreeFieldId)
  1785. return KEEP_SEARCHING;
  1786. break;
  1787. case WILDCARD_MATCH:
  1788. /* Zero-terminated lstBtreeWord */
  1789. lstBtreeWord[*((LPW)lstBtreeWord) + sizeof (SHORT)] = 0;
  1790. if ((nCmp = WildCardCompare
  1791. (lstTermWord, lstBtreeWord, prgbLeadByteTable)) != STRING_MATCH)
  1792. return nCmp;
  1793. if (dwTermFieldId != dwBtreeFieldId)
  1794. return KEEP_SEARCHING;
  1795. break;
  1796. }
  1797. return STRING_MATCH;
  1798. }
  1799. /*************************************************************************
  1800. * @doc INTERNAL
  1801. *
  1802. * @func HRESULT PASCAL NEAR | SkipOccList |
  1803. * This function will skip on occurence list in the index.
  1804. * @parm _LPQT | lpqt |
  1805. * Pointer to Index information.
  1806. * @parm PNODEINFO | pNodeInfo |
  1807. * Current leaf info.
  1808. * @parm DWORD | dwOccs |
  1809. * Number of occurrences to be skipped
  1810. * @rdesc S_OK if successfully skip the occurence list
  1811. *************************************************************************/
  1812. #ifndef SIMILARITY
  1813. PUBLIC HRESULT PASCAL FAR SkipOccList(_LPQT lpqt, PNODEINFO pNodeInfo, DWORD dwOccs)
  1814. #else
  1815. PRIVATE HRESULT PASCAL NEAR SkipOccList(_LPQT lpqt, PNODEINFO pNodeInfo, DWORD dwOccs)
  1816. #endif
  1817. {
  1818. DWORD dwTmp; // Trash variable.
  1819. HRESULT fRet; // Returned value
  1820. //
  1821. // One pass through here for each occurence in the
  1822. // current sub-list.
  1823. //
  1824. for (; dwOccs; dwOccs--)
  1825. {
  1826. //
  1827. // Keeping word-counts? If so, get it.
  1828. //
  1829. if (lpqt->occf & OCCF_COUNT)
  1830. {
  1831. if ((fRet = FGetDword(pNodeInfo, lpqt->ckeyWordCount,
  1832. &dwTmp)) != S_OK)
  1833. {
  1834. return fRet;
  1835. }
  1836. }
  1837. //
  1838. // Keeping byte-offsets? If so, get it.
  1839. //
  1840. if (lpqt->occf & OCCF_OFFSET)
  1841. {
  1842. if ((fRet = FGetDword(pNodeInfo, lpqt->ckeyOffset,
  1843. &dwTmp)) != S_OK)
  1844. {
  1845. return fRet;
  1846. }
  1847. }
  1848. }
  1849. return S_OK;
  1850. }
  1851. /*************************************************************************
  1852. * @doc INTERNAL
  1853. *
  1854. * @func BOOL FAR PASCAL | FGroupLookup |
  1855. * Given a item number, this function will check to see if the item
  1856. * belongs to a group or not.
  1857. *
  1858. * @parm LPGROUP | lpGroup |
  1859. * Pointer to the group to be checked
  1860. *
  1861. * @parm DWORD | dwTopicId |
  1862. * Item number to be checked
  1863. *
  1864. * @rdesc The function will return 0 if the item is not in the group,
  1865. * non-zero otherwise
  1866. *************************************************************************/
  1867. BOOL NEAR PASCAL FGroupLookup(_LPGROUP lpGroup, DWORD dwTopicId)
  1868. {
  1869. /* Check for empty group */
  1870. if (lpGroup->lcItem == 0)
  1871. return 0;
  1872. if (dwTopicId < lpGroup->minItem || dwTopicId > lpGroup->maxItem)
  1873. return 0;
  1874. #if 0
  1875. // Currently the group always starts at 0., so there is no need
  1876. // to recalculate dwTopicId as below
  1877. dwTopicId -= (lpGroup->minItem / 8) * 8;
  1878. #endif
  1879. return (lpGroup->lpbGrpBitVect[(DWORD)(dwTopicId / 8)] &
  1880. (1 << (dwTopicId % 8)));
  1881. }
  1882. PRIVATE int PASCAL NEAR WildCardCompare
  1883. (LPB pWildString, LPB pString, BYTE prgbLeadByteTable[])
  1884. {
  1885. LPB pBack;
  1886. unsigned int wMinLength = 0;
  1887. int f1stIsWild;
  1888. int fRet = KEEP_SEARCHING;
  1889. int fGotWild = FALSE;
  1890. pWildString += sizeof (SHORT); /* Skip the length */
  1891. f1stIsWild = (*pWildString == WILDCARD_CHAR ||
  1892. *pWildString == WILDCARD_STAR);
  1893. /* Calculate the minimum length of the string */
  1894. // pback is used as temp here
  1895. for (pBack = pWildString; *pBack; pBack++)
  1896. {
  1897. if (prgbLeadByteTable && prgbLeadByteTable[*pBack])
  1898. {
  1899. wMinLength += 2;
  1900. pBack++;
  1901. }
  1902. else if (*pBack != '*')
  1903. wMinLength ++;
  1904. }
  1905. if (wMinLength > *((LPW)pString))
  1906. {
  1907. if (f1stIsWild)
  1908. return KEEP_SEARCHING;
  1909. }
  1910. pString += sizeof (SHORT); /* Skip the length */
  1911. pBack = NULL; /* Reset pBack */
  1912. for (;;)
  1913. {
  1914. switch (*pWildString)
  1915. {
  1916. case '?':
  1917. if (*pString == 0)
  1918. return fRet;
  1919. pWildString++;
  1920. pString = NextChar (pString, prgbLeadByteTable);
  1921. fGotWild = TRUE;
  1922. break;
  1923. case '*':
  1924. fGotWild = TRUE;
  1925. /* Optimization: *???? == * */
  1926. for (; *pWildString; pWildString++)
  1927. {
  1928. switch (*pWildString)
  1929. {
  1930. case '*':
  1931. pBack = pWildString;
  1932. case '?':
  1933. continue;
  1934. }
  1935. break;
  1936. }
  1937. if (*pWildString == 0)
  1938. {
  1939. /* Terminated by '*'. Match all */
  1940. return STRING_MATCH;
  1941. }
  1942. /* Skip the chars until we get a 1st match */
  1943. while (*pString)
  1944. {
  1945. if (!CompareChar (pString, pWildString, prgbLeadByteTable))
  1946. break;
  1947. pString = NextChar (pString, prgbLeadByteTable);
  1948. }
  1949. // This is inteded to fall through to continue processing
  1950. default:
  1951. if (!CompareChar (pString, pWildString, prgbLeadByteTable))
  1952. {
  1953. if (*pString == 0) /* We finish both strings */
  1954. return STRING_MATCH;
  1955. pString = NextChar (pString, prgbLeadByteTable);
  1956. pWildString = NextChar (pWildString, prgbLeadByteTable);
  1957. break;
  1958. }
  1959. else if (f1stIsWild || // *pWildString == 0 ||
  1960. // *pString < *pWildString
  1961. CompareChar (pString, pWildString, prgbLeadByteTable) < 0)
  1962. {
  1963. fRet = KEEP_SEARCHING;
  1964. }
  1965. else if (fGotWild == FALSE)
  1966. fRet = NOT_FOUND;
  1967. /* The chars do not match. Check to see for back up */
  1968. if (!pBack || *pString == 0)
  1969. {
  1970. return fRet;
  1971. }
  1972. /* Back up the string */
  1973. pWildString = pBack;
  1974. break;
  1975. }
  1976. }
  1977. }
  1978. /*************************************************************************
  1979. * @doc INTERNAL
  1980. *
  1981. * @func HRESULT FAR PASCAL | TopNodeRead |
  1982. * Makes sure the index b-tree top node is in memory. Reads it if
  1983. * necessary. The index file must be open and the index header must
  1984. * be in memory or this call will break.
  1985. *
  1986. * @parm _LPQT | lpidx |
  1987. * Index information.
  1988. *
  1989. * @rdesc S_OK, if succeeded, otherwise error values
  1990. *************************************************************************/
  1991. PUBLIC HRESULT PASCAL FAR TopNodeRead( _LPIDX lpidx)
  1992. {
  1993. DWORD dwBlockSize = lpidx->ih.dwBlockSize;
  1994. if (lpidx->hTopNode != NULL)
  1995. return S_OK;
  1996. if ((lpidx->hTopNode = _GLOBALALLOC(GMEM_MOVEABLE, dwBlockSize)) == NULL)
  1997. {
  1998. return E_OUTOFMEMORY;
  1999. }
  2000. lpidx->lrgbTopNode = (LRGB)_GLOBALLOCK(lpidx->hTopNode);
  2001. if (FileSeekRead
  2002. (lpidx->hfpbIdxSubFile, lpidx->lrgbTopNode, lpidx->ih.foIdxRoot,
  2003. dwBlockSize, lpidx->lperrb) != (long)dwBlockSize)
  2004. {
  2005. TopNodePurge(lpidx);
  2006. return E_ASSERT;
  2007. }
  2008. return S_OK;
  2009. }
  2010. /*************************************************************************
  2011. * @doc INTERNAL
  2012. *
  2013. * @func void PASCAL FAR | TopNodePurge |
  2014. * Get rid of the index b-tree top node if it's in memory.
  2015. *
  2016. * @parm _LPIDX | lpidx |
  2017. * Pointer to index structure
  2018. *************************************************************************/
  2019. PUBLIC void FAR PASCAL TopNodePurge(_LPIDX lpidx)
  2020. {
  2021. if (lpidx->hTopNode == NULL) // Already gone.
  2022. return;
  2023. FreeHandle(lpidx->hTopNode);
  2024. lpidx->hTopNode = NULL;
  2025. }
  2026. /*************************************************************************
  2027. * @doc INTERNAL
  2028. *
  2029. * @func void FAR PASCAL | IndexCloseFile |
  2030. * Close the index file. Error not checked since it is opened
  2031. * for read only
  2032. *
  2033. * @parm _LPIDX | lpidx |
  2034. * Pointer to index structure
  2035. *************************************************************************/
  2036. PUBLIC void PASCAL FAR IndexCloseFile(_LPIDX lpidx)
  2037. {
  2038. if (lpidx->hfpbIdxSubFile != NULL)
  2039. {
  2040. FileClose(lpidx->hfpbIdxSubFile);
  2041. lpidx->hfpbIdxSubFile = NULL;
  2042. }
  2043. }
  2044. /*************************************************************************
  2045. * @doc INTERNAL
  2046. *
  2047. * @func LPB FAR PASCAL | NextChar |
  2048. * Get the next character in a string based on a DBCS lead-byte table
  2049. *
  2050. * @parm LPB | pStr |
  2051. * Pointer to character in a string to skip
  2052. *
  2053. * @parm BYTE * | prgbLeadByteTable |
  2054. * Array of DBCS lead bytes (assumed to have 256 elements)
  2055. * Each element should be set to 1 or 0 to indeicate if that index
  2056. * is considered a lead-byte.
  2057. *
  2058. * @rdesc Returns a pointer to the next character in pStr
  2059. *************************************************************************/
  2060. LPB FAR PASCAL NextChar (LPB pStr, BYTE prgbLeadByteTable[])
  2061. {
  2062. if (!prgbLeadByteTable)
  2063. return (pStr + 1);
  2064. if (prgbLeadByteTable[*pStr])
  2065. return (pStr + 2);
  2066. return (pStr + 1);
  2067. }
  2068. /*************************************************************************
  2069. * @doc INTERNAL
  2070. *
  2071. * @func BOOL FAR PASCAL | CompareChar |
  2072. * Compares the first character in pStr1 to the first
  2073. * character in pStr2, using the supplied DBCS lead-byte table.
  2074. *
  2075. * @parm LPB | pStr1 |
  2076. * Pointer to character in a string to compare
  2077. *
  2078. * @parm LPB | pStr2 |
  2079. * Pointer to character in a string to compare
  2080. *
  2081. * @parm BYTE * | prgbLeadByteTable |
  2082. * Array of DBCS lead bytes (assumed to have 256 elements).
  2083. * Each element should be set to 1 or 0 to indeicate if that index
  2084. * is considered a lead-byte.
  2085. *
  2086. * @rdesc The difference between the first bytes of pStr1 and pStr2.
  2087. * If the first bytes are equal and are lead bytes then the
  2088. * difference between the second bytes is returned.
  2089. *************************************************************************/
  2090. __inline BOOL FAR PASCAL CompareChar
  2091. (LPB pStr1, LPB pStr2, BYTE prgbLeadByteTable[])
  2092. {
  2093. // Get rid of obvious mismatches
  2094. if (*pStr1 != *pStr2)
  2095. return (*pStr1 - *pStr2);
  2096. // We now know the first bytes are equal.
  2097. // If there is no lead byte table we have a match
  2098. if (!prgbLeadByteTable)
  2099. return (0);
  2100. // If lead bytes, check the trail bytes
  2101. if (prgbLeadByteTable[*pStr1])
  2102. return (*(pStr1 + 1) - *(pStr2 + 1));
  2103. // Not lead bytes then they must be equal
  2104. return (0);
  2105. }