Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1194 lines
36 KiB

  1. /************************************************************************y
  2. * *
  3. * QTPARSE.C *
  4. * *
  5. * Copyright (C) Microsoft Corporation 1990-1994 *
  6. * All Rights reserved. *
  7. * *
  8. **************************************************************************
  9. * *
  10. * Module Intent *
  11. * This module contains the functions needed to build a binary *
  12. * query tree used in search. It is specific to parsing only. All *
  13. * functions' purpose is to build and optimize the query tree before *
  14. * it is used for search. *
  15. * *
  16. **************************************************************************
  17. * *
  18. * Current Owner: BinhN *
  19. * *
  20. *************************************************************************/
  21. #include <mvopsys.h>
  22. #include <mem.h>
  23. #include <memory.h>
  24. #ifdef DOS_ONLY
  25. #include <stdio.h>
  26. #include <assert.h>
  27. #endif
  28. #include <mvsearch.h>
  29. #include "common.h"
  30. #include "search.h"
  31. int Debug = 1;
  32. #ifdef _DEBUG
  33. static BYTE NEAR s_aszModule[] = __FILE__; /* Used by error return functions.*/
  34. #endif
  35. /*************************************************************************
  36. * EXTERNAL VARIABLES
  37. * All those variables must be read only
  38. *************************************************************************/
  39. extern FNHANDLER HandlerFuncTable[];// Pointer to operator handlers
  40. extern WORD OperatorAttributeTable[];
  41. extern OPSYM OperatorSymbolTable[];
  42. /*************************************************************************
  43. *
  44. * INTERNAL GLOBAL FUNCTIONS
  45. *
  46. *************************************************************************/
  47. PUBLIC LPQT PASCAL NEAR QueryTreeAlloc(void);
  48. PUBLIC HRESULT PASCAL NEAR QueryTreeAddToken (_LPQT, int, LST, DWORD, BOOL);
  49. PUBLIC LPQT PASCAL NEAR QueryTreeBuild (LPQI);
  50. #if defined(_DEBUG) && DOS_ONLY
  51. PUBLIC HRESULT PASCAL NEAR PrintTree (_LPQTNODE ,
  52. HRESULT (PASCAL NEAR *)(BYTE FAR *));
  53. PUBLIC VOID PASCAL NEAR PrintStr (char FAR *);
  54. PUBLIC HRESULT PASCAL FAR PrintList(LPQT);
  55. #endif // DOS_ONLY && _DEBUG
  56. /*************************************************************************
  57. *
  58. * INTERNAL PRIVATE FUNCTIONS
  59. * All of them should be declared near
  60. *************************************************************************/
  61. #ifndef SIMILARITY
  62. PUBLIC STRING_TOKEN FAR *PASCAL AllocWord(_LPQT lpQueryTree, LST lstWord);
  63. #else
  64. PRIVATE STRING_TOKEN FAR *PASCAL NEAR AllocWord(_LPQT lpQueryTree, LST lstWord);
  65. #endif
  66. PRIVATE HRESULT PASCAL NEAR CheckTree (_LPQTNODE, PHRESULT);
  67. PRIVATE HRESULT PASCAL NEAR DoNullTermOpt (_LPQTNODE );
  68. PRIVATE HRESULT NEAR DoTermTermOpt (_LPQTNODE);
  69. PRIVATE HRESULT NEAR PASCAL DoAssociativeOpt (_LPQTNODE );
  70. PRIVATE HRESULT PASCAL NEAR TreeBuild(_LPQT);
  71. PRIVATE HRESULT PASCAL NEAR QueryTreeOptim (_LPQTNODE );
  72. PRIVATE HRESULT PASCAL NEAR PrintQueryNode (BYTE FAR *);
  73. PRIVATE HRESULT PASCAL NEAR PrintTopicListNode (LPQT, BYTE FAR *);
  74. PRIVATE HRESULT PASCAL NEAR PrintOccurNode(LPQT, OCCURENCE FAR *);
  75. PRIVATE VOID PASCAL NEAR PrintOperator (int);
  76. PRIVATE HRESULT PASCAL NEAR RemoveRedundancy (_LPQTNODE, int);
  77. PRIVATE VOID PASCAL NEAR GetTreeDepth (_LPQTNODE lpQtNode, int FAR * MaxLevel,
  78. int Level);
  79. /*************************************************************************
  80. * @doc INTERNAL
  81. *
  82. * @func LPQT PASCAL FAR | QueryTreeAlloc |
  83. * The function allocates a query tree structure and initializes
  84. * different variables needed to build the query tree.
  85. *
  86. * @rdesc Pointer to query tree structure if succeeded, NULL if
  87. * out-of-memory
  88. *************************************************************************/
  89. PUBLIC LPQT PASCAL NEAR QueryTreeAlloc()
  90. {
  91. _LPQT lpQueryTree;
  92. /* Allocate query buffer */
  93. if ((lpQueryTree =
  94. (_LPQT)GLOBALLOCKEDSTRUCTMEMALLOC(sizeof(QTREE))) == NULL)
  95. return NULL;
  96. /* Allocate room for string block */
  97. if ((lpQueryTree->lpStringBlock = BlockInitiate (STRING_BLOCK_SIZE,
  98. 0, 0, 0)) == NULL)
  99. {
  100. exit0:
  101. GlobalLockedStructMemFree((LPV)lpQueryTree);
  102. return NULL;
  103. }
  104. /* Allocate room for query tree nodes */
  105. if ((lpQueryTree->lpNodeBlock = BlockInitiate (QUERY_BLOCK_SIZE,
  106. sizeof(QTNODE), 0, 0)) == NULL)
  107. {
  108. exit1:
  109. BlockFree((LPV)lpQueryTree->lpStringBlock);
  110. goto exit0;
  111. }
  112. /* Allocate room for Topic nodes */
  113. if ((lpQueryTree->lpTopicMemBlock = BlockInitiate(sizeof(TOPIC_LIST)*
  114. cTOPIC_PER_BLOCK, sizeof(TOPIC_LIST), 0, 1)) == NULL)
  115. {
  116. exit2:
  117. BlockFree((LPV)lpQueryTree->lpNodeBlock);
  118. goto exit1;
  119. }
  120. /* Allocate room for Occurrence nodes */
  121. if ((lpQueryTree->lpOccMemBlock = BlockInitiate(sizeof(OCCURENCE)*
  122. cOCC_PER_BLOCK, sizeof(OCCURENCE), 0, 1)) == NULL)
  123. {
  124. #ifndef SIMILARITY
  125. exit3:
  126. #endif
  127. BlockFree((LPV)lpQueryTree->lpTopicMemBlock);
  128. goto exit2;
  129. }
  130. #ifndef SIMILARITY
  131. /* Allocate room for word info block */
  132. if ((lpQueryTree->lpWordInfoBlock = BlockInitiate (sizeof(WORDINFO)
  133. * cWordsPerToken, sizeof(WORDINFO), 0, 0)) == NULL)
  134. {
  135. BlockFree((LPV)lpQueryTree->lpOccMemBlock);
  136. goto exit3;
  137. }
  138. #endif
  139. /* Initialize various fields */
  140. lpQueryTree->cQuery = 0;
  141. lpQueryTree->lpOccFreeList =
  142. (LPSLINK)BlockGetLinkedList(lpQueryTree->lpOccMemBlock);
  143. lpQueryTree->lpTopicFreeList =
  144. (LPSLINK)BlockGetLinkedList(lpQueryTree->lpTopicMemBlock);
  145. lpQueryTree->lpStrList = NULL;
  146. return (LPQT)lpQueryTree;
  147. }
  148. /*************************************************************************
  149. * @doc INTERNAL
  150. *
  151. * @func HRESULT FAR PASCAL | QueryTreeAddToken |
  152. * This will create a query node from the information, and add it to
  153. * the query tree. The input data must be in postfix format
  154. *
  155. * @parm _LPQT | lpQueryTree |
  156. * Pointer to QueryTree
  157. *
  158. * @parm int | TokenType |
  159. * either TERM_TOKEN, or operator value (OR_OP, AND_OP, etc)
  160. *
  161. * @parm LST | lpWord |
  162. * pointer to Pascal string (for TERM_TOKEN)
  163. *
  164. * @parm DWORD | dwOffset |
  165. * Offset from the beginning of the query
  166. *
  167. * @parm BOOL | fWildChar |
  168. * TRUE if the string is terminated with '*'
  169. *
  170. *
  171. * @rdesc S_OK if succeeded, else error codes
  172. *************************************************************************/
  173. PUBLIC HRESULT PASCAL NEAR QueryTreeAddToken (_LPQT lpQueryTree,
  174. int TokenType, LST pWord, DWORD dwOffset, BOOL fWildChar)
  175. {
  176. _LPQTNODE lpQtNode;
  177. if (++lpQueryTree->cQuery == MAX_QUERY_NODE)
  178. return E_TREETOOBIG;
  179. /* Allocate the node */
  180. if ((lpQtNode = BlockGetElement(lpQueryTree->lpNodeBlock)) == NULL)
  181. return E_OUTOFMEMORY;
  182. // For future uses
  183. lpQtNode->dwMaxTopicId = 0;
  184. lpQtNode->dwMinTopicId = (DWORD)-1;
  185. if ((QTN_OPVAL(lpQtNode) = (WORD)TokenType) == TERM_TOKEN)
  186. {
  187. if ((QTN_TOKEN(lpQtNode) = AllocWord(lpQueryTree, pWord)) == NULL)
  188. return E_OUTOFMEMORY;
  189. QTN_NODETYPE(lpQtNode) = TERM_NODE;
  190. QTN_OPVAL(lpQtNode) = OR_OP; // For OrHandler()
  191. QTN_FIELDID(lpQtNode) = lpQueryTree->dwFieldId;
  192. QTN_GROUP(lpQtNode) = lpQueryTree->lpGroup;
  193. QTN_DTYPE(lpQtNode) = lpQueryTree->wBrkDtype;
  194. }
  195. else if ((QTN_OPVAL(lpQtNode) = (WORD)TokenType) == STOP_OP)
  196. {
  197. // erinfox: mark STOPWORD as a STOP_NODE
  198. QTN_NODETYPE(lpQtNode) = STOP_NODE;
  199. }
  200. else
  201. {
  202. QTN_NODETYPE(lpQtNode) = OPERATOR_NODE;
  203. QTN_PARMS(lpQtNode) = (LPV)pWord;
  204. }
  205. QTN_OFFSET(lpQtNode) = (WORD)dwOffset;
  206. QTN_FLAG(lpQtNode)= fWildChar ? WILDCARD_MATCH: EXACT_MATCH;
  207. /* Link the node into the linked list */
  208. QTN_PREV(lpQtNode) = lpQueryTree->lpTopNode;
  209. if (lpQueryTree->lpTopNode)
  210. QTN_NEXT(lpQueryTree->lpTopNode) = lpQtNode;
  211. lpQueryTree->lpTopNode = lpQtNode;
  212. return S_OK;
  213. }
  214. /*************************************************************************
  215. * @doc INTERNAL
  216. *
  217. * @func VOID PASCAL NEAR | SetQueryWeight |
  218. * The function sets the weights of invidual words in the query
  219. * The formula used in the computation is:
  220. * weight = 0.5 + 0.5 * wc / maxwc (0.5 < w <= 1)
  221. * or with the normalization
  222. * weight = 32767 (1 + wc / maxwc) (32767 < w <= 65534)
  223. * where:
  224. * wc: the word occurrence's count
  225. * maxwc: maximum word occurrence's count
  226. *
  227. * @parm LPQT | lpQueryInfo |
  228. * Pointer to QueryInfo struct where globals are
  229. *
  230. *************************************************************************/
  231. VOID PASCAL NEAR SetQueryWeight (LPQI lpQueryInfo)
  232. {
  233. STRING_TOKEN FAR *lpStrList; /* Pointer to strings table */
  234. WORD MaxWc; /* Maximum word count */
  235. lpStrList = ((_LPQT)lpQueryInfo->lpQueryTree)->lpStrList;
  236. /* Calculate maxwc */
  237. for (MaxWc = 1; lpStrList; lpStrList = lpStrList->pNext)
  238. {
  239. if (MaxWc < lpStrList->cUsed)
  240. MaxWc = lpStrList->cUsed;
  241. }
  242. /* Calculate invidual word weights */
  243. for (lpStrList = ((_LPQT)lpQueryInfo->lpQueryTree)->lpStrList;
  244. lpStrList; lpStrList = lpStrList->pNext) {
  245. lpStrList->wWeight = 32767 + (32767/MaxWc)*lpStrList->cUsed;
  246. }
  247. }
  248. /*************************************************************************
  249. * @doc INTERNAL
  250. *
  251. * @func LPQT FAR PASCAL | QueryTreeBuild |
  252. * The function build a query tree to be used later for the retrieval
  253. * process. The input data for it comes from calls of QueryTreeAddToken()
  254. *
  255. * @parm LPQT | lpQueryInfo |
  256. * Pointer to QueryInfo struct where globals are
  257. *
  258. * @rdesc NULL if failed, a pointer to the top of the query tree
  259. *************************************************************************/
  260. PUBLIC LPQT PASCAL NEAR QueryTreeBuild(LPQI lpQueryInfo)
  261. {
  262. _LPQT lpQueryTree = lpQueryInfo->lpQueryTree;
  263. PHRESULT phr = lpQueryInfo->lperrb;
  264. _LPQTNODE lpTreeTop;
  265. if (lpQueryTree->cQuery == 0 ||
  266. ( (lpQueryTree->cQuery == 1) && (lpQueryTree->lpTopNode->OpVal == STOP_OP)) )
  267. {
  268. SetErrCode(phr, E_NULLQUERY);
  269. return NULL;
  270. }
  271. /* Set the query words weights */
  272. SetQueryWeight (lpQueryInfo);
  273. /* Build the tree */
  274. TreeBuild (lpQueryTree);
  275. lpTreeTop = lpQueryTree->lpTopNode;
  276. if (lpQueryTree->cQuery > 1 && S_OK != CheckTree(lpTreeTop, phr))
  277. {
  278. return (NULL);
  279. }
  280. #if defined(_DEBUG) && DOS_ONLY
  281. if (Debug)
  282. {
  283. printf ("\n*** Tree infix form ***\n");
  284. PrintTree (lpTreeTop, PrintQueryNode);
  285. printf ("\n");
  286. }
  287. #endif // DOS_ONLY && _DEBUG
  288. /* Remove all redundant words */
  289. if (lpQueryTree->fFlag & (ALL_AND | ALL_OR))
  290. while (RemoveRedundancy (lpTreeTop, 0) == S_OK);
  291. /* Keep doing optimization until nothing can't be done anymore */
  292. while (QueryTreeOptim (lpTreeTop) == S_OK);
  293. #if defined(_DEBUG) && DOS_ONLY
  294. if (Debug)
  295. {
  296. printf ("\n*** Tree infix form after optimization ***\n");
  297. PrintTree (lpTreeTop, PrintQueryNode);
  298. printf ("\n");
  299. }
  300. #endif // DOS_ONLY && _DEBUG
  301. return (lpQueryTree);
  302. }
  303. /*************************************************************************
  304. * @doc INTERNAL
  305. *
  306. * @func HRESULT PASCAL NEAR | RemoveRedundancy |
  307. * This function will remove any repeated term in the query for all
  308. * OR or all AND queries. This will speed up the search since there is
  309. * no need to load the same term again only to throw it away.
  310. *
  311. * @parm _LPQTNODE | lpTreeTop |
  312. * Top node of the binary tree.
  313. *
  314. * @rdesc S_OK if some terms get removed, E_FAIL otherwise
  315. *************************************************************************/
  316. #define MAX_CHECKED_LEVEL 50 // Maximum traversed tree levels
  317. PRIVATE HRESULT PASCAL NEAR RemoveRedundancy (_LPQTNODE lpTreeTop, int level)
  318. {
  319. _LPQTNODE lpQtNode;
  320. if (level >= MAX_CHECKED_LEVEL || /* Stack overflow */
  321. QTN_NODETYPE(lpTreeTop) != OPERATOR_NODE) /* Term node */
  322. return E_FAIL;
  323. /* Handle the right tree */
  324. if (QTN_NODETYPE (lpQtNode = QTN_RIGHT(lpTreeTop)) == OPERATOR_NODE)
  325. {
  326. RemoveRedundancy (lpQtNode, level + 1);
  327. }
  328. else if ( (QTN_NODETYPE(lpQtNode) == TERM_NODE) && QTN_TOKEN(lpQtNode)->cUsed > 1)
  329. {
  330. /* We don't have to retrieve this node, so we
  331. * can ignore it. By doing that, we can ignore
  332. * the operator that accoampanies the node
  333. */
  334. /* Update the word count */
  335. QTN_TOKEN(lpQtNode)->cUsed--;
  336. *lpTreeTop = *QTN_LEFT(lpTreeTop);
  337. return S_OK;
  338. }
  339. /* Handle the left tree */
  340. if (QTN_NODETYPE (lpQtNode = QTN_LEFT(lpTreeTop)) == OPERATOR_NODE)
  341. {
  342. RemoveRedundancy (lpQtNode, level + 1);
  343. }
  344. else if ((QTN_NODETYPE(lpQtNode) == TERM_NODE) && QTN_TOKEN(lpQtNode)->cUsed > 1)
  345. {
  346. /* We don't have to retrieve this node, so we
  347. * can ignore it. By doing that, we can ignore
  348. * the operator that accoampanies the node
  349. */
  350. /* Update the word count */
  351. QTN_TOKEN(lpQtNode)->cUsed--;
  352. *lpTreeTop = *QTN_RIGHT(lpTreeTop);
  353. return S_OK;
  354. }
  355. return E_FAIL;
  356. }
  357. /*************************************************************************
  358. * @doc INTERNAL
  359. *
  360. * @func STRING_TOKEN FAR *PASCAL NEAR | AllocWord |
  361. * Allocate a word memory and structure
  362. *
  363. * @parm _LPQT | lpQueryTree |
  364. * Pointer to query tree structure (for globals)
  365. *
  366. * @parm LST | lstWord |
  367. * Word to be copied
  368. *
  369. * @rdesc Pointer to structure if succeeded
  370. *************************************************************************/
  371. #ifndef SIMILARITY
  372. PUBLIC STRING_TOKEN FAR *PASCAL AllocWord(_LPQT lpQueryTree, LST lstWord)
  373. #else
  374. PRIVATE STRING_TOKEN FAR *PASCAL NEAR AllocWord(_LPQT lpQueryTree, LST lstWord)
  375. #endif
  376. {
  377. STRING_TOKEN FAR *pTmp;
  378. pTmp = lpQueryTree->lpStrList;
  379. while (pTmp)
  380. {
  381. if (!StringDiff2 (lstWord, pTmp->lpString))
  382. {
  383. pTmp->cUsed ++;
  384. return (pTmp);
  385. }
  386. pTmp = pTmp->pNext;
  387. }
  388. /* The word doesn't exist yet, so create it. Add an extra byte
  389. * to make it 0's teminated to help WildCardCompare()
  390. */
  391. if ((pTmp = (STRING_TOKEN FAR *)BlockCopy (lpQueryTree->lpStringBlock,
  392. lstWord, *((LPW)lstWord) + 3, sizeof (STRING_TOKEN))) == NULL)
  393. return NULL;
  394. /* Set all the fields */
  395. pTmp->cUsed = 1;
  396. pTmp->lpString = (char FAR *)pTmp + sizeof(STRING_TOKEN);
  397. #ifndef SIMILARITY
  398. pTmp->lpwi = NULL; // List of word data for this token
  399. pTmp->dwTopicCount = 0;
  400. #endif
  401. /* Add the word to the string list */
  402. pTmp->pNext = lpQueryTree->lpStrList;
  403. lpQueryTree->lpStrList = pTmp;
  404. return pTmp;
  405. }
  406. /*************************************************************************
  407. * @doc INTERNAL
  408. *
  409. * @func PRIVATE PASCAL NEAR | TreeBuild |
  410. * Build the infix form tree from the postfix form.
  411. *
  412. * @parm _LPQT | lpQueryTree |
  413. * Pointer to query tree
  414. *************************************************************************/
  415. PRIVATE HRESULT PASCAL NEAR TreeBuild (_LPQT lpQueryTree)
  416. {
  417. _LPQTNODE rgNodeStack[STACK_SIZE]; /* Stack to help the conversion */
  418. _LPQTNODE lpStackTop = NULL; /* Pointer to stop of stack */
  419. int StackTop = -1; /* Current stack top index */
  420. _LPQTNODE lpPrevNode;
  421. _LPQTNODE lpQtNode;
  422. int TreeDepth;
  423. #if 0
  424. /* First optimzation step: Remove all duplicate words for all AND
  425. * or all OR querys
  426. */
  427. if (lpQueryTree->fFlag & (ALL_AND | ALL_OR))
  428. {
  429. for (lpQtNode = lpQueryTree->lpTopNode; lpQtNode;
  430. lpQtNode = lpPrevNode)
  431. {
  432. lpPrevNode = QTN_PREV(lpQtNode);
  433. /* Ignore operator node */
  434. if (QTN_NODETYPE(lpQtNode) == OPERATOR_NODE)
  435. continue;
  436. /* Check to see if we have to retrieve this node */
  437. if (QTN_TOKEN(lpQtNode)->cUsed > 1)
  438. {
  439. /* We don't have to retrieve this node, so we
  440. * can ignore it. By doing that, we can ignore
  441. * the operator that accoampanies the node
  442. */
  443. /* Update the word count */
  444. QTN_TOKEN(lpQtNode)->cUsed--;
  445. /* Remove the two nodes */
  446. if ((lpStartQtNode = QTN_NEXT(QTN_NEXT(lpQtNode))) == NULL)
  447. {
  448. /* Remove the two beginning nodes by just
  449. * resetting the starting node
  450. */
  451. lpQueryTree->lpTopNode = lpPrevNode;
  452. QTN_NEXT(lpPrevNode) = NULL;
  453. }
  454. else
  455. {
  456. QTN_PREV(lpStartQtNode) = lpPrevNode;
  457. QTN_NEXT(lpPrevNode) = lpStartQtNode;
  458. }
  459. /* Update number of query nodes */
  460. lpQueryTree->cQuery -= 2;
  461. }
  462. }
  463. }
  464. #endif
  465. for (lpQtNode = lpQueryTree->lpTopNode; lpQtNode;
  466. lpQtNode = lpPrevNode)
  467. {
  468. lpPrevNode = QTN_PREV(lpQtNode);
  469. QTN_RIGHT(lpQtNode) = QTN_LEFT(lpQtNode) = NULL;
  470. if (QTN_NODETYPE(lpQtNode) == OPERATOR_NODE)
  471. {
  472. /* Push the operator onto the stack */
  473. if (lpStackTop)
  474. {
  475. if (QTN_RIGHT(lpStackTop) == NULL)
  476. QTN_RIGHT(lpStackTop) = lpQtNode;
  477. else
  478. {
  479. QTN_LEFT(lpStackTop) = lpQtNode;
  480. StackTop--;
  481. }
  482. }
  483. lpStackTop = rgNodeStack[++StackTop] = lpQtNode;
  484. }
  485. else
  486. {
  487. /* Handle term node. lpStackTop points to the operator node */
  488. if (lpStackTop)
  489. {
  490. if (QTN_RIGHT(lpStackTop) == NULL)
  491. QTN_RIGHT(lpStackTop) = lpQtNode;
  492. else
  493. {
  494. QTN_LEFT(lpStackTop) = lpQtNode;
  495. if (--StackTop < 0)
  496. lpStackTop = NULL;
  497. else
  498. lpStackTop = rgNodeStack[StackTop];
  499. }
  500. }
  501. }
  502. }
  503. /* Calculate the tree depth. This is helpful when we resolve the
  504. * tree by avoiding a too deep recursion level.
  505. */
  506. TreeDepth = 0;
  507. #if 0
  508. for (lpQtNode = lpQueryTree->lpTopNode; lpQtNode;
  509. lpQtNode = QTN_LEFT(lpQtNode))
  510. {
  511. TreeDepth ++;
  512. }
  513. #else
  514. GetTreeDepth (lpQueryTree->lpTopNode, &TreeDepth, 1);
  515. #endif
  516. lpQueryTree->TreeDepth = (TreeDepth < STACK_SIZE) ? STACK_SIZE : TreeDepth;
  517. return S_OK;
  518. }
  519. PRIVATE VOID PASCAL NEAR GetTreeDepth (_LPQTNODE lpQtNode, int FAR * pMaxLevel,
  520. int Level)
  521. {
  522. if (QTN_LEFT(lpQtNode))
  523. GetTreeDepth (QTN_LEFT(lpQtNode), pMaxLevel, Level + 1);
  524. if (Level > *pMaxLevel)
  525. *pMaxLevel = Level;
  526. if (QTN_RIGHT(lpQtNode))
  527. GetTreeDepth (QTN_RIGHT(lpQtNode), pMaxLevel, Level + 1);
  528. }
  529. /*************************************************************************
  530. * @doc INTERNAL
  531. *
  532. * @func HRESULT PASCAL NEAR | QueryTreeOptim |
  533. * Do optimizations to the query tree. The optimization is done
  534. * based on the characteristics of the operators, such as
  535. * COMMUTATIVITY, etc
  536. *
  537. * @parm _LPQTNODE | lpQtNode |
  538. * Pointer to the top of the binary query tree
  539. *
  540. * @rdesc S_OK, if some optimization has been performed, E_FAIL
  541. * otherwise
  542. *************************************************************************/
  543. PRIVATE HRESULT PASCAL NEAR QueryTreeOptim (_LPQTNODE lpQtNode)
  544. {
  545. register _LPQTNODE lpLeft;
  546. register _LPQTNODE lpRight;
  547. int OpVal;
  548. HRESULT fRet = E_FAIL;
  549. if (QTN_NODETYPE(lpQtNode) == NULL_NODE ||
  550. QTN_NODETYPE(lpQtNode) == TERM_NODE)
  551. return E_FAIL;
  552. /* Handle unary operator */
  553. if (OperatorAttributeTable[OpVal = QTN_OPVAL(lpQtNode)] & UNARY_OP)
  554. return E_FAIL;
  555. lpLeft = QTN_LEFT(lpQtNode);
  556. lpRight = QTN_RIGHT(lpQtNode);
  557. // erinfox:
  558. // Look for STOP word nodes. If the operator is AND or NEAR,
  559. // change it to OR. Set node to NULL for all operators.
  560. // (X AND STOPWORD) becomes (X OR NULL) so it will evaluate to X.
  561. //
  562. if (QTN_NODETYPE(lpLeft) == STOP_NODE)
  563. {
  564. if (AND_OP == OpVal || NEAR_OP == OpVal)
  565. lpQtNode->OpVal = OR_OP;
  566. QTN_NODETYPE(lpLeft) = NULL_NODE;
  567. }
  568. else if (QTN_NODETYPE(lpRight) == STOP_NODE)
  569. {
  570. if (AND_OP == OpVal || NEAR_OP == OpVal)
  571. lpQtNode->OpVal = OR_OP;
  572. QTN_NODETYPE(lpRight) = NULL_NODE;
  573. }
  574. /* Handle leaf-leaf case */
  575. if (QTN_NODETYPE(lpLeft) == TERM_NODE &&
  576. QTN_NODETYPE(lpRight) == TERM_NODE)
  577. {
  578. return DoTermTermOpt (lpQtNode);
  579. }
  580. /* Handle NULL_NODE leaf */
  581. if (QTN_NODETYPE(lpLeft) == NULL_NODE ||
  582. QTN_NODETYPE(lpRight) == NULL_NODE)
  583. {
  584. return DoNullTermOpt (lpQtNode);
  585. }
  586. if ((OperatorAttributeTable[OpVal] & ASSOCIATIVE) &&
  587. (QTN_NODETYPE(lpLeft) == TERM_NODE ||
  588. QTN_NODETYPE(lpRight) == TERM_NODE))
  589. {
  590. /* One TERM_NODE and one OPERATOR_NODE */
  591. if (DoAssociativeOpt (lpQtNode) == S_OK)
  592. return S_OK;
  593. }
  594. if (QTN_NODETYPE(lpLeft) == OPERATOR_NODE)
  595. {
  596. if (QueryTreeOptim (lpLeft) == S_OK)
  597. fRet = S_OK;
  598. }
  599. if (QTN_NODETYPE(lpRight) == OPERATOR_NODE)
  600. {
  601. if (QueryTreeOptim (lpRight) == S_OK)
  602. fRet = S_OK;
  603. }
  604. /* Handle leaf-op case */
  605. if (OperatorAttributeTable[QTN_OPVAL(lpQtNode)] & COMMUTATIVE)
  606. {
  607. if (QTN_NODETYPE(lpLeft) == TERM_NODE)
  608. {
  609. /* Exchange the branches so that we do the sub-tree first */
  610. QTN_LEFT(lpQtNode) = lpRight;
  611. QTN_RIGHT(lpQtNode) = lpLeft;
  612. return S_OK;
  613. }
  614. }
  615. return fRet;
  616. }
  617. /*************************************************************************
  618. * HRESULT NEAR PASCAL DoAssociativeOpt (_LPQTNODE lpQtNode)
  619. *
  620. * Description:
  621. * This function will try to reduce the number of node processed
  622. * by applying the law of associativity of the operator.
  623. * a * (a * b) = (a * a) * b = a * b
  624. * The process is simplified by the following observations:
  625. * 1/ Only OR and AND are associative. So the top node must be
  626. * AND or OR
  627. * 2/ They are also commutative, ie. we can switch the right for
  628. * the left sub-tree without causing any error
  629. * (a or b) = (b or a)
  630. * (a and b) = (b and a)
  631. * This helps simplify different possible scenario
  632. * 3/ The OPERATOR_NODE Q4 may be any binary operator. Depending on
  633. * its value, we may have different result. The notation is based
  634. * on the picture below
  635. *
  636. * We have the following cases:
  637. * 1/ a and (T * a) = (T * a)
  638. * 2/ a or (T or a) = (T or a)
  639. * 3/ a or (T * a) : Unchanged (*: Non OR operator)
  640. * We can argue that a or (T * a) = a, since (T * a) is a subset of
  641. * a, but considering the following scenario:
  642. * b + a : b is highlited
  643. * a or (b + a) : b should be still highlited, ie. all the info
  644. * about b should not be thrown away
  645. *
  646. * The following picture describes of what is happening
  647. *
  648. * or and
  649. * / \ / \
  650. * Q1: a Q4: and ----> Q3: Tree a : Q2
  651. * / \
  652. * Q3: Tree a Q2
  653. *
  654. * Q1: the first level term node
  655. * Q2: the second level term node that match Q1
  656. * Q3: the second level sub-tree
  657. * Q4: the first level sub-tree
  658. *
  659. * Parameter:
  660. * _LPQTNODE lpQtNode: Pointer to a query node. We are sure to have
  661. * one TERM_NODE and one OPERATOR_NODE
  662. * Returned Values:
  663. * S_OK: If some optimization has been done
  664. * E_FAIL: otherwise
  665. *************************************************************************/
  666. PRIVATE HRESULT NEAR PASCAL DoAssociativeOpt (_LPQTNODE lpQtNode)
  667. {
  668. _LPQTNODE Q1;
  669. _LPQTNODE Q3 = NULL;
  670. _LPQTNODE Q4;
  671. int SubTreeOpVal;
  672. /* TERM_NODE is Q1, OPERATOR_NODE Q4 (see above picture) */
  673. if (QTN_NODETYPE(QTN_LEFT(lpQtNode)) == TERM_NODE)
  674. {
  675. Q1 = QTN_LEFT(lpQtNode);
  676. Q4 = QTN_RIGHT(lpQtNode);
  677. }
  678. else
  679. {
  680. Q1 = QTN_RIGHT(lpQtNode);
  681. Q4 = QTN_LEFT(lpQtNode);
  682. }
  683. SubTreeOpVal = QTN_OPVAL(Q4);
  684. /* UNDONE: UNARY_OP not supported */
  685. if (OperatorAttributeTable[SubTreeOpVal] & UNARY_OP)
  686. return E_FAIL;
  687. /* Find the common TERM_NODE */
  688. if (QTN_NODETYPE(QTN_LEFT(Q4)) == TERM_NODE &&
  689. QTN_TOKEN(Q1) == QTN_LEFT(Q4)->u.pToken)
  690. {
  691. Q3 = QTN_RIGHT(Q4);
  692. }
  693. else if (QTN_NODETYPE(QTN_RIGHT(Q4)) == TERM_NODE &&
  694. QTN_TOKEN(Q1) == QTN_RIGHT(Q4)->u.pToken)
  695. {
  696. Q3 = QTN_LEFT(Q4);
  697. }
  698. if (Q3 != NULL)
  699. {
  700. /* We got a match, just do the optimization */
  701. if (QTN_OPVAL(lpQtNode) == OR_OP && SubTreeOpVal != OR_OP)
  702. {
  703. /* case 3/ a or (T * a) : Unchanged (*: Non OR operator) */
  704. return E_FAIL; /* Unchanged */
  705. }
  706. /* Other cases */
  707. *lpQtNode = *Q4; /* Move up the sub-tree */
  708. return S_OK;
  709. }
  710. return E_FAIL;
  711. }
  712. /*************************************************************************
  713. * @doc INTERNAL
  714. *
  715. * @func HRESULT NEAR PASCAL | DoNullTermOpt |
  716. * Optimize node that has a NULL child
  717. *
  718. * @parm _LPQTNODE | lpQtNode |
  719. * Pointer to query tree node to be optimized
  720. *
  721. * @rdesc S_OK if some optimization has been done.
  722. *************************************************************************/
  723. PRIVATE HRESULT NEAR PASCAL DoNullTermOpt (register _LPQTNODE lpQtNode)
  724. {
  725. register _LPQTNODE lpChild;
  726. _LPQTNODE lpLeft;
  727. _LPQTNODE lpRight;
  728. HRESULT fOptimize = E_FAIL;
  729. lpLeft = QTN_LEFT(lpQtNode);
  730. lpRight = QTN_RIGHT(lpQtNode);
  731. if (QTN_OPVAL(lpQtNode) == NOT_OP)
  732. {
  733. if (QTN_NODETYPE(lpLeft) == NULL_NODE)
  734. {
  735. /* NULL ! a = NULL */
  736. *lpQtNode = *lpLeft;
  737. fOptimize = S_OK;
  738. }
  739. else if (QTN_NODETYPE(lpRight) == NULL_NODE)
  740. {
  741. /* a ! NULL = a */
  742. *lpQtNode = *lpLeft;
  743. fOptimize = S_OK;
  744. }
  745. if (fOptimize)
  746. QTN_LEFT(lpQtNode) = QTN_RIGHT(lpQtNode) = NULL;
  747. return fOptimize;
  748. }
  749. lpChild = QTN_NODETYPE(lpLeft = QTN_LEFT(lpQtNode)) == NULL_NODE ?
  750. (lpRight = QTN_RIGHT(lpQtNode)) : lpLeft;
  751. switch (QTN_OPVAL(lpQtNode))
  752. {
  753. case AND_OP: // a & NULL = NULL
  754. case NEAR_OP: // a # NULL = NULL
  755. case PHRASE_OP: // a + NULL = NULL ??
  756. QTN_NODETYPE(lpQtNode) = NULL_NODE;
  757. fOptimize = S_OK;
  758. break;
  759. case OR_OP: // a | NULL = a
  760. *lpQtNode = *lpChild;
  761. fOptimize = S_OK;
  762. break;
  763. }
  764. if (fOptimize)
  765. {
  766. QTN_LEFT(lpQtNode) = QTN_RIGHT(lpQtNode) = NULL;
  767. }
  768. return fOptimize;
  769. }
  770. /*************************************************************************
  771. * @doc INTERNAL
  772. *
  773. * @func HRESULT NEAR | DoTermTermOpt |
  774. * This function optimizes a node that has two TERM nodes for
  775. * children
  776. *
  777. * @parm _LPQTNODE | lpQtNode |
  778. * Node to be optimized
  779. *
  780. * @rdesc S_OK if some optimization is done
  781. *************************************************************************/
  782. PRIVATE HRESULT NEAR DoTermTermOpt (register _LPQTNODE lpQtNode)
  783. {
  784. register _LPQTNODE lpLeft = QTN_LEFT(lpQtNode);
  785. register _LPQTNODE lpRight = QTN_RIGHT(lpQtNode);
  786. HRESULT fOptimize = E_FAIL;
  787. if (lpRight->u.pToken == lpLeft->u.pToken &&
  788. lpRight->dwFieldId == lpLeft->dwFieldId)
  789. {
  790. /* Same strings */
  791. switch (QTN_OPVAL(lpQtNode))
  792. {
  793. case OR_OP: /* a | a = a */
  794. case AND_OP: /* a & a */
  795. *lpQtNode = *lpRight;
  796. fOptimize = S_OK;
  797. break;
  798. case NOT_OP: /* a ! a = 0 */
  799. QTN_NODETYPE(lpQtNode) = NULL_NODE;
  800. fOptimize = S_OK;
  801. break;
  802. }
  803. if (fOptimize == S_OK)
  804. {
  805. QTN_LEFT(lpQtNode) = QTN_RIGHT(lpQtNode) = NULL;
  806. }
  807. }
  808. else
  809. {
  810. /* Different strings. The least we can do is to change their order of
  811. retrieval. The string that has wild char should be fetched last (ie. be
  812. the right leaf to have minimal impact on memory
  813. */
  814. if (OperatorAttributeTable[QTN_OPVAL(lpQtNode)] & COMMUTATIVE)
  815. {
  816. if (QTN_FLAG(lpLeft) == WILDCARD_MATCH)
  817. {
  818. QTN_LEFT(lpQtNode) = lpRight;
  819. QTN_RIGHT(lpQtNode) = lpLeft;
  820. }
  821. }
  822. }
  823. return fOptimize;
  824. }
  825. PRIVATE HRESULT PASCAL NEAR CheckTree (_LPQTNODE lpQtNode, PHRESULT phr)
  826. {
  827. _LPQTNODE lpLeft;
  828. _LPQTNODE lpRight;
  829. HRESULT fRet;
  830. BOOL fAllStopWords = TRUE;
  831. if (QTN_NODETYPE(lpQtNode) != OPERATOR_NODE)
  832. return SetErrCode(phr, E_ASSERT);
  833. lpLeft = QTN_LEFT(lpQtNode);
  834. lpRight = QTN_RIGHT(lpQtNode);
  835. /* Handle THRU operator */
  836. if (QTN_OPVAL(lpQtNode) == RANGE_OP)
  837. {
  838. // erinfox: Better error message if STOP_NODE
  839. if (QTN_NODETYPE(lpLeft) == STOP_NODE)
  840. return VSetUserErr(phr, E_STOPWORD, QTN_OFFSET(lpLeft));
  841. if (QTN_NODETYPE(lpRight) == STOP_NODE)
  842. return VSetUserErr(phr, E_STOPWORD, QTN_OFFSET(lpRight));
  843. // Otherwise report bad range operator
  844. if (QTN_NODETYPE(lpLeft) != TERM_NODE ||
  845. QTN_FLAG(lpLeft) == WILDCARD_MATCH)
  846. return VSetUserErr(phr, E_BADRANGEOP, QTN_OFFSET(lpLeft));
  847. if (QTN_NODETYPE(lpRight) != TERM_NODE ||
  848. QTN_FLAG(lpRight) == WILDCARD_MATCH)
  849. return VSetUserErr(phr, E_BADRANGEOP, QTN_OFFSET(lpRight));
  850. /* The dtypes must match when using THRU */
  851. if (QTN_DTYPE(lpLeft) != QTN_DTYPE(lpRight))
  852. return VSetUserErr(phr, E_UNMATCHEDTYPE, QTN_OFFSET(lpLeft));
  853. /* Switch the order of the nodes if necessary */
  854. if (NCmpS(QTN_TOKEN(lpRight)->lpString,
  855. QTN_TOKEN(lpLeft)->lpString) < 0)
  856. {
  857. /* Left string > Right string, exchange the two */
  858. _LPQTNODE lpTmp = lpLeft;
  859. lpLeft = lpRight;
  860. lpRight = lpTmp;
  861. }
  862. /* Copy the left node into the operator */
  863. *lpQtNode = *lpLeft;
  864. /* Change the type of term matching */
  865. QTN_FLAG(lpQtNode) = TERM_RANGE_MATCH;
  866. QTN_HITERM(lpQtNode) = QTN_TOKEN(lpRight)->lpString;
  867. return S_OK;
  868. }
  869. // erinfox: used to report and error for (SW and term), but BS wants it to
  870. // be just "term". We delay handling this until tree optimization time
  871. #if 0
  872. if (QTN_OPVAL(lpQtNode) == AND_OP)
  873. {
  874. // erinfox: Better error message if NULL_NODE
  875. if (QTN_NODETYPE(lpLeft) == NULL_NODE)
  876. return VSetUserErr(phr, E_STOPWORD, QTN_OFFSET(lpLeft));
  877. if (QTN_NODETYPE(lpRight) == NULL_NODE)
  878. return VSetUserErr(phr, E_STOPWORD, QTN_OFFSET(lpRight));
  879. }
  880. #endif
  881. if (lpLeft && QTN_NODETYPE(lpLeft) != TERM_NODE)
  882. {
  883. // erinfox: add check against STOP_NODE
  884. if (QTN_NODETYPE(lpLeft) != STOP_NODE) // Neither term nor SW
  885. {
  886. fRet = CheckTree(lpLeft, phr);
  887. if (fRet != S_OK)
  888. {
  889. if (fRet != E_NULLQUERY) // E_NULLQUERY is legit
  890. return VSetUserErr(phr, fRet, QTN_OFFSET(lpLeft));
  891. }
  892. else
  893. fAllStopWords = FALSE; // We found a term when recursing
  894. }
  895. }
  896. else
  897. fAllStopWords = FALSE; // It's a term node so it can't be a stopword
  898. if (lpRight && QTN_NODETYPE(lpRight) != TERM_NODE)
  899. {
  900. if (QTN_NODETYPE(lpRight) != STOP_NODE)
  901. {
  902. fRet = CheckTree(lpRight, phr);
  903. if (fRet != S_OK)
  904. {
  905. if (fRet != E_NULLQUERY)
  906. return VSetUserErr(phr, fRet, QTN_OFFSET(lpRight));
  907. }
  908. else
  909. fAllStopWords = FALSE;
  910. }
  911. }
  912. else
  913. fAllStopWords = FALSE;
  914. if (fAllStopWords)
  915. return VSetUserErr(phr, E_NULLQUERY, 0);
  916. return S_OK;
  917. }
  918. #if defined(_DEBUG) && DOS_ONLY
  919. PUBLIC HRESULT PASCAL NEAR PrintTree (_LPQTNODE lpQtNode,
  920. HRESULT (PASCAL NEAR *fpFunction)(BYTE FAR *))
  921. {
  922. if (QTN_NODETYPE(lpQtNode) != TERM_NODE)
  923. printf ("(");
  924. if (QTN_LEFT(lpQtNode))
  925. PrintTree (QTN_LEFT(lpQtNode), fpFunction);
  926. (*fpFunction)((BYTE FAR *)lpQtNode);
  927. if (QTN_RIGHT(lpQtNode))
  928. PrintTree (QTN_RIGHT(lpQtNode), fpFunction);
  929. if (QTN_NODETYPE(lpQtNode) != TERM_NODE)
  930. printf (")");
  931. return S_OK;
  932. }
  933. PRIVATE HRESULT PASCAL NEAR PrintTopicListNode (_LPQT lpQueryTree,
  934. BYTE FAR *lpVanilla)
  935. {
  936. LPITOPIC lpTopicList;
  937. OCCURENCE FAR *lpOccur;
  938. DWORD i;
  939. for (lpTopicList = QTN_TOPICLIST(lpVanilla); lpTopicList;
  940. lpTopicList=lpTopicList->pNext)
  941. {
  942. lpOccur = lpTopicList->lpOccur;
  943. for (i = 1; lpOccur; i++, lpOccur = lpOccur->pNext)
  944. {
  945. printf ("D:%4ld", lpTopicList->dwTopicId);
  946. PrintOccurNode(lpQueryTree, lpOccur);
  947. printf ("\n");
  948. }
  949. }
  950. return S_OK;
  951. }
  952. PRIVATE HRESULT PASCAL NEAR PrintOccurNode(_LPQT lpQueryTree,
  953. OCCURENCE FAR *lpOccur)
  954. {
  955. printf (",%5ld", lpOccur->dwCount);
  956. printf (",%5ld", lpOccur->dwOffset);
  957. printf (",%5d", lpOccur->wWeight);
  958. return S_OK;
  959. }
  960. PRIVATE HRESULT PASCAL NEAR PrintQueryNode (BYTE FAR *lpVanilla)
  961. {
  962. _LPQTNODE lpQtNode = (_LPQTNODE )lpVanilla;
  963. if (QTN_NODETYPE(lpQtNode) == TERM_NODE)
  964. PrintStr(QTN_TOKEN(lpQtNode)->lpString);
  965. else if (QTN_NODETYPE(lpQtNode) == OPERATOR_NODE)
  966. {
  967. putchar(' ');
  968. PrintOperator(QTN_OPVAL(lpQtNode));
  969. putchar(' ');
  970. }
  971. else if (QTN_NODETYPE(lpQtNode) == NULL_NODE)
  972. printf ("NULL");
  973. return S_OK;
  974. }
  975. PUBLIC VOID PASCAL NEAR PrintStr (char FAR *lstWord)
  976. {
  977. int nLength = *lstWord++;
  978. for (; nLength > 0; nLength--, lstWord++)
  979. putchar (*lstWord);
  980. }
  981. /*
  982. Debugging routines
  983. */
  984. PUBLIC HRESULT PASCAL FAR PrintList(_LPQT lpQueryTree)
  985. {
  986. STRING_TOKEN FAR *pStr;
  987. _LPQTNODE lpQtNode = lpQueryTree->lpTopNode;
  988. if (Debug == 0)
  989. return S_OK;
  990. printf ("*** STRING LIST ***\n");
  991. if (pStr = lpQueryTree->lpStrList)
  992. {
  993. while (pStr)
  994. {
  995. PrintStr (pStr->lpString);
  996. printf ("\n");
  997. pStr = pStr->pNext;
  998. }
  999. }
  1000. printf ("\n** Expression postfix form\n");
  1001. while (QTN_PREV(lpQtNode))
  1002. lpQtNode = QTN_PREV(lpQtNode);
  1003. while (lpQtNode)
  1004. {
  1005. if (QTN_NODETYPE(lpQtNode) == TERM_NODE)
  1006. {
  1007. PrintStr(QTN_TOKEN(lpQtNode)->lpString);
  1008. putchar(' ');
  1009. }
  1010. else
  1011. {
  1012. PrintOperator(QTN_OPVAL(lpQtNode));
  1013. putchar(' ');
  1014. }
  1015. lpQtNode = QTN_NEXT(lpQtNode);
  1016. }
  1017. printf ("\n");
  1018. return S_OK;
  1019. }
  1020. PRIVATE VOID PASCAL NEAR PrintOperator (int OpVal)
  1021. {
  1022. switch (OpVal)
  1023. {
  1024. case AND_OP:
  1025. printf ("AND");
  1026. break;
  1027. case OR_OP:
  1028. printf ("OR");
  1029. break;
  1030. case NEAR_OP:
  1031. printf ("NEAR");
  1032. break;
  1033. case PHRASE_OP:
  1034. printf ("PHRASE");
  1035. break;
  1036. case GROUP_OP:
  1037. printf ("GROUP");
  1038. break;
  1039. case FIELD_OP:
  1040. printf ("VFLD");
  1041. break;
  1042. case BRKR_OP:
  1043. printf ("DTYPE");
  1044. break;
  1045. case NOT_OP:
  1046. printf ("NOT");
  1047. break;
  1048. case RANGE_OP:
  1049. printf ("THRU");
  1050. break;
  1051. }
  1052. }
  1053. #endif // DOS_ONLY && _DEBUG