Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

977 lines
35 KiB

4 years ago
  1. #include "stdafx.h"
  2. #include <math.h>
  3. #include "vmbuffer.h"
  4. #include "memex.h"
  5. #include "saveload.h"
  6. #include "TXDBase.h"
  7. #include "bytemaps.h"
  8. #include "textset.h"
  9. #include "dict.h"
  10. #include "vector.h"
  11. #include "query.h"
  12. // This file contains the definition of class CQuery
  13. // Constructors
  14. /*************************************************************************
  15. * FUNCTION : *
  16. * *
  17. * RETURNS : *
  18. * *
  19. * PURPOSE : *
  20. * *
  21. * PARAMETERS : *
  22. * *
  23. * SIDE EFFECTS : *
  24. * *
  25. * DESCRIPTION : *
  26. * *
  27. * HISTORY : *
  28. * *
  29. * Author Date Action *
  30. * ------ ---- ------ *
  31. * *
  32. * KrishnaN 4/23/94 Creation. *
  33. * *
  34. *************************************************************************/
  35. CQuery::CQuery()
  36. {
  37. m_cConWts = 0;
  38. #if 0
  39. m_cOverFlows = 0;
  40. #endif
  41. m_pszQueryText = NULL;
  42. }
  43. // Destructor
  44. /*************************************************************************
  45. * FUNCTION : *
  46. * *
  47. * RETURNS : *
  48. * *
  49. * PURPOSE : *
  50. * *
  51. * PARAMETERS : *
  52. * *
  53. * SIDE EFFECTS : *
  54. * *
  55. * DESCRIPTION : *
  56. * *
  57. * HISTORY : *
  58. * *
  59. * Author Date Action *
  60. * ------ ---- ------ *
  61. * *
  62. * KrishnaN 4/23/94 Creation. *
  63. * *
  64. *************************************************************************/
  65. CQuery::~CQuery()
  66. {
  67. if (m_vbVectorConcept.Base)
  68. FreeVirtualBuffer(&m_vbVectorConcept);
  69. if (m_vbVectorTermFreq.Base)
  70. FreeVirtualBuffer(&m_vbVectorTermFreq);
  71. if (m_vbVectorWt.Base)
  72. FreeVirtualBuffer(&m_vbVectorWt);
  73. #if 0
  74. if (m_vbTFOverFlow.Base)
  75. FreeVirtualBuffer(&m_vbTFOverFlow);
  76. #endif
  77. }
  78. CQuery *CQuery::NewQuery(CTextSet *pts)
  79. {
  80. CQuery *pQuery= NULL;
  81. __try
  82. {
  83. pQuery= New CQuery;
  84. pQuery->Initialize(pts, 100, 100000);
  85. }
  86. __finally
  87. {
  88. if (_abnormal_termination() && pQuery)
  89. {
  90. delete pQuery; pQuery= NULL;
  91. }
  92. }
  93. return pQuery;
  94. }
  95. // Access Functions:
  96. /*************************************************************************
  97. * FUNCTION : *
  98. * *
  99. * RETURNS : *
  100. * *
  101. * PURPOSE : *
  102. * *
  103. * PARAMETERS : *
  104. * *
  105. * SIDE EFFECTS : *
  106. * *
  107. * DESCRIPTION : *
  108. * *
  109. * HISTORY : *
  110. * *
  111. * Author Date Action *
  112. * ------ ---- ------ *
  113. * *
  114. * KrishnaN 4/23/94 Creation. *
  115. * *
  116. *************************************************************************/
  117. void CQuery::Initialize(CTextSet *textsetIn, DWORD cInEstConWtPairs, DWORD cInMaxConWtPairs)
  118. {
  119. ASSERT(textsetIn != NULL);
  120. m_ptdb = textsetIn;
  121. m_cDocuments = m_ptdb->PColl()->m_cDocuments;
  122. ASSERT(m_cDocuments != 0);
  123. m_vbConcepts = m_ptdb->PColl()->m_vbConcepts;
  124. ASSERT(m_vbConcepts.Base);
  125. m_aWtInvIndex = m_ptdb->PColl()->m_aWtInvIndex;
  126. ASSERT(m_aWtInvIndex);
  127. CreateVirtualBuffer(&m_vbVectorConcept , cInEstConWtPairs * sizeof(DWORD), cInMaxConWtPairs * sizeof(DWORD));
  128. CreateVirtualBuffer(&m_vbVectorTermFreq, cInEstConWtPairs * sizeof(DWORD), cInMaxConWtPairs * sizeof(DWORD));
  129. CreateVirtualBuffer(&m_vbVectorWt , 0 , cInMaxConWtPairs * sizeof(float));
  130. #if 0
  131. CreateVirtualBuffer(&m_vbTFOverFlow , 0 , 0x4000 * sizeof(TFOverFlowStruct));
  132. #endif
  133. // Initialize allocated memory
  134. // VritualAlloc zeroes all memory it commits, so we don't have to worry about zeroing the virtual buffers
  135. }
  136. /*************************************************************************
  137. * FUNCTION : *
  138. * *
  139. * RETURNS : *
  140. * *
  141. * PURPOSE : *
  142. * *
  143. * PARAMETERS : *
  144. * *
  145. * SIDE EFFECTS : *
  146. * *
  147. * DESCRIPTION : *
  148. * *
  149. * HISTORY : *
  150. * *
  151. * Author Date Action *
  152. * ------ ---- ------ *
  153. * *
  154. * KrishnaN 4/23/94 Creation. *
  155. * *
  156. *************************************************************************/
  157. void CQuery::RecordConcept(DWORD ConceptId)
  158. {
  159. // Search for this concept id in the current document. If you find it,
  160. // simply increment its frequency and that will take care of everything.
  161. // If you don't find it, then enter the concept for the document.
  162. DWORD i; // index of the con,wt pair being considered for match
  163. for (i = 0; i < m_cConWts && Concept(i) != ConceptId; i++);
  164. if (i == m_cConWts)
  165. {
  166. // This concept doesn't exist in the query. Record it.
  167. __try
  168. {
  169. Concept(m_cConWts) = ConceptId;
  170. }
  171. __except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorConcept))
  172. {
  173. RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
  174. }
  175. __try
  176. {
  177. TermFreq(m_cConWts) = 1; // this is the first time this concept occured for this document
  178. }
  179. __except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorTermFreq))
  180. {
  181. RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
  182. }
  183. m_cConWts++;
  184. }
  185. else
  186. {
  187. // Term already exists in this document. Increase the occurence frequency.
  188. // Since the term already exists in the document, it has a frequency of at least 1
  189. #if 0
  190. // The only time when the value can be 0 is when the frequency has exceeded 0xFFFF. In
  191. // that case, the overflowing value is stored in the over flow area
  192. if (TermFreq(i) == 0)
  193. {
  194. // go to the over flow area and update the value that tracks this term frequency
  195. }
  196. else
  197. #endif
  198. if (TermFreq(i) == 0xFFFF)
  199. {
  200. // we reached the upperbound on this value.
  201. }
  202. else // normal case. No overflow is involved. This is what happens MOST of the time.
  203. (TermFreq(i))++;
  204. }
  205. }
  206. /*************************************************************************
  207. * FUNCTION : *
  208. * *
  209. * RETURNS : *
  210. * *
  211. * PURPOSE : *
  212. * *
  213. * PARAMETERS : *
  214. * *
  215. * SIDE EFFECTS : *
  216. * *
  217. * DESCRIPTION : *
  218. * *
  219. * HISTORY : *
  220. * *
  221. * Author Date Action *
  222. * ------ ---- ------ *
  223. * *
  224. * KrishnaN 4/23/94 Creation. *
  225. * *
  226. *************************************************************************/
  227. // ASSUMPTION : We are only weighting one query vector. This will hold true all the time.
  228. BOOL CQuery::WeightVector(BYTE TFModType, BYTE WeightType, BYTE NormType)
  229. {
  230. DWORD i;
  231. // Copy the Term Frequencies into an array of floating points. All operations will be computed
  232. // on these floating point weights. The final results can then be converted to a fixed point.
  233. // IMPORTANT : ALL WEIGHTS SHOULD BE NORMALIZED TO ENSURE THAT EACH WEIGHT IS LESS THAN ONE.
  234. // THE FIXED POINT VALUE ONLY REPRESENTS VALUES BETWEEN 0.0 AND 1.0
  235. for (i = 0; i < m_cConWts; i++)
  236. {
  237. __try
  238. {
  239. TermWt(i) = (float)GetRealTermFreq(i);
  240. }
  241. __except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorWt))
  242. {
  243. RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
  244. }
  245. }
  246. ApplyWeightingScheme(TFModType, WeightType, NormType, 0, m_cConWts);
  247. // Plug back the weighted values into the term frequency array
  248. // ASSUMPTION : Each weight in TermWt is between 0.0 and 1.0
  249. // Multiplying this with WT_ONE forces each TermWt weight to be a
  250. // fixed point number ranging between 0 and WT_ONE.
  251. for (i = 0; i < m_cConWts; i++)
  252. TermFreq(i) = (WORD)((double)TermWt(i) * (double)WT_ONE);
  253. return TRUE;
  254. }
  255. /*************************************************************************
  256. * FUNCTION : *
  257. * *
  258. * RETURNS : *
  259. * *
  260. * PURPOSE : *
  261. * *
  262. * PARAMETERS : *
  263. * *
  264. * SIDE EFFECTS : *
  265. * *
  266. * DESCRIPTION : *
  267. * *
  268. * HISTORY : *
  269. * *
  270. * Author Date Action *
  271. * ------ ---- ------ *
  272. * *
  273. * KrishnaN 4/23/94 Creation. *
  274. * *
  275. *************************************************************************/
  276. void CQuery::ApplyWeightingScheme(BYTE TFModType, BYTE WeightType, BYTE NormType, DWORD iFirstConWt, DWORD cConWts)
  277. {
  278. register DWORD i;
  279. double Wt; // used to hold different types of cumulative values at various points in the computations
  280. // First modify weight based on the term frequency component
  281. switch (TFModType)
  282. {
  283. case NEWTF_NONE: // do nothing
  284. break;
  285. case NEWTF_BINARY: // Since all the terms are in, turn them on
  286. for (i = 0; i < cConWts; i++)
  287. TermWt(i) = (float)1.0;
  288. break;
  289. case NEWTF_MAXNORM:
  290. Wt = 0.0;
  291. for (i = 0; i < cConWts; i++)
  292. if (TermWt(i) > Wt)
  293. Wt = TermWt(i);
  294. // increase Max by 0.00001 to place all normalized TFs between 0.0 and 1.0
  295. Wt += 0.00001;
  296. for (i = 0; i < cConWts; i++)
  297. TermWt(i) = (float) ((double)TermWt(i) / Wt);
  298. break;
  299. case NEWTF_AUGNORM:
  300. Wt = 0.0;
  301. for (i = 0; i < cConWts; i++)
  302. if (TermWt(i) > Wt)
  303. Wt = TermWt(i);
  304. // increase Max by 0.00001 to place all normalized TFs between 0.0 and 1.0
  305. Wt += 0.00001;
  306. for (i = 0; i < cConWts; i++)
  307. TermWt(i) = (float) (0.5 + 0.5 * (double)TermWt(i) / Wt);
  308. break;
  309. default:
  310. // Assertion failure.
  311. break;
  312. }
  313. // Then modify the weight based on the collection frequency component
  314. switch (WeightType)
  315. {
  316. case WT_NONE: // do nothing
  317. break;
  318. // if a concept occurs in all docs, let's assign it a small value instead of assigning it a 0.0
  319. case WT_TFIDF:
  320. for (i = 0; i < cConWts; i++)
  321. if (m_cDocuments == DocFromCumFreq(Concept(i + iFirstConWt)))
  322. TermWt(i) = (float) 0.005;
  323. else
  324. TermWt(i) = (float) ((double)TermWt(i) * log((double)m_cDocuments / (double)DocFromCumFreq(Concept(i + iFirstConWt))));
  325. break;
  326. case WT_PROB:
  327. for (i = 0; i < cConWts; i++)
  328. if (m_cDocuments == DocFromCumFreq(Concept(i + iFirstConWt)))
  329. TermWt(i) = (float) 0.005;
  330. else
  331. TermWt(i) = (float) ((double)TermWt(i) * log((double)(m_cDocuments - DocFromCumFreq(Concept(i + iFirstConWt))) / (double)DocFromCumFreq(Concept(i + iFirstConWt))));
  332. break;
  333. default:
  334. // ASSERTion failure.
  335. break;
  336. }
  337. switch (NormType)
  338. {
  339. case NORM_NONE:
  340. break;
  341. case NORM_SUM:
  342. Wt = 0.0;
  343. for (i = 0; i < cConWts; i++)
  344. Wt += (double) TermWt(i);
  345. for (i = 0; i < cConWts; i++)
  346. TermWt(i) = (float) ((double)TermWt(i) / Wt);
  347. break;
  348. case NORM_COSINE:
  349. Wt = 0.0;
  350. // compute sum of squares of weights in the vector
  351. for (i = 0; i < cConWts; i++)
  352. Wt += TermWt(i) * TermWt(i);
  353. Wt = sqrt(Wt);
  354. // normalize each weight by the sum of squares computed above
  355. for (i = 0; i < cConWts; i++)
  356. TermWt(i) = (float) ((double)TermWt(i) / Wt);
  357. break;
  358. case NORM_MAX:
  359. break;
  360. }
  361. }
  362. /*************************************************************************
  363. * FUNCTION : *
  364. * *
  365. * RETURNS : *
  366. * *
  367. * PURPOSE : *
  368. * *
  369. * PARAMETERS : *
  370. * *
  371. * SIDE EFFECTS : *
  372. * *
  373. * DESCRIPTION : *
  374. * *
  375. * HISTORY : *
  376. * *
  377. * Author Date Action *
  378. * ------ ---- ------ *
  379. * *
  380. * KrishnaN 4/23/94 Creation. *
  381. * *
  382. *************************************************************************/
  383. BOOL CQuery::RankDocuments(SimStruct *aInSimilarity, DWORD cInHits)
  384. {
  385. register DWORD i, j;
  386. DWORD ConceptId, DocId;
  387. DWORD cDocs;
  388. DWORD DocPos; // tracks the position of a document in the similiarity structure
  389. DWORD startDocPos;
  390. if (cInHits == 0)
  391. {
  392. SetLastError(QUERYERROR_NOHITS);
  393. return FALSE;
  394. }
  395. if (aInSimilarity == NULL)
  396. {
  397. SetLastError(QUERYERROR_EMPTYSIMARRAY);
  398. return FALSE;
  399. }
  400. // ASSUME THAT THE SIMILARITY STRUCTURE ARRAY HAS ENOUGH ENTRIES TO SUPPORT cInHits
  401. // Zero out any existing similarity values
  402. for (i = 0; i < cInHits; i++)
  403. aInSimilarity[i].Similarity = 0;
  404. // Compute similarity. Walk the doc,wt list for each concept
  405. for (i = 0; i < m_cConWts; i++)
  406. {
  407. // Ignore concepts that have a zero weight. Later, we may want to extend this idea
  408. // to suppress weights below a small value.
  409. if (TermFreq(i) == 0)
  410. continue;
  411. ConceptId = Concept(i);
  412. cDocs = DocFromCumFreq(ConceptId);
  413. // Consider each doc in the (Doc, Wt) list for this concept and score docs that
  414. // are in the predetermined hit list.
  415. startDocPos = DocList(ConceptId); // get the starting point of the inverted list.
  416. for (j = 0; j < cDocs; j++)
  417. {
  418. if (j == 0)
  419. DocId = m_ptdb->PColl()->GetDocumentGap(&startDocPos) - 1;
  420. else
  421. DocId += m_ptdb->PColl()->GetDocumentGap(&startDocPos);
  422. DocPos = GetDocPosInList(aInSimilarity, cInHits, DocId);
  423. if (DocPos != DOESNOTEXIST)
  424. aInSimilarity[DocPos].Similarity += TermFreq(i) * WtFromInvList(ConceptId, j);
  425. /* IF WE LIMIT SIMILARITY TO 24 BITS, USE THE FOLLOWING LINE. IF WE LIMIT TO ANY OTHER NUMBER
  426. OF BITS n, n < 32, RIGHT SHIFT THE RHS BY 32 - n.
  427. aInSimilarity[DocPos].Similarity += (TermFreq(i) * WtFromInvList(ConceptId, j)) >> 8;
  428. */
  429. }
  430. }
  431. /* MOVE SORTING TO THE CALLER!
  432. // sort the scored documents.
  433. qsort(aInSimilarity, cInHits, sizeof(SimStruct), CompareSimStruct);
  434. */
  435. // return the number of hits
  436. return cInHits;
  437. }
  438. /*************************************************************************
  439. * FUNCTION : *
  440. * *
  441. * RETURNS : *
  442. * *
  443. * PURPOSE : *
  444. * *
  445. * PARAMETERS : *
  446. * *
  447. * SIDE EFFECTS : *
  448. * *
  449. * DESCRIPTION : *
  450. * *
  451. * HISTORY : *
  452. * *
  453. * Author Date Action *
  454. * ------ ---- ------ *
  455. * *
  456. * KrishnaN 4/23/94 Creation. *
  457. * *
  458. *************************************************************************/
  459. // cInHits is the number of previous hits. cInMaxHits is the maximum number of documents to retrieve
  460. DWORD CQuery::RetrieveWithFeedback( SimStruct *aInSimilarity, DWORD cInMaxHits,
  461. PWCHAR pwRelDocText, int cwRelDocText,
  462. PWCHAR pwNonRelDocText, int cwNonRelDocText
  463. )
  464. {
  465. DWORD i, j, k;
  466. DWORD DocId;
  467. DWORD cHits = 0;
  468. DWORD cDocs, DocPos;
  469. DWORD LastDocInPos = 0; // Document position which has the least partial similarity match
  470. DWORD UBCurrentDoc; // Upper bound of the current document
  471. DWORD CCurrentDoc; // C (CurrentDoc)
  472. DWORD CFirstDocOut = 0; // C (FirstDocOut)
  473. DWORD UBFirstDocOut; // Upper bound of the first document outside the RSet
  474. LPDWORD aQTermSummation= NULL; // summation of query terms
  475. DWORD startDocPos;
  476. ASSERT(aInSimilarity);
  477. ASSERT(pwRelDocText && cwRelDocText);
  478. __try
  479. {
  480. // Add terms from the query. We will either have to reindex the initial query
  481. // or store the term frequencies the first time they were computed.
  482. // Assume that each term only occurs once in the query.
  483. // This assumption usually holds good for queries typed in by the user.
  484. // It doesn't matter much even if it doesn't hold good because the document
  485. // text overwhelms the original query.
  486. for (i = 0; i < m_cConWts; i++) // Enforce the above assumption.
  487. TermFreq(i) = 1;
  488. // Add terms from the relevant documents to the query
  489. IndexDocumentText(pwRelDocText, cwRelDocText, TRUE);
  490. // For the non-relevant document text, decrease termfreqs of concepts it has in common with
  491. // the newly formed query.
  492. // NOTE : The caller should pass in the document text of only the highest ranked
  493. // non-relevant document to get the best results (Dec-Hi relevance feedback method).
  494. if (pwNonRelDocText && cwNonRelDocText)
  495. {
  496. IndexDocumentText(pwNonRelDocText, cwNonRelDocText, FALSE);
  497. // At this point, we may have some zero weighted concepts in the
  498. // query. Remove any such concept, weight pairs.
  499. for (i = j = 0; i < m_cConWts;)
  500. {
  501. // search for the next zero weighted concept
  502. for (; j < m_cConWts && TermFreq(j) > 0; j++);
  503. i = j; // update i so that outer loop terminates appropriately
  504. if (j < m_cConWts) // we found a zero weighted concept
  505. {
  506. // search for the next non-zero weighted concept
  507. for (k = j + 1; k < m_cConWts && TermFreq(k) == 0; k++);
  508. if (k < m_cConWts) // we found a non-zero weighted concept
  509. {
  510. // copy the con,wt pair
  511. Concept(j) = Concept(k);
  512. TermFreq(j) = TermFreq(k);
  513. // erase the copied pair
  514. TermFreq(k) = 0;
  515. j++; // update j so that the for loop advances
  516. }
  517. else // no more non-zero weighted concepts. we are done.
  518. i = k;
  519. }
  520. }
  521. ASSERT(i <= m_cConWts);
  522. // Count the new number of ConWt pairs
  523. for (m_cConWts = 0; TermFreq(m_cConWts) > 0; m_cConWts++);
  524. }
  525. if (!m_cConWts) __leave;
  526. // Now weight the query vector
  527. WeightVector(NEWTF_NONE, WT_TFIDF, NORM_COSINE);
  528. SortQuery();
  529. aQTermSummation = (LPDWORD)VAlloc(FALSE, m_cConWts * sizeof(DWORD));
  530. // Compute summation of query terms. This summation will be used to compute the upperbound.
  531. // aQTermSummation[j] gives is the sum of weights of query terms j+1 to the last term in the query
  532. // The << 16 left shift takes care of multiplication by 1 i.e. it makes this a true 32-bit value
  533. for (aQTermSummation[m_cConWts - 1] = TermFreq(i) << 16, i = m_cConWts - 1; i > 0; i--)
  534. aQTermSummation[i - 1] = aQTermSummation[i] + TermFreq(i-1) << 16;
  535. /*
  536. // scale the values to 24 bit
  537. for (i = 0; i < m_cConWts; i++)
  538. aQTermSummation[i] = aQTermSummation[i] >> 8;
  539. */
  540. /* IMPORTANT ASSUMPTION : The aInSimilarity array is properly initialized.
  541. Proper initialization includes resetting all docid and sim values to 0
  542. and the CollId field to the appropriate collection id.
  543. If aInSimilarity is not properly initialized, the docid, sim values will
  544. still be correct, but the caller will have no way of finding the collection
  545. id of the docid, sim values set here.
  546. */
  547. // Compute similarity. Walk the doc,wt list for each concept
  548. // Compute until all terms are exhausted or the stopping conditions are met
  549. // Skip terms that occur too frequently (how frequent is too frequent ?)
  550. i = 0;
  551. do
  552. {
  553. // CODE TO SKIP TERMS THAT ARE TOO FREQUENT CAN APPEAR HERE
  554. // if (term is too frequent)
  555. // {
  556. // i++;
  557. // continue;
  558. // }
  559. cDocs = DocFromCumFreq(Concept(i));
  560. DocId = 0;
  561. startDocPos = DocList(Concept(i));
  562. // Consider each doc in the (Doc, Wt) list for this concept and score docs that
  563. // are in the predetermined hit list.
  564. for (j = 0; j < cDocs; j++)
  565. {
  566. // The first doc in an inverted list for a concept is encoded as docid + 1. The subsequent
  567. // gaps are encoded as they are.
  568. if (j == 0)
  569. DocId = m_ptdb->PColl()->GetDocumentGap(&startDocPos) - 1;
  570. else
  571. DocId += m_ptdb->PColl()->GetDocumentGap(&startDocPos);
  572. DocPos = GetDocPosInList2(aInSimilarity, cHits, DocId);
  573. // ALG : If RsetNotFull then
  574. if (cHits < cInMaxHits)
  575. {
  576. // ALG : Compute C(Document);
  577. // ALG : Enter Document into the RSet
  578. if (DocPos == DOESNOTEXIST)
  579. {
  580. // Add this new document
  581. DocPos = cHits;
  582. aInSimilarity[DocPos].DocId = DocId;
  583. cHits++;
  584. aInSimilarity[DocPos].Similarity += TermFreq(i) * WtFromInvList(Concept(i), j);
  585. /* If we scale similarity to 24 bits, use this line instead of the above
  586. aInSimilarity[DocPos].Similarity += (TermFreq(i) * WtFromInvList(Concept(i), j)) >> 8;
  587. */
  588. if (aInSimilarity[DocPos].Similarity < aInSimilarity[LastDocInPos].Similarity)
  589. LastDocInPos = DocPos;
  590. }
  591. else
  592. {
  593. // recompute the LastDocIn document if this document was LastDocIn before this cumulation
  594. aInSimilarity[DocPos].Similarity += TermFreq(i) * WtFromInvList(Concept(i), j);
  595. /* If we scale similarity to 24 bits, use this line instead of the above
  596. aInSimilarity[DocPos].Similarity += (TermFreq(i) * WtFromInvList(Concept(i), j)) >> 8;
  597. */
  598. if (DocPos == LastDocInPos)
  599. for (k = 0; k < cHits; k++)
  600. if (aInSimilarity[k].Similarity < aInSimilarity[LastDocInPos].Similarity)
  601. LastDocInPos = k;
  602. }
  603. }
  604. // ALG : else
  605. else
  606. {
  607. // ALG : Compute Upperbound (Document)
  608. // At this point we will also compute the partial similarity for this document
  609. if (DocPos == DOESNOTEXIST)
  610. {
  611. CCurrentDoc = TermFreq(i) * WtFromInvList(Concept(i), j);
  612. /* If we scale similarity to 24 bits, use this line instead of the above
  613. CCurrentDoc = (TermFreq(i) * WtFromInvList(Concept(i), j)) >> 8;
  614. */
  615. UBCurrentDoc = aQTermSummation[i];
  616. }
  617. else
  618. {
  619. CCurrentDoc = aInSimilarity[DocPos].Similarity + (TermFreq(i) * WtFromInvList(Concept(i), j));
  620. /* If we scale similarity to 24 bits, use this line instead of the above
  621. CCurrentDoc = aInSimilarity[DocPos].Similarity + ((TermFreq(i) * WtFromInvList(Concept(i), j)) >> 8);
  622. */
  623. // The upper bound could exceed the maximum possible similarity value. We should protect
  624. // against that by bounding the upper bound.
  625. if ((MAXSIM - aInSimilarity[DocPos].Similarity) < aQTermSummation[i])
  626. UBCurrentDoc = MAXSIM;
  627. else
  628. UBCurrentDoc = aInSimilarity[DocPos].Similarity + aQTermSummation[i];
  629. }
  630. // ALG : If U(Document) <= C(LastDoc) then
  631. // ALG : DoNotAllocate / Remove Document
  632. // If U < C condition is met and the doc is already in, remove it
  633. if (UBCurrentDoc <= aInSimilarity[LastDocInPos].Similarity)
  634. {
  635. // This document is a loser. Check to see if it is at least better than
  636. // the first document outside the RSet.
  637. if (CCurrentDoc > CFirstDocOut)
  638. CFirstDocOut = CCurrentDoc;
  639. // Remove this loser if it was already entered
  640. if (DocPos != DOESNOTEXIST)
  641. {
  642. // remove current document from the list
  643. // remove by copying the document at the end into this document's position
  644. aInSimilarity[DocPos].Similarity = aInSimilarity[cHits - 1].Similarity;
  645. aInSimilarity[DocPos].DocId = aInSimilarity[cHits - 1].DocId;
  646. cHits--;
  647. ASSERT (cHits);
  648. // Now that we changed the document set, recompute the last doc position
  649. for (k = 0; k < cHits; k++)
  650. if (aInSimilarity[k].Similarity < aInSimilarity[LastDocInPos].Similarity)
  651. LastDocInPos = k;
  652. }
  653. }
  654. // ALG : else
  655. // ALG : Compute C (Document)
  656. // ALG : if (C (Document) > C (LastDoc) then
  657. // ALG : Enter Document into the RSet
  658. else
  659. {
  660. if (CCurrentDoc > aInSimilarity[LastDocInPos].Similarity)
  661. {
  662. if (DocPos == DOESNOTEXIST)
  663. {
  664. // Since the RSet is already full, the only way to enter the current document
  665. // is by replacing the document at the LastDocInPos - i.e replacing the doc with
  666. // the least partial match
  667. // Before replacing the LastDocIn, let us save it as the FirstDocOut
  668. CFirstDocOut = aInSimilarity[LastDocInPos].Similarity;
  669. // Replace
  670. aInSimilarity[LastDocInPos].DocId = DocId;
  671. aInSimilarity[LastDocInPos].Similarity = CCurrentDoc;
  672. // Now that we changed the document set, recompute the last doc position
  673. for (k = 0; k < cHits; k++)
  674. if (aInSimilarity[k].Similarity < aInSimilarity[LastDocInPos].Similarity)
  675. LastDocInPos = k;
  676. }
  677. else
  678. {
  679. aInSimilarity[DocPos].Similarity = CCurrentDoc;
  680. // recompute the LastDocIn document if this document was LastDocIn before this cumulation
  681. if (DocPos == LastDocInPos)
  682. for (k = 0; k < cHits; k++)
  683. if (aInSimilarity[k].Similarity < aInSimilarity[LastDocInPos].Similarity)
  684. LastDocInPos = k;
  685. }
  686. }
  687. }
  688. }
  689. }
  690. /* BEGIN : Fix FOR BUG 18016 */
  691. if (cHits < cInMaxHits)
  692. UBFirstDocOut = 0xFFFFFFFF; // No doc outside the RSet, so the first doc out potentially has infinite upperboud
  693. else
  694. /* END : fix for BUG 18016 */
  695. // Compute upper bound of FirstDocOut
  696. UBFirstDocOut = CFirstDocOut + aQTermSummation[i];
  697. i++;
  698. // ALG : until LastQueryTerm or U(FirstDocOut) <= C(LastDocIn)
  699. // NOTE : We converted a repeat - until into a do - while, so the loop termination conditions are different
  700. // between algorithm and the implementation.
  701. } while (i < m_cConWts && UBFirstDocOut > aInSimilarity[LastDocInPos].Similarity && TermFreq(i) > 0 ); // INTRODUCE MORE STOPPING CONDITIONS HERE
  702. #if 0 // statistics
  703. if ( i < m_cConWts )
  704. {
  705. char szBuffer[200];
  706. DWORD cDocsExamined = 0, cDocsNotExamined = 0;
  707. for (k = 0; k < i; k++)
  708. cDocsExamined += DocFromCumFreq(Concept(k));
  709. for (k = i; k < m_cConWts; k++)
  710. cDocsNotExamined += DocFromCumFreq(Concept(k));
  711. wsprintf(szBuffer, "Examined only %u lists out of %u lists and only %u docs out of %u docs", i, m_cConWts, cDocsExamined, cDocsExamined+cDocsNotExamined);
  712. MessageBox(GetFocus(), szBuffer, "Query Optimization", MB_OK);
  713. }
  714. #endif // 0, statistics
  715. /* MOVE SORTING TO THE CALLER. THIS IS DONE TO ENABLE MULTIPLE FILE SEARCHES. THE CALLER WILL
  716. GET ALL THE RESULTS INTO A HUGE SIMSTRUCT ARRAY AND SORT IT
  717. // sort the scored documents.
  718. qsort(aInSimilarity, cHits, sizeof(SimStruct), CompareSimStruct);
  719. */
  720. }
  721. __finally
  722. {
  723. if (aQTermSummation) VFree(aQTermSummation);
  724. }
  725. return cHits;
  726. }
  727. void CQuery::SortQuery()
  728. {
  729. TempConWtStruct * aConWts = NULL;
  730. register DWORD i;
  731. __try
  732. {
  733. // Sort the query concept, wt pairs based on the weight of the concepts. This will be used
  734. // when we employ stop conditions to reduce the number of documents considered.
  735. // Since the concepts and weights are not in the same structure, we need to
  736. // copy them to a temporary buffer and then copy the sorted values back
  737. aConWts = (TempConWtStruct *) VAlloc(FALSE, sizeof(TempConWtStruct) * m_cConWts);
  738. for (i = 0; i < m_cConWts; i++)
  739. {
  740. aConWts[i].ConceptId = Concept(i);
  741. aConWts[i].Weight = TermFreq(i);
  742. }
  743. qsort(aConWts, m_cConWts, sizeof(TempConWtStruct), CompareTempConWtStruct);
  744. for (i = 0; i < m_cConWts; i++)
  745. {
  746. Concept(i) = aConWts[i].ConceptId;
  747. TermFreq(i) = (WORD)aConWts[i].Weight;
  748. }
  749. }
  750. __finally
  751. {
  752. if (aConWts) VFree(aConWts);
  753. }
  754. }
  755. // Compare two dwords and return 0, < 0, or > 0 to help qsort sort in decreasing order
  756. int _cdecl CompareTempConWtStruct(const void *arg1, const void *arg2)
  757. {
  758. if (((TempConWtStruct *)arg2)->Weight > ((TempConWtStruct *)arg1)->Weight)
  759. return 1;
  760. else if (((TempConWtStruct *)arg2)->Weight < ((TempConWtStruct *)arg1)->Weight)
  761. return -1;
  762. else
  763. return 0;
  764. }
  765. // Compare two dwords and return 0, < 0, or > 0 to help qsort sort in decreasing order
  766. int _cdecl CompareSimStruct(const void *arg1, const void *arg2)
  767. {
  768. if (((SimStruct *)arg2)->Similarity > ((SimStruct *)arg1)->Similarity)
  769. return 1;
  770. else if (((SimStruct *)arg2)->Similarity < ((SimStruct *)arg1)->Similarity)
  771. return -1;
  772. else
  773. return 0;
  774. }
  775. // ASSUMPTION : There are at least two elements in the list
  776. __inline DWORD CQuery::GetDocPosInList(SimStruct *aInSimilarity, DWORD cInHits, DWORD DocId)
  777. {
  778. register DWORD high = cInHits, low = 0, mid;
  779. while (low < high)
  780. {
  781. mid = low + (high - low)/2;
  782. if (DocId < aInSimilarity[mid].DocId)
  783. high = mid;
  784. else if (DocId > aInSimilarity[mid].DocId)
  785. low = mid + 1;
  786. else
  787. return mid;
  788. }
  789. return DOESNOTEXIST;
  790. }
  791. __inline DWORD CQuery::GetDocPosInList2(SimStruct *aInSimilarity, DWORD cInHits, DWORD DocId)
  792. {
  793. register DWORD i;
  794. for (i = 0; i < cInHits; i++)
  795. if (aInSimilarity[i].DocId == DocId)
  796. return i;
  797. // the doc has not been found
  798. return DOESNOTEXIST;
  799. }
  800. void CQuery::IndexDocumentText(PWCHAR pwDocText, int cwText, BOOL fRelevant)
  801. {
  802. int n, nTokens, nMore;
  803. PUINT pwHash = NULL;
  804. PBYTE pbType = NULL;
  805. PWCHAR *paStart = NULL,
  806. *paEnd = NULL;
  807. PWCHAR pwText = pwDocText; // we will leave the pwDocText untouched so that
  808. // the caller can delete that memory buffer.
  809. DWORD ConId;
  810. nMore = cwText;
  811. ASSERT(pwText && cwText);
  812. __try
  813. {
  814. // cwText is probably a lot more than we need, but it guarantees us that we won't run out of memory
  815. // for tokens
  816. pwHash = New UINT[cwText];
  817. pbType = New BYTE[cwText];
  818. paStart = New PWCHAR[cwText];
  819. paEnd = New PWCHAR[cwText];
  820. if (pwText && pwHash && paStart && pbType && paEnd)
  821. {
  822. nTokens = WordBreakW(&pwText, &nMore, paStart, paEnd, pbType, pwHash, cwText, REMOVE_SPACE_CHARS);
  823. for (n = 0; n < nTokens; n++)
  824. {
  825. // EnterWord with last param set to TRUE is only looking up, not entering, a word
  826. ConId = m_ptdb->PDict()->EnterWord(paStart[n], paEnd[n] - paStart[n], TRUE, TRUE);
  827. if (ConId != EOL && ConId != STOPWORD)
  828. if (fRelevant)
  829. RecordConcept(ConId);
  830. else // not relevant
  831. {
  832. DWORD i;
  833. // For each concept in the document, check to see if it exists
  834. // in the query. If it does, subtract it from the query's term frequency
  835. for (i = 0; i < m_cConWts && Concept(i) != ConId; i++);
  836. if (i < m_cConWts)
  837. // This concept exists in the query. Subtract this term from the query.
  838. if (TermFreq(i) > 0)
  839. TermFreq(i) -= 1;
  840. }
  841. }
  842. }
  843. }
  844. __finally
  845. {
  846. if (paEnd) { delete paEnd; paEnd = NULL; }
  847. if (paStart) { delete paStart; paStart = NULL; }
  848. if (pbType) { delete pbType; pbType = NULL; }
  849. if (pwHash) { delete pwHash; pwHash = NULL; }
  850. }
  851. }