windows-nt-4.0/private/windows/win4help/ftsrch/query.cpp



								#include   "stdafx.h"

								#include     <math.h>

								#include "vmbuffer.h"

								#include    "memex.h"

								#include "saveload.h"

								#include  "TXDBase.h"

								#include "bytemaps.h"

								#include  "textset.h"

								#include     "dict.h"

								#include   "vector.h"

								#include    "query.h"


								// This file contains the definition of class CQuery


								// Constructors

								/*************************************************************************

								 *	FUNCTION : 															 *

								 *                                                                       *

								 *  RETURNS  :															 *

								 *																		 *

								 *	PURPOSE :															 *

								 *																		 *

								 *	PARAMETERS :														 *

								 *																		 *

								 *	SIDE EFFECTS :														 *

								 *                                                                       *

								 *  DESCRIPTION :                                                        *

								 *                                                                       *

								 *  HISTORY :                                                            *

								 *                                                                       *

								 *          Author          Date              Action                     *

								 *          ------          ----              ------                     *

								 *                                                                       *

								 *          KrishnaN        4/23/94           Creation.                  *

								 *																		 *

								 *************************************************************************/


								CQuery::CQuery()

								{

									m_cConWts = 0;

								#if 0

									m_cOverFlows = 0;

								#endif

									m_pszQueryText = NULL;

								}


								// Destructor

								/*************************************************************************

								 *	FUNCTION : 															 *

								 *                                                                       *

								 *  RETURNS  :															 *

								 *																		 *

								 *	PURPOSE :															 *

								 *																		 *

								 *	PARAMETERS :														 *

								 *																		 *

								 *	SIDE EFFECTS :														 *

								 *                                                                       *

								 *  DESCRIPTION :                                                        *

								 *                                                                       *

								 *  HISTORY :                                                            *

								 *                                                                       *

								 *          Author          Date              Action                     *

								 *          ------          ----              ------                     *

								 *                                                                       *

								 *          KrishnaN        4/23/94           Creation.                  *

								 *																		 *

								 *************************************************************************/


								CQuery::~CQuery()

								{

								    if (m_vbVectorConcept.Base)

								        FreeVirtualBuffer(&m_vbVectorConcept);

									if (m_vbVectorTermFreq.Base)

										FreeVirtualBuffer(&m_vbVectorTermFreq);

									if (m_vbVectorWt.Base)

										FreeVirtualBuffer(&m_vbVectorWt);

								#if 0

									if (m_vbTFOverFlow.Base)

										FreeVirtualBuffer(&m_vbTFOverFlow);

								#endif

								}


								CQuery *CQuery::NewQuery(CTextSet *pts)

								{

								    CQuery *pQuery= NULL;


								    __try

								    {

								        pQuery= New CQuery;


								        pQuery->Initialize(pts, 100, 100000);

								    }

								    __finally

								    {

								        if (_abnormal_termination() && pQuery)

								        {

								            delete pQuery;  pQuery= NULL;

								        }

								    }


								    return pQuery;

								}


								// Access Functions:

								/*************************************************************************

								 *	FUNCTION : 															 *

								 *                                                                       *

								 *  RETURNS  :															 *

								 *																		 *

								 *	PURPOSE :															 *

								 *																		 *

								 *	PARAMETERS :														 *

								 *																		 *

								 *	SIDE EFFECTS :														 *

								 *                                                                       *

								 *  DESCRIPTION :                                                        *

								 *                                                                       *

								 *  HISTORY :                                                            *

								 *                                                                       *

								 *          Author          Date              Action                     *

								 *          ------          ----              ------                     *

								 *                                                                       *

								 *          KrishnaN        4/23/94           Creation.                  *

								 *																		 *

								 *************************************************************************/


								void CQuery::Initialize(CTextSet *textsetIn, DWORD cInEstConWtPairs, DWORD cInMaxConWtPairs)

								{

									ASSERT(textsetIn != NULL);


									m_ptdb = textsetIn;


									m_cDocuments = m_ptdb->PColl()->m_cDocuments;


									ASSERT(m_cDocuments != 0);


									m_vbConcepts = m_ptdb->PColl()->m_vbConcepts;


									ASSERT(m_vbConcepts.Base);


									m_aWtInvIndex = m_ptdb->PColl()->m_aWtInvIndex;


									ASSERT(m_aWtInvIndex);


								    CreateVirtualBuffer(&m_vbVectorConcept , cInEstConWtPairs * sizeof(DWORD), cInMaxConWtPairs * sizeof(DWORD));

								    CreateVirtualBuffer(&m_vbVectorTermFreq, cInEstConWtPairs * sizeof(DWORD), cInMaxConWtPairs * sizeof(DWORD));

								    CreateVirtualBuffer(&m_vbVectorWt      , 0                               , cInMaxConWtPairs * sizeof(float));

								#if 0

								    CreateVirtualBuffer(&m_vbTFOverFlow    , 0                               , 0x4000 * sizeof(TFOverFlowStruct));

								#endif


									// Initialize allocated memory

									// VritualAlloc zeroes all memory it commits, so we don't have to worry about zeroing the virtual buffers

								}


								/*************************************************************************

								 *	FUNCTION : 															 *

								 *                                                                       *

								 *  RETURNS  :															 *

								 *																		 *

								 *	PURPOSE :															 *

								 *																		 *

								 *	PARAMETERS :														 *

								 *																		 *

								 *	SIDE EFFECTS :														 *

								 *                                                                       *

								 *  DESCRIPTION :                                                        *

								 *                                                                       *

								 *  HISTORY :                                                            *

								 *                                                                       *

								 *          Author          Date              Action                     *

								 *          ------          ----              ------                     *

								 *                                                                       *

								 *          KrishnaN        4/23/94           Creation.                  *

								 *																		 *

								 *************************************************************************/


								void CQuery::RecordConcept(DWORD ConceptId)

								{

									// Search for this concept id in the current document. If you find it,

									// simply increment its frequency and that will take care of everything.

									// If you don't find it, then enter the concept for the document.


									DWORD i;	// index of the con,wt pair being considered for match


									for (i = 0; i < m_cConWts && Concept(i) != ConceptId; i++);


									if (i == m_cConWts)

									{

										// This concept doesn't exist in the query. Record it.

										__try

										{

											Concept(m_cConWts) = ConceptId;

										}

										__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorConcept))

										{

											RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);

										}


										__try

										{

											TermFreq(m_cConWts) = 1;	// this is the first time this concept occured for this document

										}

										__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorTermFreq))

										{

											RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);

										}


										m_cConWts++;

									}

									else

									{

										// Term already exists in this document. Increase the occurence frequency.

										// Since the term already exists in the document, it has a frequency of at least 1

								#if 0

										// The only time when the value can be 0 is when the frequency has exceeded 0xFFFF. In

										// that case, the overflowing value is stored in the over flow area

										if (TermFreq(i) == 0)

										{

											// go to the over flow area and update the value that tracks this term frequency

										}

										else


								#endif

										if (TermFreq(i) == 0xFFFF)

										{

											// we reached the upperbound on this value.

										}

										else	// normal case. No overflow is involved. This is what happens MOST of the time.

											(TermFreq(i))++;

									}

								}


								/*************************************************************************

								 *	FUNCTION : 															 *

								 *                                                                       *

								 *  RETURNS  :															 *

								 *																		 *

								 *	PURPOSE :															 *

								 *																		 *

								 *	PARAMETERS :														 *

								 *																		 *

								 *	SIDE EFFECTS :														 *

								 *                                                                       *

								 *  DESCRIPTION :                                                        *

								 *                                                                       *

								 *  HISTORY :                                                            *

								 *                                                                       *

								 *          Author          Date              Action                     *

								 *          ------          ----              ------                     *

								 *                                                                       *

								 *          KrishnaN        4/23/94           Creation.                  *

								 *																		 *

								 *************************************************************************/


								// ASSUMPTION : We are only weighting one query vector. This will hold true all the time.


								BOOL CQuery::WeightVector(BYTE TFModType, BYTE WeightType, BYTE NormType)

								{

									DWORD i;


									// Copy the Term Frequencies into an array of floating points. All operations will be computed

									// on these floating point weights. The final results can then be converted to a fixed point.

									// IMPORTANT : ALL WEIGHTS SHOULD BE NORMALIZED TO ENSURE THAT EACH WEIGHT IS LESS THAN ONE.

									//             THE FIXED POINT VALUE ONLY REPRESENTS VALUES BETWEEN 0.0 AND 1.0

									for (i = 0; i < m_cConWts; i++)

									{

										__try

										{

											TermWt(i) = (float)GetRealTermFreq(i);

										}

										__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorWt))

										{

											RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);

										}

									}


									ApplyWeightingScheme(TFModType, WeightType, NormType, 0, m_cConWts);


									// Plug back the weighted values into the term frequency array

									// ASSUMPTION : Each weight in TermWt is between 0.0 and 1.0

									// Multiplying this with WT_ONE forces each TermWt weight to be a

									// fixed point number ranging between 0 and WT_ONE.

									for (i = 0; i < m_cConWts; i++)

										TermFreq(i) = (WORD)((double)TermWt(i) * (double)WT_ONE);


								 	return TRUE;

								}


								/*************************************************************************

								 *	FUNCTION : 															 *

								 *                                                                       *

								 *  RETURNS  :															 *

								 *																		 *

								 *	PURPOSE :															 *

								 *																		 *

								 *	PARAMETERS :														 *

								 *																		 *

								 *	SIDE EFFECTS :														 *

								 *                                                                       *

								 *  DESCRIPTION :                                                        *

								 *                                                                       *

								 *  HISTORY :                                                            *

								 *                                                                       *

								 *          Author          Date              Action                     *

								 *          ------          ----              ------                     *

								 *                                                                       *

								 *          KrishnaN        4/23/94           Creation.                  *

								 *																		 *

								 *************************************************************************/


								void CQuery::ApplyWeightingScheme(BYTE TFModType, BYTE WeightType, BYTE NormType, DWORD iFirstConWt, DWORD cConWts)

								{

									register DWORD i;

									double Wt;	// used to hold different types of cumulative values at various points in the computations


									// First modify weight based on the term frequency component

									switch (TFModType)

									{

										case NEWTF_NONE:	// do nothing

											break;


										case NEWTF_BINARY:	// Since all the terms are in, turn them on

											for (i = 0; i < cConWts; i++)

												TermWt(i) = (float)1.0;

											break;


										case NEWTF_MAXNORM:

											Wt = 0.0;

											for (i = 0; i < cConWts; i++)

												if (TermWt(i) > Wt)

													Wt = TermWt(i);


											// increase Max by 0.00001 to place all normalized TFs between 0.0 and 1.0

											Wt += 0.00001;


											for (i = 0; i < cConWts; i++)

													TermWt(i) = (float) ((double)TermWt(i) / Wt);

											break;


										case NEWTF_AUGNORM:

											Wt = 0.0;

											for (i = 0; i < cConWts; i++)

												if (TermWt(i) > Wt)

													Wt = TermWt(i);


											// increase Max by 0.00001 to place all normalized TFs between 0.0 and 1.0

											Wt += 0.00001;


											for (i = 0; i < cConWts; i++)

												TermWt(i) = (float) (0.5 + 0.5 * (double)TermWt(i) / Wt);

											break;


										default:

											// Assertion failure.

											break;

									}


									// Then modify the weight based on the collection frequency component

									switch (WeightType)

									{

										case WT_NONE:	// do nothing

											break;


										// if a concept occurs in all docs, let's assign it a small value instead of assigning it a 0.0

										case WT_TFIDF:

											for (i = 0; i < cConWts; i++)

												if (m_cDocuments == DocFromCumFreq(Concept(i + iFirstConWt)))

													TermWt(i) = (float) 0.005;

												else

													TermWt(i) = (float) ((double)TermWt(i) * log((double)m_cDocuments / (double)DocFromCumFreq(Concept(i + iFirstConWt))));

											break;


										case WT_PROB:

											for (i = 0; i < cConWts; i++)

												if (m_cDocuments == DocFromCumFreq(Concept(i + iFirstConWt)))

													TermWt(i) = (float) 0.005;

												else

													TermWt(i) = (float) ((double)TermWt(i) * log((double)(m_cDocuments - DocFromCumFreq(Concept(i + iFirstConWt))) / (double)DocFromCumFreq(Concept(i + iFirstConWt))));

											break;


										default:

											// ASSERTion failure.

											break;

									}


									switch (NormType)

									{

										case NORM_NONE:

											break;


										case NORM_SUM:

											Wt = 0.0;

											for (i = 0; i < cConWts; i++)

												Wt += (double) TermWt(i);


											for (i = 0; i < cConWts; i++)

												TermWt(i) = (float) ((double)TermWt(i) / Wt);

											break;


										case NORM_COSINE:

											Wt = 0.0;

											// compute sum of squares of weights in the vector

											for (i = 0; i < cConWts; i++)

												Wt += TermWt(i) * TermWt(i);


											Wt = sqrt(Wt);

											// normalize each weight by the sum of squares computed above

											for (i = 0; i < cConWts; i++)

												TermWt(i) = (float) ((double)TermWt(i) / Wt);

											break;


										case NORM_MAX:

											break;

									}

								}


								/*************************************************************************

								 *	FUNCTION : 															 *

								 *                                                                       *

								 *  RETURNS  :															 *

								 *																		 *

								 *	PURPOSE :															 *

								 *																		 *

								 *	PARAMETERS :														 *

								 *																		 *

								 *	SIDE EFFECTS :														 *

								 *                                                                       *

								 *  DESCRIPTION :                                                        *

								 *                                                                       *

								 *  HISTORY :                                                            *

								 *                                                                       *

								 *          Author          Date              Action                     *

								 *          ------          ----              ------                     *

								 *                                                                       *

								 *          KrishnaN        4/23/94           Creation.                  *

								 *																		 *

								 *************************************************************************/


								BOOL CQuery::RankDocuments(SimStruct *aInSimilarity, DWORD cInHits)

								{

									register DWORD i, j;

									DWORD ConceptId, DocId;

									DWORD cDocs;

									DWORD DocPos;	// tracks the position of a document in the similiarity structure

									DWORD startDocPos;


									if (cInHits == 0)

									{

										SetLastError(QUERYERROR_NOHITS);

										return FALSE;

									}


									if (aInSimilarity == NULL)

									{

										SetLastError(QUERYERROR_EMPTYSIMARRAY);

										return FALSE;

									}


									// ASSUME THAT THE SIMILARITY STRUCTURE ARRAY HAS ENOUGH ENTRIES TO SUPPORT cInHits


									// Zero out any existing similarity values

									for (i = 0; i < cInHits; i++)

										aInSimilarity[i].Similarity = 0;


									// Compute similarity. Walk the doc,wt list for each concept

									for (i = 0; i < m_cConWts; i++)

									{

										// Ignore concepts that have a zero weight. Later, we may want to extend this idea

										// to suppress weights below a small value.

										if (TermFreq(i) == 0)

											continue;


										ConceptId = Concept(i);

										cDocs = DocFromCumFreq(ConceptId);


										// Consider each doc in the (Doc, Wt) list for this concept and score docs that

										// are in the predetermined hit list.

										startDocPos = DocList(ConceptId);	// get the starting point of the inverted list.

										for (j = 0; j < cDocs; j++)

										{

											if (j == 0)

												DocId = m_ptdb->PColl()->GetDocumentGap(&startDocPos) - 1;

											else

												DocId += m_ptdb->PColl()->GetDocumentGap(&startDocPos);


											DocPos = GetDocPosInList(aInSimilarity, cInHits, DocId);


											if (DocPos != DOESNOTEXIST)

												aInSimilarity[DocPos].Similarity += TermFreq(i) * WtFromInvList(ConceptId, j);


											/*	IF WE LIMIT SIMILARITY TO 24 BITS, USE THE FOLLOWING LINE. IF WE LIMIT TO ANY OTHER NUMBER

												OF BITS n, n < 32, RIGHT SHIFT THE RHS BY 32 - 	n.

												aInSimilarity[DocPos].Similarity += (TermFreq(i) * WtFromInvList(ConceptId, j)) >> 8;

											*/

										}

									}

								/*	MOVE SORTING TO THE CALLER!

									// sort the scored documents.

									qsort(aInSimilarity, cInHits, sizeof(SimStruct), CompareSimStruct);

								*/

									// return the number of hits

									return cInHits;

								}


								/*************************************************************************

								 *	FUNCTION : 															 *

								 *                                                                       *

								 *  RETURNS  :															 *

								 *																		 *

								 *	PURPOSE :															 *

								 *																		 *

								 *	PARAMETERS :														 *

								 *																		 *

								 *	SIDE EFFECTS :														 *

								 *                                                                       *

								 *  DESCRIPTION :                                                        *

								 *                                                                       *

								 *  HISTORY :                                                            *

								 *                                                                       *

								 *          Author          Date              Action                     *

								 *          ------          ----              ------                     *

								 *                                                                       *

								 *          KrishnaN        4/23/94           Creation.                  *

								 *																		 *

								 *************************************************************************/


								// cInHits is the number of previous hits. cInMaxHits is the maximum number of documents to retrieve

								DWORD CQuery::RetrieveWithFeedback(	SimStruct *aInSimilarity, DWORD cInMaxHits,

																	PWCHAR pwRelDocText, int cwRelDocText,

																	PWCHAR pwNonRelDocText, int cwNonRelDocText

																	)

								{

									DWORD i, j, k;

									DWORD DocId;

									DWORD cHits = 0;


									DWORD cDocs, DocPos;

									DWORD LastDocInPos = 0;	// Document position which has the least partial similarity match

									DWORD UBCurrentDoc;		// Upper bound of the current document

									DWORD CCurrentDoc;		// C (CurrentDoc)

									DWORD CFirstDocOut = 0;	// C (FirstDocOut)

									DWORD UBFirstDocOut;	// Upper bound of the first document outside the RSet

									LPDWORD aQTermSummation= NULL;	// summation of query terms


									DWORD startDocPos;


									ASSERT(aInSimilarity);


									ASSERT(pwRelDocText && cwRelDocText);


									__try

								    {

								    	// Add terms from the query. We will either have to reindex the initial query

								    	// or store the term frequencies the first time they were computed.

								    	// Assume that each term only occurs once in the query.

								    	// This assumption usually holds good for queries typed in by the user.

								    	// It doesn't matter much even if it doesn't hold good because the document

								    	// text overwhelms the original query.

								    	for (i = 0; i < m_cConWts; i++)	// Enforce the above assumption.

								    		TermFreq(i) = 1;


								    	// Add terms from the relevant documents to the query

								    	IndexDocumentText(pwRelDocText, cwRelDocText, TRUE);


								    	// For the non-relevant document text, decrease termfreqs of concepts it has in common with

								    	// the newly formed query.

								    	// NOTE : The caller should pass in the document text of only the highest ranked

								    	// non-relevant document to get the  best results (Dec-Hi relevance feedback method).

								    	if (pwNonRelDocText && cwNonRelDocText)

								    	{

								    		IndexDocumentText(pwNonRelDocText, cwNonRelDocText, FALSE);


								    		// At this point, we may have some zero weighted concepts in the

								    		// query. Remove any such concept, weight pairs.

								    		for (i = j = 0; i < m_cConWts;)

								    		{

								    			// search for the next zero weighted concept

								    			for (; j < m_cConWts && TermFreq(j) > 0; j++);

								    			i = j;	// update i so that outer loop terminates appropriately

								    			if (j < m_cConWts)	// we found a zero weighted concept

								    			{

								    				// search for the next non-zero weighted concept

								    				for (k = j + 1; k < m_cConWts && TermFreq(k) == 0; k++);

								    				if (k < m_cConWts)	// we found a non-zero weighted concept

								    				{

								    					// copy the con,wt pair

								    					Concept(j) = Concept(k);

								    					TermFreq(j) = TermFreq(k);

								    					// erase the copied pair

								    					TermFreq(k) = 0;

								    					j++;	// update j so that the for loop advances

								    				}

								    				else	// no more non-zero weighted concepts. we are done.

								    					i = k;

								    			}

								    		}


								    		ASSERT(i <= m_cConWts);

								    		// Count the new number of ConWt pairs

								    		for (m_cConWts = 0; TermFreq(m_cConWts) > 0; m_cConWts++);

								    	}


								        if (!m_cConWts) __leave;


								    	// Now weight the query vector

								    	WeightVector(NEWTF_NONE, WT_TFIDF, NORM_COSINE);


								    	SortQuery();


								    	aQTermSummation = (LPDWORD)VAlloc(FALSE, m_cConWts * sizeof(DWORD));


								    	// Compute summation of query terms. This summation will be used to compute the upperbound.

								    	// aQTermSummation[j] gives is the sum of weights of query terms j+1 to the last term in the query

								    	// The << 16 left shift takes care of multiplication by 1 i.e. it makes this a true 32-bit value

								    	for (aQTermSummation[m_cConWts - 1] = TermFreq(i) << 16, i = m_cConWts - 1; i > 0; i--)

								    		aQTermSummation[i - 1] = aQTermSummation[i] + TermFreq(i-1) << 16;


								    /*

								    	// scale the values to 24 bit

								    	for (i = 0; i < m_cConWts; i++)

								    		aQTermSummation[i] = aQTermSummation[i] >> 8;

								    */


								    	/* IMPORTANT ASSUMPTION : The aInSimilarity array is properly initialized.

								    		Proper initialization includes resetting all docid and sim values to 0

								    		and the CollId field to the appropriate collection id.


								    		If aInSimilarity is not properly initialized, the docid, sim values will

								    		still be correct, but the caller will have no way of finding the collection

								    		id of the docid, sim values set here.

								    	*/


								    	// Compute similarity. Walk the doc,wt list for each concept

								    	// Compute until all terms are exhausted or the stopping conditions are met

								    	// Skip terms that occur too frequently (how frequent is too frequent ?)

								    	i = 0;

								    	do

								    	{

								    		// CODE TO SKIP TERMS THAT ARE TOO FREQUENT CAN APPEAR HERE

								    		//	if (term is too frequent)

								    		//	{

								    		//		i++;

								    		//		continue;

								    		//	}


								    		cDocs = DocFromCumFreq(Concept(i));

								    		DocId = 0;

								    		startDocPos = DocList(Concept(i));

								    		// Consider each doc in the (Doc, Wt) list for this concept and score docs that

								    		// are in the predetermined hit list.

								    		for (j = 0; j < cDocs; j++)

								    		{

								    			// The first doc in an inverted list for a concept is encoded as docid + 1. The subsequent

								    			// gaps are encoded as they are.

								    			if (j == 0)

								    				DocId = m_ptdb->PColl()->GetDocumentGap(&startDocPos) - 1;

								    			else

								    				DocId += m_ptdb->PColl()->GetDocumentGap(&startDocPos);


								    			DocPos = GetDocPosInList2(aInSimilarity, cHits, DocId);

								    			// ALG : If RsetNotFull then

								    			if (cHits < cInMaxHits)

								    			{

								    				// ALG : Compute C(Document);

								    				// ALG : Enter Document into the RSet

								    				if (DocPos == DOESNOTEXIST)

								    				{

								    					// Add this new document

								    					DocPos = cHits;

								    					aInSimilarity[DocPos].DocId = DocId;

								    					cHits++;

								    					aInSimilarity[DocPos].Similarity += TermFreq(i) * WtFromInvList(Concept(i), j);

								    					/* If we scale similarity to 24 bits, use this line instead of the above

								    					aInSimilarity[DocPos].Similarity += (TermFreq(i) * WtFromInvList(Concept(i), j)) >> 8;

								    					*/

								    					if (aInSimilarity[DocPos].Similarity < aInSimilarity[LastDocInPos].Similarity)

								    						LastDocInPos = DocPos;

								    				}

								    				else

								    				{

								    					// recompute the LastDocIn document if this document was LastDocIn before this cumulation

								    					aInSimilarity[DocPos].Similarity += TermFreq(i) * WtFromInvList(Concept(i), j);

								    					/* If we scale similarity to 24 bits, use this line instead of the above

								    					aInSimilarity[DocPos].Similarity += (TermFreq(i) * WtFromInvList(Concept(i), j)) >> 8;

								    					*/

								    					if (DocPos == LastDocInPos)

								    						for (k = 0; k < cHits; k++)

								    							if (aInSimilarity[k].Similarity < aInSimilarity[LastDocInPos].Similarity)

								    								LastDocInPos = k;

								    				}

								    			}

								    			// ALG : else

								    			else

								    			{

								    				// ALG : Compute Upperbound (Document)

								    				// At this point we will also compute the partial similarity for this document

								    				if (DocPos == DOESNOTEXIST)

								    				{

								    					CCurrentDoc = TermFreq(i) * WtFromInvList(Concept(i), j);

								    					/* If we scale similarity to 24 bits, use this line instead of the above

								    					CCurrentDoc = (TermFreq(i) * WtFromInvList(Concept(i), j)) >> 8;

								    					*/

								    					UBCurrentDoc = aQTermSummation[i];

								    				}

								    				else

								    				{

								    					CCurrentDoc = aInSimilarity[DocPos].Similarity + (TermFreq(i) * WtFromInvList(Concept(i), j));

								    					/* If we scale similarity to 24 bits, use this line instead of the above

								    					CCurrentDoc = aInSimilarity[DocPos].Similarity + ((TermFreq(i) * WtFromInvList(Concept(i), j)) >> 8);

								    					*/

								    					// The upper bound could exceed the maximum possible similarity value. We should protect

								    					// against that by bounding the upper bound.

								    					if ((MAXSIM - aInSimilarity[DocPos].Similarity)	< aQTermSummation[i])

								    						UBCurrentDoc = MAXSIM;

								    					else

								    						UBCurrentDoc = aInSimilarity[DocPos].Similarity + aQTermSummation[i];

								    				}


								    				// ALG : If U(Document) <= C(LastDoc) then

								    				// ALG :    DoNotAllocate / Remove Document

								    				// If U < C condition is met and the doc is already in, remove it

								    				if (UBCurrentDoc <= aInSimilarity[LastDocInPos].Similarity)

								    				{

								    					// This document is a loser. Check to see if it is at least better than

								    					// the first document outside the RSet.

								    					if (CCurrentDoc > CFirstDocOut)

								    						CFirstDocOut = CCurrentDoc;


								    					// Remove this loser if it was already entered

								    					if (DocPos != DOESNOTEXIST)

								    					{

								    						// remove current document from the list

								    						// remove by copying the document at the end into this document's position

								    						aInSimilarity[DocPos].Similarity = aInSimilarity[cHits - 1].Similarity;

								    						aInSimilarity[DocPos].DocId = aInSimilarity[cHits - 1].DocId;

								    						cHits--;

								    						ASSERT (cHits);


								    						// Now that we changed the document set, recompute the last doc position

								    						for (k = 0; k < cHits; k++)

								    							if (aInSimilarity[k].Similarity < aInSimilarity[LastDocInPos].Similarity)

								    								LastDocInPos = k;

								    					}

								    				}

								    				// ALG : else

								    				// ALG : 	Compute C (Document)

								    				// ALG : 	if (C (Document) > C (LastDoc) then

								    				// ALG : 		Enter Document into the RSet

								    				else

								    				{

								    					if (CCurrentDoc > aInSimilarity[LastDocInPos].Similarity)

								    					{

								    						if (DocPos == DOESNOTEXIST)

								    						{

								                            	// Since the RSet is already full, the only way to enter the current document

								    							// is by replacing the document at the LastDocInPos - i.e replacing the doc with

								    							// the least partial match


								    							// Before replacing the LastDocIn, let us save it as the FirstDocOut

								    							CFirstDocOut = aInSimilarity[LastDocInPos].Similarity;


								    							// Replace

								    							aInSimilarity[LastDocInPos].DocId = DocId;

								    							aInSimilarity[LastDocInPos].Similarity = CCurrentDoc;


								    							// Now that we changed the document set, recompute the last doc position

								    							for (k = 0; k < cHits; k++)

								    								if (aInSimilarity[k].Similarity < aInSimilarity[LastDocInPos].Similarity)

								    									LastDocInPos = k;

								    						}

								    						else

								    						{

								    							aInSimilarity[DocPos].Similarity = CCurrentDoc;

								    							// recompute the LastDocIn document if this document was LastDocIn before this cumulation

								    							if (DocPos == LastDocInPos)

								    								for (k = 0; k < cHits; k++)

								    									if (aInSimilarity[k].Similarity < aInSimilarity[LastDocInPos].Similarity)

								    										LastDocInPos = k;

								    						}

								    					}

								    				}

								    			}

								    		}

											/* BEGIN : Fix FOR BUG 18016 */

											if (cHits < cInMaxHits)

												UBFirstDocOut = 0xFFFFFFFF;	// No doc outside the RSet, so the first doc out potentially has infinite upperboud

											else

											/* END :  fix for BUG 18016 */

									    		// Compute upper bound of FirstDocOut

								    			UBFirstDocOut = CFirstDocOut + aQTermSummation[i];

								    		i++;

								    	// ALG : until LastQueryTerm or U(FirstDocOut) <= C(LastDocIn)

								    	// NOTE : We converted a repeat - until into a do - while, so the loop termination conditions are different

								    	//        between algorithm and the implementation.

								    	} while (i < m_cConWts && UBFirstDocOut > aInSimilarity[LastDocInPos].Similarity && TermFreq(i) > 0 );	// INTRODUCE MORE STOPPING CONDITIONS HERE


								    #if 0	// statistics

								    	if ( i < m_cConWts )

								    	{

								    		char szBuffer[200];

								    		DWORD cDocsExamined = 0, cDocsNotExamined = 0;


								    		for (k = 0; k < i; k++)

								    			cDocsExamined += DocFromCumFreq(Concept(k));


								    		for (k = i; k < m_cConWts; k++)

								    			cDocsNotExamined += DocFromCumFreq(Concept(k));


								    		wsprintf(szBuffer, "Examined only %u lists out of %u lists and only %u docs out of %u docs", i, m_cConWts, cDocsExamined, cDocsExamined+cDocsNotExamined);

								    		MessageBox(GetFocus(), szBuffer, "Query Optimization", MB_OK);

								    	}

								    #endif // 0, statistics


								    /* 	MOVE SORTING TO THE CALLER. THIS IS DONE TO ENABLE MULTIPLE FILE SEARCHES. THE CALLER WILL

								    	GET ALL THE RESULTS INTO A HUGE SIMSTRUCT ARRAY AND SORT IT

								    	// sort the scored documents.

								    	qsort(aInSimilarity, cHits, sizeof(SimStruct), CompareSimStruct);

								    */

								    }

								    __finally

								    {

								        if (aQTermSummation) VFree(aQTermSummation);

								    }


									return cHits;

								}


								void CQuery::SortQuery()

								{

									TempConWtStruct * aConWts = NULL;

									register DWORD i;


									__try

								    {

								    	// Sort the query concept, wt pairs based on the weight of the concepts. This will be used

								    	// when we employ stop conditions to reduce the number of documents considered.

								    	// Since the concepts and weights are not in the same structure, we need to

								    	// copy them to a temporary buffer and then copy the sorted values back

								    	aConWts = (TempConWtStruct *) VAlloc(FALSE, sizeof(TempConWtStruct) * m_cConWts);


								    	for (i = 0; i < m_cConWts; i++)

								    	{

								    		aConWts[i].ConceptId = Concept(i);

								    		aConWts[i].Weight = TermFreq(i);

								    	}


								    	qsort(aConWts, m_cConWts, sizeof(TempConWtStruct), CompareTempConWtStruct);


								    	for (i = 0; i < m_cConWts; i++)

								    	{

								    		Concept(i) = aConWts[i].ConceptId;

								    		TermFreq(i) = (WORD)aConWts[i].Weight;

								    	}

								    }

								    __finally

								    {

								        if (aConWts) VFree(aConWts);

								    }

								}


								// Compare two dwords and return 0, < 0, or > 0 to help qsort sort in decreasing order

								int _cdecl CompareTempConWtStruct(const void *arg1, const void *arg2)

								{

									if (((TempConWtStruct *)arg2)->Weight > ((TempConWtStruct *)arg1)->Weight)

										return 1;

									else if (((TempConWtStruct *)arg2)->Weight < ((TempConWtStruct *)arg1)->Weight)

										return -1;

									else

										return 0;

								}


								// Compare two dwords and return 0, < 0, or > 0 to help qsort sort in decreasing order

								int _cdecl CompareSimStruct(const void *arg1, const void *arg2)

								{

									if (((SimStruct *)arg2)->Similarity > ((SimStruct *)arg1)->Similarity)

										return 1;

									else if (((SimStruct *)arg2)->Similarity < ((SimStruct *)arg1)->Similarity)

										return -1;

									else

										return 0;

								}


								// ASSUMPTION : There are at least two elements in the list

								__inline DWORD CQuery::GetDocPosInList(SimStruct *aInSimilarity, DWORD cInHits, DWORD DocId)

								{

									register DWORD high = cInHits, low = 0, mid;


									while (low < high)

									{

										mid = low + (high - low)/2;

										if (DocId < aInSimilarity[mid].DocId)

											high = mid;

										else if (DocId > aInSimilarity[mid].DocId)

											low = mid + 1;

										else

											return mid;

									}

									return DOESNOTEXIST;

								}


								__inline DWORD CQuery::GetDocPosInList2(SimStruct *aInSimilarity, DWORD cInHits, DWORD DocId)

								{

									register DWORD i;


									for (i = 0; i < cInHits; i++)

										if (aInSimilarity[i].DocId == DocId)

											return i;

									// the doc has not been found

									return DOESNOTEXIST;

								}


								void  CQuery::IndexDocumentText(PWCHAR pwDocText, int cwText, BOOL fRelevant)

								{

									int    n, nTokens, nMore;

									PUINT  pwHash   = NULL;

									PBYTE  pbType   = NULL;

									PWCHAR *paStart = NULL,

									       *paEnd   = NULL;


									PWCHAR pwText = pwDocText;	// we will leave the pwDocText untouched so that

																// the caller can delete that memory buffer.

								    DWORD ConId;


								   	nMore = cwText;


								   	ASSERT(pwText && cwText);


								    __try

								    {

								    	// cwText is probably a lot more than we need, but it guarantees us that we won't run out of memory

								    	// for tokens

								    	pwHash  = New   UINT[cwText];

								    	pbType  = New   BYTE[cwText];

								    	paStart = New PWCHAR[cwText];

								    	paEnd   = New PWCHAR[cwText];


								    	if (pwText && pwHash && paStart && pbType && paEnd)

								    	{

								    		nTokens = WordBreakW(&pwText, &nMore, paStart, paEnd, pbType, pwHash, cwText, REMOVE_SPACE_CHARS);


								    		for (n = 0; n < nTokens; n++)

								    		{

								    			// EnterWord with last param set to TRUE is only looking up, not entering, a word

								    			ConId = m_ptdb->PDict()->EnterWord(paStart[n], paEnd[n] - paStart[n], TRUE, TRUE);

								    			if (ConId != EOL && ConId != STOPWORD)

								    				if (fRelevant)

								    					RecordConcept(ConId);

								    				else	// not relevant

								    				{

								    					DWORD i;

								    					// For each concept in the document, check to see if it exists

								    					// in the query. If it does, subtract it from the query's term frequency


								    					for (i = 0; i < m_cConWts && Concept(i) != ConId; i++);


								    					if (i < m_cConWts)

								    					// This concept exists in the query. Subtract this term from the query.

								    					if (TermFreq(i) > 0)

								    						TermFreq(i) -= 1;

								    				}

								    		}

								    	}

								    }

								    __finally

								    {

								    	if (paEnd)   { delete paEnd;    paEnd   = NULL; }

								    	if (paStart) { delete paStart;  paStart = NULL; }

								    	if (pbType)  { delete pbType;   pbType  = NULL; }

								    	if (pwHash)  { delete pwHash;   pwHash  = NULL; }

								    }

								}