//+--------------------------------------------------------------------------- // // Microsoft Windows // Copyright (C) Microsoft Corporation, 1991 - 2000. // // File: PROXCUR.CXX // // Contents: Proximity Cursor. Computes intersection of multiple // cursors with rank computed based on word occurrance // proximity. // // Classes: CProxCursor // // History: 14-Apr-92 AmyA Created. // //---------------------------------------------------------------------------- #include #pragma hdrstop #include #include #include "proxcur.hxx" //+--------------------------------------------------------------------------- // // Member: CProxCursor::CProxCursor, public // // Synopsis: Create a cursor that merges a number of cursors. // // Arguments: [cCursor] -- count of cursors // [curArray] -- pointers to cursors (aquired to an array) // [maxDist] -- the maximum distance between occurrences // // Notes: All cursors must come from the same index // and the same property // // History: 15-Apr-92 AmyA Created // //---------------------------------------------------------------------------- CProxCursor::CProxCursor( unsigned cCursor, COccCurStack& curStack, LONG maxDist ) : _cCur ( cCursor ), _maxDist ( maxDist ), _rank ( rankInvalid ) { COccCursor *pCur = curStack.Get(0); _occHeap.MakeHeap ( _cCur, curStack.AcqStack() ); Win4Assert ( pCur != 0 ); _iid = pCur->IndexId(); _pid = pCur->Pid(); // NTRAID#DB-NTBUG9-84004-2000/07/31-dlee Indexing Service internal cursors aren't optimized to use shortest cursors first _wid = pCur->WorkId(); _logWidMax = Log2(pCur->MaxWorkId()); FindConjunction(); } //+--------------------------------------------------------------------------- // // Member: CProxCursor::WorkId, public // // Synopsis: Get current work id. // // History: 17-Apr-92 AmyA Created // //---------------------------------------------------------------------------- WORKID CProxCursor::WorkId() { return _wid; } //+--------------------------------------------------------------------------- // // Member: CProxCursor::NextWorkID, public // // Synopsis: Move to next work id // // Returns: Target work id or widInvalid if no more wid's for current key // // History: 17-Apr-92 AmyA Created // //---------------------------------------------------------------------------- WORKID CProxCursor::NextWorkId() { _rank = rankInvalid; // NTRAID#DB-NTBUG9-84004-2000/07/31-dlee Indexing Service internal cursors aren't optimized to use shortest cursors first _wid = _occHeap.Top()->NextWorkId(); FindConjunction(); return _wid; } //+--------------------------------------------------------------------------- // // Member: CProxCursor::HitCount, public // // Synopsis: Returns smallest HitCount of all keys in current wid. // // Requires: _wid set to any of the current wid's // // Returns: smallest occurrence count of all keys in wid. // // History: 17-Apr-92 AmyA Created // // Notes: If there is no conjunction in current wid, returns 0. // //---------------------------------------------------------------------------- ULONG CProxCursor::HitCount() { if ( _rank == rankInvalid ) _rank = CalculateRank(); // This needs to be called before HitCount // so taht the occurrence information in // the children cursors will be valid when // its called. COccCursor **aCur = _occHeap.GetVector(); ULONG count = aCur[0]->HitCount(); for ( unsigned i = 1; i < _cCur; i++ ) { ULONG newcount = aCur[i]->HitCount(); if ( newcount < count ) count = newcount; } return count; } void CProxCursor::RatioFinished (ULONG& denom, ULONG& num) { COccCursor **vector = _occHeap.GetVector(); denom = 1; num = 0; for (unsigned i=0; i < _cCur; i++) { ULONG d, n; vector[i]->RatioFinished(d, n); if (d == n) { // done if any cursor is done denom = d; num = n; Win4Assert( denom > 0 ); break; } else if (d > denom) { // the one with largest denom // is the most meaningful denom = d; num = n; } else if (d == denom && n < num ) { num = n; // be pessimistic } } } //+--------------------------------------------------------------------------- // // Member: CProxCursor::Rank, public // // Synopsis: Checks to see if CalculateRank has been called. If not, calls // it. // // Requires: _wid set to any of the current wid's // // Returns: _rank // // History: 20-Apr-92 AmyA Created // //---------------------------------------------------------------------------- LONG CProxCursor::Rank() { if ( _rank == rankInvalid ) _rank = CalculateRank(); return _rank; } //+--------------------------------------------------------------------------- // // Member: CProxCursor::FindConjunction, private // // Synopsis: Find nearest conjunction of all the same work id's // // Requires: _wid set to any of the current wid's // // Modifies: [_wid] to point to conjunction or to widInvalid // // History: 15-Apr-92 AmyA Copied from CAndCursor. // // Notes: If cursors are in conjunction, no change results // //---------------------------------------------------------------------------- void CProxCursor::FindConjunction () { BOOL change; COccCursor **aCur = _occHeap.GetVector(); do { change = FALSE; // NTRAID#DB-NTBUG9-84004-2000/07/31-dlee Indexing Service internal cursors aren't optimized to use shortest cursors first // for all cursors in turn try to align them on _wid for ( unsigned i = 0; i < _cCur; i++ ) { // increment cursor to or past current _wid // or exit when exhausted while ( aCur[i]->WorkId() < _wid ) { if ( aCur[i]->NextWorkId() == widInvalid ) { _wid = widInvalid; return; } } // if overshot, try again with new _wid if ( aCur[i]->WorkId() > _wid ) { _wid = aCur[i]->WorkId(); change = TRUE; break; } } } while ( change ); } //+--------------------------------------------------------------------------- // // Member: CProxCursor::CalculateRank, private // // Synopsis: Assigns a rank based on the shortest distance between an // occurrence of each child. // // Requires: _wid set to any of the current wid's, at least two child // cursors. // // Returns: calculated rank // // History: 17-Apr-92 AmyA Created // // Notes: If there is no conjunction in current wid, returns 0. // // New Rank computation: // rank = cOcc*Log2(_widMax)*normalizedProximity(distMin) // where, // cOcc = hits_with_dist(distMin) // where normalizedProximity(i) = ProxDefault[i]/MAX_QUERY_RANK // //---------------------------------------------------------------------------- // The idea is that we are looking for the combination of occurrences (one // for each child cursor) that is closest together for the current wid. To // do this, we only need to look at two of the child cursors from a set: the // one with the smallest occurrence and the one furthest from it. We look // at these sets in a loop, getting the next occurrence on the cursor with // the smallest occurrence, then reheaping to find the new smallest // occurrence, and then finding the occurrence furthest from it. By getting // the next occurrence on the cursor with the smallest occurrence, we are // guaranteeing that we will not skip over a set of cursors that are closer // together. If you need proof of this, draw a picture with the cursors // represented as parallel lines and the occurrences as hash marks on those // lines and step through the algorithm. Remember that we start this // function while all the child cursors are at thier smallest occurrence // within the current wid, since this function needs to be called before any // work with occurrences is done within a wid. LONG CProxCursor::CalculateRank() { Win4Assert ( _cCur >= 2 ); ULONG distMin = _maxDist + 1; unsigned cOcc = 0; // #hits at distMin // loop through occurrence combinations to find the set of occurrences // for different cursors that are the closest together do { // Get smallest occurrence _occHeap.Reheap(); OCCURRENCE smallOcc = _occHeap.Top()->Occurrence(); COccCursor **aCur = _occHeap.GetVector(); OCCURRENCE largeOcc = aCur[1]->Occurrence(); // loop through all occurrences (except the first, which is the // smallest and the second) to find the occurrence furthest from the // smallest. for ( unsigned count = 2; count < _cCur; count++ ) { OCCURRENCE newOcc = aCur[count]->Occurrence(); if ( newOcc > largeOcc ) largeOcc = newOcc; } if (largeOcc - smallOcc < PROX_MAX) { if (largeOcc - smallOcc < distMin) { distMin = largeOcc - smallOcc; cOcc = 1; // reset # hits } else if (largeOcc - smallOcc == distMin) { cOcc++; } } // else children are too far apart to affect rank // get the next occurrence on the cursor with the smallest occurrence } while ( _occHeap.Top()->NextOccurrence() != OCC_INVALID ); if (distMin >= PROX_MAX) { return(0); } LONG rank = cOcc * _logWidMax * ProxDefault[distMin] / MAX_QUERY_RANK; if (rank > MAX_QUERY_RANK) { rank = MAX_QUERY_RANK; } return rank; } //+--------------------------------------------------------------------------- // // Member: CProxCursor::Hit, public // // Synopsis: Hits current child (indexed by _iCur) // // History: 07-Sep-92 MikeHew Created // // Notes: Hit() should not be called more than once, except by // NextHit() // // The occurrence heap is assumed valid upon entry, and remains // valid on exit. // //---------------------------------------------------------------------------- LONG CProxCursor::Hit() { Win4Assert ( _cCur >= 2 ); COccCursor **aCur = _occHeap.GetVector(); // Make sure none of the cursors are empty for ( unsigned i=0; i<_cCur; ++i ) { if ( aCur[i]->IsEmpty() ) return rankInvalid; } // Starting with smallest occurrence, loop through all cursors, // Hitting() each one and searching for the largest occurrence. OCCURRENCE largeOcc = _occHeap.Top()->Occurrence(); OCCURRENCE smallOcc = largeOcc; for ( i=0; i<_cCur; ++i ) { aCur[i]->Hit(); OCCURRENCE thisOcc = aCur[i]->Occurrence(); if ( thisOcc > largeOcc ) { largeOcc = thisOcc; } // get the next occurrence on the cursor with the smallest occurrence } unsigned dist = largeOcc - smallOcc; if (dist >= PROX_MAX) return(0); return ProxDefault[dist]; } //+--------------------------------------------------------------------------- // // Member: CProxCursor::NextHit, public // // Synopsis: calls NextOccurrence() on smallest child, then // returns Hit() if NextOccurrence() is valid // // History: 07-Sep-92 MikeHew Created // // Notes: NextHit() should not be called after returning rankInvalid // //---------------------------------------------------------------------------- LONG CProxCursor::NextHit() { if ( _occHeap.Top()->NextOccurrence() == OCC_INVALID ) { return rankInvalid; } _occHeap.Reheap(); return Hit(); }