|
|
//+---------------------------------------------------------------------------
//
// Microsoft Windows
// Copyright (C) Microsoft Corporation, 1991 - 1999.
//
// File: VECCURS.CXX
//
// Contents: Vector-or Cursor. Computes union of multiple cursors with
// weighted rank computation.
//
// Classes: CVectorCursor
//
// History: 23-Jul-92 KyleP Created
// 01-Mar-93 KyleP Use 64-bit arithmetic
//
//----------------------------------------------------------------------------
#include <pch.cxx>
#pragma hdrstop
#include <curstk.hxx>
#include "veccurs.hxx"
//+---------------------------------------------------------------------------
//
// Member: CVectorCursor::CVectorCursor, public
//
// Synopsis: Creates a vector cursor.
//
// Arguments: [cCursor] -- count of cursors
// [curStack] -- cursors to be merged
// [RankMethod] -- Indicates formula used to compute rank.
//
// History: 23-Jul-92 KyleP Created
//
// Notes: The cursors and the array will be deleted by destructor.
// The cursors have to come from one index
//
//----------------------------------------------------------------------------
CVectorCursor::CVectorCursor( int cCursor, CCurStack& curStack, ULONG RankMethod ) : _cChild( cCursor ), _RankMethod( RankMethod ), _lMaxWeight( 0 ), _lSumWeight( 0 ), _ulSumSquaredWeight( 0 ), _widRank( widInvalid ), _iCur( -1 ), _aChildCursor( cCursor ), _aChildRank( cCursor ), _aChildWeight( cCursor ) { // Two step construction of the heap.
// We have to make sure that all cursors have a valid key
int count = 0;
//
// aCursor is a compacted version of the cursor array which
// only contains valid cursors. It is passed to the wid heap.
//
CCursor ** aCursor = curStack.AcqStack(); RtlCopyMemory( _aChildCursor.GetPointer(), aCursor, cCursor * sizeof( CCursor * ) );
//
// remove empty cursors
//
for ( int i = 0; i < cCursor; i++ ) { if ( aCursor[i] == 0 || aCursor[i]->WorkId() == widInvalid ) { //
// Invalid cursor
//
delete aCursor[i]; _aChildCursor[i] = 0; _aChildRank[i] = 0; _aChildWeight[i] = 0; } else { //
// Valid cursor
//
_aChildWeight[i] = _aChildCursor[i]->GetWeight(); _lMaxWeight = max( _lMaxWeight, _aChildWeight[i] ); _lSumWeight += _aChildWeight[i]; _ulSumSquaredWeight += _aChildWeight[i] * _aChildWeight[i];
if ( count != i ) aCursor[count++] = aCursor[i]; else count++; } }
//
// Avoid divide-by-zero in rank computation
//
if ( _lMaxWeight == 0 ) _lMaxWeight = 1;
if ( _lSumWeight == 0 ) _lSumWeight = 1;
if ( _ulSumSquaredWeight == 0 ) _ulSumSquaredWeight = 1;
_widHeap.MakeHeap ( count, aCursor ); if ( !_widHeap.IsEmpty() ) { _iid = _widHeap.Top()->IndexId(); _pid = _widHeap.Top()->Pid(); _RefreshRanks(); } } //CVectorCursor
//+---------------------------------------------------------------------------
//
// Member: CVectorCursor::WorkId, public
//
// Synopsis: Get current work id.
//
// History: 23-Jul-92 KyleP Lifted from COrCursor
//
//----------------------------------------------------------------------------
WORKID CVectorCursor::WorkId() { if ( _widHeap.IsEmpty() ) return widInvalid;
return _widHeap.Top()->WorkId(); }
//+---------------------------------------------------------------------------
//
// Member: CVectorCursor::NextWorkId, public
//
// Synopsis: Move to next work id
//
// Returns: Target work id or widInvalid if no more wid's for current key
//
// History: 23-Jul-92 KyleP Created from COrCursor.
//
//----------------------------------------------------------------------------
WORKID CVectorCursor::NextWorkId() { WORKID widOld = WorkId(); WORKID widNew;
if ( widOld == widInvalid ) return widInvalid;
do { _widHeap.Top()->NextWorkId(); _widHeap.Reheap(); widNew = _widHeap.Top()->WorkId(); } while ( widNew == widOld );
return widNew; }
//+---------------------------------------------------------------------------
//
// Member: CVectorCursor::RatioFinished, public
//
// Synopsis: return approximate ratio of documents processed to total
// documents.
//
// Notes: The ratio, while approximate, should not return 1/1 until
// all cursors are exhausted.
//
//----------------------------------------------------------------------------
void CVectorCursor::RatioFinished (ULONG& denom, ULONG& num) { WORKID widTop = WorkId(); if (widTop == widInvalid) { denom = num = 1; return; }
denom = 0; num = 0;
unsigned cValid = 1;
for (unsigned i = 0; i < _cChild; i++) { ULONG d, n; if (_aChildCursor[i]) { _aChildCursor[i]->RatioFinished(d, n); Win4Assert( n <= d && d > 0 );
denom += d; num += n; Win4Assert( d <= denom ); // overflow?
if (n == d) { WORKID widCurrent = _aChildCursor[i]->WorkId(); if (widCurrent != widInvalid && widCurrent != widTop) cValid++; } } } Win4Assert ( denom > 0 ); if (num == denom && cValid > 1) denom++; }
//+---------------------------------------------------------------------------
//
// Member: CVectorCursor::WorkIdCount, public
//
// Synopsis: return wid count
//
// History: 23-Jul-92 KyleP Lifted from COrCursor
//
//----------------------------------------------------------------------------
ULONG CVectorCursor::WorkIdCount() { Win4Assert (( FALSE && "CVectorCursor::WorkIdCount called" )); return(0); }
//+---------------------------------------------------------------------------
//
// Member: CVectorCursor::HitCount, public
//
// Synopsis: Return occurrence count
//
// History: 23-Jul-92 KyleP Lifted from COrCursor
//
//----------------------------------------------------------------------------
ULONG CVectorCursor::HitCount() { WORKID wid = _widHeap.Top()->WorkId();
if (wid == widInvalid) return 0;
ULONG hitCnt = 0;
for (UINT i=0; i < _cChild; i++) { if ( _aChildCursor[i] && _aChildCursor[i]->WorkId() == wid ) hitCnt += _aChildCursor[i]->HitCount(); }
return hitCnt; }
//+---------------------------------------------------------------------------
//
// Member: CVectorCursor::Rank, public
//
// Returns: Rank.
//
// History: 23-Jul-92 KyleP Created
// 29-Jan-93 KyleP Fixed rounding error in Jaccard
//
// Notes: Uses algorithm specified by user from a small, precomputed
// set.
//
// See "Automatic Text Processing", G. Salton, 10.1.1 and
// 10.4.2 for a discussion of the weight formulas.
//
//----------------------------------------------------------------------------
static int const cMaxChildrenInner = ( 0xFFFFFFFF / ( MAX_QUERY_RANK * MAX_QUERY_RANK ) );
static int const cMaxChildrenDice = ( 0xFFFFFFFF / ( MAX_QUERY_RANK * MAX_QUERY_RANK * 2 ) );
static int const cMaxChildrenJaccard = ( 0xFFFFFFFF / ( MAX_QUERY_RANK * MAX_QUERY_RANK ) );
LONG CVectorCursor::Rank() { LONG lRank; WORKID wid = _widHeap.Top()->WorkId();
//
// An empty heap is a minimum rank.
//
if (wid == widInvalid) { Win4Assert( FALSE && "Rank called on empty heap!" ); return 0; }
//
// Get ranks for this wid.
//
_RefreshRanks();
//
// Otherwise, compute rank based on selected method.
//
switch ( _RankMethod ) { case VECTOR_RANK_MIN: { // MAX[ wi * ( MaxRank - ri ) ]
// VECTOR_RANK_MIN MaxRank - ---------------------------------
// MAX[wi]
lRank = (MAX_QUERY_RANK - _aChildRank[0]) * _aChildWeight[0];
for ( UINT i = 1; i < _cChild; i++ ) { LONG lNew = (MAX_QUERY_RANK - _aChildRank[i]) * _aChildWeight[i]; lRank = max( lRank, lNew ); }
lRank = MAX_QUERY_RANK - (lRank / _lMaxWeight);
break; }
case VECTOR_RANK_MAX: { // MAX[ wi * ri ]
// VECTOR_RANK_MAX -----------------
// MAX[wi]
lRank = _aChildRank[0] * _aChildWeight[0];
for ( UINT i = 1; i < _cChild; i++ ) { LONG lNew = _aChildRank[i] * _aChildWeight[i]; lRank = max( lRank, lNew ); }
lRank = lRank / _lMaxWeight;
break; }
case VECTOR_RANK_INNER: { // n
// SUM ri * wi
// i=1
// VECTOR_RANK_INNER -------------
// n
// SUM wi
// i=1
if ( _cChild > cMaxChildrenInner ) { THROW( CException( STATUS_INVALID_PARAMETER ) ); }
lRank = 0;
for ( UINT i = 0; i < _cChild; i++ ) { lRank += _aChildRank[i] * _aChildWeight[i]; }
lRank /= _lSumWeight;
break; }
case VECTOR_RANK_DICE: { // n
// 2 * SUM ri * wi
// i=1
// VECTOR_RANK_DICE --------------------
// n 2 n 2
// SUM ri + SUM wi
// i=1 i=1
if ( _cChild > cMaxChildrenDice ) { THROW( CException( STATUS_INVALID_PARAMETER ) ); }
ULONG ulWeightSum = 0;
lRank = 0;
for ( UINT i = 0; i < _cChild; i++ ) { lRank += _aChildRank[i] * _aChildWeight[i]; ulWeightSum += _aChildRank[i] * _aChildRank[i]; }
ulWeightSum += _ulSumSquaredWeight;
//
// Avoid nasty rounding errors
//
LONGLONG liTop;
liTop = UInt32x32To64( lRank, 2 * MAX_QUERY_RANK );
liTop /= ulWeightSum;
lRank = lltoul(liTop);
break; }
case VECTOR_RANK_JACCARD: { // n
// SUM ri * wi
// i=1
// VECTOR_RANK_JACCARD ---------------------------------
// n 2 n 2 n
// SUM ri + SUM wi - SUM ri * wi
// i=1 i=1 i=1
if ( _cChild > cMaxChildrenJaccard ) { THROW( CException( STATUS_INVALID_PARAMETER ) ); }
ULONG ulWeightSum = 0;
lRank = 0;
for ( UINT i = 0; i < _cChild; i++ ) { lRank += _aChildRank[i] * _aChildWeight[i]; ulWeightSum += _aChildRank[i] * _aChildRank[i]; }
ulWeightSum += _ulSumSquaredWeight; ulWeightSum -= lRank;
//
// Avoid nasty rounding errors
//
LONGLONG liTop;
liTop = UInt32x32To64( lRank, MAX_QUERY_RANK );
liTop /= ulWeightSum;
lRank = lltoul(liTop); break; }
default: Win4Assert( FALSE && "Invalid rank calculation method." ); lRank = 0; }
Win4Assert( lRank <= MAX_QUERY_RANK );
return ( lRank ); }
//+-------------------------------------------------------------------------
//
// Member: CVectorCursor::GetRankVector, public
//
// Synopsis: Fetches the rank vector for the cursor.
//
// Arguments: [pulVector] -- The vector is copied here.
//
// Requires: There is enough space in [pulVector] for all the
// elements of the vector. No overflow checking is done.
//
// Returns: The count of elements copied.
//
// History: 24-Jul-92 KyleP Created
//
//--------------------------------------------------------------------------
ULONG CVectorCursor::GetRankVector( LONG * plVector, ULONG cElements ) { //
// Get ranks for this wid.
//
_RefreshRanks();
if ( cElements >= _cChild ) RtlCopyMemory( plVector, _aChildRank.GetPointer(), _cChild * sizeof LONG );
return _cChild; }
//+---------------------------------------------------------------------------
//
// Member: CVectorCursor::Hit, public
//
// Returns: Current hit.
//
// History: 07-Sep-92 MikeHew Created
// 12-Dec-92 KyleP Modified for Vector Cursor
//
// Notes: A hit for the vector cursor is identical to a hit
// for an or cursor -- 1 hilite at a time.
//
//----------------------------------------------------------------------------
LONG CVectorCursor::Hit() { //
// The first time Hit() is called, we need to position on the first hit.
//
CCursor ** aCur = _widHeap.GetVector();
if ( _iCur == -1 ) { NextHit(); }
if ( -1 == _iCur ) return rankInvalid;
return ( aCur[_iCur]->Hit() ); }
//+---------------------------------------------------------------------------
//
// Member: CVectorCursor::NextHit, public
//
// Returns: Next hit.
//
// History: 07-Sep-92 MikeHew Created
// 12-Dec-92 KyleP Modified for Vector Cursor
//
//----------------------------------------------------------------------------
LONG CVectorCursor::NextHit() { CCursor ** aCur = _widHeap.GetVector();
LONG rank;
if ( _iCur == -1 ) rank = rankInvalid; else rank = aCur[_iCur]->NextHit();
//
// If this cursor is empty (rank == rankInvalid) and
// there are more cursors available, find one that's non-empty.
//
while ( rank == rankInvalid && _iCur < _widHeap.Count() - 1 ) { ++_iCur; rank = aCur[_iCur]->Hit(); }
return rank; }
//+-------------------------------------------------------------------------
//
// Member: CVectorCursor::_RefreshRanks, private
//
// Synopsis: Fetches ranks from children with matching workids.
//
// History: 24-Jul-92 KyleP Created
//
//--------------------------------------------------------------------------
void CVectorCursor::_RefreshRanks() { WORKID wid = _widHeap.Top()->WorkId();
//
// If the cache is up-to-date, do nothing.
if ( _widRank == wid ) return;
for ( UINT i = 0; i < _cChild; i++ ) { WORKID widCurrent = ( _aChildCursor[i] ) ? _aChildCursor[i]->WorkId() : widInvalid;
if ( widCurrent == widInvalid || widCurrent != wid ) { _aChildRank[i] = 0; } else { _aChildRank[i] = _aChildCursor[i]->Rank(); } }
_widRank = wid; }
|