|
|
//+---------------------------------------------------------------------------
//
// Microsoft Windows
// Copyright (C) Microsoft Corporation, 1991 - 2000.
//
// File: NORM.CXX
//
// Contents: Normalizer
//
// Classes: CNormalizer
//
// History: 28-May-91 t-WadeR added CNormalizer
// 31-Jan-92 BartoszM Created from lang.cxx
// 07-Oct-93 DwightKr Added new methods to normalize
// different data types
//
// Notes: The filtering pipeline is hidden in the Data Repository
// object which serves as a sink for the filter.
// The sink for the Data Repository is the Key Repository.
// The language dependent part of the pipeline
// is obtained from the Language List object and is called
// Key Maker. It consists of:
//
// Word Breaker
// Stemmer (optional)
// Normalizer
// Noise List
//
// Each object serves as a sink for its predecessor,
// Key Repository is the final sink.
//
//----------------------------------------------------------------------------
#include <pch.cxx>
#pragma hdrstop
#include <plang.hxx>
#include <misc.hxx>
#include <norm.hxx>
//+---------------------------------------------------------------------------
//
// Function GetExpAndSign
//
// Synopsis: Finds the exponent and sign of a number
//
// Arguments: [d] -- the input number to examine
// [fPositive] -- returns TRUE if positive, FALSE if negative
//
// Returns: The exponent
//
// History: 21-Nov-94 KyleP Created.
//
//----------------------------------------------------------------------------
int GetExpAndSign( double d, BOOL & fPositive ) { //
// bit 63 = sign
// bits 52 - 62 = exponent
// bits 0 - 51 = mantissa
//
Win4Assert( sizeof(LARGE_INTEGER) == sizeof(double) );
LARGE_INTEGER * pli = (LARGE_INTEGER *)&d;
fPositive = (pli->HighPart & 0x80000000) == 0;
int const bias = 0x3ff;
return ( ( pli->HighPart & 0x7ff00000 ) >> 20 ) - bias; } //GetExpAndSign
//+---------------------------------------------------------------------------
//
// Function NormDouble
//
// Synopsis: Normalizes doubles by taking log2 of the number
//
// Notes: This func converts doubles into one of 5 different categories
//
// x < -1x2**32 is in bin 0
// -1x2**32 <= x <= -1x2**-32 are in bins 1 to 65
// -1x2**-32 <= x <= 1x2**-32 is in bin 66
// 1x2**-32 <= x <= 1x2**32 are in bins 67 to 131
// x > 1x2**32 is bin bin 132
//
// History: 21-Nov-94 KyleP Created.
//
//----------------------------------------------------------------------------
static unsigned NormDouble(double dValue) { const int SignificantExponent = 32; const int SignificantRange = SignificantExponent * 2;
const unsigned LowestBin = 0; // 0
const unsigned LowerBin = LowestBin + 1; // 1
const unsigned MiddleBin = LowerBin + SignificantRange + 1; // 66
const unsigned UpperBin = MiddleBin + 1; // 67
const unsigned HighestBin = UpperBin+ SignificantRange + 1; // 132
BOOL fPositive;
int exp = GetExpAndSign( dValue, fPositive );
unsigned bin;
if ( exp < -SignificantExponent ) { //
// All numbers close to zero in middle bin
//
bin = MiddleBin; } else if ( exp > SignificantExponent ) { if ( fPositive ) { //
// Very large positive numbers in top bin
//
bin = HighestBin; } else { //
// Very large negative numbers in bottom bin
//
bin = LowestBin; } } else { if ( fPositive ) { //
// medium size positive numbers
//
bin = UpperBin + exp + SignificantExponent; } else { //
// medium size negative numbers
//
bin = LowerBin - exp + SignificantExponent; } } return bin; }
#ifdef TEST_NORM
//
// a test to verify the validity of the NormDouble function.
//
void TestNormDouble() { float fVal0 = 0.; float fVal1 = 1.; unsigned nZero = NormDouble( fVal0 ); unsigned nOne = NormDouble( fVal1 );
printf(" Value:Bin %f : 0x%4X (%d)\n", fVal0, nZero, nZero ); printf(" Value:Bin %f : 0x%4X (%d)\n", fVal1, nOne, nOne );
BOOL fPos; float f = fVal1; unsigned nPrev = nOne; while ( f > fVal0 ) { unsigned nVal = NormDouble( f ); if (nVal > nPrev || nVal < nZero || nVal > nOne) { printf(" Value:Bin %f : 0x%4X (%d)\tExp %d\n", f, nVal, nVal, GetExpAndSign(f, fPos) ); }
nPrev = nVal; f = f/3; }
f = fVal1; nPrev = nOne; while ( f < 1e+32 ) { unsigned nVal = NormDouble( f ); if (nVal < nPrev) printf(" Value:Bin %f : 0x%4X (%d)\n", f, nVal, nVal );
nPrev = nVal; f = f * (float)1.5; }
float fValm1 = -1.; unsigned nMinusOne = NormDouble( fValm1 );
printf(" Value:Bin %f : 0x%4X (%d)\n", fValm1, nMinusOne, nMinusOne );
f = fValm1; nPrev = nMinusOne; while ( f < fVal0 ) { unsigned nVal = NormDouble( f ); if (nVal < nPrev || nVal > nZero || nVal < nMinusOne) printf(" Value:Bin %f : 0x%4X (%d)\tExp %d\n", f, nVal, nVal, GetExpAndSign(f, fPos) );
nPrev = nVal; f = f/3; }
f = fValm1; nPrev = nMinusOne; while ( f > -1e+32 ) { unsigned nVal = NormDouble( f ); if (nVal > nPrev) printf(" Value:Bin %f : 0x%4X (%d)\n", f, nVal, nVal );
nPrev = nVal; f = f * (float)1.5; } } #endif // 0
// ------------------------------------------------------------------------
// | Upper Limit | Divisor (2^x) | # of Bins | (in hex) |
// ------------------------------------------------------------------------
// | 2^10 - 1 | 2^0 | 2^10 - 0 | 0400 - 0000 |
// | 2^16 - 1 | 2^3 | 2^12 - 2^7 | 2000 - 0080 |
// | 2^20 - 1 | 2^6 | 2^14 - 2^10 | 4000 - 0400 |
// | 2^26 - 1 | 2^13 | 2^13 - 2^7 | 2000 - 0080 |
// | 2^30 - 1 | 2^23 | 2^7 - 2^3 | 0080 - 0008 |
// | 2^31 - 1 | 2^25 | 2^6 - 2^5 | 0040 - 0020 |
// ------------------------------------------------------------------------
// | Total | | | 84C0 - 04D8 |
// | | | | 7FE8 |
// ------------------------------------------------------------------------
const long limit1 = 0x400; const long shift1 = 0; const long cbins1 = 0x400;
const long limit2 = 0x10000; // 2^16
const long shift2 = 3; const long cSkip1 = limit1 >> shift2; const long cbins2 = (limit2 >> shift2)-cSkip1;
const long limit3 = 0x100000; // 2^20
const long shift3 = 6; const long cSkip2 = limit2 >> shift3; const long cbins3 = (limit3 >> shift3) - cSkip2;
const long limit4 = 0x4000000; // 2^26
const long shift4 = 13; const long cSkip3 = limit3 >> shift4; const long cbins4 = (limit4 >> shift4) - cSkip3;
const long limit5 = 0x40000000; // 2^30
const long shift5 = 23; const long cSkip4 = limit4 >> shift5; const long cbins5 = (limit5 >> shift5) - cSkip4;
const long limit6 = MINLONG; // 2^31
const long shift6 = 25; const long cSkip5 = limit5 >> shift6; const long cbins6 = ((long) ((unsigned) limit6 >> shift6)) - cSkip5;
static unsigned MapLong( LONG lValue ) {
Win4Assert( !(lValue & MINLONG) || ( MINLONG == lValue ) );
#if CIDBG==1
const long cTotal = cbins1 + cbins2 + cbins3 + cbins4 + cbins5 + cbins6; Win4Assert( cTotal <= MINSHORT ); #endif // CIDBG == 1
unsigned ulValue = (unsigned) lValue;
unsigned binNum = (unsigned) lValue;;
if ( ulValue < limit1 ) { //
// Nothing to do.
//
} else if ( ulValue < limit2 ) { binNum = cbins1 - cSkip1 + (ulValue >> shift2); } else if ( ulValue < limit3 ) { binNum = cbins1 + cbins2 - cSkip2 + (binNum >> shift3); } else if ( ulValue < limit4 ) { binNum = cbins1 + cbins2 + cbins3 - cSkip3 + (binNum >> shift4); } else if ( ulValue < limit5 ) { binNum = cbins1 + cbins2 + cbins3 + cbins4 - cSkip4 + (binNum >> shift5); } else { binNum = cbins1 + cbins2 + cbins3 + cbins4 + cbins5 - cSkip5 + (binNum >> shift6); }
return binNum; }
//+---------------------------------------------------------------------------
//
// Function: NormLong
//
// Synopsis: Normalizes the given "signed" long value to a value between
// 0x0000 - 0xFFFF. The negative numbers occupy 0x0000-0x8000.
// Positive numbers occupy 0x8000-0xFFFF
//
// Arguments: [lValue] - The value to be normalized.
//
// History: 10-03-95 srikants Created
//
// Notes:
//
//----------------------------------------------------------------------------
static unsigned NormLong(LONG lValue) { if (lValue >= 0) { return MapLong(lValue) + MINSHORT; } else { return MINSHORT - MapLong(-lValue); } }
//+---------------------------------------------------------------------------
//
// Function: NormULong
//
// Synopsis: Normalizes an "unsigned" long value to a value between
// 0x0000-0xFFFF. Numbers from 0-2^31 - 1 are mapped in the
// range 0x0000-0x7FFF. Numbers 2^31 to 2^32 - 1 are mapped
// in the range 0x8000 - 0xFFFF
//
// Arguments: [lValue] - The value to be mapped.
//
// History: 10-03-95 srikants Created
//
// Notes:
//
//----------------------------------------------------------------------------
static unsigned NormULong( ULONG lValue ) { unsigned val = MapLong( lValue & ~MINLONG ); // turn off the high bit
Win4Assert( !(val & MINSHORT) );
if ( lValue & MINLONG ) val |= MINSHORT;
return val; }
//+---------------------------------------------------------------------------
//
// Function: MapLargeInteger
//
// Synopsis: Maps a LargeInteger to a number between 0x0000-0x7FFF.
//
// Numbers with the "HighPart" = 0 are mapped in the range
// 0x0000-0x3FFF. When the HighPart !=0, the values are
// mapped to 0x4000 - 0x7FFF
//
// Arguments: [liValue] - The value to be mapped.
//
// History: 10-03-95 srikants Created
//
// Notes:
//
//----------------------------------------------------------------------------
static unsigned MapLargeInteger( LARGE_INTEGER & liValue ) { Win4Assert( !(liValue.HighPart & MINLONG) || ( MINLONG == liValue.HighPart ) );
unsigned normVal;
if ( 0 == liValue.HighPart ) { normVal = NormULong( liValue.LowPart ); normVal >>= 2; } else { normVal = MapLong( liValue.HighPart ); // 0x0000-0x7FFF
normVal >>= 1; normVal |= 0x4000; }
Win4Assert( normVal < 0x8000 );
return normVal; }
//+---------------------------------------------------------------------------
//
// Function: NormULargeInteger
//
// Synopsis: Normalizes an unsigned LargeInteger to a number between
// 0x0000-0xFFFF.
//
// Numbers with the "HighPart" = 0 are mapped in the range
// 0x0000-0x7FFF. When the HighPart !=0, the values are
// mapped to 0x8000 - 0xFFFF.
//
// Arguments: [uliValue] - The value to be mapped.
//
// History: 02-09-96 Alanw Created
//
// Notes:
//
//----------------------------------------------------------------------------
static unsigned NormULargeInteger( ULARGE_INTEGER & uliValue ) { unsigned normVal;
if ( 0 == uliValue.HighPart ) { normVal = NormULong( uliValue.LowPart ); normVal >>= 1; } else { normVal = NormULong( uliValue.HighPart ); // 0x0000-0x7FFF
normVal |= 0x8000; }
Win4Assert( normVal < 0x10000 );
return normVal; }
//+---------------------------------------------------------------------------
//
// Function: NormLargeInteger
//
// Synopsis: Normalizes a large integer to a value between 0x0000-0xFFFF.
//
// -ve Numbers are mapped in the range 0x0000-0x8000.
// +ve numbers are mapped in the range 0x8000-0xFFFF.
//
// Arguments: [liValue] - The value to be normalized. Note that the
// argument is NOT passed by reference. The value is changed
// in this method and so should not be passed by reference.
//
// History: 10-03-95 srikants Created
//
// Notes:
//
//----------------------------------------------------------------------------
static unsigned NormLargeInteger( LARGE_INTEGER liValue ) { unsigned normVal;
if ( liValue.QuadPart < 0 ) { liValue.QuadPart = -liValue.QuadPart; normVal = MINSHORT - MapLargeInteger( liValue ); } else { normVal = MINSHORT + MapLargeInteger( liValue ); }
Win4Assert( normVal < 0x10000 );
return normVal; }
#ifdef TEST_NORM
//
// a test to verify the validity of the NormLong function.
//
void TestNormLong() { long lVal1 = 0; unsigned nVal1 = NormLong( lVal1 );
printf(" Value:Bin 0x%8X : 0x%4X \t(%10d : %10d)\n", lVal1, nVal1, lVal1, nVal1 );
lVal1 = 2; long lVal2 = 0; unsigned nVal2 = NormLong(1);
while ( !(lVal1 & 0x80000000) ) { nVal1 = NormLong( lVal1 ); //printf(" Value:Bin 0x%8X : 0x%4X \t(%10d : %10d)\n", lVal1, nVal1, lVal1, nVal1 );
Win4Assert( nVal1 == nVal2+1 );
lVal2 = lVal1 + lVal1-1; nVal2 = NormLong( lVal2 ); //printf(" Value:Bin 0x%8X : 0x%4X \t(%10d : %10d)\n", lVal2, nVal2, lVal2, nVal2 );
lVal1 <<= 1; }
lVal1 = 2; nVal2 = NormLong(-1); printf(" Value:Bin 0x%8X : 0x%4X \t(%10d : %10d)\n", -1, nVal2, -1, nVal2 );
while ( !(lVal1 & 0x80000000) ) { nVal1 = NormLong( -lVal1 ); //printf(" Value:Bin 0x%8X : 0x%4X \t(%10d : %10d)\n", -lVal1, nVal1, -lVal1, nVal1 );
Win4Assert( nVal1 == nVal2-1 );
lVal2 = lVal1 + lVal1-1; lVal2 = -lVal2;
nVal2 = NormLong( lVal2 ); //printf(" Value:Bin 0x%8X : 0x%4X \t(%10d : %10d)\n", lVal2, nVal2, lVal2, nVal2 );
lVal1 <<= 1; } } #endif // 0
//+---------------------------------------------------------------------------
//
// Member: CNormalizer::CNormalizer
//
// Synopsis: constructor for normalizer
//
// Effects: gets buffers from noiselist
//
// Arguments: [nl] -- Noise list object to pass data on to.
//
// History: 05-June-91 t-WadeR Created.
//
// Notes:
//
//----------------------------------------------------------------------------
CNormalizer::CNormalizer( PNoiseList& nl ) : _noiseList(nl) { SetWordBuffer();
// check that input size + prefix fits in the output buffer
Win4Assert( cwcMaxKey * sizeof( WCHAR ) + cbKeyPrefix <= *_pcbOutBuf ); }
//+---------------------------------------------------------------------------
//
// Member: CNormalizer::GetFlags
//
// Synopsis: Returns address of ranking and range flags
//
// Arguments: [ppRange] -- range flag
// [ppRank] -- rank flag
//
// History: 11-Fab-92 BartoszM Created.
//
//----------------------------------------------------------------------------
void CNormalizer::GetFlags ( BOOL** ppRange, CI_RANK** ppRank ) { _noiseList.GetFlags ( ppRange, ppRank ); }
//+---------------------------------------------------------------------------
//
// Member: CNormalizer::ProcessAltWord, public
//
// Synopsis: Normalizes a UniCode string, passes it to NoiseList.
//
// Effects: Deposits a normalized version [pwcInBuf] in [_pbOutBuf]
//
// Arguments: [pwcInBuf] -- input buffer
// [cwc] -- count of chars in pwcInBuf
//
// History: 03-May-95 SitaramR Created.
//
//----------------------------------------------------------------------------
void CNormalizer::ProcessAltWord( WCHAR const *pwcInBuf, ULONG cwc ) { SetNextAltBuffer();
unsigned hash = NormalizeWord( pwcInBuf, cwc ); SetAltHash( hash ); }
//+---------------------------------------------------------------------------
//
// Member: CNormalizer::ProcessWord, public
//
// Synopsis: Normalizes a UniCode string, passes it to NoiseList.
//
// Effects: Deposits a normalized version of [pwcInBuf] in [_pbOutBuf].
//
// Arguments: [pwcInBuf] -- input buffer
// [cwc] -- count of chars in pwcInBuf
//
// History: 05-June-91 t-WadeR Created.
// 13-Oct-92 AmyA Added unicode support
//
//----------------------------------------------------------------------------
void CNormalizer::ProcessWord( WCHAR const *pwcInBuf, ULONG cwc ) { if ( UsingAltBuffers() ) SetNextAltBuffer();
unsigned hash = NormalizeWord( pwcInBuf, cwc );
if ( UsingAltBuffers() ) { SetAltHash( hash ); ProcessAllWords(); } else _noiseList.PutWord( hash ); }
//+---------------------------------------------------------------------------
//
// Member: CNormalizer::ProcessAllWords, private
//
// Synopsis: Removes duplicate alternate words and emits remainder.
//
// History: 17-Sep-1999 KyleP Created.
//
//----------------------------------------------------------------------------
void CNormalizer::ProcessAllWords() { //
// Check for duplicate keys. Since the number of alternate forms will always be
// quite small it's ok to use a O(n^2) algorithm here.
//
unsigned iFinal = 0;
for ( unsigned i = 0; i < _cAltKey; i++ ) { //
// Already marked duplicate?
//
if ( 0 == _aAltKey[i].Count() ) continue;
iFinal = i;
for ( unsigned j = i+1; j < _cAltKey; j++ ) { //
// Remember, Pid is really the hash here.
//
if ( _aAltKey[i].Pid() == _aAltKey[j].Pid() && _aAltKey[i].Count() == _aAltKey[j].Count() && RtlEqualMemory( _aAltKey[i].GetBuf(), _aAltKey[j].GetBuf(), _aAltKey[j].Count() ) ) { ciDebugOut(( DEB_TRACE, "Duplicate keys: %u and %u\n", i, j )); _aAltKey[j].SetCount( 0 ); } } }
//
// Now transfer any remaining key(s).
//
SetWordBuffer(); unsigned hash;
for ( i = 0; i <= iFinal; i++ ) { //
// Ignore duplicates
//
if ( 0 == _aAltKey[i].Count() ) continue;
//
// Copy to the transfer buffer.
//
*_pcbOutBuf = _aAltKey[i].Count(); RtlCopyMemory( _pbOutBuf, _aAltKey[i].GetBuf(), *_pcbOutBuf ); hash = _aAltKey[i].Pid();
//
// If this is not the final "PutWord" call, send the data along.
//
if ( i != iFinal ) _noiseList.PutAltWord( hash ); }
//
// Put the final word
//
_noiseList.PutWord( hash ); } //ProcessAllWords
//+---------------------------------------------------------------------------
//
// Member: CNormalizer::NormalizeWord
//
// Synopsis: Normalizes a UniCode string
// Calculates the hash function for normalized string.
//
// Arguments: [pwcInBuf] -- input buffer
// [cwc] -- count of chars in pwcInBuf
//
// Returns: unsigned hash value of string
//
// History: 03-May-95 SitaramR Created.
//
//----------------------------------------------------------------------------
unsigned CNormalizer::NormalizeWord( WCHAR const *pwcInBuf, ULONG cwc ) { return NormalizeWord( pwcInBuf, cwc, _pbOutBuf, _pcbOutBuf ); }
//+---------------------------------------------------------------------------
//
// Member: CNormalizer::NormalizeWord
//
// Synopsis: Normalizes a UniCode string
// Calculates the hash function for normalized string. This
// function is identical to the other NormalizeWord funtion,
// except that it puts the outputs int he output parameters
//
// Arguments: [pwcInBuf] -- input buffer
// [cwc] -- count of chars in pwcInBuf
// [pbOutBuf] -- output buffer.
// [pcbOutBuf] - pointer to output count of bytes.
//
// Returns: unsigned hash value of string
//
// History: 03-May-1995 SitaramR Created.
// 03-Oct-2000 KitmanH Added output parameters
//
//----------------------------------------------------------------------------
unsigned CNormalizer::NormalizeWord( WCHAR const *pwcInBuf, ULONG cwc, BYTE *pbOutBuf, unsigned *pcbOutBuf ) { // count of bytes needs to take into account STRING_KEY
*pcbOutBuf = cwc * sizeof(WCHAR) + cbKeyPrefix;
// prefix with the string key identifier
*pbOutBuf++ = STRING_KEY;
unsigned hash = 0;
Win4Assert ( cwc != 0 && cwc <= cwcMaxKey ); for ( unsigned i = 0; i < cwc; i++ ) { WCHAR c = *pwcInBuf++;
// normalize the character to upcase.
c = ( c < 'a' ) ? c : ( c <= 'z' ) ? ( c - ('a' - 'A') ) : RtlUpcaseUnicodeChar( c );
//
// Store. Do it one byte at a time because the normalized string
// must be byte compared.
//
*pbOutBuf++ = (BYTE)(c >> 8); *pbOutBuf++ = (BYTE)c;
// hash
hash = ( hash << 2 ) + c; }
return hash; }
//+---------------------------------------------------------------------------
//
// Member: CNormalizer::NormalizeWstr - Public
//
// Synopsis: Normalizes a UniCode string
//
// Arguments: [pwcInBuf] -- input buffer
// [cwcInBuf] -- count of chars in pwcInBuf
// [pbOutBuf] -- output buffer.
// [pcbOutBuf] - pointer to output count of bytes.
//
// History: 10-Feb-2000 KitmanH Created
//
//----------------------------------------------------------------------------
void CNormalizer::NormalizeWStr( WCHAR const *pwcInBuf, ULONG cwcInBuf, BYTE *pbOutBuf, unsigned *pcbOutBuf ) { NormalizeWord( pwcInBuf, cwcInBuf, pbOutBuf, pcbOutBuf ); }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::CValueNormalizer
//
// Synopsis: Constructor
//
// Arguments: [krep] -- key repository sink for keys
//
// History: 21-Sep-92 BartoszM Created.
//
//----------------------------------------------------------------------------
CValueNormalizer::CValueNormalizer( PKeyRepository& krep ) : _krep(krep) { _krep.GetBuffers( &_pcbOutBuf, &_pbOutBuf, &_pOcc ); _cbMaxOutBuf = *_pcbOutBuf; *_pOcc = 0; }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue, public
//
// Synopsis: Store a variant
//
// Arguments: [pid] -- property id
// [occ] -- On input: starting occurrence.
// On output: next starting occurrence.
// [var] -- value
//
// History: 04-Nov-94 KyleP Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, OCCURRENCE & occ, CStorageVariant const & var ) { *_pOcc = occ;
switch ( var.Type() ) { case VT_EMPTY: case VT_NULL: break;
case VT_UI1: PutValue( pid, var.GetUI1() ); break;
case VT_I1: PutValue( pid, var.GetI1() ); break;
case VT_UI2: PutValue( pid, (USHORT) var.GetUI2() ); break;
case VT_I2: PutValue( pid, var.GetI2() ); break;
case VT_I4: case VT_INT: PutValue( pid, var.GetI4() ); break;
case VT_R4: PutValue( pid, var.GetR4() ); break;
case VT_R8: PutValue( pid, var.GetR8() ); break;
case VT_UI4: case VT_UINT: PutValue( pid, var.GetUI4() ); break;
case VT_I8: PutValue( pid, var.GetI8() ); break;
case VT_UI8: PutValue( pid, var.GetUI8() ); break;
case VT_BOOL: PutValue( pid, (BYTE) (FALSE != var.GetBOOL()) ); break;
case VT_ERROR: PutValue( pid, var.GetERROR() ); break;
case VT_CY: PutValue( pid, var.GetCY() ); break;
case VT_DATE: PutDate( pid, var.GetDATE() ); break;
case VT_FILETIME: PutValue( pid, var.GetFILETIME() ); break;
case VT_CLSID: PutValue( pid, *var.GetCLSID() ); break;
// NTRAID#DB-NTBUG9-84589-2000/07/31-dlee Indexing Service data type normalization doesn't handle VT_DECIMAL, VT_VECTOR, or VT_ARRAY.
default: ciDebugOut(( DEB_IWARN, "Unhandled type %d (%x) sent to normalization\n", var.Type(), var.Type() )); break; }
occ = *_pOcc; }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue private
//
// Synopsis: Store a unsigned 2 byte value without altering it
//
// Arguments: [pid] -- property id
// [uValue] -- value
// [bType] -- value type
//
// History: 07-Oct-93 DwightKr Created.
//
// Notes: This is the principal PutValue method that other PutValue()s
// will call. Each of the OTHER PutValue()'s sole purpose is
// to normalize their input data into a 2-byte unsigned value.
// This version of PutValue() will store the value together
// with its WID, PID, size, etc. in the CDataRepository object.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, unsigned uValue, BYTE bType ) { BYTE* pb = _pbOutBuf;
// Store size of entry
*_pcbOutBuf = sizeof(USHORT) + sizeof(PROPID) + 1;
// Store key type
*pb++ = bType;
// store property id
*pb++ = (BYTE)(pid >> 24); *pb++ = (BYTE)(pid >> 16); *pb++ = (BYTE)(pid >> 8); *pb++ = (BYTE) pid;
// Store key
Win4Assert( uValue < 0x10000 ); *pb++ = BYTE (uValue >> 8); *pb++ = BYTE (uValue);
#if CIDBG == 1
for (unsigned i = 0; i < *_pcbOutBuf; i++ ) { ciDebugOut (( DEB_USER1 | DEB_NOCOMPNAME, "%02x ", _pbOutBuf[i] )); } ciDebugOut (( DEB_USER1 | DEB_NOCOMPNAME, "\n" )); #endif
_krep.PutPropId(pid); _krep.PutKey(); (*_pOcc)++; }
void CValueNormalizer::PutMinValue( PROPID pid, OCCURRENCE & occ, VARENUM Type ) { *_pOcc = occ; PutValue( pid, 0, Type ); occ = *_pOcc; }
void CValueNormalizer::PutMaxValue( PROPID pid, OCCURRENCE & occ, VARENUM Type ) { *_pOcc = occ; PutValue( pid, 0xFFFF, Type ); occ = *_pOcc; }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue public
//
// Synopsis: Store a 1 byte value without altering it
//
// Arguments: [pid] -- property id
// [byte] -- value
//
// History: 25-Oct-93 DwightKr Created.
//
// Notes: One byte values are NOT normalized, they are stored as is.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, BYTE byte ) { PutValue(pid, (unsigned) byte, VT_UI1); }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue public
//
// Synopsis: Store a 1 byte signed value without altering it
//
// Arguments: [pid] -- property id
// [ch] -- value
//
// History: 25-Oct-1993 DwightKr Created.
// 29-Sep-2000 KitmanH Normalize VT_I1 values
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, CHAR ch ) { PutValue(pid, ( ((BYTE) ch) + 0x80 ) & 0xFF, VT_I1); }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue
//
// Synopsis: Store the high byte of an unsigned 2 byte value
//
// Arguments: [pid] -- property id
// [usValue] -- value
//
// History: 07-Oct-93 DwightKr Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, USHORT usValue ) { PutValue(pid, (usValue >> 8) & 0xFF, VT_UI2); }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue public
//
// Synopsis: Store the high byte of a signed 2 byte value.
//
// Arguments: [pid] -- property id
// [sValue] -- value
//
// Notes: Add the smallest BYTE to this so that we translate numbers
// into the range above 0. i.e. -32768 maps into 0x00, and 32767
// maps into 0xFF.
//
// History: 07-Oct-93 DwightKr Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, SHORT sValue ) { PutValue(pid, ((sValue >> 8) + 0x80) & 0xFF, VT_I2); }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue public
//
// Synopsis: Store the base-2 log of the ULONG value.
//
// Arguments: [pid] -- property id
// [ulValue] -- value
//
// Notes: This convert ULONGs into the range 0 - 31 by taking the Log2
// of the number.
//
// History: 07-Oct-93 DwightKr Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, ULONG ulValue ) { PutValue(pid, NormULong ( ulValue ), VT_UI4); }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue
//
// Synopsis: Store the base-2 log of the signed LONG value.
//
// Arguments: [pid] -- property id
// [lValue] -- value
//
// Notes: This converts LONGs into numbers larger than 0. This
// translates into 64 bins; 32 bins for #'s < 0 & 32 bins for
// #'s >= 0.
//
// History: 07-Oct-93 DwightKr Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, LONG lValue ) { PutValue(pid, NormLong(lValue), VT_I4); }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue
//
// Synopsis: Store the base-10 log of the FLOAT value.
//
// Arguments: [pid] -- property id
// [rValue] -- value
//
// Notes: floats fit into a total of 41 bins.
//
// History: 07-Oct-93 DwightKr Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, float rValue ) { PutValue(pid, NormDouble(rValue), VT_R4); }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue
//
// Synopsis: Store the base-10 log of the DOUBLE value.
//
// Arguments: [pid] -- property id
// [dValue] -- value
//
// Notes: doubles fit into a total of 41 bins.
//
// History: 07-Oct-93 DwightKr Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, double dValue ) { PutValue(pid, NormDouble(dValue), VT_R8); }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue
//
// Synopsis: Store the exponent of a large integer
//
// Arguments: [pid] -- property id
// [li] -- value
//
// History: 21-Sep-92 BartoszM Created.
// 04-Feb-93 KyleP Use LARGE_INTEGER
// 25-Oct-92 DwightKr Copied here & removed extra code &
// accounted for negative numbers
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, LARGE_INTEGER liValue ) { unsigned uExponent = NormLargeInteger(liValue);
PutValue( pid, uExponent, VT_I8); }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue
//
// Synopsis: Store a compressed large integer
//
// Arguments: [pid] -- property id
// [uli] -- value
//
// History: 09 Feb 96 AlanW Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, ULARGE_INTEGER uliValue ) { unsigned uExponent = NormULargeInteger(uliValue);
PutValue( pid, uExponent, VT_UI8); }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue
//
// Synopsis: Store the least byte of a GUID
//
// Arguments: [pid] -- property id
// [guid] -- value
//
// Notes: The GUID generators are guaranteed to modify the TOP DWORD
// of the 32-byte GUID each time a new GUID is generated.
// The lower bytes of the GUID is the network address of the
// card which generated the UUID.
//
// We would like to cluster together together objects of a single
// class (all MS-Word objects together for example). Since it
// is possible that someone could generate UUIDs for more than
// one application on a single machine, the lower portion of
// the UUID will perhaps remain constant between class IDs. The
// only part of the UUID which is guaranteed to be unique between
// multiple objects is the field which represents time. It is
// unlikely that two classes were generated the same second on
// two different machines.
//
// History: 25-Oct-93 DwightKr Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, GUID const & Guid ) { PutValue(pid, Guid.Data1 & 0xFFFF, VT_CLSID); }
long CastToLong( double d ) { //
// bit 63 = sign
// bits 52 - 62 = exponent
// bits 0 - 51 = mantissa
//
LARGE_INTEGER * pli = (LARGE_INTEGER *)&d;
int exp = (pli->HighPart & 0x7ff00000) >> 20;
if ( exp == 0 ) { //
// Special case: Zero, NaNs, etc.
//
return( 0 ); }
//
// Subtract off bias
//
exp -= 0x3ff;
if ( exp < 0 ) { // Cast of very small number to unsigned long. Loss of precision
return( 0 ); } else if ( exp > 30 ) { // Cast of very large number to unsigned long. Overflow
if ( pli->HighPart & 0x80000000 ) return( LONG_MIN ); else return( LONG_MAX ); } else { //
// We need to get the top 32 bits of the mantissa
// into a dword.
//
unsigned long temp = pli->LowPart >> (32 - 12); temp |= pli->HighPart << (32 - 20);
//
// Add the 'hidden' bit of the mantissa. (Since all doubles
// are normalized to 1.?????? the highest 1 bit isn't stored)
//
temp = temp >> 1; temp |= 0x80000000;
//
// Thow away digits to the right of decimal
//
temp = temp >> (31 - exp);
//
// Adjust for sign
//
Win4Assert( (temp & 0x80000000) == 0 ); long temp2;
if ( pli->HighPart & 0x80000000 ) temp2 = temp * -1; else temp2 = temp;
return( temp2 ); } } //CastToLong
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutDate
//
// Synopsis: Dates are passed in as the number of days (and fractional days)
// since Jan. 1, 1900. We'll crunch this down to the number of
// weeks. Dates are passed in a doubles. We'll assume that
// negative numbers represent dates before Jan. 1, 1900.
//
// Arguments: [pid] -- property id
// [DATE] -- value (double)
//
// Notes: Since dates before Jan 1, 1900 are passed as negative numbers
// we'll need to normalize them to something >= 0.
//
// time period resolution # bins
// =========================== =============== ======
// year < 10Bil BC -- bin = 0 1
// 10Bil BC <= year <= 1 BC -- log10 (year) 11
// 1 BC < year <= 1900 -- year 1902
// 1901 AD <= year <= 2050 AD -- daily 54787
// 2051 AD <= year <= 10Bil AD -- log10 (year) 8
// year > 10Bil AD -- bin = 0xFFFF 1
//
//
// I choose the daily range from 1901 - 2050 since there is a lot
// of events in the 20th century (WW I, WW II, landing on the
// moon, my wife's birthday, etc.) that are interesting, and
// imporant. It is likely that dates outside of this range will
// be rounded to the nearest year (1492, 1776, 1812, 1867, etc).
//
// Also by breaking the log10(year) at 1 BC rather than some other
// date (such as 0000 AD, or 1 AD) we avoid values in the range
// 1 BC < year < 1 AD, calculating log10(year) resulting in
// large negative numbers. Everything in this range should be in
// bin #12. It also avoids taking log10(0).
//
//
// History: 25-Oct-93 DwightKr Created.
// 07-Dec-94 KyleP Remove use of floating point
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutDate( PROPID pid, DATE const & Date ) { const int MinDate = 42; // 2^42 --> ~4.4E12 days --> ~12E9 years --> 12 billion B.C.
const int MinByYear = 20; // 2^20 --> ~1.0E6 days --> ~2.9E3 years --> 970 B.C.
const int cMinByYear = (1 << MinByYear) / 365 + 1; // 2873
const int MaxDaily = (2051 - 1900) * 365; // 55115
const int MinByYearAD = 15; // 2^15 --> ~32768 days --> ...
const int MaxDate = 42; // 2^42 --> ~4.4E12 days --> ~12E9 years --> 12 billion A.D.
const unsigned FirstBC = 0; const unsigned FirstLogBC = FirstBC + 1; const unsigned LastLogBC = FirstLogBC + MinDate - MinByYear; const unsigned FirstYearBC = LastLogBC + 1; const unsigned LastYearBC = FirstYearBC + cMinByYear; const unsigned FirstDaily = LastYearBC + 1; const unsigned LastDaily = FirstDaily + MaxDaily; const unsigned FirstLogAD = LastDaily + 1; const unsigned LastLogAD = FirstLogAD + MaxDate - MinByYearAD; const unsigned LastAD = 0xFFFF;
Win4Assert( LastLogAD < 0xFFFF );
unsigned bin; BOOL fPositive;
int exp = GetExpAndSign( Date, fPositive );
if ( !fPositive ) { //
// Very large negative dates go in first bin
//
if ( exp >= MinDate ) bin = FirstBC;
//
// Medium size negative dates get 1 bin / power of 2
//
else if ( exp >= MinByYear ) bin = FirstLogBC - exp + MinByYear;
//
// All other dates before 1900 get 1 bucket per 365 days.
//
else { long cYears = CastToLong( Date ) / 365;
Win4Assert( cYears >= -cMinByYear && cYears <= 0 );
bin = FirstYearBC + cYears + cMinByYear; } } else { //
// Very large positive dates go in last bin
//
if ( exp >= MaxDate ) bin = LastAD; else { long cDays = CastToLong( Date );
//
// Dates rather far in the future get 1 bucket / power of 2
//
if ( cDays >= MaxDaily ) bin = FirstLogAD + exp - MinByYearAD;
//
// Days close to today get 1 bucket per day
//
else bin = FirstDaily + cDays; } }
PutValue(pid, bin, VT_DATE); } //PutDate
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue
//
// Synopsis: Store the hashed value of an 8-byte currency.
//
// Arguments: [pid] -- property id
// [cyValue] -- value
//
// Notes: Currency values are stored as a ULONG cents, and a LONG $.
// We'll ignore the cents portion and store the $ part using
// the standard LONG storage method.
//
// History: 26-Oct-93 DwightKr Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, CURRENCY const & cyValue) { PutValue(pid, NormLong(cyValue.Hi), VT_CY); }
//+---------------------------------------------------------------------------
//
// Member: CValueNormalizer::PutValue
//
// Synopsis: Store the number of days since Jan 1, 1980;
//
// Arguments: [pid] -- property id
// [ulValue] -- value
//
// History: 07-Oct-93 DwightKr Created.
//
// Notes: This algorithym calculates the number of days since Jan 1,
// 1980; and stores it into a unsigned. FileTimes are divided
// into the following ranges:
//
// FileTime < 1980 => bin 0
// 1980 <= FileTime <= 1993 week granularity => bins 1 - 729
// 1994 <= FileTime <= 2160 day granularity => bins 730+
// FileTime > 2160 => bin 0xFFFF
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, FILETIME const & ftValue ) { //
// Determine the number of days since Jan 1, 1601 by dividing by
// the number of 100 nanosecond intervals in a day. The result
// will fit into a ULONG.
//
// Then map the result into one of the ranges: before 1980, between
// 1980 and 1994, between 1994 and 2160, and after 2160. To make
// the computation easier, we use precomputed values of the number
// of days from 1601 and the breakpoints of our range.
//
// 100s of nanosecs per day
const ULONGLONG uliTicsPerDay = 24 * 60 * 60 * (ULONGLONG)10000000;
const ULONG ulStart = 138426; // number of days from 1601 to 1980
const ULONG ulMiddle= 143542; // number of days from 1601 to 1/2/1994
const ULONG ulEnd = 204535; // number of days from 1601 to 2161
ULARGE_INTEGER liValue = {ftValue.dwLowDateTime, ftValue.dwHighDateTime};
ULONG ulDays = (ULONG) (liValue.QuadPart / uliTicsPerDay);
//
// We now have the number of days since Jan. 01, 1601 in ulDays.
// Map into buckets.
//
if (ulDays < ulStart) // Store in bin 0
{ PutValue(pid, 0, VT_FILETIME); } else if (ulDays <= ulMiddle) // Store week granularity
{ PutValue(pid, (ulDays + 1 - ulStart) / 7, VT_FILETIME); } else if (ulDays <= ulEnd) // Store day granularity
{ //
// Bins 0 - 730 are used by the two clauses above. It doesn't
// really matter if we reuse bin 730 for the start of the next
// range (this might happen because of the division we do).
//
PutValue(pid, (ulDays + 1 - ulMiddle) + ((ulMiddle - ulStart) / 7), VT_FILETIME); } else // FileTime > 2160
{ PutValue(pid, 0xFFFF, VT_FILETIME); } }
|