//+--------------------------------------------------------------------------- // // Microsoft Windows // Copyright (C) Microsoft Corporation, 1991 - 2000. // // File: NORM.CXX // // Contents: Normalizer // // Classes: CNormalizer // // History: 28-May-91 t-WadeR added CNormalizer // 31-Jan-92 BartoszM Created from lang.cxx // 07-Oct-93 DwightKr Added new methods to normalize // different data types // // Notes: The filtering pipeline is hidden in the Data Repository // object which serves as a sink for the filter. // The sink for the Data Repository is the Key Repository. // The language dependent part of the pipeline // is obtained from the Language List object and is called // Key Maker. It consists of: // // Word Breaker // Stemmer (optional) // Normalizer // Noise List // // Each object serves as a sink for its predecessor, // Key Repository is the final sink. // //---------------------------------------------------------------------------- #include #pragma hdrstop #include #include #include //+--------------------------------------------------------------------------- // // Function GetExpAndSign // // Synopsis: Finds the exponent and sign of a number // // Arguments: [d] -- the input number to examine // [fPositive] -- returns TRUE if positive, FALSE if negative // // Returns: The exponent // // History: 21-Nov-94 KyleP Created. // //---------------------------------------------------------------------------- int GetExpAndSign( double d, BOOL & fPositive ) { // // bit 63 = sign // bits 52 - 62 = exponent // bits 0 - 51 = mantissa // Win4Assert( sizeof(LARGE_INTEGER) == sizeof(double) ); LARGE_INTEGER * pli = (LARGE_INTEGER *)&d; fPositive = (pli->HighPart & 0x80000000) == 0; int const bias = 0x3ff; return ( ( pli->HighPart & 0x7ff00000 ) >> 20 ) - bias; } //GetExpAndSign //+--------------------------------------------------------------------------- // // Function NormDouble // // Synopsis: Normalizes doubles by taking log2 of the number // // Notes: This func converts doubles into one of 5 different categories // // x < -1x2**32 is in bin 0 // -1x2**32 <= x <= -1x2**-32 are in bins 1 to 65 // -1x2**-32 <= x <= 1x2**-32 is in bin 66 // 1x2**-32 <= x <= 1x2**32 are in bins 67 to 131 // x > 1x2**32 is bin bin 132 // // History: 21-Nov-94 KyleP Created. // //---------------------------------------------------------------------------- static unsigned NormDouble(double dValue) { const int SignificantExponent = 32; const int SignificantRange = SignificantExponent * 2; const unsigned LowestBin = 0; // 0 const unsigned LowerBin = LowestBin + 1; // 1 const unsigned MiddleBin = LowerBin + SignificantRange + 1; // 66 const unsigned UpperBin = MiddleBin + 1; // 67 const unsigned HighestBin = UpperBin+ SignificantRange + 1; // 132 BOOL fPositive; int exp = GetExpAndSign( dValue, fPositive ); unsigned bin; if ( exp < -SignificantExponent ) { // // All numbers close to zero in middle bin // bin = MiddleBin; } else if ( exp > SignificantExponent ) { if ( fPositive ) { // // Very large positive numbers in top bin // bin = HighestBin; } else { // // Very large negative numbers in bottom bin // bin = LowestBin; } } else { if ( fPositive ) { // // medium size positive numbers // bin = UpperBin + exp + SignificantExponent; } else { // // medium size negative numbers // bin = LowerBin - exp + SignificantExponent; } } return bin; } #ifdef TEST_NORM // // a test to verify the validity of the NormDouble function. // void TestNormDouble() { float fVal0 = 0.; float fVal1 = 1.; unsigned nZero = NormDouble( fVal0 ); unsigned nOne = NormDouble( fVal1 ); printf(" Value:Bin %f : 0x%4X (%d)\n", fVal0, nZero, nZero ); printf(" Value:Bin %f : 0x%4X (%d)\n", fVal1, nOne, nOne ); BOOL fPos; float f = fVal1; unsigned nPrev = nOne; while ( f > fVal0 ) { unsigned nVal = NormDouble( f ); if (nVal > nPrev || nVal < nZero || nVal > nOne) { printf(" Value:Bin %f : 0x%4X (%d)\tExp %d\n", f, nVal, nVal, GetExpAndSign(f, fPos) ); } nPrev = nVal; f = f/3; } f = fVal1; nPrev = nOne; while ( f < 1e+32 ) { unsigned nVal = NormDouble( f ); if (nVal < nPrev) printf(" Value:Bin %f : 0x%4X (%d)\n", f, nVal, nVal ); nPrev = nVal; f = f * (float)1.5; } float fValm1 = -1.; unsigned nMinusOne = NormDouble( fValm1 ); printf(" Value:Bin %f : 0x%4X (%d)\n", fValm1, nMinusOne, nMinusOne ); f = fValm1; nPrev = nMinusOne; while ( f < fVal0 ) { unsigned nVal = NormDouble( f ); if (nVal < nPrev || nVal > nZero || nVal < nMinusOne) printf(" Value:Bin %f : 0x%4X (%d)\tExp %d\n", f, nVal, nVal, GetExpAndSign(f, fPos) ); nPrev = nVal; f = f/3; } f = fValm1; nPrev = nMinusOne; while ( f > -1e+32 ) { unsigned nVal = NormDouble( f ); if (nVal > nPrev) printf(" Value:Bin %f : 0x%4X (%d)\n", f, nVal, nVal ); nPrev = nVal; f = f * (float)1.5; } } #endif // 0 // ------------------------------------------------------------------------ // | Upper Limit | Divisor (2^x) | # of Bins | (in hex) | // ------------------------------------------------------------------------ // | 2^10 - 1 | 2^0 | 2^10 - 0 | 0400 - 0000 | // | 2^16 - 1 | 2^3 | 2^12 - 2^7 | 2000 - 0080 | // | 2^20 - 1 | 2^6 | 2^14 - 2^10 | 4000 - 0400 | // | 2^26 - 1 | 2^13 | 2^13 - 2^7 | 2000 - 0080 | // | 2^30 - 1 | 2^23 | 2^7 - 2^3 | 0080 - 0008 | // | 2^31 - 1 | 2^25 | 2^6 - 2^5 | 0040 - 0020 | // ------------------------------------------------------------------------ // | Total | | | 84C0 - 04D8 | // | | | | 7FE8 | // ------------------------------------------------------------------------ const long limit1 = 0x400; const long shift1 = 0; const long cbins1 = 0x400; const long limit2 = 0x10000; // 2^16 const long shift2 = 3; const long cSkip1 = limit1 >> shift2; const long cbins2 = (limit2 >> shift2)-cSkip1; const long limit3 = 0x100000; // 2^20 const long shift3 = 6; const long cSkip2 = limit2 >> shift3; const long cbins3 = (limit3 >> shift3) - cSkip2; const long limit4 = 0x4000000; // 2^26 const long shift4 = 13; const long cSkip3 = limit3 >> shift4; const long cbins4 = (limit4 >> shift4) - cSkip3; const long limit5 = 0x40000000; // 2^30 const long shift5 = 23; const long cSkip4 = limit4 >> shift5; const long cbins5 = (limit5 >> shift5) - cSkip4; const long limit6 = MINLONG; // 2^31 const long shift6 = 25; const long cSkip5 = limit5 >> shift6; const long cbins6 = ((long) ((unsigned) limit6 >> shift6)) - cSkip5; static unsigned MapLong( LONG lValue ) { Win4Assert( !(lValue & MINLONG) || ( MINLONG == lValue ) ); #if CIDBG==1 const long cTotal = cbins1 + cbins2 + cbins3 + cbins4 + cbins5 + cbins6; Win4Assert( cTotal <= MINSHORT ); #endif // CIDBG == 1 unsigned ulValue = (unsigned) lValue; unsigned binNum = (unsigned) lValue;; if ( ulValue < limit1 ) { // // Nothing to do. // } else if ( ulValue < limit2 ) { binNum = cbins1 - cSkip1 + (ulValue >> shift2); } else if ( ulValue < limit3 ) { binNum = cbins1 + cbins2 - cSkip2 + (binNum >> shift3); } else if ( ulValue < limit4 ) { binNum = cbins1 + cbins2 + cbins3 - cSkip3 + (binNum >> shift4); } else if ( ulValue < limit5 ) { binNum = cbins1 + cbins2 + cbins3 + cbins4 - cSkip4 + (binNum >> shift5); } else { binNum = cbins1 + cbins2 + cbins3 + cbins4 + cbins5 - cSkip5 + (binNum >> shift6); } return binNum; } //+--------------------------------------------------------------------------- // // Function: NormLong // // Synopsis: Normalizes the given "signed" long value to a value between // 0x0000 - 0xFFFF. The negative numbers occupy 0x0000-0x8000. // Positive numbers occupy 0x8000-0xFFFF // // Arguments: [lValue] - The value to be normalized. // // History: 10-03-95 srikants Created // // Notes: // //---------------------------------------------------------------------------- static unsigned NormLong(LONG lValue) { if (lValue >= 0) { return MapLong(lValue) + MINSHORT; } else { return MINSHORT - MapLong(-lValue); } } //+--------------------------------------------------------------------------- // // Function: NormULong // // Synopsis: Normalizes an "unsigned" long value to a value between // 0x0000-0xFFFF. Numbers from 0-2^31 - 1 are mapped in the // range 0x0000-0x7FFF. Numbers 2^31 to 2^32 - 1 are mapped // in the range 0x8000 - 0xFFFF // // Arguments: [lValue] - The value to be mapped. // // History: 10-03-95 srikants Created // // Notes: // //---------------------------------------------------------------------------- static unsigned NormULong( ULONG lValue ) { unsigned val = MapLong( lValue & ~MINLONG ); // turn off the high bit Win4Assert( !(val & MINSHORT) ); if ( lValue & MINLONG ) val |= MINSHORT; return val; } //+--------------------------------------------------------------------------- // // Function: MapLargeInteger // // Synopsis: Maps a LargeInteger to a number between 0x0000-0x7FFF. // // Numbers with the "HighPart" = 0 are mapped in the range // 0x0000-0x3FFF. When the HighPart !=0, the values are // mapped to 0x4000 - 0x7FFF // // Arguments: [liValue] - The value to be mapped. // // History: 10-03-95 srikants Created // // Notes: // //---------------------------------------------------------------------------- static unsigned MapLargeInteger( LARGE_INTEGER & liValue ) { Win4Assert( !(liValue.HighPart & MINLONG) || ( MINLONG == liValue.HighPart ) ); unsigned normVal; if ( 0 == liValue.HighPart ) { normVal = NormULong( liValue.LowPart ); normVal >>= 2; } else { normVal = MapLong( liValue.HighPart ); // 0x0000-0x7FFF normVal >>= 1; normVal |= 0x4000; } Win4Assert( normVal < 0x8000 ); return normVal; } //+--------------------------------------------------------------------------- // // Function: NormULargeInteger // // Synopsis: Normalizes an unsigned LargeInteger to a number between // 0x0000-0xFFFF. // // Numbers with the "HighPart" = 0 are mapped in the range // 0x0000-0x7FFF. When the HighPart !=0, the values are // mapped to 0x8000 - 0xFFFF. // // Arguments: [uliValue] - The value to be mapped. // // History: 02-09-96 Alanw Created // // Notes: // //---------------------------------------------------------------------------- static unsigned NormULargeInteger( ULARGE_INTEGER & uliValue ) { unsigned normVal; if ( 0 == uliValue.HighPart ) { normVal = NormULong( uliValue.LowPart ); normVal >>= 1; } else { normVal = NormULong( uliValue.HighPart ); // 0x0000-0x7FFF normVal |= 0x8000; } Win4Assert( normVal < 0x10000 ); return normVal; } //+--------------------------------------------------------------------------- // // Function: NormLargeInteger // // Synopsis: Normalizes a large integer to a value between 0x0000-0xFFFF. // // -ve Numbers are mapped in the range 0x0000-0x8000. // +ve numbers are mapped in the range 0x8000-0xFFFF. // // Arguments: [liValue] - The value to be normalized. Note that the // argument is NOT passed by reference. The value is changed // in this method and so should not be passed by reference. // // History: 10-03-95 srikants Created // // Notes: // //---------------------------------------------------------------------------- static unsigned NormLargeInteger( LARGE_INTEGER liValue ) { unsigned normVal; if ( liValue.QuadPart < 0 ) { liValue.QuadPart = -liValue.QuadPart; normVal = MINSHORT - MapLargeInteger( liValue ); } else { normVal = MINSHORT + MapLargeInteger( liValue ); } Win4Assert( normVal < 0x10000 ); return normVal; } #ifdef TEST_NORM // // a test to verify the validity of the NormLong function. // void TestNormLong() { long lVal1 = 0; unsigned nVal1 = NormLong( lVal1 ); printf(" Value:Bin 0x%8X : 0x%4X \t(%10d : %10d)\n", lVal1, nVal1, lVal1, nVal1 ); lVal1 = 2; long lVal2 = 0; unsigned nVal2 = NormLong(1); while ( !(lVal1 & 0x80000000) ) { nVal1 = NormLong( lVal1 ); //printf(" Value:Bin 0x%8X : 0x%4X \t(%10d : %10d)\n", lVal1, nVal1, lVal1, nVal1 ); Win4Assert( nVal1 == nVal2+1 ); lVal2 = lVal1 + lVal1-1; nVal2 = NormLong( lVal2 ); //printf(" Value:Bin 0x%8X : 0x%4X \t(%10d : %10d)\n", lVal2, nVal2, lVal2, nVal2 ); lVal1 <<= 1; } lVal1 = 2; nVal2 = NormLong(-1); printf(" Value:Bin 0x%8X : 0x%4X \t(%10d : %10d)\n", -1, nVal2, -1, nVal2 ); while ( !(lVal1 & 0x80000000) ) { nVal1 = NormLong( -lVal1 ); //printf(" Value:Bin 0x%8X : 0x%4X \t(%10d : %10d)\n", -lVal1, nVal1, -lVal1, nVal1 ); Win4Assert( nVal1 == nVal2-1 ); lVal2 = lVal1 + lVal1-1; lVal2 = -lVal2; nVal2 = NormLong( lVal2 ); //printf(" Value:Bin 0x%8X : 0x%4X \t(%10d : %10d)\n", lVal2, nVal2, lVal2, nVal2 ); lVal1 <<= 1; } } #endif // 0 //+--------------------------------------------------------------------------- // // Member: CNormalizer::CNormalizer // // Synopsis: constructor for normalizer // // Effects: gets buffers from noiselist // // Arguments: [nl] -- Noise list object to pass data on to. // // History: 05-June-91 t-WadeR Created. // // Notes: // //---------------------------------------------------------------------------- CNormalizer::CNormalizer( PNoiseList& nl ) : _noiseList(nl) { SetWordBuffer(); // check that input size + prefix fits in the output buffer Win4Assert( cwcMaxKey * sizeof( WCHAR ) + cbKeyPrefix <= *_pcbOutBuf ); } //+--------------------------------------------------------------------------- // // Member: CNormalizer::GetFlags // // Synopsis: Returns address of ranking and range flags // // Arguments: [ppRange] -- range flag // [ppRank] -- rank flag // // History: 11-Fab-92 BartoszM Created. // //---------------------------------------------------------------------------- void CNormalizer::GetFlags ( BOOL** ppRange, CI_RANK** ppRank ) { _noiseList.GetFlags ( ppRange, ppRank ); } //+--------------------------------------------------------------------------- // // Member: CNormalizer::ProcessAltWord, public // // Synopsis: Normalizes a UniCode string, passes it to NoiseList. // // Effects: Deposits a normalized version [pwcInBuf] in [_pbOutBuf] // // Arguments: [pwcInBuf] -- input buffer // [cwc] -- count of chars in pwcInBuf // // History: 03-May-95 SitaramR Created. // //---------------------------------------------------------------------------- void CNormalizer::ProcessAltWord( WCHAR const *pwcInBuf, ULONG cwc ) { SetNextAltBuffer(); unsigned hash = NormalizeWord( pwcInBuf, cwc ); SetAltHash( hash ); } //+--------------------------------------------------------------------------- // // Member: CNormalizer::ProcessWord, public // // Synopsis: Normalizes a UniCode string, passes it to NoiseList. // // Effects: Deposits a normalized version of [pwcInBuf] in [_pbOutBuf]. // // Arguments: [pwcInBuf] -- input buffer // [cwc] -- count of chars in pwcInBuf // // History: 05-June-91 t-WadeR Created. // 13-Oct-92 AmyA Added unicode support // //---------------------------------------------------------------------------- void CNormalizer::ProcessWord( WCHAR const *pwcInBuf, ULONG cwc ) { if ( UsingAltBuffers() ) SetNextAltBuffer(); unsigned hash = NormalizeWord( pwcInBuf, cwc ); if ( UsingAltBuffers() ) { SetAltHash( hash ); ProcessAllWords(); } else _noiseList.PutWord( hash ); } //+--------------------------------------------------------------------------- // // Member: CNormalizer::ProcessAllWords, private // // Synopsis: Removes duplicate alternate words and emits remainder. // // History: 17-Sep-1999 KyleP Created. // //---------------------------------------------------------------------------- void CNormalizer::ProcessAllWords() { // // Check for duplicate keys. Since the number of alternate forms will always be // quite small it's ok to use a O(n^2) algorithm here. // unsigned iFinal = 0; for ( unsigned i = 0; i < _cAltKey; i++ ) { // // Already marked duplicate? // if ( 0 == _aAltKey[i].Count() ) continue; iFinal = i; for ( unsigned j = i+1; j < _cAltKey; j++ ) { // // Remember, Pid is really the hash here. // if ( _aAltKey[i].Pid() == _aAltKey[j].Pid() && _aAltKey[i].Count() == _aAltKey[j].Count() && RtlEqualMemory( _aAltKey[i].GetBuf(), _aAltKey[j].GetBuf(), _aAltKey[j].Count() ) ) { ciDebugOut(( DEB_TRACE, "Duplicate keys: %u and %u\n", i, j )); _aAltKey[j].SetCount( 0 ); } } } // // Now transfer any remaining key(s). // SetWordBuffer(); unsigned hash; for ( i = 0; i <= iFinal; i++ ) { // // Ignore duplicates // if ( 0 == _aAltKey[i].Count() ) continue; // // Copy to the transfer buffer. // *_pcbOutBuf = _aAltKey[i].Count(); RtlCopyMemory( _pbOutBuf, _aAltKey[i].GetBuf(), *_pcbOutBuf ); hash = _aAltKey[i].Pid(); // // If this is not the final "PutWord" call, send the data along. // if ( i != iFinal ) _noiseList.PutAltWord( hash ); } // // Put the final word // _noiseList.PutWord( hash ); } //ProcessAllWords //+--------------------------------------------------------------------------- // // Member: CNormalizer::NormalizeWord // // Synopsis: Normalizes a UniCode string // Calculates the hash function for normalized string. // // Arguments: [pwcInBuf] -- input buffer // [cwc] -- count of chars in pwcInBuf // // Returns: unsigned hash value of string // // History: 03-May-95 SitaramR Created. // //---------------------------------------------------------------------------- unsigned CNormalizer::NormalizeWord( WCHAR const *pwcInBuf, ULONG cwc ) { return NormalizeWord( pwcInBuf, cwc, _pbOutBuf, _pcbOutBuf ); } //+--------------------------------------------------------------------------- // // Member: CNormalizer::NormalizeWord // // Synopsis: Normalizes a UniCode string // Calculates the hash function for normalized string. This // function is identical to the other NormalizeWord funtion, // except that it puts the outputs int he output parameters // // Arguments: [pwcInBuf] -- input buffer // [cwc] -- count of chars in pwcInBuf // [pbOutBuf] -- output buffer. // [pcbOutBuf] - pointer to output count of bytes. // // Returns: unsigned hash value of string // // History: 03-May-1995 SitaramR Created. // 03-Oct-2000 KitmanH Added output parameters // //---------------------------------------------------------------------------- unsigned CNormalizer::NormalizeWord( WCHAR const *pwcInBuf, ULONG cwc, BYTE *pbOutBuf, unsigned *pcbOutBuf ) { // count of bytes needs to take into account STRING_KEY *pcbOutBuf = cwc * sizeof(WCHAR) + cbKeyPrefix; // prefix with the string key identifier *pbOutBuf++ = STRING_KEY; unsigned hash = 0; Win4Assert ( cwc != 0 && cwc <= cwcMaxKey ); for ( unsigned i = 0; i < cwc; i++ ) { WCHAR c = *pwcInBuf++; // normalize the character to upcase. c = ( c < 'a' ) ? c : ( c <= 'z' ) ? ( c - ('a' - 'A') ) : RtlUpcaseUnicodeChar( c ); // // Store. Do it one byte at a time because the normalized string // must be byte compared. // *pbOutBuf++ = (BYTE)(c >> 8); *pbOutBuf++ = (BYTE)c; // hash hash = ( hash << 2 ) + c; } return hash; } //+--------------------------------------------------------------------------- // // Member: CNormalizer::NormalizeWstr - Public // // Synopsis: Normalizes a UniCode string // // Arguments: [pwcInBuf] -- input buffer // [cwcInBuf] -- count of chars in pwcInBuf // [pbOutBuf] -- output buffer. // [pcbOutBuf] - pointer to output count of bytes. // // History: 10-Feb-2000 KitmanH Created // //---------------------------------------------------------------------------- void CNormalizer::NormalizeWStr( WCHAR const *pwcInBuf, ULONG cwcInBuf, BYTE *pbOutBuf, unsigned *pcbOutBuf ) { NormalizeWord( pwcInBuf, cwcInBuf, pbOutBuf, pcbOutBuf ); } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::CValueNormalizer // // Synopsis: Constructor // // Arguments: [krep] -- key repository sink for keys // // History: 21-Sep-92 BartoszM Created. // //---------------------------------------------------------------------------- CValueNormalizer::CValueNormalizer( PKeyRepository& krep ) : _krep(krep) { _krep.GetBuffers( &_pcbOutBuf, &_pbOutBuf, &_pOcc ); _cbMaxOutBuf = *_pcbOutBuf; *_pOcc = 0; } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue, public // // Synopsis: Store a variant // // Arguments: [pid] -- property id // [occ] -- On input: starting occurrence. // On output: next starting occurrence. // [var] -- value // // History: 04-Nov-94 KyleP Created. // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, OCCURRENCE & occ, CStorageVariant const & var ) { *_pOcc = occ; switch ( var.Type() ) { case VT_EMPTY: case VT_NULL: break; case VT_UI1: PutValue( pid, var.GetUI1() ); break; case VT_I1: PutValue( pid, var.GetI1() ); break; case VT_UI2: PutValue( pid, (USHORT) var.GetUI2() ); break; case VT_I2: PutValue( pid, var.GetI2() ); break; case VT_I4: case VT_INT: PutValue( pid, var.GetI4() ); break; case VT_R4: PutValue( pid, var.GetR4() ); break; case VT_R8: PutValue( pid, var.GetR8() ); break; case VT_UI4: case VT_UINT: PutValue( pid, var.GetUI4() ); break; case VT_I8: PutValue( pid, var.GetI8() ); break; case VT_UI8: PutValue( pid, var.GetUI8() ); break; case VT_BOOL: PutValue( pid, (BYTE) (FALSE != var.GetBOOL()) ); break; case VT_ERROR: PutValue( pid, var.GetERROR() ); break; case VT_CY: PutValue( pid, var.GetCY() ); break; case VT_DATE: PutDate( pid, var.GetDATE() ); break; case VT_FILETIME: PutValue( pid, var.GetFILETIME() ); break; case VT_CLSID: PutValue( pid, *var.GetCLSID() ); break; // NTRAID#DB-NTBUG9-84589-2000/07/31-dlee Indexing Service data type normalization doesn't handle VT_DECIMAL, VT_VECTOR, or VT_ARRAY. default: ciDebugOut(( DEB_IWARN, "Unhandled type %d (%x) sent to normalization\n", var.Type(), var.Type() )); break; } occ = *_pOcc; } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue private // // Synopsis: Store a unsigned 2 byte value without altering it // // Arguments: [pid] -- property id // [uValue] -- value // [bType] -- value type // // History: 07-Oct-93 DwightKr Created. // // Notes: This is the principal PutValue method that other PutValue()s // will call. Each of the OTHER PutValue()'s sole purpose is // to normalize their input data into a 2-byte unsigned value. // This version of PutValue() will store the value together // with its WID, PID, size, etc. in the CDataRepository object. // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, unsigned uValue, BYTE bType ) { BYTE* pb = _pbOutBuf; // Store size of entry *_pcbOutBuf = sizeof(USHORT) + sizeof(PROPID) + 1; // Store key type *pb++ = bType; // store property id *pb++ = (BYTE)(pid >> 24); *pb++ = (BYTE)(pid >> 16); *pb++ = (BYTE)(pid >> 8); *pb++ = (BYTE) pid; // Store key Win4Assert( uValue < 0x10000 ); *pb++ = BYTE (uValue >> 8); *pb++ = BYTE (uValue); #if CIDBG == 1 for (unsigned i = 0; i < *_pcbOutBuf; i++ ) { ciDebugOut (( DEB_USER1 | DEB_NOCOMPNAME, "%02x ", _pbOutBuf[i] )); } ciDebugOut (( DEB_USER1 | DEB_NOCOMPNAME, "\n" )); #endif _krep.PutPropId(pid); _krep.PutKey(); (*_pOcc)++; } void CValueNormalizer::PutMinValue( PROPID pid, OCCURRENCE & occ, VARENUM Type ) { *_pOcc = occ; PutValue( pid, 0, Type ); occ = *_pOcc; } void CValueNormalizer::PutMaxValue( PROPID pid, OCCURRENCE & occ, VARENUM Type ) { *_pOcc = occ; PutValue( pid, 0xFFFF, Type ); occ = *_pOcc; } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue public // // Synopsis: Store a 1 byte value without altering it // // Arguments: [pid] -- property id // [byte] -- value // // History: 25-Oct-93 DwightKr Created. // // Notes: One byte values are NOT normalized, they are stored as is. // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, BYTE byte ) { PutValue(pid, (unsigned) byte, VT_UI1); } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue public // // Synopsis: Store a 1 byte signed value without altering it // // Arguments: [pid] -- property id // [ch] -- value // // History: 25-Oct-1993 DwightKr Created. // 29-Sep-2000 KitmanH Normalize VT_I1 values // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, CHAR ch ) { PutValue(pid, ( ((BYTE) ch) + 0x80 ) & 0xFF, VT_I1); } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue // // Synopsis: Store the high byte of an unsigned 2 byte value // // Arguments: [pid] -- property id // [usValue] -- value // // History: 07-Oct-93 DwightKr Created. // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, USHORT usValue ) { PutValue(pid, (usValue >> 8) & 0xFF, VT_UI2); } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue public // // Synopsis: Store the high byte of a signed 2 byte value. // // Arguments: [pid] -- property id // [sValue] -- value // // Notes: Add the smallest BYTE to this so that we translate numbers // into the range above 0. i.e. -32768 maps into 0x00, and 32767 // maps into 0xFF. // // History: 07-Oct-93 DwightKr Created. // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, SHORT sValue ) { PutValue(pid, ((sValue >> 8) + 0x80) & 0xFF, VT_I2); } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue public // // Synopsis: Store the base-2 log of the ULONG value. // // Arguments: [pid] -- property id // [ulValue] -- value // // Notes: This convert ULONGs into the range 0 - 31 by taking the Log2 // of the number. // // History: 07-Oct-93 DwightKr Created. // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, ULONG ulValue ) { PutValue(pid, NormULong ( ulValue ), VT_UI4); } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue // // Synopsis: Store the base-2 log of the signed LONG value. // // Arguments: [pid] -- property id // [lValue] -- value // // Notes: This converts LONGs into numbers larger than 0. This // translates into 64 bins; 32 bins for #'s < 0 & 32 bins for // #'s >= 0. // // History: 07-Oct-93 DwightKr Created. // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, LONG lValue ) { PutValue(pid, NormLong(lValue), VT_I4); } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue // // Synopsis: Store the base-10 log of the FLOAT value. // // Arguments: [pid] -- property id // [rValue] -- value // // Notes: floats fit into a total of 41 bins. // // History: 07-Oct-93 DwightKr Created. // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, float rValue ) { PutValue(pid, NormDouble(rValue), VT_R4); } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue // // Synopsis: Store the base-10 log of the DOUBLE value. // // Arguments: [pid] -- property id // [dValue] -- value // // Notes: doubles fit into a total of 41 bins. // // History: 07-Oct-93 DwightKr Created. // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, double dValue ) { PutValue(pid, NormDouble(dValue), VT_R8); } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue // // Synopsis: Store the exponent of a large integer // // Arguments: [pid] -- property id // [li] -- value // // History: 21-Sep-92 BartoszM Created. // 04-Feb-93 KyleP Use LARGE_INTEGER // 25-Oct-92 DwightKr Copied here & removed extra code & // accounted for negative numbers // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, LARGE_INTEGER liValue ) { unsigned uExponent = NormLargeInteger(liValue); PutValue( pid, uExponent, VT_I8); } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue // // Synopsis: Store a compressed large integer // // Arguments: [pid] -- property id // [uli] -- value // // History: 09 Feb 96 AlanW Created. // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, ULARGE_INTEGER uliValue ) { unsigned uExponent = NormULargeInteger(uliValue); PutValue( pid, uExponent, VT_UI8); } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue // // Synopsis: Store the least byte of a GUID // // Arguments: [pid] -- property id // [guid] -- value // // Notes: The GUID generators are guaranteed to modify the TOP DWORD // of the 32-byte GUID each time a new GUID is generated. // The lower bytes of the GUID is the network address of the // card which generated the UUID. // // We would like to cluster together together objects of a single // class (all MS-Word objects together for example). Since it // is possible that someone could generate UUIDs for more than // one application on a single machine, the lower portion of // the UUID will perhaps remain constant between class IDs. The // only part of the UUID which is guaranteed to be unique between // multiple objects is the field which represents time. It is // unlikely that two classes were generated the same second on // two different machines. // // History: 25-Oct-93 DwightKr Created. // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, GUID const & Guid ) { PutValue(pid, Guid.Data1 & 0xFFFF, VT_CLSID); } long CastToLong( double d ) { // // bit 63 = sign // bits 52 - 62 = exponent // bits 0 - 51 = mantissa // LARGE_INTEGER * pli = (LARGE_INTEGER *)&d; int exp = (pli->HighPart & 0x7ff00000) >> 20; if ( exp == 0 ) { // // Special case: Zero, NaNs, etc. // return( 0 ); } // // Subtract off bias // exp -= 0x3ff; if ( exp < 0 ) { // Cast of very small number to unsigned long. Loss of precision return( 0 ); } else if ( exp > 30 ) { // Cast of very large number to unsigned long. Overflow if ( pli->HighPart & 0x80000000 ) return( LONG_MIN ); else return( LONG_MAX ); } else { // // We need to get the top 32 bits of the mantissa // into a dword. // unsigned long temp = pli->LowPart >> (32 - 12); temp |= pli->HighPart << (32 - 20); // // Add the 'hidden' bit of the mantissa. (Since all doubles // are normalized to 1.?????? the highest 1 bit isn't stored) // temp = temp >> 1; temp |= 0x80000000; // // Thow away digits to the right of decimal // temp = temp >> (31 - exp); // // Adjust for sign // Win4Assert( (temp & 0x80000000) == 0 ); long temp2; if ( pli->HighPart & 0x80000000 ) temp2 = temp * -1; else temp2 = temp; return( temp2 ); } } //CastToLong //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutDate // // Synopsis: Dates are passed in as the number of days (and fractional days) // since Jan. 1, 1900. We'll crunch this down to the number of // weeks. Dates are passed in a doubles. We'll assume that // negative numbers represent dates before Jan. 1, 1900. // // Arguments: [pid] -- property id // [DATE] -- value (double) // // Notes: Since dates before Jan 1, 1900 are passed as negative numbers // we'll need to normalize them to something >= 0. // // time period resolution # bins // =========================== =============== ====== // year < 10Bil BC -- bin = 0 1 // 10Bil BC <= year <= 1 BC -- log10 (year) 11 // 1 BC < year <= 1900 -- year 1902 // 1901 AD <= year <= 2050 AD -- daily 54787 // 2051 AD <= year <= 10Bil AD -- log10 (year) 8 // year > 10Bil AD -- bin = 0xFFFF 1 // // // I choose the daily range from 1901 - 2050 since there is a lot // of events in the 20th century (WW I, WW II, landing on the // moon, my wife's birthday, etc.) that are interesting, and // imporant. It is likely that dates outside of this range will // be rounded to the nearest year (1492, 1776, 1812, 1867, etc). // // Also by breaking the log10(year) at 1 BC rather than some other // date (such as 0000 AD, or 1 AD) we avoid values in the range // 1 BC < year < 1 AD, calculating log10(year) resulting in // large negative numbers. Everything in this range should be in // bin #12. It also avoids taking log10(0). // // // History: 25-Oct-93 DwightKr Created. // 07-Dec-94 KyleP Remove use of floating point // //---------------------------------------------------------------------------- void CValueNormalizer::PutDate( PROPID pid, DATE const & Date ) { const int MinDate = 42; // 2^42 --> ~4.4E12 days --> ~12E9 years --> 12 billion B.C. const int MinByYear = 20; // 2^20 --> ~1.0E6 days --> ~2.9E3 years --> 970 B.C. const int cMinByYear = (1 << MinByYear) / 365 + 1; // 2873 const int MaxDaily = (2051 - 1900) * 365; // 55115 const int MinByYearAD = 15; // 2^15 --> ~32768 days --> ... const int MaxDate = 42; // 2^42 --> ~4.4E12 days --> ~12E9 years --> 12 billion A.D. const unsigned FirstBC = 0; const unsigned FirstLogBC = FirstBC + 1; const unsigned LastLogBC = FirstLogBC + MinDate - MinByYear; const unsigned FirstYearBC = LastLogBC + 1; const unsigned LastYearBC = FirstYearBC + cMinByYear; const unsigned FirstDaily = LastYearBC + 1; const unsigned LastDaily = FirstDaily + MaxDaily; const unsigned FirstLogAD = LastDaily + 1; const unsigned LastLogAD = FirstLogAD + MaxDate - MinByYearAD; const unsigned LastAD = 0xFFFF; Win4Assert( LastLogAD < 0xFFFF ); unsigned bin; BOOL fPositive; int exp = GetExpAndSign( Date, fPositive ); if ( !fPositive ) { // // Very large negative dates go in first bin // if ( exp >= MinDate ) bin = FirstBC; // // Medium size negative dates get 1 bin / power of 2 // else if ( exp >= MinByYear ) bin = FirstLogBC - exp + MinByYear; // // All other dates before 1900 get 1 bucket per 365 days. // else { long cYears = CastToLong( Date ) / 365; Win4Assert( cYears >= -cMinByYear && cYears <= 0 ); bin = FirstYearBC + cYears + cMinByYear; } } else { // // Very large positive dates go in last bin // if ( exp >= MaxDate ) bin = LastAD; else { long cDays = CastToLong( Date ); // // Dates rather far in the future get 1 bucket / power of 2 // if ( cDays >= MaxDaily ) bin = FirstLogAD + exp - MinByYearAD; // // Days close to today get 1 bucket per day // else bin = FirstDaily + cDays; } } PutValue(pid, bin, VT_DATE); } //PutDate //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue // // Synopsis: Store the hashed value of an 8-byte currency. // // Arguments: [pid] -- property id // [cyValue] -- value // // Notes: Currency values are stored as a ULONG cents, and a LONG $. // We'll ignore the cents portion and store the $ part using // the standard LONG storage method. // // History: 26-Oct-93 DwightKr Created. // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, CURRENCY const & cyValue) { PutValue(pid, NormLong(cyValue.Hi), VT_CY); } //+--------------------------------------------------------------------------- // // Member: CValueNormalizer::PutValue // // Synopsis: Store the number of days since Jan 1, 1980; // // Arguments: [pid] -- property id // [ulValue] -- value // // History: 07-Oct-93 DwightKr Created. // // Notes: This algorithym calculates the number of days since Jan 1, // 1980; and stores it into a unsigned. FileTimes are divided // into the following ranges: // // FileTime < 1980 => bin 0 // 1980 <= FileTime <= 1993 week granularity => bins 1 - 729 // 1994 <= FileTime <= 2160 day granularity => bins 730+ // FileTime > 2160 => bin 0xFFFF // //---------------------------------------------------------------------------- void CValueNormalizer::PutValue( PROPID pid, FILETIME const & ftValue ) { // // Determine the number of days since Jan 1, 1601 by dividing by // the number of 100 nanosecond intervals in a day. The result // will fit into a ULONG. // // Then map the result into one of the ranges: before 1980, between // 1980 and 1994, between 1994 and 2160, and after 2160. To make // the computation easier, we use precomputed values of the number // of days from 1601 and the breakpoints of our range. // // 100s of nanosecs per day const ULONGLONG uliTicsPerDay = 24 * 60 * 60 * (ULONGLONG)10000000; const ULONG ulStart = 138426; // number of days from 1601 to 1980 const ULONG ulMiddle= 143542; // number of days from 1601 to 1/2/1994 const ULONG ulEnd = 204535; // number of days from 1601 to 2161 ULARGE_INTEGER liValue = {ftValue.dwLowDateTime, ftValue.dwHighDateTime}; ULONG ulDays = (ULONG) (liValue.QuadPart / uliTicsPerDay); // // We now have the number of days since Jan. 01, 1601 in ulDays. // Map into buckets. // if (ulDays < ulStart) // Store in bin 0 { PutValue(pid, 0, VT_FILETIME); } else if (ulDays <= ulMiddle) // Store week granularity { PutValue(pid, (ulDays + 1 - ulStart) / 7, VT_FILETIME); } else if (ulDays <= ulEnd) // Store day granularity { // // Bins 0 - 730 are used by the two clauses above. It doesn't // really matter if we reuse bin 730 for the start of the next // range (this might happen because of the division we do). // PutValue(pid, (ulDays + 1 - ulMiddle) + ((ulMiddle - ulStart) / 7), VT_FILETIME); } else // FileTime > 2160 { PutValue(pid, 0xFFFF, VT_FILETIME); } }