windows-server-2003/inetsrv/query/restrict/norm.cxx

//+---------------------------------------------------------------------------
//
//  Microsoft Windows
//  Copyright (C) Microsoft Corporation, 1991 - 2000.
//
//  File:       NORM.CXX
//
//  Contents:   Normalizer
//
//  Classes:    CNormalizer
//
//  History:    28-May-91   t-WadeR     added CNormalizer
//              31-Jan-92   BartoszM    Created from lang.cxx
//              07-Oct-93   DwightKr    Added new methods to normalize
//                                      different data types
//
//  Notes:      The filtering pipeline is hidden in the Data Repository
//              object which serves as a sink for the filter.
//              The sink for the Data Repository is the Key Repository.
//              The language dependent part of the pipeline
//              is obtained from the Language List object and is called
//              Key Maker. It consists of:
//
//                  Word Breaker
//                  Stemmer (optional)
//                  Normalizer
//                  Noise List
//
//              Each object serves as a sink for its predecessor,
//              Key Repository is the final sink.
//
//----------------------------------------------------------------------------

#include <pch.cxx>
#pragma hdrstop

#include <plang.hxx>
#include <misc.hxx>
#include <norm.hxx>

//+---------------------------------------------------------------------------
//
//  Function    GetExpAndSign
//
//  Synopsis:   Finds the exponent and sign of a number
//
//  Arguments:  [d]          -- the input number to examine
//              [fPositive]  -- returns TRUE if positive, FALSE if negative
//
//  Returns:    The exponent
//
//  History:    21-Nov-94   KyleP        Created.
//
//----------------------------------------------------------------------------

int GetExpAndSign( double d, BOOL & fPositive )
{
    //
    // bit 63       = sign
    // bits 52 - 62 = exponent
    // bits 0 - 51  = mantissa
    //

    Win4Assert( sizeof(LARGE_INTEGER) == sizeof(double) );

    LARGE_INTEGER * pli = (LARGE_INTEGER *)&d;

    fPositive = (pli->HighPart & 0x80000000) == 0;

    int const bias = 0x3ff;

    return ( ( pli->HighPart & 0x7ff00000 ) >> 20 ) - bias;
} //GetExpAndSign

//+---------------------------------------------------------------------------
//
//  Function    NormDouble
//
//  Synopsis:   Normalizes doubles by taking log2 of the number
//
//  Notes:      This func converts doubles into one of 5 different categories
//
//              x < -1x2**32                 is in bin 0
//              -1x2**32  <= x <= -1x2**-32  are in bins 1 to 65
//              -1x2**-32 <= x <=  1x2**-32  is in bin 66
//               1x2**-32 <= x <=  1x2**32   are in bins 67 to 131
//               x > 1x2**32                 is bin bin 132
//
//  History:    21-Nov-94   KyleP        Created.
//
//----------------------------------------------------------------------------

static unsigned NormDouble(double dValue)
{
    const int SignificantExponent = 32;
    const int SignificantRange = SignificantExponent * 2;

    const unsigned LowestBin  = 0;                                // 0
    const unsigned LowerBin   = LowestBin + 1;                    // 1
    const unsigned MiddleBin  = LowerBin + SignificantRange + 1;  // 66
    const unsigned UpperBin   = MiddleBin + 1;                    // 67
    const unsigned HighestBin = UpperBin+ SignificantRange + 1;   // 132


    BOOL fPositive;

    int exp = GetExpAndSign( dValue, fPositive );

    unsigned bin;

    if ( exp < -SignificantExponent )
    {
        //
        // All numbers close to zero in middle bin
        //

        bin = MiddleBin;
    }
    else if ( exp > SignificantExponent )
    {
        if ( fPositive )
        {
            //
            // Very large positive numbers in top bin
            //

            bin = HighestBin;
        }
        else
        {
            //
            // Very large negative numbers in bottom bin
            //

            bin = LowestBin;
        }
    }
    else
    {
        if ( fPositive )
        {
            //
            // medium size positive numbers
            //

            bin = UpperBin + exp + SignificantExponent;
        }
        else
        {
            //
            // medium size negative numbers
            //

            bin = LowerBin - exp + SignificantExponent;
        }
    }
    return bin;
}

#ifdef  TEST_NORM
//
// a test to verify the validity of the NormDouble function.
//
void TestNormDouble()
{
    float fVal0 = 0.;
    float fVal1 = 1.;
    unsigned nZero = NormDouble( fVal0 );
    unsigned nOne = NormDouble( fVal1 );

    printf(" Value:Bin %f : 0x%4X (%d)\n", fVal0, nZero, nZero );
    printf(" Value:Bin %f : 0x%4X (%d)\n", fVal1, nOne, nOne );

    BOOL fPos;
    float f = fVal1;
    unsigned nPrev = nOne;
    while ( f > fVal0 )
    {
        unsigned nVal = NormDouble( f );
        if (nVal > nPrev || nVal < nZero || nVal > nOne)
        {
            printf(" Value:Bin %f : 0x%4X (%d)\tExp %d\n", f, nVal, nVal, GetExpAndSign(f, fPos) );
        }

        nPrev = nVal;
        f = f/3;
    }

    f = fVal1;
    nPrev = nOne;
    while ( f < 1e+32 )
    {
        unsigned nVal = NormDouble( f );
        if (nVal < nPrev)
            printf(" Value:Bin %f : 0x%4X (%d)\n", f, nVal, nVal );

        nPrev = nVal;
        f = f * (float)1.5;
    }

    float fValm1 = -1.;
    unsigned nMinusOne = NormDouble( fValm1 );

    printf(" Value:Bin %f : 0x%4X (%d)\n", fValm1, nMinusOne, nMinusOne );

    f = fValm1;
    nPrev = nMinusOne;
    while ( f < fVal0 )
    {
        unsigned nVal = NormDouble( f );
        if (nVal < nPrev || nVal > nZero || nVal < nMinusOne)
            printf(" Value:Bin %f : 0x%4X (%d)\tExp %d\n", f, nVal, nVal, GetExpAndSign(f, fPos) );

        nPrev = nVal;
        f = f/3;
    }

    f = fValm1;
    nPrev = nMinusOne;
    while ( f > -1e+32 )
    {
        unsigned nVal = NormDouble( f );
        if (nVal > nPrev)
            printf(" Value:Bin %f : 0x%4X (%d)\n", f, nVal, nVal );

        nPrev = nVal;
        f = f * (float)1.5;
    }
}
#endif  // 0


// ------------------------------------------------------------------------
// | Upper Limit  |  Divisor (2^x) | # of Bins        | (in hex)          |
// ------------------------------------------------------------------------
// | 2^10 - 1     |   2^0          | 2^10  - 0        | 0400 - 0000       |
// | 2^16 - 1     |   2^3          | 2^12  - 2^7      | 2000 - 0080       |
// | 2^20 - 1     |   2^6          | 2^14  - 2^10     | 4000 - 0400       |
// | 2^26 - 1     |   2^13         | 2^13  - 2^7      | 2000 - 0080       |
// | 2^30 - 1     |   2^23         | 2^7   - 2^3      | 0080 - 0008       |
// | 2^31 - 1     |   2^25         | 2^6   - 2^5      | 0040 - 0020       |
// ------------------------------------------------------------------------
// | Total        |                |                  | 84C0 - 04D8       |
// |              |                |                  |     7FE8          |
// ------------------------------------------------------------------------

const long limit1 = 0x400;
const long shift1 = 0;
const long cbins1 = 0x400;

const long limit2 = 0x10000;        // 2^16
const long shift2 = 3;
const long cSkip1 = limit1 >> shift2;
const long cbins2 = (limit2 >> shift2)-cSkip1;

const long limit3 = 0x100000;       // 2^20
const long shift3 = 6;
const long cSkip2 = limit2 >> shift3;
const long cbins3 = (limit3 >> shift3) - cSkip2;

const long limit4 = 0x4000000;      // 2^26
const long shift4 = 13;
const long cSkip3 = limit3 >> shift4;
const long cbins4 = (limit4 >> shift4) - cSkip3;

const long limit5 = 0x40000000;     // 2^30
const long shift5 = 23;
const long cSkip4 = limit4 >> shift5;
const long cbins5 = (limit5 >> shift5) - cSkip4;

const long limit6 = MINLONG;     // 2^31
const long shift6 = 25;
const long cSkip5 = limit5 >> shift6;
const long cbins6 = ((long) ((unsigned) limit6 >> shift6)) - cSkip5;

static unsigned MapLong( LONG lValue )
{

    Win4Assert( !(lValue & MINLONG) || ( MINLONG == lValue ) );

#if CIDBG==1
    const long cTotal = cbins1 + cbins2 + cbins3 + cbins4 + cbins5 + cbins6;
    Win4Assert( cTotal <= MINSHORT );
#endif  // CIDBG == 1

    unsigned ulValue = (unsigned) lValue;

    unsigned binNum = (unsigned) lValue;;

    if ( ulValue < limit1 )
    {
        //
        // Nothing to do.
        //
    }
    else if ( ulValue < limit2 )
    {
        binNum = cbins1 - cSkip1 + (ulValue >> shift2);
    }
    else if ( ulValue < limit3 )
    {
        binNum = cbins1 + cbins2 - cSkip2 + (binNum >> shift3);
    }
    else if ( ulValue < limit4 )
    {
        binNum = cbins1 + cbins2 + cbins3 - cSkip3 + (binNum >> shift4);
    }
    else if ( ulValue < limit5 )
    {
        binNum = cbins1 + cbins2 + cbins3 + cbins4 - cSkip4 + (binNum >> shift5);
    }
    else
    {
        binNum = cbins1 + cbins2 + cbins3 + cbins4 + cbins5 - cSkip5 + (binNum >> shift6);
    }

    return binNum;
}

//+---------------------------------------------------------------------------
//
//  Function:   NormLong
//
//  Synopsis:   Normalizes the given "signed" long value to a value between
//              0x0000 - 0xFFFF. The negative numbers occupy 0x0000-0x8000.
//              Positive numbers occupy 0x8000-0xFFFF
//
//  Arguments:  [lValue] - The value to be normalized.
//
//  History:    10-03-95   srikants   Created
//
//  Notes:
//
//----------------------------------------------------------------------------

static unsigned NormLong(LONG lValue)
{
    if (lValue >= 0)
    {
        return MapLong(lValue) + MINSHORT;
    }
    else
    {
        return MINSHORT - MapLong(-lValue);
    }
}

//+---------------------------------------------------------------------------
//
//  Function:   NormULong
//
//  Synopsis:   Normalizes an "unsigned" long value to a value between
//              0x0000-0xFFFF.  Numbers from 0-2^31 - 1 are mapped in the
//              range 0x0000-0x7FFF.  Numbers 2^31 to 2^32 - 1 are mapped
//              in the range 0x8000 - 0xFFFF
//
//  Arguments:  [lValue] -  The value to be mapped.
//
//  History:    10-03-95   srikants   Created
//
//  Notes:
//
//----------------------------------------------------------------------------

static unsigned NormULong( ULONG lValue )
{
    unsigned val = MapLong( lValue & ~MINLONG );    // turn off the high bit

    Win4Assert( !(val & MINSHORT) );

    if ( lValue & MINLONG )
        val |= MINSHORT;

    return val;
}

//+---------------------------------------------------------------------------
//
//  Function:   MapLargeInteger
//
//  Synopsis:   Maps a LargeInteger to a number between 0x0000-0x7FFF.
//
//              Numbers with the "HighPart" = 0 are mapped in the range
//              0x0000-0x3FFF.  When the HighPart !=0, the values are
//              mapped to 0x4000 - 0x7FFF
//
//  Arguments:  [liValue] - The value to be mapped.
//
//  History:    10-03-95   srikants   Created
//
//  Notes:
//
//----------------------------------------------------------------------------

static unsigned MapLargeInteger( LARGE_INTEGER & liValue )
{
    Win4Assert( !(liValue.HighPart & MINLONG) || ( MINLONG == liValue.HighPart ) );

    unsigned normVal;

    if ( 0 == liValue.HighPart )
    {
        normVal = NormULong( liValue.LowPart );
        normVal >>= 2;
    }
    else
    {
        normVal = MapLong( liValue.HighPart );  // 0x0000-0x7FFF
        normVal >>= 1;
        normVal |= 0x4000;
    }

    Win4Assert( normVal < 0x8000 );

    return normVal;
}

//+---------------------------------------------------------------------------
//
//  Function:   NormULargeInteger
//
//  Synopsis:   Normalizes an unsigned LargeInteger to a number between
//              0x0000-0xFFFF.
//
//              Numbers with the "HighPart" = 0 are mapped in the range
//              0x0000-0x7FFF.  When the HighPart !=0, the values are
//              mapped to 0x8000 - 0xFFFF.
//
//  Arguments:  [uliValue] - The value to be mapped.
//
//  History:    02-09-96   Alanw   Created
//
//  Notes:
//
//----------------------------------------------------------------------------

static unsigned NormULargeInteger( ULARGE_INTEGER & uliValue )
{
    unsigned normVal;

    if ( 0 == uliValue.HighPart )
    {
        normVal = NormULong( uliValue.LowPart );
        normVal >>= 1;
    }
    else
    {
        normVal = NormULong( uliValue.HighPart );  // 0x0000-0x7FFF
        normVal |= 0x8000;
    }

    Win4Assert( normVal < 0x10000 );

    return normVal;
}


//+---------------------------------------------------------------------------
//
//  Function:   NormLargeInteger
//
//  Synopsis:   Normalizes a large integer to a value between 0x0000-0xFFFF.
//
//              -ve Numbers are mapped in the range 0x0000-0x8000.
//              +ve numbers are mapped in the range 0x8000-0xFFFF.
//
//  Arguments:  [liValue] -  The value to be normalized. Note that the
//              argument is NOT passed by reference. The value is changed
//              in this method and so should not be passed by reference.
//
//  History:    10-03-95   srikants   Created
//
//  Notes:
//
//----------------------------------------------------------------------------

static unsigned NormLargeInteger( LARGE_INTEGER liValue )
{
    unsigned normVal;

    if ( liValue.QuadPart < 0 )
    {
        liValue.QuadPart = -liValue.QuadPart;
        normVal = MINSHORT - MapLargeInteger( liValue );
    }
    else
    {
        normVal = MINSHORT + MapLargeInteger( liValue );
    }

    Win4Assert( normVal < 0x10000 );

    return normVal;
}

#ifdef  TEST_NORM
//
// a test to verify the validity of the NormLong function.
//
void TestNormLong()
{
    long lVal1 = 0;
    unsigned nVal1 = NormLong( lVal1 );

    printf(" Value:Bin 0x%8X : 0x%4X  \t(%10d : %10d)\n", lVal1, nVal1, lVal1, nVal1 );

    lVal1 = 2;
    long lVal2 = 0;
    unsigned nVal2 = NormLong(1);

    while ( !(lVal1 & 0x80000000) )
    {
        nVal1 = NormLong( lVal1 );
        //printf(" Value:Bin 0x%8X : 0x%4X  \t(%10d : %10d)\n", lVal1, nVal1, lVal1, nVal1 );

        Win4Assert( nVal1 == nVal2+1 );

        lVal2 = lVal1 + lVal1-1;
        nVal2 = NormLong( lVal2 );
        //printf(" Value:Bin 0x%8X : 0x%4X  \t(%10d : %10d)\n", lVal2, nVal2, lVal2, nVal2 );

        lVal1 <<= 1;
    }

    lVal1 = 2;
    nVal2 = NormLong(-1);
    printf(" Value:Bin 0x%8X : 0x%4X  \t(%10d : %10d)\n", -1, nVal2, -1, nVal2 );

    while ( !(lVal1 & 0x80000000) )
    {
        nVal1 = NormLong( -lVal1 );
        //printf(" Value:Bin 0x%8X : 0x%4X  \t(%10d : %10d)\n", -lVal1, nVal1, -lVal1, nVal1 );

        Win4Assert( nVal1 == nVal2-1 );

        lVal2 = lVal1 + lVal1-1;
        lVal2 = -lVal2;

        nVal2 = NormLong( lVal2 );
        //printf(" Value:Bin 0x%8X : 0x%4X  \t(%10d : %10d)\n", lVal2, nVal2, lVal2, nVal2 );

        lVal1 <<= 1;
    }
}
#endif  // 0

//+---------------------------------------------------------------------------
//
//  Member:     CNormalizer::CNormalizer
//
//  Synopsis:   constructor for normalizer
//
//  Effects:    gets buffers from noiselist
//
//  Arguments:  [nl] -- Noise list object to pass data on to.
//
//  History:    05-June-91   t-WadeR     Created.
//
//  Notes:
//
//----------------------------------------------------------------------------
CNormalizer::CNormalizer( PNoiseList& nl )
    : _noiseList(nl)
{
    SetWordBuffer();

    // check that input size + prefix fits in the output buffer
    Win4Assert( cwcMaxKey * sizeof( WCHAR ) + cbKeyPrefix <= *_pcbOutBuf );
}


//+---------------------------------------------------------------------------
//
//  Member:     CNormalizer::GetFlags
//
//  Synopsis:   Returns address of ranking and range flags
//
//  Arguments:  [ppRange] -- range flag
//              [ppRank] -- rank flag
//
//  History:    11-Fab-92   BartoszM     Created.
//
//----------------------------------------------------------------------------
void CNormalizer::GetFlags ( BOOL** ppRange, CI_RANK** ppRank )
{
    _noiseList.GetFlags ( ppRange, ppRank );
}


//+---------------------------------------------------------------------------
//
//  Member:     CNormalizer::ProcessAltWord, public
//
//  Synopsis:   Normalizes a UniCode string, passes it to NoiseList.
//
//  Effects:    Deposits a normalized version [pwcInBuf] in [_pbOutBuf]
//
//  Arguments:  [pwcInBuf] -- input buffer
//              [cwc] -- count of chars in pwcInBuf
//
//  History:    03-May-95     SitaramR     Created.
//
//----------------------------------------------------------------------------

void CNormalizer::ProcessAltWord( WCHAR const *pwcInBuf,  ULONG cwc )
{
    SetNextAltBuffer();

    unsigned hash = NormalizeWord( pwcInBuf, cwc );
    SetAltHash( hash );
}

//+---------------------------------------------------------------------------
//
//  Member:     CNormalizer::ProcessWord, public
//
//  Synopsis:   Normalizes a UniCode string, passes it to NoiseList.
//
//  Effects:    Deposits a normalized version of [pwcInBuf] in [_pbOutBuf].
//
//  Arguments:  [pwcInBuf] -- input buffer
//              [cwc] -- count of chars in pwcInBuf
//
//  History:    05-June-91  t-WadeR     Created.
//              13-Oct-92   AmyA        Added unicode support
//
//----------------------------------------------------------------------------

void CNormalizer::ProcessWord( WCHAR const *pwcInBuf,  ULONG cwc )
{
    if ( UsingAltBuffers() )
        SetNextAltBuffer();

    unsigned hash = NormalizeWord( pwcInBuf, cwc );

    if ( UsingAltBuffers() )
    {
        SetAltHash( hash );
        ProcessAllWords();
    }
    else
        _noiseList.PutWord( hash );
}

//+---------------------------------------------------------------------------
//
//  Member:     CNormalizer::ProcessAllWords, private
//
//  Synopsis:   Removes duplicate alternate words and emits remainder.
//
//  History:    17-Sep-1999    KyleP     Created.
//
//----------------------------------------------------------------------------

void CNormalizer::ProcessAllWords()
{
    //
    // Check for duplicate keys.  Since the number of alternate forms will always be
    // quite small it's ok to use a O(n^2) algorithm here.
    //

    unsigned iFinal = 0;

    for ( unsigned i = 0; i < _cAltKey; i++ )
    {
        //
        // Already marked duplicate?
        //

        if ( 0 == _aAltKey[i].Count() )
            continue;

        iFinal = i;

        for ( unsigned j = i+1; j < _cAltKey; j++ )
        {
            //
            // Remember, Pid is really the hash here.
            //

            if ( _aAltKey[i].Pid() == _aAltKey[j].Pid() &&
                 _aAltKey[i].Count() == _aAltKey[j].Count() &&
                 RtlEqualMemory( _aAltKey[i].GetBuf(), _aAltKey[j].GetBuf(), _aAltKey[j].Count() ) )
            {
                ciDebugOut(( DEB_TRACE, "Duplicate keys: %u and %u\n", i, j ));
                _aAltKey[j].SetCount( 0 );
            }
        }
    }

    //
    // Now transfer any remaining key(s).
    //

    SetWordBuffer();
    unsigned hash;

    for ( i = 0; i <= iFinal; i++ )
    {
        //
        // Ignore duplicates
        //

        if ( 0 == _aAltKey[i].Count() )
            continue;

        //
        // Copy to the transfer buffer.
        //

        *_pcbOutBuf = _aAltKey[i].Count();
        RtlCopyMemory( _pbOutBuf, _aAltKey[i].GetBuf(), *_pcbOutBuf );
        hash = _aAltKey[i].Pid();

        //
        // If this is not the final "PutWord" call, send the data along.
        //

        if ( i != iFinal )
            _noiseList.PutAltWord( hash );
    }

    //
    // Put the final word
    //

    _noiseList.PutWord( hash );
} //ProcessAllWords

//+---------------------------------------------------------------------------
//
//  Member:     CNormalizer::NormalizeWord
//
//  Synopsis:   Normalizes a UniCode string
//              Calculates the hash function for normalized string.
//
//  Arguments:  [pwcInBuf] -- input buffer
//              [cwc] -- count of chars in pwcInBuf
//
//  Returns:    unsigned hash value of string
//
//  History:    03-May-95    SitaramR     Created.
//
//----------------------------------------------------------------------------

unsigned CNormalizer::NormalizeWord( WCHAR const *pwcInBuf, ULONG cwc )
{
    return NormalizeWord( pwcInBuf, cwc, _pbOutBuf, _pcbOutBuf );
}

//+---------------------------------------------------------------------------
//
//  Member:     CNormalizer::NormalizeWord
//
//  Synopsis:   Normalizes a UniCode string
//              Calculates the hash function for normalized string. This 
//              function is identical to the other NormalizeWord funtion,
//              except that it puts the outputs int he output parameters
//
//  Arguments:  [pwcInBuf] -- input buffer
//              [cwc] -- count of chars in pwcInBuf
//              [pbOutBuf] -- output buffer.
//              [pcbOutBuf] - pointer to output count of bytes.
//
//  Returns:    unsigned hash value of string
//
//  History:    03-May-1995    SitaramR     Created.
//              03-Oct-2000    KitmanH      Added output parameters              
//
//----------------------------------------------------------------------------

unsigned CNormalizer::NormalizeWord( WCHAR const *pwcInBuf, 
                                     ULONG cwc, 
                                     BYTE *pbOutBuf, 
                                     unsigned *pcbOutBuf )
{
    // count of bytes needs to take into account STRING_KEY

    *pcbOutBuf = cwc * sizeof(WCHAR) + cbKeyPrefix;

    // prefix with the string key identifier

    *pbOutBuf++ = STRING_KEY;

    unsigned hash = 0;

    Win4Assert ( cwc != 0 && cwc <= cwcMaxKey );
    for ( unsigned i = 0; i < cwc; i++ )
    {
        WCHAR c = *pwcInBuf++;

        // normalize the character to upcase.

        c = ( c < 'a' ) ? c : ( c <= 'z' ) ? ( c - ('a' - 'A') ) :
            RtlUpcaseUnicodeChar( c );

        //
        // Store.  Do it one byte at a time because the normalized string
        // must be byte compared.
        //

        *pbOutBuf++ = (BYTE)(c >> 8);
        *pbOutBuf++ = (BYTE)c;

        // hash
        hash = ( hash << 2 ) + c;
    }

    return hash;
}

//+---------------------------------------------------------------------------
//
//  Member:     CNormalizer::NormalizeWstr - Public
//
//  Synopsis:   Normalizes a UniCode string
//
//  Arguments:  [pwcInBuf] -- input buffer
//              [cwcInBuf] -- count of chars in pwcInBuf
//              [pbOutBuf] -- output buffer.
//              [pcbOutBuf] - pointer to output count of bytes.
//
//  History:    10-Feb-2000     KitmanH    Created
//
//----------------------------------------------------------------------------

void CNormalizer::NormalizeWStr( WCHAR const *pwcInBuf, 
                                 ULONG cwcInBuf,
                                 BYTE *pbOutBuf, 
                                 unsigned *pcbOutBuf )
{
    NormalizeWord( pwcInBuf, 
                   cwcInBuf,
                   pbOutBuf, 
                   pcbOutBuf );
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::CValueNormalizer
//
//  Synopsis:   Constructor
//
//  Arguments:  [krep] -- key repository sink for keys
//
//  History:    21-Sep-92   BartoszM     Created.
//
//----------------------------------------------------------------------------

CValueNormalizer::CValueNormalizer( PKeyRepository& krep )
  : _krep(krep)
{
    _krep.GetBuffers( &_pcbOutBuf, &_pbOutBuf, &_pOcc );
    _cbMaxOutBuf = *_pcbOutBuf;
    *_pOcc = 0;
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue, public
//
//  Synopsis:   Store a variant
//
//  Arguments:  [pid] -- property id
//              [occ] -- On input:  starting occurrence.
//                       On output: next starting occurrence.
//              [var] -- value
//
//  History:    04-Nov-94   KyleP        Created.
//
//----------------------------------------------------------------------------

void CValueNormalizer::PutValue( PROPID pid,
                                 OCCURRENCE & occ,
                                 CStorageVariant const & var )
{
    *_pOcc = occ;

    switch ( var.Type() )
    {
    case VT_EMPTY:
    case VT_NULL:
        break;

    case VT_UI1:
        PutValue( pid, var.GetUI1() );
        break;

    case VT_I1:
        PutValue( pid, var.GetI1() );
        break;

    case VT_UI2:
        PutValue( pid, (USHORT) var.GetUI2() );
        break;

    case VT_I2:
        PutValue( pid, var.GetI2() );
        break;

    case VT_I4:
    case VT_INT:
        PutValue( pid, var.GetI4() );
        break;

    case VT_R4:
        PutValue( pid, var.GetR4() );
        break;

    case VT_R8:
        PutValue( pid, var.GetR8() );
        break;

    case VT_UI4:
    case VT_UINT:
        PutValue( pid, var.GetUI4() );
        break;

    case VT_I8:
        PutValue( pid, var.GetI8() );
        break;

    case VT_UI8:
        PutValue( pid, var.GetUI8() );
        break;

    case VT_BOOL:
        PutValue( pid, (BYTE) (FALSE != var.GetBOOL()) );
        break;

    case VT_ERROR:
        PutValue( pid, var.GetERROR() );
        break;

    case VT_CY:
        PutValue( pid, var.GetCY() );
        break;

    case VT_DATE:
        PutDate( pid, var.GetDATE() );
        break;

    case VT_FILETIME:
        PutValue( pid, var.GetFILETIME() );
        break;

    case VT_CLSID:
        PutValue( pid, *var.GetCLSID() );
        break;

    // NTRAID#DB-NTBUG9-84589-2000/07/31-dlee Indexing Service data type normalization doesn't handle VT_DECIMAL, VT_VECTOR, or VT_ARRAY.

    default:
        ciDebugOut(( DEB_IWARN, "Unhandled type %d (%x) sent to normalization\n",
                     var.Type(), var.Type() ));
        break;
    }

    occ = *_pOcc;
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue  private
//
//  Synopsis:   Store a unsigned 2 byte value without altering it
//
//  Arguments:  [pid]    -- property id
//              [uValue] -- value
//              [bType]  -- value type
//
//  History:    07-Oct-93   DwightKr     Created.
//
//  Notes:      This is the principal PutValue method that other PutValue()s
//              will call.  Each of the OTHER PutValue()'s sole purpose is
//              to normalize their input data into a 2-byte unsigned value.
//              This version of PutValue() will store the value together
//              with its WID, PID, size, etc. in the CDataRepository object.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, unsigned uValue, BYTE bType )
{
    BYTE* pb = _pbOutBuf;

    // Store size of entry
    *_pcbOutBuf = sizeof(USHORT) + sizeof(PROPID) + 1;

    // Store key type
    *pb++ = bType;

    // store property id
    *pb++ = (BYTE)(pid >> 24);
    *pb++ = (BYTE)(pid >> 16);
    *pb++ = (BYTE)(pid >> 8);
    *pb++ = (BYTE) pid;

    // Store key
    Win4Assert( uValue < 0x10000 );
    *pb++ = BYTE (uValue >> 8);
    *pb++ = BYTE (uValue);

#if CIDBG == 1
    for (unsigned i =  0; i < *_pcbOutBuf; i++ )
    {
        ciDebugOut (( DEB_USER1 | DEB_NOCOMPNAME, "%02x ", _pbOutBuf[i] ));
    }
    ciDebugOut (( DEB_USER1 | DEB_NOCOMPNAME, "\n" ));
#endif

    _krep.PutPropId(pid);
    _krep.PutKey();
    (*_pOcc)++;
}

void CValueNormalizer::PutMinValue( PROPID pid, OCCURRENCE & occ, VARENUM Type )
{
    *_pOcc = occ;
    PutValue( pid, 0, Type );
    occ = *_pOcc;
}

void CValueNormalizer::PutMaxValue( PROPID pid, OCCURRENCE & occ, VARENUM Type )
{
    *_pOcc = occ;
    PutValue( pid, 0xFFFF, Type );
    occ = *_pOcc;
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue  public
//
//  Synopsis:   Store a 1 byte value without altering it
//
//  Arguments:  [pid]  -- property id
//              [byte] -- value
//
//  History:    25-Oct-93   DwightKr     Created.
//
//  Notes:      One byte values are NOT normalized, they are stored as is.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, BYTE byte )
{
    PutValue(pid, (unsigned) byte, VT_UI1);
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue  public
//
//  Synopsis:   Store a 1 byte signed value without altering it
//
//  Arguments:  [pid]  -- property id
//              [ch]   -- value
//
//  History:    25-Oct-1993   DwightKr     Created.
//              29-Sep-2000   KitmanH      Normalize VT_I1 values
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, CHAR ch )
{
    PutValue(pid, ( ((BYTE) ch) + 0x80 ) & 0xFF, VT_I1);
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue
//
//  Synopsis:   Store the high byte of an unsigned 2 byte value
//
//  Arguments:  [pid]     -- property id
//              [usValue] -- value
//
//  History:    07-Oct-93   DwightKr     Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, USHORT usValue )
{
    PutValue(pid, (usValue >> 8) & 0xFF, VT_UI2);
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue  public
//
//  Synopsis:   Store the high byte of a signed 2 byte value.
//
//  Arguments:  [pid]     -- property id
//              [sValue]  -- value
//
//  Notes:      Add the smallest BYTE to this so that we translate numbers
//              into the range above 0.  i.e. -32768 maps into 0x00, and 32767
//              maps into 0xFF.
//
//  History:    07-Oct-93   DwightKr     Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, SHORT sValue )
{
    PutValue(pid, ((sValue >> 8) + 0x80) & 0xFF, VT_I2);
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue  public
//
//  Synopsis:   Store the base-2 log of the ULONG value.
//
//  Arguments:  [pid]     -- property id
//              [ulValue] -- value
//
//  Notes:      This convert ULONGs into the range 0 - 31 by taking the Log2
//              of the number.
//
//  History:    07-Oct-93   DwightKr     Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, ULONG  ulValue )
{
    PutValue(pid, NormULong ( ulValue ), VT_UI4);
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue
//
//  Synopsis:   Store the base-2 log of the signed LONG value.
//
//  Arguments:  [pid]     -- property id
//              [lValue]  -- value
//
//  Notes:      This converts LONGs into numbers larger than 0.  This
//              translates into 64 bins; 32 bins for #'s < 0 & 32 bins for
//              #'s >= 0.
//
//  History:    07-Oct-93   DwightKr     Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, LONG lValue )
{
     PutValue(pid, NormLong(lValue), VT_I4);
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue
//
//  Synopsis:   Store the base-10 log of the FLOAT value.
//
//  Arguments:  [pid]     -- property id
//              [rValue]  -- value
//
//  Notes:      floats fit into a total of 41 bins.
//
//  History:    07-Oct-93   DwightKr     Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, float rValue )
{
    PutValue(pid, NormDouble(rValue), VT_R4);
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue
//
//  Synopsis:   Store the base-10 log of the DOUBLE value.
//
//  Arguments:  [pid]     -- property id
//              [dValue]  -- value
//
//  Notes:      doubles fit into a total of 41 bins.
//
//  History:    07-Oct-93   DwightKr     Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, double dValue )
{
    PutValue(pid, NormDouble(dValue), VT_R8);
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue
//
//  Synopsis:   Store the exponent of a large integer
//
//  Arguments:  [pid] -- property id
//              [li]  -- value
//
//  History:    21-Sep-92   BartoszM    Created.
//              04-Feb-93   KyleP       Use LARGE_INTEGER
//              25-Oct-92   DwightKr    Copied here & removed extra code &
//                                      accounted for negative numbers
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, LARGE_INTEGER liValue )
{
    unsigned uExponent = NormLargeInteger(liValue);

    PutValue( pid, uExponent, VT_I8);
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue
//
//  Synopsis:   Store a compressed large integer
//
//  Arguments:  [pid] -- property id
//              [uli] -- value
//
//  History:    09 Feb 96   AlanW      Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, ULARGE_INTEGER uliValue )
{
    unsigned uExponent = NormULargeInteger(uliValue);

    PutValue( pid, uExponent, VT_UI8);
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue
//
//  Synopsis:   Store the least byte of a GUID
//
//  Arguments:  [pid]   -- property id
//              [guid]  -- value
//
//  Notes:      The GUID generators are guaranteed to modify the TOP DWORD
//              of the 32-byte GUID each time a new GUID is generated.
//              The lower bytes of the GUID is the network address of the
//              card which generated the UUID.
//
//              We would like to cluster together together objects of a single
//              class (all MS-Word objects together for example).  Since it
//              is possible that someone could generate UUIDs for more than
//              one application on a single machine, the lower portion of
//              the UUID will perhaps remain constant between class IDs.  The
//              only part of the UUID which is guaranteed to be unique between
//              multiple objects is the field which represents time.  It is
//              unlikely that two classes were generated the same second on
//              two different machines.
//
//  History:    25-Oct-93   DwightKr     Created.
//
//----------------------------------------------------------------------------
void CValueNormalizer::PutValue( PROPID pid, GUID const & Guid )
{
    PutValue(pid, Guid.Data1 & 0xFFFF, VT_CLSID);
}

long CastToLong( double d )
{
    //
    // bit 63       = sign
    // bits 52 - 62 = exponent
    // bits 0 - 51  = mantissa
    //

    LARGE_INTEGER * pli = (LARGE_INTEGER *)&d;

    int exp  = (pli->HighPart & 0x7ff00000) >> 20;

    if ( exp == 0 )
    {
        //
        // Special case: Zero, NaNs, etc.
        //

        return( 0 );
    }

    //
    // Subtract off bias
    //

    exp -= 0x3ff;

    if ( exp < 0 )
    {
        // Cast of very small number to unsigned long.  Loss of precision
        return( 0 );
    }
    else if ( exp > 30 )
    {
        // Cast of very large number to unsigned long.  Overflow
        if ( pli->HighPart & 0x80000000 )
            return( LONG_MIN );
        else
            return( LONG_MAX );
    }
    else
    {
        //
        // We need to get the top 32 bits of the mantissa
        // into a dword.
        //

        unsigned long temp = pli->LowPart >> (32 - 12);
        temp |= pli->HighPart << (32 - 20);

        //
        // Add the 'hidden' bit of the mantissa. (Since all doubles
        // are normalized to 1.?????? the highest 1 bit isn't stored)
        //

        temp = temp >> 1;
        temp |= 0x80000000;

        //
        // Thow away digits to the right of decimal
        //

        temp = temp >> (31 - exp);

        //
        // Adjust for sign
        //

        Win4Assert( (temp & 0x80000000) == 0 );
        long temp2;

        if ( pli->HighPart & 0x80000000 )
            temp2 = temp * -1;
        else
            temp2 = temp;

        return( temp2 );
    }
} //CastToLong

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutDate
//
//  Synopsis:   Dates are passed in as the number of days (and fractional days)
//              since Jan. 1, 1900.  We'll crunch this down to the number of
//              weeks.  Dates are passed in a doubles.  We'll assume that
//              negative numbers represent dates before Jan. 1, 1900.
//
//  Arguments:  [pid]     -- property id
//              [DATE]    -- value (double)
//
//  Notes:      Since dates before Jan 1, 1900 are passed as negative numbers
//              we'll need to normalize them to something >= 0.
//
//              time period                    resolution            # bins
//              ===========================    ===============       ======
//              year < 10Bil BC                -- bin = 0               1
//              10Bil BC <= year <=    1 BC    -- log10 (year)         11
//                 1 BC  <  year <= 1900       -- year               1902
//              1901 AD  <= year <= 2050 AD    -- daily             54787
//              2051 AD  <= year <= 10Bil AD   -- log10 (year)          8
//              year > 10Bil AD                -- bin = 0xFFFF          1
//
//
//              I choose the daily range from 1901 - 2050 since there is a lot
//              of events in the 20th century (WW I, WW II, landing on the
//              moon, my wife's birthday, etc.) that are interesting, and
//              imporant.  It is likely that dates outside of this range will
//              be rounded to the nearest year (1492, 1776, 1812, 1867, etc).
//
//              Also by breaking the log10(year) at 1 BC rather than some other
//              date (such as 0000 AD, or 1 AD) we avoid values in the range
//              1 BC < year < 1 AD, calculating log10(year) resulting in
//              large negative numbers.  Everything in this range should be in
//              bin #12.  It also avoids taking log10(0).
//
//
//  History:    25-Oct-93   DwightKr     Created.
//              07-Dec-94   KyleP        Remove use of floating point
//
//----------------------------------------------------------------------------

void CValueNormalizer::PutDate( PROPID pid, DATE const & Date )
{
    const int MinDate = 42;     // 2^42 --> ~4.4E12 days --> ~12E9 years  --> 12 billion B.C.
    const int MinByYear = 20;   // 2^20 --> ~1.0E6  days --> ~2.9E3 years --> 970 B.C.
    const int cMinByYear = (1 << MinByYear) / 365 + 1;   // 2873
    const int MaxDaily = (2051 - 1900) * 365;            // 55115
    const int MinByYearAD = 15; // 2^15 --> ~32768 days  --> ...
    const int MaxDate = 42;     // 2^42 --> ~4.4E12 days --> ~12E9 years  --> 12 billion A.D.

    const unsigned FirstBC            = 0;
    const unsigned FirstLogBC         = FirstBC + 1;
    const unsigned LastLogBC          = FirstLogBC + MinDate - MinByYear;
    const unsigned FirstYearBC        = LastLogBC + 1;
    const unsigned LastYearBC         = FirstYearBC + cMinByYear;
    const unsigned FirstDaily         = LastYearBC + 1;
    const unsigned LastDaily          = FirstDaily + MaxDaily;
    const unsigned FirstLogAD         = LastDaily + 1;
    const unsigned LastLogAD          = FirstLogAD + MaxDate - MinByYearAD;
    const unsigned LastAD             = 0xFFFF;

    Win4Assert( LastLogAD < 0xFFFF );

    unsigned bin;
    BOOL fPositive;

    int exp = GetExpAndSign( Date, fPositive );

    if ( !fPositive )
    {
        //
        // Very large negative dates go in first bin
        //

        if ( exp >= MinDate )
            bin = FirstBC;

        //
        // Medium size negative dates get 1 bin / power of 2
        //

        else if ( exp >= MinByYear )
            bin = FirstLogBC - exp + MinByYear;

        //
        // All other dates before 1900 get 1 bucket per 365 days.
        //

        else
        {
            long cYears = CastToLong( Date ) / 365;

            Win4Assert( cYears >= -cMinByYear && cYears <= 0 );

            bin = FirstYearBC + cYears + cMinByYear;
        }
    }
    else
    {
        //
        // Very large positive dates go in last bin
        //

        if ( exp >= MaxDate )
            bin = LastAD;
        else
        {
            long cDays = CastToLong( Date );

            //
            // Dates rather far in the future get 1 bucket / power of 2
            //

            if ( cDays >= MaxDaily )
                bin = FirstLogAD + exp - MinByYearAD;

            //
            // Days close to today get 1 bucket per day
            //

            else
                bin = FirstDaily + cDays;
        }
    }

    PutValue(pid, bin, VT_DATE);
} //PutDate

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue
//
//  Synopsis:   Store the hashed value of an 8-byte currency.
//
//  Arguments:  [pid]     -- property id
//              [cyValue]  -- value
//
//  Notes:      Currency values are stored as a ULONG cents, and a LONG $.
//              We'll ignore the cents portion and store the $ part using
//              the standard LONG storage method.
//
//  History:    26-Oct-93   DwightKr     Created.
//
//----------------------------------------------------------------------------

void CValueNormalizer::PutValue( PROPID pid, CURRENCY const & cyValue)
{
    PutValue(pid, NormLong(cyValue.Hi), VT_CY);
}

//+---------------------------------------------------------------------------
//
//  Member:     CValueNormalizer::PutValue
//
//  Synopsis:   Store the number of days since Jan 1, 1980;
//
//  Arguments:  [pid]     -- property id
//              [ulValue] -- value
//
//  History:    07-Oct-93   DwightKr     Created.
//
//  Notes:      This algorithym calculates the number of days since Jan 1,
//              1980; and stores it into a unsigned.  FileTimes are divided
//              into the following ranges:
//
//              FileTime < 1980                              => bin  0
//              1980 <= FileTime <= 1993    week granularity => bins 1 - 729
//              1994 <= FileTime <= 2160    day granularity  => bins 730+
//              FileTime > 2160                              => bin 0xFFFF
//
//----------------------------------------------------------------------------

void CValueNormalizer::PutValue( PROPID pid, FILETIME const & ftValue )
{
    //
    //  Determine the number of days since Jan 1, 1601 by dividing by
    //  the number of 100 nanosecond intervals in a day.  The result
    //  will fit into a ULONG.
    //
    //  Then map the result into one of the ranges: before 1980, between
    //  1980 and 1994, between 1994 and 2160, and after 2160.  To make
    //  the computation easier, we use precomputed values of the number
    //  of days from 1601 and the breakpoints of our range.
    //

    // 100s of nanosecs per day
    const ULONGLONG uliTicsPerDay = 24 * 60 * 60 * (ULONGLONG)10000000;

    const ULONG ulStart = 138426;       // number of days from 1601 to 1980
    const ULONG ulMiddle= 143542;       // number of days from 1601 to 1/2/1994
    const ULONG ulEnd   = 204535;       // number of days from 1601 to 2161

    ULARGE_INTEGER liValue = {ftValue.dwLowDateTime, ftValue.dwHighDateTime};

    ULONG ulDays = (ULONG) (liValue.QuadPart / uliTicsPerDay);

    //
    //  We now have the number of days since Jan. 01, 1601 in ulDays.
    //  Map into buckets.
    //

    if (ulDays < ulStart)                  // Store in bin 0
    {
        PutValue(pid, 0, VT_FILETIME);
    }
    else if (ulDays <= ulMiddle)           // Store week granularity
    {
        PutValue(pid, (ulDays + 1 - ulStart) / 7, VT_FILETIME);
    }
    else if (ulDays <= ulEnd)             // Store day granularity
    {
        //
        // Bins 0 - 730 are used by the two clauses above.  It doesn't
        // really matter if we reuse bin 730 for the start of the next
        // range (this might happen because of the division we do).
        //

        PutValue(pid, (ulDays + 1 - ulMiddle) + ((ulMiddle - ulStart) / 7),
                 VT_FILETIME);
    }
    else                                            // FileTime > 2160
    {
        PutValue(pid, 0xFFFF, VT_FILETIME);
    }
}