windows-server-2003/base/fs/hsm/wsb/wsbhash.cpp

/*++

� 1998 Seagate Software, Inc.  All rights reserved

Module Name:

    Wsbhash.cpp

Abstract:

    Some functions for hashing text strings and creating DB keys from
    file path names.

    NOTE: Since no one needed this code by the time I got it done, it
    hasn't been tested!

Author:

    Ron White   [ronw]   25-Apr-1997

Revision History:

--*/

#include "stdafx.h"

// This pseudorandom permutation table (used by the SimpleHash function below)
// is taken from the article referenced in the comments for that function.
static UCHAR perm_table[] = {
      1,  87,  49,  12, 176, 178, 102, 166, 121, 193,   6,  84, 249, 230,  44, 163,
     14, 197, 213, 181, 161,  85, 218,  80,  64, 239,  24, 226, 236, 142,  38, 200,
    110, 177, 104, 103, 141, 253, 255,  50,  77, 101,  81,  18,  45,  96,  31, 222,
     25, 107, 190,  70,  86, 237, 240,  34,  72, 242,  20, 214, 244, 227, 149, 235,
     97, 234,  57,  22,  60, 250,  82, 175, 208,   5, 127, 199, 111,  62, 135, 248,
    174, 169, 211,  58,  66, 154, 106, 195, 245, 171,  17, 187, 182, 179,   0, 243,
    132,  56, 148,  75, 128, 133, 158, 100, 130, 126,  91,  13, 153, 246, 216, 219,
    119,  68, 223,  78,  83,  88, 201,  99, 122,  11,  92,  32, 136, 114,  52,  10,
    138,  30,  48, 183, 156,  35,  61,  26, 143,  74, 251,  94, 129, 162,  63, 152,
    170,   7, 115, 167, 241, 206,   3, 150,  55,  59, 151, 220,  90,  53,  23, 131,
    125, 173,  15, 238,  79,  95,  89,  16, 105, 137, 225, 224, 217, 160,  37, 123,
    118,  73,   2, 157,  46, 116,   9, 145, 134, 228, 207, 212, 202, 215,  69, 229,
     27, 188,  67, 124, 168, 252,  42,   4,  29, 108,  21, 247,  19, 205,  39, 203,
    233,  40, 186, 147, 198, 192, 155,  33, 164, 191,  98, 204, 165, 180, 117,  76,
    140,  36, 210, 172,  41,  54, 159,   8, 185, 232, 113, 196, 231,  47, 146, 120,
     51,  65,  28, 144, 254, 221,  93, 189, 194, 139, 112,  43,  71, 109, 184, 209
};

//  Local functions
static HRESULT ProgressiveHash(WCHAR* pWstring, ULONG nChars, UCHAR* pKey, 
        ULONG keySize, ULONG* pKeyCount);
static UCHAR SimpleHash(UCHAR* pString, ULONG count);


//  ProgressiveHash - hash a wide-character string into a byte key of a given
//  maximum size.  The string is limited to 32K characters (64K bytes) and the
//  key size must be at least 16. 
//
//  The algorithm starts out merely XORing the two bytes of each character into a
//  single byte in the key.  If it must use the last 15 bytes of the key, it begins
//  using the SimpleHash function to hash progressively larger (doubling) chuncks
//  of the string into a single byte.
//
//  This method is used to try and preserve as much information about short strings
//  as possible; to preserve, to some extent, the sort order of strings; and to
//  compress long strings into a reasonably sized key. It is assumed (perhaps
//  incorrectly) that many of the characters will be ANSI characters an so the
//  XOR of the bytes in the initial part of the string won't lose any information.

static HRESULT ProgressiveHash(WCHAR* pWstring, ULONG nChars, UCHAR* pKey, 
        ULONG keySize, ULONG* pKeyCount)
{
    HRESULT hr = S_OK;

    try {
        ULONG   chunk;           // Current chunk size
        ULONG   headSize;
        ULONG   keyIndex = 0;    // Current index into the key
        UCHAR*  pBytes;          // Byte pointer into the string
        ULONG   remains;         // Bytes remaining in the string

        //  Check arguments
        WsbAffirm(NULL != pWstring, E_POINTER);
        WsbAffirm(NULL != pKey, E_POINTER);
        remains = nChars * 2;
        WsbAffirm(65536 >= remains, E_INVALIDARG);
        WsbAffirm(15 < keySize, E_INVALIDARG);

        //  Do the non-progressive part
        pBytes = (UCHAR*)pWstring;
        headSize = keySize - 15;
        while (remains > 0 && keyIndex < headSize) {
            pKey[keyIndex++] = (UCHAR) ( *pBytes ^ *(pBytes + 1) );
            pBytes += 2;
            remains -= 2;
        }

        //  Do the progressive part
        chunk = 4;
        while (remains > 0) {
            if (chunk > remains) {
                chunk = remains;
            }
            pKey[keyIndex++] = SimpleHash(pBytes, chunk);
            pBytes += chunk;
            remains -= chunk;
            chunk *= 2;
        }

        if (NULL != pKeyCount) {
            *pKeyCount = keyIndex;
        }
    } WsbCatch(hr);

    return(hr);
}


//  SimpleHash - hash a string of bytes into a single byte.
//
//  This algorithm and the permutation table come from the article "Fast Hashing
//  of Variable-Length Text Strings" in the June 1990 (33, 6) issue of Communications
//  of the ACM (CACM).
//  NOTE: For a hash value larger than one byte, the article suggests hashing the
//  original string with this function to get one byte, adding 1 (mod 256) to the
//  first byte of the string and hashing the new string with this function to get
//  the second byte, etc.

static UCHAR SimpleHash(UCHAR* pString, ULONG count)
{
    int h = 0;

    for (ULONG i = 0; i < count; i++) {
        h = perm_table[h ^ pString[i]];
    }
    return((UCHAR)h);
}

//  SquashFilepath - compress a file path name into a (possibly) shorter key.
//
//  This function splits the key into a path part (about 3/4 of the initial
//  bytes of the key) and a file name part (the rest of the key).  For each
//  part it uses the ProgressiveHash function to compress the substring.

//  This function attempts to preserve enough information in the key that keys
//  will be sorted in approximately the same order as the original path names
//  and it is unlikely (though not impossible) that two different paths would
//  result in the same key.  Both of these are dependent on the size of the key.
//  A reasonable size is probably 128 bytes, which gives 96 bytes for the path
//  and 32 bytes for the file name.  A key size of 64 or less will fail because
//  the file name part will be too small for the Progressive Hash function.

HRESULT SquashFilepath(WCHAR* pWstring, UCHAR* pKey, ULONG keySize)
{
    HRESULT hr = S_OK;

    try {
        ULONG  keyIndex;
        ULONG  nChars;
        WCHAR* pFilename;
        ULONG  pathKeySize;

        //  Check arguments
        WsbAffirm(NULL != pWstring, E_POINTER);
        WsbAffirm(NULL != pKey, E_POINTER);
        WsbAffirm(60 < keySize, E_INVALIDARG);

        //  Calculate some initial values
        pFilename = wcsrchr(pWstring, WCHAR('\\'));
        if (NULL == pFilename) {
            nChars = 0;
            pFilename = pWstring;
        } else {
            nChars = (ULONG)(pFilename - pWstring);
            pFilename++;
        }
        pathKeySize = (keySize / 4) * 3;

        //  Compress the path
        if (0 < nChars) {
            WsbAffirmHr(ProgressiveHash(pWstring, nChars, pKey, pathKeySize,
                    &keyIndex));
        } else {
            keyIndex = 0;
        }

        //  Fill the rest of the path part of the key with zeros
        for ( ; keyIndex < pathKeySize; keyIndex++) {
            pKey[keyIndex] = 0;
        }

        //  Compress the file name
        nChars = wcslen(pFilename);
        if (0 < nChars) {
            WsbAffirmHr(ProgressiveHash(pFilename, nChars, &pKey[keyIndex],
                    keySize - pathKeySize, &keyIndex));
            keyIndex += pathKeySize;
        }

        //  Fill the rest of the file name part of the key with zeros
        for ( ; keyIndex < keySize; keyIndex++) {
            pKey[keyIndex] = 0;
        }
    } WsbCatch(hr);

    return(hr);
}