Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

565 lines
16 KiB

//+-------------------------------------------------------------------------
//
// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
// PARTICULAR PURPOSE.
//
// Copyright 2001 - 2001 Microsoft Corporation. All Rights Reserved.
//
// File: stem.hxx
//
// PURPOSE: Classes to read a binary file of stem expansions
//
// PLATFORM: Windows 2000 and later
//
//--------------------------------------------------------------------------
#pragma once
#define SFX_ADD_D 0xff
#define SFX_ADD_DED 0xfe
#define SFX_ADD_DING 0xfd
#define SFX_ADD_E 0xfc
#define SFX_ADD_ED 0xfb
#define SFX_ADD_EN 0xfa
#define SFX_ADD_ER 0xf9
#define SFX_ADD_ES 0xf8
#define SFX_ADD_EST 0xf7
#define SFX_ADD_ING 0xf6
#define SFX_ADD_KED 0xf5
#define SFX_ADD_KING 0xf4
#define SFX_ADD_LED 0xf3
#define SFX_ADD_LING 0xf2
#define SFX_ADD_N 0xf1
#define SFX_ADD_NER 0xf0
#define SFX_ADD_R 0xef
#define SFX_ADD_S 0xee
#define SFX_ADD_SES 0xed
#define SFX_ADD_ST 0xec
#define SFX_ADD_T 0xeb
#define SFX_ADD_TA 0xea
#define SFX_DROP_EY_ADD_IER 0xe9
#define SFX_DROP_EY_ADD_IEST 0xe8
#define SFX_DROP_E_ADD_ING 0xe7
#define SFX_DROP_LAST_ADD_T 0xe6
#define SFX_DROP_ON_ADD_A 0xe5
#define SFX_DROP_O_ADD_I 0xe4
#define SFX_DROP_UM_ADD_A 0xe3
#define SFX_DROP_US_ADD_I 0xe2
#define SFX_DROP_Y_ADD_IED 0xe1
#define SFX_DROP_Y_ADD_IER 0xe0
#define SFX_DROP_Y_ADD_IES 0xdf
#define SFX_DROP_Y_ADD_IEST 0xde
#define SFX_REPEATLAST_ADD_ED 0xdd
#define SFX_REPEATLAST_ADD_ER 0xdc
#define SFX_REPEATLAST_ADD_EST 0xdb
#define SFX_REPEATLAST_ADD_ING 0xda
#define SFX_SINGLE_BYTE 0xd0 // values >= than this take 1 byte
#define SFX_SWAP_PENULTIMATE 0xcf
#define SFX_PREFIX 0xce
#define SFX_NOPREFIX 0xcd
__inline BOOL IsHighBitSet( BYTE b ) { return ( 0 != ( b & 0x80 ) ); }
const unsigned cbMaxStem = 50;
const unsigned stemInvalid = 0xffffffff;
class CDirectoryEntry
{
public:
void Set( unsigned off, unsigned entry )
{
value = ( ( entry << 24 ) | off );
}
unsigned Offset()
{
return ( value & 0x00ffffff );
}
unsigned Entry()
{
return ( ( value & 0xff000000 ) >> 24 );
}
private:
unsigned value;
};
class CStemSet
{
public:
CStemSet( BYTE * pb, unsigned oSet ) : _pb( pb + oSet )
{
_ccRoot = 0;
while ( ( 0 != _pb[_ccRoot] ) &&
( !IsHighBitSet( _pb[_ccRoot] ) ) )
{
_acRoot[_ccRoot] = _pb[_ccRoot];
_ccRoot++;
}
_acRoot[ _ccRoot ] = 0;
}
BOOL IsGreaterThan( unsigned iEntry, char const * pcKey )
{
char ac[ cbMaxStem ];
unsigned o = stemInvalid;
GetNth( ac, iEntry, o );
return ( strcmp( ac, pcKey ) > 0 );
}
BOOL GetForm( char * pcOut, unsigned & iBmk )
{
return GetNth( pcOut, 0, iBmk );
}
BOOL GetNth( char * pcOut, unsigned iEntry, unsigned & iBmk )
{
BYTE * pbNext = _pb + _ccRoot;
if ( stemInvalid == iBmk )
{
if ( 0 == iEntry )
{
strcpy( pcOut, _acRoot );
iBmk = (unsigned) (ULONG_PTR) ( pbNext - _pb );
return TRUE;
}
unsigned iCurrentEntry = 1;
while ( iCurrentEntry != iEntry )
{
if ( 0 == *pbNext )
break;
if ( *pbNext >= SFX_SINGLE_BYTE )
pbNext++;
else if ( *pbNext == SFX_SWAP_PENULTIMATE )
pbNext += 2;
else if ( *pbNext == SFX_PREFIX )
{
pbNext++;
pbNext++; // prefix
unsigned cb = *pbNext++;
pbNext += cb;
}
else if ( *pbNext == SFX_NOPREFIX )
{
pbNext++;
unsigned cb = *pbNext++;
pbNext += cb;
}
iCurrentEntry++;
}
}
else
{
pbNext = iBmk + _pb;
}
if ( 0 == *pbNext )
{
pbNext++;
iBmk = (unsigned) (ULONG_PTR) ( pbNext - _pb );
return FALSE;
}
strcpy( pcOut, _acRoot );
BYTE bSuffix = *pbNext++;
switch ( bSuffix )
{
case SFX_ADD_S:
strcpy( pcOut + _ccRoot, "s" );
break;
case SFX_ADD_ED:
strcpy( pcOut + _ccRoot, "ed" );
break;
case SFX_ADD_ING:
strcpy( pcOut + _ccRoot, "ing" );
break;
case SFX_ADD_ES:
strcpy( pcOut + _ccRoot, "es" );
break;
case SFX_ADD_D:
strcpy( pcOut + _ccRoot, "d" );
break;
case SFX_ADD_ER:
strcpy( pcOut + _ccRoot, "er" );
break;
case SFX_ADD_N:
strcpy( pcOut + _ccRoot, "n" );
break;
case SFX_ADD_EST:
strcpy( pcOut + _ccRoot, "est" );
break;
case SFX_DROP_E_ADD_ING:
strcpy( pcOut + _ccRoot - 1, "ing" );
break;
case SFX_DROP_Y_ADD_IER:
strcpy( pcOut + _ccRoot - 1, "ier" );
break;
case SFX_DROP_Y_ADD_IES:
strcpy( pcOut + _ccRoot - 1, "ies" );
break;
case SFX_DROP_Y_ADD_IED:
strcpy( pcOut + _ccRoot - 1, "ied" );
break;
case SFX_ADD_SES:
strcpy( pcOut + _ccRoot, "ses" );
break;
case SFX_ADD_E:
strcpy( pcOut + _ccRoot, "e" );
break;
case SFX_ADD_LED:
strcpy( pcOut + _ccRoot, "led" );
break;
case SFX_ADD_NER:
strcpy( pcOut + _ccRoot, "ner" );
break;
case SFX_ADD_DED:
strcpy( pcOut + _ccRoot, "ded" );
break;
case SFX_DROP_Y_ADD_IEST:
strcpy( pcOut + _ccRoot - 1, "iest" );
break;
case SFX_ADD_LING:
strcpy( pcOut + _ccRoot, "ling" );
break;
case SFX_ADD_DING:
strcpy( pcOut + _ccRoot, "ding" );
break;
case SFX_REPEATLAST_ADD_ER:
pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ];
strcpy( pcOut + _ccRoot + 1, "er" );
break;
case SFX_REPEATLAST_ADD_EST:
pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ];
strcpy( pcOut + _ccRoot + 1, "est" );
break;
case SFX_REPEATLAST_ADD_ED:
pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ];
strcpy( pcOut + _ccRoot + 1, "ed" );
break;
case SFX_REPEATLAST_ADD_ING:
pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ];
strcpy( pcOut + _ccRoot + 1, "ing" );
break;
case SFX_ADD_R:
strcpy( pcOut + _ccRoot, "r" );
break;
case SFX_ADD_ST:
strcpy( pcOut + _ccRoot, "st" );
break;
case SFX_DROP_O_ADD_I:
break;
case SFX_ADD_KED:
strcpy( pcOut + _ccRoot, "ked" );
break;
case SFX_ADD_KING:
strcpy( pcOut + _ccRoot, "king" );
break;
case SFX_ADD_TA:
strcpy( pcOut + _ccRoot, "ta" );
break;
case SFX_DROP_EY_ADD_IER:
strcpy( pcOut + _ccRoot - 2, "ier" );
break;
case SFX_DROP_EY_ADD_IEST:
strcpy( pcOut + _ccRoot - 2, "iest" );
break;
case SFX_DROP_US_ADD_I:
strcpy( pcOut + _ccRoot - 2, "i" );
break;
case SFX_DROP_UM_ADD_A:
strcpy( pcOut + _ccRoot - 2, "a" );
break;
case SFX_ADD_T:
strcpy( pcOut + _ccRoot, "t" );
break;
case SFX_ADD_EN:
strcpy( pcOut + _ccRoot, "en" );
break;
case SFX_DROP_ON_ADD_A:
break;
case SFX_DROP_LAST_ADD_T:
strcpy( pcOut + _ccRoot - 1, "t" );
break;
case SFX_SWAP_PENULTIMATE:
pcOut[ _ccRoot - 2 ] = *pbNext;
pbNext++;
break;
case SFX_PREFIX:
{
unsigned ccPrefix = *pbNext++;
unsigned ccSuffix = *pbNext++;
CopyMemory( pcOut + ccPrefix, pbNext, ccSuffix );
pcOut[ ccPrefix + ccSuffix ] = 0;
pbNext += ccSuffix;
break;
}
case SFX_NOPREFIX:
{
unsigned cc = *pbNext++;
for ( unsigned i = 0; i < cc; i++ )
pcOut[i] = *pbNext++;
pcOut[i] = 0;
break;
}
}
iBmk = (unsigned) (ULONG_PTR) ( pbNext - _pb );
return TRUE;
}
private:
BYTE * _pb;
unsigned _ccRoot;
char _acRoot[ cbMaxStem ];
};
class CStem
{
public:
CStem( unsigned cDirectory,
CDirectoryEntry * pDirectory,
unsigned cbKeys,
BYTE * pbKeys ) :
_pbKeys( pbKeys ),
_cbKeys( cbKeys ),
_cDirectory( cDirectory ),
_pDirectory( pDirectory )
{
}
~CStem()
{
delete [] _pDirectory;
delete [] _pbKeys;
}
BOOL FindStemSet( char const * pcKey,
unsigned & iBmk,
unsigned & iStemSet )
{
unsigned oNext = stemInvalid;
char ac[ cbMaxStem ];
if ( stemInvalid == iBmk )
{
// Find a match using the directory
iBmk = FirstList( pcKey );
// Backup until the first match is found
while ( iBmk > 0 )
{
unsigned o = _pDirectory[ iBmk-1 ].Offset();
unsigned e = _pDirectory[ iBmk-1 ].Entry();
CStemSet set( _pbKeys, o );
set.GetNth( ac, e, oNext );
if ( !strcmp( ac, pcKey ) )
iBmk--;
else
break;
}
}
else
{
iBmk++;
}
// Return the list if an entry is found that maches
unsigned o = _pDirectory[ iBmk ].Offset();
unsigned e = _pDirectory[ iBmk ].Entry();
CStemSet set( _pbKeys, o );
oNext = stemInvalid;
set.GetNth( ac, e, oNext );
if ( !strcmp( ac, pcKey ) )
{
iStemSet = o;
return TRUE;
}
return FALSE;
}
unsigned SkipList( unsigned oList )
{
CStemSet set( _pbKeys, oList );
char ac[ cbMaxStem ];
unsigned i = 1;
unsigned o = stemInvalid;
while ( set.GetNth( ac, i, o ) )
i++;
o += oList;
if ( o >= _cbKeys )
return stemInvalid;
return o;
}
unsigned GetNth( char * pcOut, unsigned oList, unsigned iEntry )
{
CStemSet set( _pbKeys, oList );
unsigned o = stemInvalid;
return set.GetNth( pcOut, iEntry, o );
}
BYTE * GetStemSetRoot() { return _pbKeys; }
unsigned GetDirectoryCount() { return _cDirectory; }
CDirectoryEntry * GetDirectory() { return _pDirectory; }
private:
unsigned FirstList( char const * pcKey )
{
unsigned iHi = _cDirectory - 1;
unsigned iLo = 0;
unsigned cKeys = _cDirectory;
// do a binary search looking for the key
do
{
unsigned cHalf = cKeys / 2;
if ( 0 != cHalf )
{
unsigned cTmp = cHalf - 1 + ( cKeys & 1 );
unsigned iMid = iLo + cTmp;
CStemSet set( _pbKeys, _pDirectory[ iMid ].Offset() );
if ( set.IsGreaterThan( _pDirectory[ iMid ].Entry(),
pcKey ) )
{
iHi = iMid - 1;
cKeys = cTmp;
}
else
{
CStemSet set( _pbKeys, _pDirectory[ iMid + 1 ].Offset() );
if ( ! set.IsGreaterThan( _pDirectory[ iMid + 1 ].Entry(),
pcKey ) )
{
iLo = iMid + 1;
cKeys = cHalf;
}
else
return iMid;
}
}
else if ( cKeys > 1 )
{
CStemSet set( _pbKeys, _pDirectory[ iLo + 1 ].Offset() );
if ( set.IsGreaterThan( _pDirectory[ iLo + 1 ].Entry(),
pcKey ) )
return iLo;
return iLo + 1;
}
else
return iLo;
}
while ( TRUE );
return 0;
}
unsigned _cDirectory;
unsigned _cbKeys;
CDirectoryEntry * _pDirectory;
BYTE * _pbKeys;
};
__inline CStem * MakeStemObject( HMODULE hMod )
{
// Get the path of the data file
WCHAR awcPath[ MAX_PATH ];
DWORD cwcCopied = GetModuleFileName( hMod,
awcPath,
ArraySize( awcPath ) );
if ( 0 == cwcCopied )
return 0;
WCHAR *pwcSlash = wcsrchr( awcPath, '\\' );
if ( 0 == pwcSlash )
return 0;
wcscpy( pwcSlash + 1, L"en-stem.dat" );
// Open the data file
FILE *fp = _wfopen( awcPath, L"rb" );
if ( 0 == fp )
return 0;
// Check how big it is
fseek( fp, 0, SEEK_END );
unsigned cb = ftell( fp );
fseek( fp, 0, SEEK_SET );
// Read the directory count and the directory
unsigned cDirectory;
fread( &cDirectory, 1, sizeof( unsigned ), fp );
CDirectoryEntry * aDir = new CDirectoryEntry[ cDirectory ];
if ( 0 == aDir )
{
fclose( fp );
return 0;
}
fread( aDir, cDirectory, sizeof( unsigned ), fp );
// Read the key data
unsigned cbKeys = cb - ( sizeof( unsigned ) * ( cDirectory + 1 ) );
BYTE * pbKeys = new BYTE[ cbKeys ];
if ( 0 == pbKeys )
{
delete [] aDir;
fclose( fp );
return 0;
}
fread( pbKeys, cbKeys, 1, fp );
fclose( fp );
// Make the stemmer object with the buffers
CStem * pStem = new CStem( cDirectory, aDir, cbKeys, pbKeys );
if ( 0 == pStem )
{
delete [] aDir;
delete [] pbKeys;
}
return pStem;
} //MakeStemObject