|
|
//+-------------------------------------------------------------------------
//
// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
// PARTICULAR PURPOSE.
//
// Copyright 2001 - 2001 Microsoft Corporation. All Rights Reserved.
//
// File: stem.hxx
//
// PURPOSE: Classes to read a binary file of stem expansions
//
// PLATFORM: Windows 2000 and later
//
//--------------------------------------------------------------------------
#pragma once
#define SFX_ADD_D 0xff
#define SFX_ADD_DED 0xfe
#define SFX_ADD_DING 0xfd
#define SFX_ADD_E 0xfc
#define SFX_ADD_ED 0xfb
#define SFX_ADD_EN 0xfa
#define SFX_ADD_ER 0xf9
#define SFX_ADD_ES 0xf8
#define SFX_ADD_EST 0xf7
#define SFX_ADD_ING 0xf6
#define SFX_ADD_KED 0xf5
#define SFX_ADD_KING 0xf4
#define SFX_ADD_LED 0xf3
#define SFX_ADD_LING 0xf2
#define SFX_ADD_N 0xf1
#define SFX_ADD_NER 0xf0
#define SFX_ADD_R 0xef
#define SFX_ADD_S 0xee
#define SFX_ADD_SES 0xed
#define SFX_ADD_ST 0xec
#define SFX_ADD_T 0xeb
#define SFX_ADD_TA 0xea
#define SFX_DROP_EY_ADD_IER 0xe9
#define SFX_DROP_EY_ADD_IEST 0xe8
#define SFX_DROP_E_ADD_ING 0xe7
#define SFX_DROP_LAST_ADD_T 0xe6
#define SFX_DROP_ON_ADD_A 0xe5
#define SFX_DROP_O_ADD_I 0xe4
#define SFX_DROP_UM_ADD_A 0xe3
#define SFX_DROP_US_ADD_I 0xe2
#define SFX_DROP_Y_ADD_IED 0xe1
#define SFX_DROP_Y_ADD_IER 0xe0
#define SFX_DROP_Y_ADD_IES 0xdf
#define SFX_DROP_Y_ADD_IEST 0xde
#define SFX_REPEATLAST_ADD_ED 0xdd
#define SFX_REPEATLAST_ADD_ER 0xdc
#define SFX_REPEATLAST_ADD_EST 0xdb
#define SFX_REPEATLAST_ADD_ING 0xda
#define SFX_SINGLE_BYTE 0xd0 // values >= than this take 1 byte
#define SFX_SWAP_PENULTIMATE 0xcf
#define SFX_PREFIX 0xce
#define SFX_NOPREFIX 0xcd
__inline BOOL IsHighBitSet( BYTE b ) { return ( 0 != ( b & 0x80 ) ); } const unsigned cbMaxStem = 50; const unsigned stemInvalid = 0xffffffff;
class CDirectoryEntry { public: void Set( unsigned off, unsigned entry ) { value = ( ( entry << 24 ) | off ); }
unsigned Offset() { return ( value & 0x00ffffff ); }
unsigned Entry() { return ( ( value & 0xff000000 ) >> 24 ); }
private: unsigned value; };
class CStemSet { public: CStemSet( BYTE * pb, unsigned oSet ) : _pb( pb + oSet ) { _ccRoot = 0;
while ( ( 0 != _pb[_ccRoot] ) && ( !IsHighBitSet( _pb[_ccRoot] ) ) ) { _acRoot[_ccRoot] = _pb[_ccRoot]; _ccRoot++; }
_acRoot[ _ccRoot ] = 0; }
BOOL IsGreaterThan( unsigned iEntry, char const * pcKey ) { char ac[ cbMaxStem ]; unsigned o = stemInvalid; GetNth( ac, iEntry, o ); return ( strcmp( ac, pcKey ) > 0 ); }
BOOL GetForm( char * pcOut, unsigned & iBmk ) { return GetNth( pcOut, 0, iBmk ); }
BOOL GetNth( char * pcOut, unsigned iEntry, unsigned & iBmk ) { BYTE * pbNext = _pb + _ccRoot;
if ( stemInvalid == iBmk ) { if ( 0 == iEntry ) { strcpy( pcOut, _acRoot ); iBmk = (unsigned) (ULONG_PTR) ( pbNext - _pb ); return TRUE; }
unsigned iCurrentEntry = 1; while ( iCurrentEntry != iEntry ) { if ( 0 == *pbNext ) break; if ( *pbNext >= SFX_SINGLE_BYTE ) pbNext++; else if ( *pbNext == SFX_SWAP_PENULTIMATE ) pbNext += 2; else if ( *pbNext == SFX_PREFIX ) { pbNext++; pbNext++; // prefix
unsigned cb = *pbNext++; pbNext += cb; } else if ( *pbNext == SFX_NOPREFIX ) { pbNext++; unsigned cb = *pbNext++; pbNext += cb; } iCurrentEntry++; } } else { pbNext = iBmk + _pb; }
if ( 0 == *pbNext ) { pbNext++; iBmk = (unsigned) (ULONG_PTR) ( pbNext - _pb ); return FALSE; }
strcpy( pcOut, _acRoot ); BYTE bSuffix = *pbNext++;
switch ( bSuffix ) { case SFX_ADD_S: strcpy( pcOut + _ccRoot, "s" ); break; case SFX_ADD_ED: strcpy( pcOut + _ccRoot, "ed" ); break; case SFX_ADD_ING: strcpy( pcOut + _ccRoot, "ing" ); break; case SFX_ADD_ES: strcpy( pcOut + _ccRoot, "es" ); break; case SFX_ADD_D: strcpy( pcOut + _ccRoot, "d" ); break; case SFX_ADD_ER: strcpy( pcOut + _ccRoot, "er" ); break; case SFX_ADD_N: strcpy( pcOut + _ccRoot, "n" ); break; case SFX_ADD_EST: strcpy( pcOut + _ccRoot, "est" ); break; case SFX_DROP_E_ADD_ING: strcpy( pcOut + _ccRoot - 1, "ing" ); break; case SFX_DROP_Y_ADD_IER: strcpy( pcOut + _ccRoot - 1, "ier" ); break; case SFX_DROP_Y_ADD_IES: strcpy( pcOut + _ccRoot - 1, "ies" ); break; case SFX_DROP_Y_ADD_IED: strcpy( pcOut + _ccRoot - 1, "ied" ); break; case SFX_ADD_SES: strcpy( pcOut + _ccRoot, "ses" ); break; case SFX_ADD_E: strcpy( pcOut + _ccRoot, "e" ); break; case SFX_ADD_LED: strcpy( pcOut + _ccRoot, "led" ); break; case SFX_ADD_NER: strcpy( pcOut + _ccRoot, "ner" ); break; case SFX_ADD_DED: strcpy( pcOut + _ccRoot, "ded" ); break; case SFX_DROP_Y_ADD_IEST: strcpy( pcOut + _ccRoot - 1, "iest" ); break; case SFX_ADD_LING: strcpy( pcOut + _ccRoot, "ling" ); break; case SFX_ADD_DING: strcpy( pcOut + _ccRoot, "ding" ); break; case SFX_REPEATLAST_ADD_ER: pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ]; strcpy( pcOut + _ccRoot + 1, "er" ); break; case SFX_REPEATLAST_ADD_EST: pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ]; strcpy( pcOut + _ccRoot + 1, "est" ); break; case SFX_REPEATLAST_ADD_ED: pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ]; strcpy( pcOut + _ccRoot + 1, "ed" ); break; case SFX_REPEATLAST_ADD_ING: pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ]; strcpy( pcOut + _ccRoot + 1, "ing" ); break; case SFX_ADD_R: strcpy( pcOut + _ccRoot, "r" ); break; case SFX_ADD_ST: strcpy( pcOut + _ccRoot, "st" ); break; case SFX_DROP_O_ADD_I: break; case SFX_ADD_KED: strcpy( pcOut + _ccRoot, "ked" ); break; case SFX_ADD_KING: strcpy( pcOut + _ccRoot, "king" ); break; case SFX_ADD_TA: strcpy( pcOut + _ccRoot, "ta" ); break; case SFX_DROP_EY_ADD_IER: strcpy( pcOut + _ccRoot - 2, "ier" ); break; case SFX_DROP_EY_ADD_IEST: strcpy( pcOut + _ccRoot - 2, "iest" ); break; case SFX_DROP_US_ADD_I: strcpy( pcOut + _ccRoot - 2, "i" ); break; case SFX_DROP_UM_ADD_A: strcpy( pcOut + _ccRoot - 2, "a" ); break; case SFX_ADD_T: strcpy( pcOut + _ccRoot, "t" ); break; case SFX_ADD_EN: strcpy( pcOut + _ccRoot, "en" ); break; case SFX_DROP_ON_ADD_A: break; case SFX_DROP_LAST_ADD_T: strcpy( pcOut + _ccRoot - 1, "t" ); break; case SFX_SWAP_PENULTIMATE: pcOut[ _ccRoot - 2 ] = *pbNext; pbNext++; break; case SFX_PREFIX: { unsigned ccPrefix = *pbNext++; unsigned ccSuffix = *pbNext++; CopyMemory( pcOut + ccPrefix, pbNext, ccSuffix ); pcOut[ ccPrefix + ccSuffix ] = 0; pbNext += ccSuffix; break; } case SFX_NOPREFIX: { unsigned cc = *pbNext++; for ( unsigned i = 0; i < cc; i++ ) pcOut[i] = *pbNext++; pcOut[i] = 0; break; } } iBmk = (unsigned) (ULONG_PTR) ( pbNext - _pb ); return TRUE; }
private: BYTE * _pb; unsigned _ccRoot; char _acRoot[ cbMaxStem ]; };
class CStem { public: CStem( unsigned cDirectory, CDirectoryEntry * pDirectory, unsigned cbKeys, BYTE * pbKeys ) : _pbKeys( pbKeys ), _cbKeys( cbKeys ), _cDirectory( cDirectory ), _pDirectory( pDirectory ) { }
~CStem() { delete [] _pDirectory; delete [] _pbKeys; }
BOOL FindStemSet( char const * pcKey, unsigned & iBmk, unsigned & iStemSet ) { unsigned oNext = stemInvalid; char ac[ cbMaxStem ];
if ( stemInvalid == iBmk ) { // Find a match using the directory
iBmk = FirstList( pcKey );
// Backup until the first match is found
while ( iBmk > 0 ) { unsigned o = _pDirectory[ iBmk-1 ].Offset(); unsigned e = _pDirectory[ iBmk-1 ].Entry(); CStemSet set( _pbKeys, o ); set.GetNth( ac, e, oNext );
if ( !strcmp( ac, pcKey ) ) iBmk--; else break; } } else { iBmk++; }
// Return the list if an entry is found that maches
unsigned o = _pDirectory[ iBmk ].Offset(); unsigned e = _pDirectory[ iBmk ].Entry();
CStemSet set( _pbKeys, o ); oNext = stemInvalid; set.GetNth( ac, e, oNext );
if ( !strcmp( ac, pcKey ) ) { iStemSet = o; return TRUE; }
return FALSE; }
unsigned SkipList( unsigned oList ) { CStemSet set( _pbKeys, oList ); char ac[ cbMaxStem ];
unsigned i = 1; unsigned o = stemInvalid;
while ( set.GetNth( ac, i, o ) ) i++;
o += oList;
if ( o >= _cbKeys ) return stemInvalid;
return o; }
unsigned GetNth( char * pcOut, unsigned oList, unsigned iEntry ) { CStemSet set( _pbKeys, oList ); unsigned o = stemInvalid; return set.GetNth( pcOut, iEntry, o ); }
BYTE * GetStemSetRoot() { return _pbKeys; } unsigned GetDirectoryCount() { return _cDirectory; } CDirectoryEntry * GetDirectory() { return _pDirectory; }
private:
unsigned FirstList( char const * pcKey ) { unsigned iHi = _cDirectory - 1; unsigned iLo = 0; unsigned cKeys = _cDirectory;
// do a binary search looking for the key
do { unsigned cHalf = cKeys / 2; if ( 0 != cHalf ) { unsigned cTmp = cHalf - 1 + ( cKeys & 1 ); unsigned iMid = iLo + cTmp;
CStemSet set( _pbKeys, _pDirectory[ iMid ].Offset() );
if ( set.IsGreaterThan( _pDirectory[ iMid ].Entry(), pcKey ) ) { iHi = iMid - 1; cKeys = cTmp; } else { CStemSet set( _pbKeys, _pDirectory[ iMid + 1 ].Offset() );
if ( ! set.IsGreaterThan( _pDirectory[ iMid + 1 ].Entry(), pcKey ) ) { iLo = iMid + 1; cKeys = cHalf; } else return iMid; } } else if ( cKeys > 1 ) { CStemSet set( _pbKeys, _pDirectory[ iLo + 1 ].Offset() );
if ( set.IsGreaterThan( _pDirectory[ iLo + 1 ].Entry(), pcKey ) ) return iLo;
return iLo + 1; } else return iLo; } while ( TRUE ); return 0; }
unsigned _cDirectory; unsigned _cbKeys; CDirectoryEntry * _pDirectory; BYTE * _pbKeys; };
__inline CStem * MakeStemObject( HMODULE hMod ) { // Get the path of the data file
WCHAR awcPath[ MAX_PATH ]; DWORD cwcCopied = GetModuleFileName( hMod, awcPath, ArraySize( awcPath ) ); if ( 0 == cwcCopied ) return 0;
WCHAR *pwcSlash = wcsrchr( awcPath, '\\' ); if ( 0 == pwcSlash ) return 0;
wcscpy( pwcSlash + 1, L"en-stem.dat" );
// Open the data file
FILE *fp = _wfopen( awcPath, L"rb" ); if ( 0 == fp ) return 0;
// Check how big it is
fseek( fp, 0, SEEK_END ); unsigned cb = ftell( fp ); fseek( fp, 0, SEEK_SET );
// Read the directory count and the directory
unsigned cDirectory; fread( &cDirectory, 1, sizeof( unsigned ), fp );
CDirectoryEntry * aDir = new CDirectoryEntry[ cDirectory ]; if ( 0 == aDir ) { fclose( fp ); return 0; }
fread( aDir, cDirectory, sizeof( unsigned ), fp );
// Read the key data
unsigned cbKeys = cb - ( sizeof( unsigned ) * ( cDirectory + 1 ) );
BYTE * pbKeys = new BYTE[ cbKeys ]; if ( 0 == pbKeys ) { delete [] aDir; fclose( fp ); return 0; }
fread( pbKeys, cbKeys, 1, fp ); fclose( fp );
// Make the stemmer object with the buffers
CStem * pStem = new CStem( cDirectory, aDir, cbKeys, pbKeys );
if ( 0 == pStem ) { delete [] aDir; delete [] pbKeys; }
return pStem; } //MakeStemObject
|