You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
565 lines
16 KiB
565 lines
16 KiB
//+-------------------------------------------------------------------------
|
|
//
|
|
// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
|
|
// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
|
|
// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
|
|
// PARTICULAR PURPOSE.
|
|
//
|
|
// Copyright 2001 - 2001 Microsoft Corporation. All Rights Reserved.
|
|
//
|
|
// File: stem.hxx
|
|
//
|
|
// PURPOSE: Classes to read a binary file of stem expansions
|
|
//
|
|
// PLATFORM: Windows 2000 and later
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
#pragma once
|
|
|
|
#define SFX_ADD_D 0xff
|
|
#define SFX_ADD_DED 0xfe
|
|
#define SFX_ADD_DING 0xfd
|
|
#define SFX_ADD_E 0xfc
|
|
#define SFX_ADD_ED 0xfb
|
|
#define SFX_ADD_EN 0xfa
|
|
#define SFX_ADD_ER 0xf9
|
|
#define SFX_ADD_ES 0xf8
|
|
#define SFX_ADD_EST 0xf7
|
|
#define SFX_ADD_ING 0xf6
|
|
#define SFX_ADD_KED 0xf5
|
|
#define SFX_ADD_KING 0xf4
|
|
#define SFX_ADD_LED 0xf3
|
|
#define SFX_ADD_LING 0xf2
|
|
#define SFX_ADD_N 0xf1
|
|
#define SFX_ADD_NER 0xf0
|
|
#define SFX_ADD_R 0xef
|
|
#define SFX_ADD_S 0xee
|
|
#define SFX_ADD_SES 0xed
|
|
#define SFX_ADD_ST 0xec
|
|
#define SFX_ADD_T 0xeb
|
|
#define SFX_ADD_TA 0xea
|
|
#define SFX_DROP_EY_ADD_IER 0xe9
|
|
#define SFX_DROP_EY_ADD_IEST 0xe8
|
|
#define SFX_DROP_E_ADD_ING 0xe7
|
|
#define SFX_DROP_LAST_ADD_T 0xe6
|
|
#define SFX_DROP_ON_ADD_A 0xe5
|
|
#define SFX_DROP_O_ADD_I 0xe4
|
|
#define SFX_DROP_UM_ADD_A 0xe3
|
|
#define SFX_DROP_US_ADD_I 0xe2
|
|
#define SFX_DROP_Y_ADD_IED 0xe1
|
|
#define SFX_DROP_Y_ADD_IER 0xe0
|
|
#define SFX_DROP_Y_ADD_IES 0xdf
|
|
#define SFX_DROP_Y_ADD_IEST 0xde
|
|
#define SFX_REPEATLAST_ADD_ED 0xdd
|
|
#define SFX_REPEATLAST_ADD_ER 0xdc
|
|
#define SFX_REPEATLAST_ADD_EST 0xdb
|
|
#define SFX_REPEATLAST_ADD_ING 0xda
|
|
#define SFX_SINGLE_BYTE 0xd0 // values >= than this take 1 byte
|
|
|
|
#define SFX_SWAP_PENULTIMATE 0xcf
|
|
#define SFX_PREFIX 0xce
|
|
#define SFX_NOPREFIX 0xcd
|
|
|
|
__inline BOOL IsHighBitSet( BYTE b ) { return ( 0 != ( b & 0x80 ) ); }
|
|
const unsigned cbMaxStem = 50;
|
|
const unsigned stemInvalid = 0xffffffff;
|
|
|
|
class CDirectoryEntry
|
|
{
|
|
public:
|
|
void Set( unsigned off, unsigned entry )
|
|
{
|
|
value = ( ( entry << 24 ) | off );
|
|
}
|
|
|
|
unsigned Offset()
|
|
{
|
|
return ( value & 0x00ffffff );
|
|
}
|
|
|
|
unsigned Entry()
|
|
{
|
|
return ( ( value & 0xff000000 ) >> 24 );
|
|
}
|
|
|
|
private:
|
|
unsigned value;
|
|
};
|
|
|
|
class CStemSet
|
|
{
|
|
public:
|
|
CStemSet( BYTE * pb, unsigned oSet ) : _pb( pb + oSet )
|
|
{
|
|
_ccRoot = 0;
|
|
|
|
while ( ( 0 != _pb[_ccRoot] ) &&
|
|
( !IsHighBitSet( _pb[_ccRoot] ) ) )
|
|
{
|
|
_acRoot[_ccRoot] = _pb[_ccRoot];
|
|
_ccRoot++;
|
|
}
|
|
|
|
_acRoot[ _ccRoot ] = 0;
|
|
}
|
|
|
|
BOOL IsGreaterThan( unsigned iEntry, char const * pcKey )
|
|
{
|
|
char ac[ cbMaxStem ];
|
|
unsigned o = stemInvalid;
|
|
GetNth( ac, iEntry, o );
|
|
return ( strcmp( ac, pcKey ) > 0 );
|
|
}
|
|
|
|
BOOL GetForm( char * pcOut, unsigned & iBmk )
|
|
{
|
|
return GetNth( pcOut, 0, iBmk );
|
|
}
|
|
|
|
BOOL GetNth( char * pcOut, unsigned iEntry, unsigned & iBmk )
|
|
{
|
|
BYTE * pbNext = _pb + _ccRoot;
|
|
|
|
if ( stemInvalid == iBmk )
|
|
{
|
|
if ( 0 == iEntry )
|
|
{
|
|
strcpy( pcOut, _acRoot );
|
|
iBmk = (unsigned) (ULONG_PTR) ( pbNext - _pb );
|
|
return TRUE;
|
|
}
|
|
|
|
unsigned iCurrentEntry = 1;
|
|
|
|
while ( iCurrentEntry != iEntry )
|
|
{
|
|
if ( 0 == *pbNext )
|
|
break;
|
|
|
|
if ( *pbNext >= SFX_SINGLE_BYTE )
|
|
pbNext++;
|
|
else if ( *pbNext == SFX_SWAP_PENULTIMATE )
|
|
pbNext += 2;
|
|
else if ( *pbNext == SFX_PREFIX )
|
|
{
|
|
pbNext++;
|
|
pbNext++; // prefix
|
|
unsigned cb = *pbNext++;
|
|
pbNext += cb;
|
|
}
|
|
else if ( *pbNext == SFX_NOPREFIX )
|
|
{
|
|
pbNext++;
|
|
unsigned cb = *pbNext++;
|
|
pbNext += cb;
|
|
}
|
|
|
|
iCurrentEntry++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
pbNext = iBmk + _pb;
|
|
}
|
|
|
|
if ( 0 == *pbNext )
|
|
{
|
|
pbNext++;
|
|
iBmk = (unsigned) (ULONG_PTR) ( pbNext - _pb );
|
|
return FALSE;
|
|
}
|
|
|
|
strcpy( pcOut, _acRoot );
|
|
BYTE bSuffix = *pbNext++;
|
|
|
|
switch ( bSuffix )
|
|
{
|
|
case SFX_ADD_S:
|
|
strcpy( pcOut + _ccRoot, "s" );
|
|
break;
|
|
case SFX_ADD_ED:
|
|
strcpy( pcOut + _ccRoot, "ed" );
|
|
break;
|
|
case SFX_ADD_ING:
|
|
strcpy( pcOut + _ccRoot, "ing" );
|
|
break;
|
|
case SFX_ADD_ES:
|
|
strcpy( pcOut + _ccRoot, "es" );
|
|
break;
|
|
case SFX_ADD_D:
|
|
strcpy( pcOut + _ccRoot, "d" );
|
|
break;
|
|
case SFX_ADD_ER:
|
|
strcpy( pcOut + _ccRoot, "er" );
|
|
break;
|
|
case SFX_ADD_N:
|
|
strcpy( pcOut + _ccRoot, "n" );
|
|
break;
|
|
case SFX_ADD_EST:
|
|
strcpy( pcOut + _ccRoot, "est" );
|
|
break;
|
|
case SFX_DROP_E_ADD_ING:
|
|
strcpy( pcOut + _ccRoot - 1, "ing" );
|
|
break;
|
|
case SFX_DROP_Y_ADD_IER:
|
|
strcpy( pcOut + _ccRoot - 1, "ier" );
|
|
break;
|
|
case SFX_DROP_Y_ADD_IES:
|
|
strcpy( pcOut + _ccRoot - 1, "ies" );
|
|
break;
|
|
case SFX_DROP_Y_ADD_IED:
|
|
strcpy( pcOut + _ccRoot - 1, "ied" );
|
|
break;
|
|
case SFX_ADD_SES:
|
|
strcpy( pcOut + _ccRoot, "ses" );
|
|
break;
|
|
case SFX_ADD_E:
|
|
strcpy( pcOut + _ccRoot, "e" );
|
|
break;
|
|
case SFX_ADD_LED:
|
|
strcpy( pcOut + _ccRoot, "led" );
|
|
break;
|
|
case SFX_ADD_NER:
|
|
strcpy( pcOut + _ccRoot, "ner" );
|
|
break;
|
|
case SFX_ADD_DED:
|
|
strcpy( pcOut + _ccRoot, "ded" );
|
|
break;
|
|
case SFX_DROP_Y_ADD_IEST:
|
|
strcpy( pcOut + _ccRoot - 1, "iest" );
|
|
break;
|
|
case SFX_ADD_LING:
|
|
strcpy( pcOut + _ccRoot, "ling" );
|
|
break;
|
|
case SFX_ADD_DING:
|
|
strcpy( pcOut + _ccRoot, "ding" );
|
|
break;
|
|
case SFX_REPEATLAST_ADD_ER:
|
|
pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ];
|
|
strcpy( pcOut + _ccRoot + 1, "er" );
|
|
break;
|
|
case SFX_REPEATLAST_ADD_EST:
|
|
pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ];
|
|
strcpy( pcOut + _ccRoot + 1, "est" );
|
|
break;
|
|
case SFX_REPEATLAST_ADD_ED:
|
|
pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ];
|
|
strcpy( pcOut + _ccRoot + 1, "ed" );
|
|
break;
|
|
case SFX_REPEATLAST_ADD_ING:
|
|
pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ];
|
|
strcpy( pcOut + _ccRoot + 1, "ing" );
|
|
break;
|
|
case SFX_ADD_R:
|
|
strcpy( pcOut + _ccRoot, "r" );
|
|
break;
|
|
case SFX_ADD_ST:
|
|
strcpy( pcOut + _ccRoot, "st" );
|
|
break;
|
|
case SFX_DROP_O_ADD_I:
|
|
break;
|
|
case SFX_ADD_KED:
|
|
strcpy( pcOut + _ccRoot, "ked" );
|
|
break;
|
|
case SFX_ADD_KING:
|
|
strcpy( pcOut + _ccRoot, "king" );
|
|
break;
|
|
case SFX_ADD_TA:
|
|
strcpy( pcOut + _ccRoot, "ta" );
|
|
break;
|
|
case SFX_DROP_EY_ADD_IER:
|
|
strcpy( pcOut + _ccRoot - 2, "ier" );
|
|
break;
|
|
case SFX_DROP_EY_ADD_IEST:
|
|
strcpy( pcOut + _ccRoot - 2, "iest" );
|
|
break;
|
|
case SFX_DROP_US_ADD_I:
|
|
strcpy( pcOut + _ccRoot - 2, "i" );
|
|
break;
|
|
case SFX_DROP_UM_ADD_A:
|
|
strcpy( pcOut + _ccRoot - 2, "a" );
|
|
break;
|
|
case SFX_ADD_T:
|
|
strcpy( pcOut + _ccRoot, "t" );
|
|
break;
|
|
case SFX_ADD_EN:
|
|
strcpy( pcOut + _ccRoot, "en" );
|
|
break;
|
|
case SFX_DROP_ON_ADD_A:
|
|
break;
|
|
case SFX_DROP_LAST_ADD_T:
|
|
strcpy( pcOut + _ccRoot - 1, "t" );
|
|
break;
|
|
case SFX_SWAP_PENULTIMATE:
|
|
pcOut[ _ccRoot - 2 ] = *pbNext;
|
|
pbNext++;
|
|
break;
|
|
case SFX_PREFIX:
|
|
{
|
|
unsigned ccPrefix = *pbNext++;
|
|
unsigned ccSuffix = *pbNext++;
|
|
CopyMemory( pcOut + ccPrefix, pbNext, ccSuffix );
|
|
pcOut[ ccPrefix + ccSuffix ] = 0;
|
|
pbNext += ccSuffix;
|
|
break;
|
|
}
|
|
case SFX_NOPREFIX:
|
|
{
|
|
unsigned cc = *pbNext++;
|
|
|
|
for ( unsigned i = 0; i < cc; i++ )
|
|
pcOut[i] = *pbNext++;
|
|
|
|
pcOut[i] = 0;
|
|
break;
|
|
}
|
|
}
|
|
|
|
iBmk = (unsigned) (ULONG_PTR) ( pbNext - _pb );
|
|
return TRUE;
|
|
}
|
|
|
|
private:
|
|
BYTE * _pb;
|
|
unsigned _ccRoot;
|
|
char _acRoot[ cbMaxStem ];
|
|
};
|
|
|
|
class CStem
|
|
{
|
|
public:
|
|
CStem( unsigned cDirectory,
|
|
CDirectoryEntry * pDirectory,
|
|
unsigned cbKeys,
|
|
BYTE * pbKeys ) :
|
|
_pbKeys( pbKeys ),
|
|
_cbKeys( cbKeys ),
|
|
_cDirectory( cDirectory ),
|
|
_pDirectory( pDirectory )
|
|
{
|
|
}
|
|
|
|
~CStem()
|
|
{
|
|
delete [] _pDirectory;
|
|
delete [] _pbKeys;
|
|
}
|
|
|
|
BOOL FindStemSet( char const * pcKey,
|
|
unsigned & iBmk,
|
|
unsigned & iStemSet )
|
|
{
|
|
unsigned oNext = stemInvalid;
|
|
char ac[ cbMaxStem ];
|
|
|
|
if ( stemInvalid == iBmk )
|
|
{
|
|
// Find a match using the directory
|
|
|
|
iBmk = FirstList( pcKey );
|
|
|
|
// Backup until the first match is found
|
|
|
|
while ( iBmk > 0 )
|
|
{
|
|
unsigned o = _pDirectory[ iBmk-1 ].Offset();
|
|
unsigned e = _pDirectory[ iBmk-1 ].Entry();
|
|
CStemSet set( _pbKeys, o );
|
|
set.GetNth( ac, e, oNext );
|
|
|
|
if ( !strcmp( ac, pcKey ) )
|
|
iBmk--;
|
|
else
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
iBmk++;
|
|
}
|
|
|
|
// Return the list if an entry is found that maches
|
|
|
|
unsigned o = _pDirectory[ iBmk ].Offset();
|
|
unsigned e = _pDirectory[ iBmk ].Entry();
|
|
|
|
CStemSet set( _pbKeys, o );
|
|
oNext = stemInvalid;
|
|
set.GetNth( ac, e, oNext );
|
|
|
|
if ( !strcmp( ac, pcKey ) )
|
|
{
|
|
iStemSet = o;
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
unsigned SkipList( unsigned oList )
|
|
{
|
|
CStemSet set( _pbKeys, oList );
|
|
char ac[ cbMaxStem ];
|
|
|
|
unsigned i = 1;
|
|
unsigned o = stemInvalid;
|
|
|
|
while ( set.GetNth( ac, i, o ) )
|
|
i++;
|
|
|
|
o += oList;
|
|
|
|
if ( o >= _cbKeys )
|
|
return stemInvalid;
|
|
|
|
return o;
|
|
}
|
|
|
|
unsigned GetNth( char * pcOut, unsigned oList, unsigned iEntry )
|
|
{
|
|
CStemSet set( _pbKeys, oList );
|
|
unsigned o = stemInvalid;
|
|
return set.GetNth( pcOut, iEntry, o );
|
|
}
|
|
|
|
BYTE * GetStemSetRoot() { return _pbKeys; }
|
|
unsigned GetDirectoryCount() { return _cDirectory; }
|
|
CDirectoryEntry * GetDirectory() { return _pDirectory; }
|
|
|
|
private:
|
|
|
|
unsigned FirstList( char const * pcKey )
|
|
{
|
|
unsigned iHi = _cDirectory - 1;
|
|
unsigned iLo = 0;
|
|
unsigned cKeys = _cDirectory;
|
|
|
|
// do a binary search looking for the key
|
|
|
|
do
|
|
{
|
|
unsigned cHalf = cKeys / 2;
|
|
|
|
if ( 0 != cHalf )
|
|
{
|
|
unsigned cTmp = cHalf - 1 + ( cKeys & 1 );
|
|
unsigned iMid = iLo + cTmp;
|
|
|
|
CStemSet set( _pbKeys, _pDirectory[ iMid ].Offset() );
|
|
|
|
if ( set.IsGreaterThan( _pDirectory[ iMid ].Entry(),
|
|
pcKey ) )
|
|
{
|
|
iHi = iMid - 1;
|
|
cKeys = cTmp;
|
|
}
|
|
else
|
|
{
|
|
CStemSet set( _pbKeys, _pDirectory[ iMid + 1 ].Offset() );
|
|
|
|
if ( ! set.IsGreaterThan( _pDirectory[ iMid + 1 ].Entry(),
|
|
pcKey ) )
|
|
{
|
|
iLo = iMid + 1;
|
|
cKeys = cHalf;
|
|
}
|
|
else
|
|
return iMid;
|
|
}
|
|
}
|
|
else if ( cKeys > 1 )
|
|
{
|
|
CStemSet set( _pbKeys, _pDirectory[ iLo + 1 ].Offset() );
|
|
|
|
if ( set.IsGreaterThan( _pDirectory[ iLo + 1 ].Entry(),
|
|
pcKey ) )
|
|
return iLo;
|
|
|
|
return iLo + 1;
|
|
}
|
|
else
|
|
return iLo;
|
|
}
|
|
while ( TRUE );
|
|
|
|
return 0;
|
|
}
|
|
|
|
unsigned _cDirectory;
|
|
unsigned _cbKeys;
|
|
CDirectoryEntry * _pDirectory;
|
|
BYTE * _pbKeys;
|
|
};
|
|
|
|
__inline CStem * MakeStemObject( HMODULE hMod )
|
|
{
|
|
// Get the path of the data file
|
|
|
|
WCHAR awcPath[ MAX_PATH ];
|
|
DWORD cwcCopied = GetModuleFileName( hMod,
|
|
awcPath,
|
|
ArraySize( awcPath ) );
|
|
if ( 0 == cwcCopied )
|
|
return 0;
|
|
|
|
WCHAR *pwcSlash = wcsrchr( awcPath, '\\' );
|
|
if ( 0 == pwcSlash )
|
|
return 0;
|
|
|
|
wcscpy( pwcSlash + 1, L"en-stem.dat" );
|
|
|
|
// Open the data file
|
|
|
|
FILE *fp = _wfopen( awcPath, L"rb" );
|
|
if ( 0 == fp )
|
|
return 0;
|
|
|
|
// Check how big it is
|
|
|
|
fseek( fp, 0, SEEK_END );
|
|
unsigned cb = ftell( fp );
|
|
fseek( fp, 0, SEEK_SET );
|
|
|
|
// Read the directory count and the directory
|
|
|
|
unsigned cDirectory;
|
|
fread( &cDirectory, 1, sizeof( unsigned ), fp );
|
|
|
|
CDirectoryEntry * aDir = new CDirectoryEntry[ cDirectory ];
|
|
if ( 0 == aDir )
|
|
{
|
|
fclose( fp );
|
|
return 0;
|
|
}
|
|
|
|
fread( aDir, cDirectory, sizeof( unsigned ), fp );
|
|
|
|
// Read the key data
|
|
|
|
unsigned cbKeys = cb - ( sizeof( unsigned ) * ( cDirectory + 1 ) );
|
|
|
|
BYTE * pbKeys = new BYTE[ cbKeys ];
|
|
if ( 0 == pbKeys )
|
|
{
|
|
delete [] aDir;
|
|
fclose( fp );
|
|
return 0;
|
|
}
|
|
|
|
fread( pbKeys, cbKeys, 1, fp );
|
|
fclose( fp );
|
|
|
|
// Make the stemmer object with the buffers
|
|
|
|
CStem * pStem = new CStem( cDirectory, aDir, cbKeys, pbKeys );
|
|
|
|
if ( 0 == pStem )
|
|
{
|
|
delete [] aDir;
|
|
delete [] pbKeys;
|
|
}
|
|
|
|
return pStem;
|
|
} //MakeStemObject
|
|
|