You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
979 lines
29 KiB
979 lines
29 KiB
//+-------------------------------------------------------------------------
|
|
//
|
|
// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
|
|
// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
|
|
// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
|
|
// PARTICULAR PURPOSE.
|
|
//
|
|
// Copyright 1998 - 2001 Microsoft Corporation. All Rights Reserved.
|
|
//
|
|
// PROGRAM: lrsample.cxx
|
|
//
|
|
// PURPOSE: Sample wordbreaker and stemmer.
|
|
//
|
|
// PLATFORM: Windows 2000 and later
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
#include <stdio.h>
|
|
#include <wchar.h>
|
|
|
|
#include <windows.h>
|
|
#include <objidl.h>
|
|
#include <indexsrv.h>
|
|
#include <cierror.h>
|
|
#include <filterr.h>
|
|
|
|
#include "lrsample.hxx"
|
|
#include "filtreg.hxx"
|
|
#include "langreg.hxx"
|
|
|
|
//#define LEXICON_STEMMER
|
|
//#define PORTER_STEMMER
|
|
#define SIMPLE_LIST_STEMMER
|
|
|
|
// The CLSID for the wordbreaker
|
|
|
|
CLSID CLSID_SampleWordBreaker = /* d225281a-7ca9-4a46-ae7d-c63a9d4815d4 */
|
|
{
|
|
0xd225281a, 0x7ca9, 0x4a46,
|
|
{0xae, 0x7d, 0xc6, 0x3a, 0x9d, 0x48, 0x15, 0xd4}
|
|
};
|
|
|
|
// The CLSID of the stemmer
|
|
|
|
CLSID CLSID_SampleStemmer = /* 0a275611-aa4d-4b39-8290-4baf77703f55 */
|
|
{
|
|
0x0a275611, 0xaa4d, 0x4b39,
|
|
{0x82, 0x90, 0x4b, 0xaf, 0x77, 0x70, 0x3f, 0x55}
|
|
};
|
|
|
|
// Global module refcount
|
|
|
|
long g_cInstances = 0;
|
|
HMODULE g_hModule = 0;
|
|
|
|
#ifdef PORTER_STEMMER
|
|
|
|
#include "porter.hxx"
|
|
|
|
#endif //PORTER_STEMMER
|
|
|
|
#ifdef LEXICON_STEMMER
|
|
|
|
#include "stem.hxx"
|
|
|
|
CStem * g_pStem = 0;
|
|
|
|
#endif //LEXICON_STEMMER
|
|
|
|
#ifdef SIMPLE_LIST_STEMMER
|
|
|
|
// This is just a simple hard-coded list of words and stem forms.
|
|
|
|
struct SStemForm
|
|
{
|
|
USHORT iList; // first index into aStems
|
|
USHORT iForm; // second index into aStems
|
|
};
|
|
|
|
const SStemForm aStemForms[] =
|
|
{
|
|
{ 0, 0 }, // abide
|
|
{ 0, 2 }, // abided
|
|
{ 0, 4 }, // abides
|
|
{ 0, 3 }, // abiding
|
|
{ 0, 1 }, // abode
|
|
{ 1, 0 }, // bat
|
|
{ 2, 0 }, // batch
|
|
{ 2, 2 }, // batched
|
|
{ 2, 1 }, // batches
|
|
{ 2, 3 }, // batching
|
|
{ 1, 1 }, // bats
|
|
{ 1, 2 }, // batted
|
|
{ 1, 3 }, // batting
|
|
{ 3, 0 }, // bear
|
|
{ 3, 1 }, // bears
|
|
{ 4, 1 }, // began
|
|
{ 4, 0 }, // begin
|
|
{ 4, 3 }, // beginning
|
|
{ 4, 4 }, // begins
|
|
{ 4, 2 }, // begun
|
|
{ 3, 2 }, // bore
|
|
{ 3, 4 }, // born
|
|
{ 3, 3 }, // borne
|
|
{ 5, 0 }, // dance
|
|
{ 5, 1 }, // danced
|
|
{ 5, 2 }, // dances
|
|
{ 5, 3 }, // dancing
|
|
{ 6, 0 }, // heave
|
|
{ 6, 1 }, // heaved
|
|
{ 6, 3 }, // heaves
|
|
{ 6, 4 }, // heaving
|
|
{ 7, 0 }, // hero
|
|
{ 7, 1 }, // heroes
|
|
{ 6, 2 }, // hove
|
|
{ 8, 0 }, // keep
|
|
{ 8, 4 }, // keeping
|
|
{ 8, 1 }, // keeps
|
|
{ 8, 2 }, // kept
|
|
{ 9, 0 }, // misspell
|
|
{ 9, 1 }, // misspelled
|
|
{ 9, 3 }, // misspelling
|
|
{ 9, 4 }, // misspells
|
|
{ 9, 2 }, // misspelt
|
|
{ 10, 0 }, // plead
|
|
{ 10, 1 }, // pleaded
|
|
{ 10, 3 }, // pleading
|
|
{ 10, 4 }, // pleads
|
|
{ 10, 0 }, // pled
|
|
{ 11, 2 }, // ran
|
|
{ 11, 0 }, // run
|
|
{ 11, 3 }, // running
|
|
{ 11, 1 }, // runs
|
|
{ 12, 1 }, // swam
|
|
{ 12, 0 }, // swim
|
|
{ 12, 3 }, // swimming
|
|
{ 12, 4 }, // swims
|
|
{ 12, 2 }, // swum
|
|
{ 13, 2 }, // underlain
|
|
{ 13, 1 }, // underlay
|
|
{ 13, 0 }, // underlie
|
|
{ 13, 4 }, // underlies
|
|
{ 13, 3 }, // underlying
|
|
};
|
|
|
|
const ULONG cStemForms = ArraySize( aStemForms );
|
|
const ULONG cMaxStemForms = 8;
|
|
|
|
const WCHAR * aStems[][ cMaxStemForms ] =
|
|
{
|
|
{ L"abide", L"abode", L"abided", L"abiding", L"abides" }, // 0
|
|
{ L"bat", L"bats", L"batted", L"batting" }, // 1
|
|
{ L"batch", L"batches", L"batched", L"batching" }, // 2
|
|
{ L"bear", L"bears", L"bore", L"borne", L"born" }, // 3
|
|
{ L"begin", L"began", L"begun", L"beginning", L"begins" }, // 4
|
|
{ L"dance", L"danced", L"dances", L"dancing" }, // 5
|
|
{ L"heave", L"heaved", L"hove", L"heaves", L"heaving" }, // 6
|
|
{ L"hero", L"heroes" }, // 7
|
|
{ L"keep", L"keeps", L"kept", L"keeping" }, // 8
|
|
{ L"misspell", L"misspelled", L"misspelt", L"misspelling",
|
|
L"misspells" }, // 9
|
|
{ L"plead", L"pleaded", L"pled", L"pleading", L"pleads" }, // 10
|
|
{ L"run", L"runs", L"ran", L"running" }, // 11
|
|
{ L"swim", L"swam", L"swum", L"swimming", L"swims" }, // 12
|
|
{ L"underlie", L"underlay", L"underlain", L"underlying",
|
|
L"underlies" }, // 13
|
|
};
|
|
|
|
int __cdecl StemCompare( const void *p1, const void *p2 )
|
|
{
|
|
SStemForm const * pForm = (SStemForm const *) p2;
|
|
WCHAR const * pwcWord = (WCHAR const *) p1;
|
|
return wcscmp( pwcWord, aStems[ pForm->iList ][ pForm->iForm ] );
|
|
}
|
|
|
|
#endif // SIMPLE_LIST_STEMMER
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Function: IsWordChar
|
|
//
|
|
// Synopsis: Find whether the i'th character in the buffer _pwcChunk
|
|
// is a word character (rather than word break)
|
|
//
|
|
// Arguments: [pwcChunk] -- Characters whose type information is checked
|
|
// [i] -- Index of character to check
|
|
// [pInfo1] -- Type 1 information
|
|
// [pInfo3] -- Type 3 information
|
|
//
|
|
// Returns: TRUE if the character is a word character
|
|
// FALSE if it's a word breaking character
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
__forceinline BOOL IsWordChar(
|
|
WCHAR const * pwcChunk,
|
|
int i,
|
|
WORD const * pInfo1,
|
|
WORD const * pInfo3 )
|
|
{
|
|
// Any alphabetic, digit, or non-spacing character is part of a word
|
|
|
|
if ( ( 0 != ( pInfo1[i] & ( C1_ALPHA | C1_DIGIT ) ) ) ||
|
|
( 0 != ( pInfo3[i] & C3_NONSPACING ) ) )
|
|
return TRUE;
|
|
|
|
WCHAR c = pwcChunk[i];
|
|
|
|
// Underscore is part of a word
|
|
|
|
if ( L'_' == c )
|
|
return TRUE;
|
|
|
|
//
|
|
// A non-breaking space followed by a non-spacing character should not
|
|
// be a word breaker.
|
|
//
|
|
|
|
if ( 0xa0 == c ) // non breaking space
|
|
{
|
|
// followed by a non-spacing character (looking ahead is okay)
|
|
|
|
if ( 0 != ( pInfo3[i+1] & C3_NONSPACING ) )
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
} //IsWordChar
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Function: ScanChunk
|
|
//
|
|
// Synopsis: For each character find its type information flags
|
|
//
|
|
// Arguments: [pwcChunk] -- Characters whose type information is retrieved
|
|
// [cwc] -- Number of characters to scan
|
|
// [pInfo1] -- Type 1 information is written here
|
|
// [pInfo3] -- Type 3 information is written here
|
|
//
|
|
// Returns: S_OK if successful or an error code
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
HRESULT ScanChunk(
|
|
WCHAR const * pwcChunk,
|
|
ULONG cwc,
|
|
WORD * pInfo1,
|
|
WORD * pInfo3 )
|
|
{
|
|
if ( !GetStringTypeW( CT_CTYPE1, // POSIX character typing
|
|
pwcChunk, // Source
|
|
cwc, // Size of source
|
|
pInfo1 ) ) // Character info 1
|
|
return HRESULT_FROM_WIN32( GetLastError() );
|
|
|
|
if ( !GetStringTypeW( CT_CTYPE3, // Additional POSIX
|
|
pwcChunk, // Source
|
|
cwc, // Size of source
|
|
pInfo3 ) ) // Character info 3
|
|
return HRESULT_FROM_WIN32( GetLastError() );
|
|
|
|
return S_OK;
|
|
} //ScanChunk
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CSampleWordBreaker::Tokenize
|
|
//
|
|
// Synopsis: Break a block of text into individual words
|
|
//
|
|
// Arguments: [pTextSource] -- Source of characters to work on
|
|
// [cwc] -- Number of characters to process
|
|
// [pWordSink] -- Where to send the words found
|
|
// [cwcProcessed] -- Returns the # of characters tokenized
|
|
//
|
|
// Returns: S_OK if successful or an error code
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
HRESULT CSampleWordBreaker::Tokenize(
|
|
TEXT_SOURCE * pTextSource,
|
|
ULONG cwc,
|
|
IWordSink * pWordSink,
|
|
ULONG & cwcProcessed )
|
|
{
|
|
// Leave space for one (unused) lookahead
|
|
|
|
WORD aInfo1[ CSampleWordBreaker::cwcAtATime + 1 ];
|
|
WORD aInfo3[ CSampleWordBreaker::cwcAtATime + 1 ];
|
|
|
|
// Initialize this so we can go 1 beyond in IsWordChar()
|
|
|
|
aInfo3 [ CSampleWordBreaker::cwcAtATime ] = C3_NONSPACING;
|
|
|
|
// Get a pointer to the text we'll be working on
|
|
|
|
const WCHAR * pwcChunk = &pTextSource->awcBuffer[ pTextSource->iCur ];
|
|
|
|
HRESULT hr = ScanChunk( pwcChunk, cwc, aInfo1, aInfo3 );
|
|
if ( FAILED( hr ) )
|
|
return hr;
|
|
|
|
BOOL fWordHasZWS = FALSE; // Does the current word have a 0-width-space?
|
|
ULONG cwcZWS; // Length of word minus embedded 0-width-spaces
|
|
|
|
//
|
|
// iBeginWord is the offset into aInfoX of the beginning character of
|
|
// a word. iCur is the first unprocessed character.
|
|
// They are indexes into the current block (_pwcChunk).
|
|
//
|
|
|
|
ULONG iBeginWord = 0;
|
|
ULONG iCur = 0;
|
|
|
|
// Temp buffer for a word having zero-width space
|
|
|
|
WCHAR awcBufZWS[ CSampleWordBreaker::cwcAtATime ];
|
|
|
|
// Send words from the current block to word sink
|
|
|
|
while ( iCur < cwc )
|
|
{
|
|
// Skip whitespace, punctuation, etc.
|
|
|
|
for (; iCur < cwc; iCur++)
|
|
if ( IsWordChar( pwcChunk, iCur, aInfo1, aInfo3 ) )
|
|
break;
|
|
|
|
// iCur points to a word char or is equal to cwc
|
|
|
|
iBeginWord = iCur;
|
|
if ( iCur < cwc )
|
|
iCur++; // we knew it pointed at word character
|
|
|
|
//
|
|
// Find word break. Filter may output Unicode zero-width-space, which
|
|
// should be ignored by the wordbreaker.
|
|
//
|
|
|
|
fWordHasZWS = FALSE;
|
|
for ( ; iCur < cwc; iCur++ )
|
|
{
|
|
if ( !IsWordChar( pwcChunk, iCur, aInfo1, aInfo3 ) )
|
|
{
|
|
if ( ZERO_WIDTH_SPACE == pwcChunk[iCur] )
|
|
fWordHasZWS = TRUE;
|
|
else
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ( fWordHasZWS )
|
|
{
|
|
// Copy word into awcBufZWS after stripping zero-width-spaces
|
|
|
|
cwcZWS = 0;
|
|
for ( ULONG i = iBeginWord; i < iCur; i++ )
|
|
{
|
|
if ( ZERO_WIDTH_SPACE != pwcChunk[i] )
|
|
awcBufZWS[cwcZWS++] = pwcChunk[i];
|
|
}
|
|
}
|
|
|
|
// iCur points to a non-word char or is equal to cwc
|
|
|
|
if ( iCur < cwc )
|
|
{
|
|
// store the word and its source position
|
|
|
|
if ( fWordHasZWS )
|
|
hr = pWordSink->PutWord( cwcZWS,
|
|
awcBufZWS, // stripped word
|
|
iCur - iBeginWord,
|
|
pTextSource->iCur + iBeginWord );
|
|
else
|
|
hr = pWordSink->PutWord( iCur - iBeginWord,
|
|
pwcChunk + iBeginWord, // the word
|
|
iCur - iBeginWord,
|
|
pTextSource->iCur + iBeginWord );
|
|
|
|
if ( FAILED( hr ) )
|
|
return hr;
|
|
|
|
iCur++; // we knew it pointed at non-word char
|
|
iBeginWord = iCur; // in case we exit the loop now
|
|
}
|
|
} // next word
|
|
|
|
// End of words in chunk.
|
|
// iCur == cwc
|
|
// iBeginWord points at beginning of word or == cwc
|
|
|
|
if ( 0 == iBeginWord )
|
|
{
|
|
// A single word fills from beginning of this chunk
|
|
// to the end. This is either a very long word or
|
|
// a short word in a leftover buffer.
|
|
|
|
// store the word and its source position
|
|
|
|
if ( fWordHasZWS )
|
|
hr = pWordSink->PutWord( cwcZWS,
|
|
awcBufZWS, // stripped word
|
|
iCur,
|
|
pTextSource->iCur ); // its source pos.
|
|
else
|
|
hr = pWordSink->PutWord( iCur,
|
|
pwcChunk, // the word
|
|
iCur,
|
|
pTextSource->iCur ); // its source pos.
|
|
|
|
if ( FAILED( hr ) )
|
|
return hr;
|
|
|
|
// Position it to not add the word twice.
|
|
|
|
iBeginWord = iCur;
|
|
}
|
|
|
|
//
|
|
// If this is the last chunk from text source, then process the
|
|
// last fragment.
|
|
//
|
|
|
|
if ( ( cwc < CSampleWordBreaker::cwcAtATime ) && ( iBeginWord != iCur ) )
|
|
{
|
|
// store the word and its source position
|
|
|
|
if ( fWordHasZWS )
|
|
hr = pWordSink->PutWord( cwcZWS,
|
|
awcBufZWS, // stripped word
|
|
iCur - iBeginWord,
|
|
pTextSource->iCur + iBeginWord );
|
|
else
|
|
hr = pWordSink->PutWord( iCur - iBeginWord,
|
|
pwcChunk + iBeginWord, // the word
|
|
iCur - iBeginWord,
|
|
pTextSource->iCur + iBeginWord );
|
|
|
|
if ( FAILED( hr ) )
|
|
return hr;
|
|
|
|
iBeginWord = iCur;
|
|
}
|
|
|
|
cwcProcessed = iBeginWord;
|
|
return S_OK;
|
|
} //Tokenize
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CSampleWordBreaker::BreakText
|
|
//
|
|
// Synopsis: Break a block of text into individual words
|
|
//
|
|
// Arguments: [pTextSource] -- Source of characters to work on
|
|
// [pWordSink] -- Where to send the words found
|
|
// [pPhraseSink] -- Where to send the phrases found (not used)
|
|
//
|
|
// Returns: S_OK if successful or an error code
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
HRESULT STDMETHODCALLTYPE CSampleWordBreaker::BreakText(
|
|
TEXT_SOURCE * pTextSource,
|
|
IWordSink * pWordSink,
|
|
IPhraseSink * pPhraseSink )
|
|
{
|
|
// Validate arguments
|
|
|
|
if ( 0 == pTextSource )
|
|
return E_INVALIDARG;
|
|
|
|
if ( ( 0 == pWordSink ) || ( pTextSource->iCur == pTextSource->iEnd ) )
|
|
return S_OK;
|
|
|
|
if ( pTextSource->iCur > pTextSource->iEnd )
|
|
return E_INVALIDARG;
|
|
|
|
ULONG cwcProcessed; // # chars actually processed by Tokenize()
|
|
HRESULT hr = S_OK;
|
|
|
|
// Pull text from the text source and tokenize it
|
|
|
|
do
|
|
{
|
|
BOOL fFirstTime = TRUE;
|
|
|
|
while ( pTextSource->iCur < pTextSource->iEnd )
|
|
{
|
|
ULONG cwc = pTextSource->iEnd - pTextSource->iCur;
|
|
|
|
// Process in buckets of cwcAtATime only
|
|
|
|
if ( cwc >= CSampleWordBreaker::cwcAtATime )
|
|
cwc = CSampleWordBreaker::cwcAtATime;
|
|
else if ( !fFirstTime )
|
|
break;
|
|
|
|
hr = Tokenize( pTextSource, cwc, pWordSink, cwcProcessed );
|
|
if ( FAILED( hr ) )
|
|
return hr;
|
|
|
|
pTextSource->iCur += cwcProcessed;
|
|
fFirstTime = FALSE;
|
|
}
|
|
|
|
hr = pTextSource->pfnFillTextBuffer( pTextSource );
|
|
} while ( SUCCEEDED( hr ) );
|
|
|
|
//
|
|
// If anything failed except for running out of text, report the error.
|
|
// Otherwise, for cases like out of memory, files will not get retried or
|
|
// reported as failures properly.
|
|
//
|
|
|
|
if ( ( FAILED( hr ) ) &&
|
|
( FILTER_E_NO_MORE_VALUES != hr ) &&
|
|
( FILTER_E_NO_TEXT != hr ) &&
|
|
( FILTER_E_NO_VALUES != hr ) &&
|
|
( FILTER_E_NO_MORE_TEXT != hr ) &&
|
|
( FILTER_E_END_OF_CHUNKS != hr ) &&
|
|
( WBREAK_E_END_OF_TEXT != hr ) )
|
|
return hr;
|
|
|
|
ULONG cwc = pTextSource->iEnd - pTextSource->iCur;
|
|
|
|
if ( 0 == cwc )
|
|
return S_OK;
|
|
|
|
return Tokenize( pTextSource, cwc, pWordSink, cwcProcessed );
|
|
} //BreakText
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CSampleStemmer::GenerateWordForms
|
|
//
|
|
// Synopsis: From the input word, emit the original and alternate forms
|
|
// of the word.
|
|
//
|
|
// Arguments: [pwcInBuf] -- The original word to stem (not 0-terminated)
|
|
// [cwc] -- Length in characters of the word
|
|
// [pStemSink] -- Where to emit the stems
|
|
//
|
|
// Returns: S_OK if successful or an error code
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
HRESULT STDMETHODCALLTYPE CSampleStemmer::GenerateWordForms(
|
|
WCHAR const * pwcInBuf,
|
|
ULONG cwc,
|
|
IWordFormSink * pWordFormSink )
|
|
{
|
|
// Validate the arguments
|
|
|
|
if ( ( 0 == pwcInBuf ) || ( 0 == pWordFormSink ) )
|
|
return E_INVALIDARG;
|
|
|
|
HRESULT hr = S_OK;
|
|
|
|
#ifdef PORTER_STEMMER
|
|
|
|
//
|
|
// If the word is small enough, attempt to get the stemmed form of the
|
|
// word. Emit both forms if they are different. The Porter algorithm
|
|
// does the opposite of what's required here, but doing the right thing
|
|
// requires a lexicon.
|
|
//
|
|
|
|
if ( cwc < cwcMaxPorterWord )
|
|
{
|
|
// Make a temporary buffer for the word
|
|
|
|
WCHAR awcPorter[ cwcMaxPorterWord ];
|
|
CopyMemory( awcPorter, pwcInBuf, sizeof(WCHAR) * cwc );
|
|
awcPorter[cwc] = 0;
|
|
|
|
// Convert it to lowercase and save the original in lowercase
|
|
|
|
CharLower( awcPorter );
|
|
WCHAR awcOriginal[ cwcMaxPorterWord ];
|
|
wcscpy( awcOriginal, awcPorter );
|
|
|
|
// Get the stemmed form of the word
|
|
|
|
GetPorterStemForm( awcPorter );
|
|
|
|
// If it's different from the original, emit it.
|
|
|
|
if ( wcscmp( awcOriginal, awcPorter ) )
|
|
{
|
|
hr = pWordFormSink->PutAltWord( awcPorter,
|
|
wcslen( awcPorter ) );
|
|
if ( FAILED( hr ) )
|
|
return hr;
|
|
}
|
|
}
|
|
|
|
#endif //PORTER_STEMMER
|
|
|
|
#ifdef LEXICON_STEMMER
|
|
|
|
//
|
|
// If the word is small enough to work with the stemmer, attempt to get
|
|
// various forms of the word.
|
|
//
|
|
|
|
if ( cwc < cbMaxStem )
|
|
{
|
|
//
|
|
// Convert the original string to 8-bit characters. This is OK since
|
|
// it's is an English stemmer that can safely assume such characters.
|
|
//
|
|
|
|
char acOriginal[ cbMaxStem ];
|
|
for ( unsigned i = 0; i < cwc; i++ )
|
|
acOriginal[ i ] = (char) pwcInBuf[ i ];
|
|
acOriginal[ i ] = 0;
|
|
|
|
// Enumerate all stem-sets that contain the word.
|
|
|
|
unsigned iBmk = stemInvalid;
|
|
unsigned iStemSet = stemInvalid;
|
|
char ac[ cbMaxStem ];
|
|
|
|
while ( g_pStem->FindStemSet( acOriginal, iBmk, iStemSet ) )
|
|
{
|
|
// Enumerate all forms of the stem-set, root first.
|
|
|
|
CStemSet set( g_pStem->GetStemSetRoot(), iStemSet );
|
|
unsigned iStemBmk = stemInvalid;
|
|
|
|
while ( set.GetForm( ac, iStemBmk ) )
|
|
{
|
|
if ( strcmp( ac, acOriginal ) )
|
|
{
|
|
WCHAR awcForm[ cbMaxStem ];
|
|
mbstowcs( awcForm, ac, -1 );
|
|
|
|
hr = pWordFormSink->PutAltWord( awcForm,
|
|
wcslen( awcForm ) );
|
|
if ( FAILED( hr ) )
|
|
return hr;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif //LEXICON_STEMMER
|
|
|
|
#ifdef SIMPLE_LIST_STEMMER
|
|
|
|
// Look up the word in the simple list of stem forms
|
|
|
|
SStemForm const * pStemForm = (SStemForm *) bsearch( pwcInBuf,
|
|
aStemForms,
|
|
cStemForms,
|
|
sizeof SStemForm,
|
|
StemCompare );
|
|
|
|
if ( 0 != pStemForm )
|
|
{
|
|
// Found it, now iterate all the forms
|
|
|
|
ULONG iList = pStemForm->iList;
|
|
ULONG iForm = 0;
|
|
|
|
while ( 0 != aStems[ iList ][ iForm ] )
|
|
{
|
|
WCHAR const * pwc = aStems[ iList ][ iForm ];
|
|
|
|
// Don't emit the original word yet
|
|
|
|
if ( 0 != wcscmp( pwc, pwcInBuf ) )
|
|
{
|
|
hr = pWordFormSink->PutAltWord( pwc,
|
|
wcslen( pwc ) );
|
|
if ( FAILED( hr ) )
|
|
return hr;
|
|
}
|
|
|
|
iForm++;
|
|
}
|
|
}
|
|
|
|
#endif //SIMPLE_LIST_STEMMER
|
|
|
|
// Emit the original word
|
|
|
|
return pWordFormSink->PutWord( pwcInBuf, cwc );
|
|
} //StemWord
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CLanguageResourceSampleCF::CLanguageResourceSampleCF
|
|
//
|
|
// Synopsis: Language resource class factory constructor
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
CLanguageResourceSampleCF::CLanguageResourceSampleCF() :
|
|
_lRefs( 1 )
|
|
{
|
|
InterlockedIncrement( &g_cInstances );
|
|
} //CLanguageResourceSampleCF
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CLanguageResourceSampleCF::~CLanguageResourceSampleCF
|
|
//
|
|
// Synopsis: Language resource class factory destructor
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
CLanguageResourceSampleCF::~CLanguageResourceSampleCF()
|
|
{
|
|
InterlockedDecrement( &g_cInstances );
|
|
} //~LanguageResourceSampleCF
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CLanguageResourceSampleCF::QueryInterface
|
|
//
|
|
// Synopsis: Rebind to the requested interface
|
|
//
|
|
// Arguments: [riid] -- IID of new interface
|
|
// [ppvObject] -- New interface * returned here
|
|
//
|
|
// Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
HRESULT STDMETHODCALLTYPE CLanguageResourceSampleCF::QueryInterface(
|
|
REFIID riid,
|
|
void ** ppvObject )
|
|
{
|
|
if ( IID_IClassFactory == riid )
|
|
*ppvObject = (IUnknown *) (IClassFactory *) this;
|
|
else if ( IID_IUnknown == riid )
|
|
*ppvObject = (IUnknown *) (IPersist *) this;
|
|
else
|
|
{
|
|
*ppvObject = 0;
|
|
return E_NOINTERFACE;
|
|
}
|
|
|
|
AddRef();
|
|
return S_OK;
|
|
} //QueryInterface
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CLanguageResourceSampleCF::AddRef
|
|
//
|
|
// Synopsis: Increments the refcount
|
|
//
|
|
// Returns: The new refcount
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
ULONG STDMETHODCALLTYPE CLanguageResourceSampleCF::AddRef()
|
|
{
|
|
return InterlockedIncrement( &_lRefs );
|
|
} //AddRef
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CLanguageResourceSampleCF::Release
|
|
//
|
|
// Synopsis: Decrement refcount. Delete self if necessary.
|
|
//
|
|
// Returns: The new refcount
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
ULONG STDMETHODCALLTYPE CLanguageResourceSampleCF::Release()
|
|
{
|
|
long lTmp = InterlockedDecrement( &_lRefs );
|
|
|
|
if ( 0 == lTmp )
|
|
delete this;
|
|
|
|
return lTmp;
|
|
} //Release
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CLanguageResourceSampleCF::CreateInstance
|
|
//
|
|
// Synopsis: Creates new Language Resource sample object
|
|
//
|
|
// Arguments: [pUnkOuter] -- 'Outer' IUnknown
|
|
// [riid] -- Interface to bind
|
|
// [ppvObject] -- Interface returned here
|
|
//
|
|
// Returns: S_OK if successful or an appropriate error code
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
HRESULT STDMETHODCALLTYPE CLanguageResourceSampleCF::CreateInstance(
|
|
IUnknown * pUnkOuter,
|
|
REFIID riid,
|
|
void * * ppvObject )
|
|
{
|
|
*ppvObject = 0;
|
|
|
|
if ( IID_IStemmer == riid )
|
|
*ppvObject = new CSampleStemmer();
|
|
else if ( IID_IWordBreaker == riid )
|
|
*ppvObject = new CSampleWordBreaker();
|
|
else
|
|
return E_NOINTERFACE;
|
|
|
|
if ( 0 == *ppvObject )
|
|
return E_OUTOFMEMORY;
|
|
|
|
return S_OK;
|
|
} //CreateInstance
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CLanguageResourceSampleCF::LockServer
|
|
//
|
|
// Synopsis: Force class factory to remain loaded
|
|
//
|
|
// Arguments: [fLock] -- TRUE if locking, FALSE if unlocking
|
|
//
|
|
// Returns: S_OK
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
HRESULT STDMETHODCALLTYPE CLanguageResourceSampleCF::LockServer( BOOL fLock )
|
|
{
|
|
if ( fLock )
|
|
InterlockedIncrement( &g_cInstances );
|
|
else
|
|
InterlockedDecrement( &g_cInstances );
|
|
|
|
return S_OK;
|
|
} //LockServer
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Function: DllGetClassObject
|
|
//
|
|
// Synopsis: Ole DLL load class routine
|
|
//
|
|
// Arguments: [cid] -- Class to load
|
|
// [iid] -- Interface to bind to on class object
|
|
// [ppvObj] -- Interface pointer returned here
|
|
//
|
|
// Returns: Sample language resource class factory
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
extern "C" HRESULT STDMETHODCALLTYPE DllGetClassObject(
|
|
REFCLSID cid,
|
|
REFIID iid,
|
|
void ** ppvObj )
|
|
{
|
|
IUnknown * pUnk = 0;
|
|
*ppvObj = 0;
|
|
|
|
if ( CLSID_SampleWordBreaker == cid ||
|
|
CLSID_SampleStemmer == cid )
|
|
{
|
|
pUnk = new CLanguageResourceSampleCF();
|
|
|
|
if ( 0 == pUnk )
|
|
return E_OUTOFMEMORY;
|
|
|
|
#ifdef LEXICON_STEMMER
|
|
|
|
if ( 0 == g_pStem )
|
|
g_pStem = MakeStemObject( g_hModule );
|
|
|
|
if ( 0 == g_pStem )
|
|
{
|
|
pUnk->Release();
|
|
return E_OUTOFMEMORY;
|
|
}
|
|
|
|
#endif //LEXICON_STEMMER
|
|
|
|
}
|
|
else
|
|
{
|
|
*ppvObj = 0;
|
|
return E_NOINTERFACE;
|
|
}
|
|
|
|
HRESULT hr = pUnk->QueryInterface( iid, ppvObj );
|
|
|
|
pUnk->Release();
|
|
|
|
return hr;
|
|
} //DllGetClassObject
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Function: DllCanUnloadNow
|
|
//
|
|
// Synopsis: Notifies DLL to unload (cleanup global resources)
|
|
//
|
|
// Returns: S_OK if it is acceptable for caller to unload DLL.
|
|
// S_FALSE otherwise.
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
extern "C" HRESULT STDMETHODCALLTYPE DllCanUnloadNow( void )
|
|
{
|
|
if ( 0 == g_cInstances )
|
|
return S_OK;
|
|
|
|
return S_FALSE;
|
|
} //DllCanUnloadNow
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Function: DllMain
|
|
//
|
|
// Synopsis: Standard main entry point for the module.
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
BOOL WINAPI DllMain(
|
|
HANDLE hInstance,
|
|
DWORD dwReason,
|
|
void * lpReserved )
|
|
{
|
|
if ( DLL_PROCESS_ATTACH == dwReason )
|
|
{
|
|
g_hModule = (HMODULE) hInstance;
|
|
DisableThreadLibraryCalls( (HINSTANCE) hInstance );
|
|
}
|
|
|
|
return TRUE;
|
|
} //DllMain
|
|
|
|
SLangRegistry const English_Sample_LangRes =
|
|
{
|
|
L"English_Sample", MAKELANGID( LANG_ENGLISH, SUBLANG_ENGLISH_SAMPLE ),
|
|
{ L"{d225281a-7ca9-4a46-ae7d-c63a9d4815d4}",
|
|
L"English_Sample Word Breaker",
|
|
L"lrsample.dll",
|
|
L"both" },
|
|
{ L"{0a275611-aa4d-4b39-8290-4baf77703f55}",
|
|
L"English_Sample Stemmer",
|
|
L"lrsample.dll",
|
|
L"both" }
|
|
};
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: DllRegisterServer
|
|
//
|
|
// Synopsis: Registers the language resources in the registry
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
STDAPI DllRegisterServer()
|
|
{
|
|
return RegisterALanguageResource( English_Sample_LangRes );
|
|
} //DllRegisterServer
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: DllUnregisterServer
|
|
//
|
|
// Synopsis: Removes the language resources from the registry
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
STDAPI DllUnregisterServer()
|
|
{
|
|
return UnRegisterALanguageResource( English_Sample_LangRes );
|
|
} //DllUnregisterServer
|
|
|