|
|
//+---------------------------------------------------------------------------
//
// Microsoft Windows
// Copyright (C) Microsoft Corporation, 1991 - 2000
//
// File: KEYMAK.CXX
//
// Contents: Key maker
//
// Classes: CKeyMaker
//
// History: 31-Jan-92 BartoszM Created
// 24-Apr-95 SitaramR Removed US/Fake stemmer and added
// Infosoft stemmer
//
// Notes: The filtering pipeline is hidden in the Data Repository
// object which serves as a sink for the filter.
// The sink for the Data Repository is the Key Repository.
// The language dependent part of the pipeline
// is obtained from the Language List object and is called
// Language Dependent Key Maker. It consists of:
//
// Word Breaker
// Stemmer (optional)
// Normalizer
// Noise List
//
// Each object serves as a sink for its predecessor,
// Key Repository is the final sink.
//
//----------------------------------------------------------------------------
#include <pch.cxx>
#pragma hdrstop
#include <lang.hxx>
#include <keymak.hxx>
#include <noise.hxx>
#include <norm.hxx>
#include <stemsink.hxx>
//+---------------------------------------------------------------------------
//
// Member: CKeyMaker::CKeyMaker
//
// Synopsis: Constructs a language-dependant key maker object
//
// Effects: Creates a noiselist, normalizer and borrows a wordbreaker, stemmer
//
// Arguments: [locale] -- language locale
// [krep] -- key repository to place completed keys in
// [pPhraseSink] -- sink for collecting phrases
// [fQuery] -- true if this is during querying
// [ulFuzzy] -- fuzzy level of query
//
// History: 05-June-91 t-WadeR Created.
// 12-Oct-92 AmyA Added Unicode support
//
//----------------------------------------------------------------------------
CKeyMaker::CKeyMaker( LCID locale, PROPID pid, PKeyRepository& krep, IPhraseSink *pPhraseSink, BOOL fQuery, ULONG ulFuzzy, CLangList & langList ) : _pPhraseSink(pPhraseSink), _fQuery( fQuery ), _sLang( locale, pid, &langList, fQuery ? LANG_LOAD_ALL : LANG_LOAD_NO_STEMMER ), _lcid( locale ), _pid( pid ) { krep.GetSourcePosBuffers (&_pcwcSrcPos, &_pcwcSrcLen );
CStringTable* noiseTable;
//
// Don't remove noise words if we're doing prefix matching. The noise
// *word* is potentially only a prefix for a non-noise word.
//
if (GENERATE_METHOD_PREFIX == ulFuzzy ) noiseTable = 0; else noiseTable = _sLang->GetNoiseTable();
if ( noiseTable != 0 ) _xNoiseList.Set( new CNoiseList( *noiseTable, krep ) ); else _xNoiseList.Set( new CNoiseListEmpty( krep, ulFuzzy ) );
_xWordRep.Set( new CNormalizer( _xNoiseList.GetReference() ) );
// Get Normalizer's buffer length
_cwcMaxNormBuf = _xWordRep->GetMaxBufferLen();
// get stemmer (optional)
if ( ulFuzzy == GENERATE_METHOD_STEMMED ) { IStemmer *pStemmer = _sLang->GetStemmer();
if ( pStemmer ) { BOOL fCopyright; SCODE sc = pStemmer->Init( _cwcMaxNormBuf, &fCopyright );
if ( FAILED(sc) ) { ciDebugOut(( DEB_ERROR, "IStemmer::Init returned 0x%x\n", sc )); THROW( CException( sc ) ); }
if ( fCopyright ) { WCHAR const * pLicense; sc = pStemmer->GetLicenseToUse( &pLicense );
if ( SUCCEEDED(sc) ) { ciDebugOut(( DEB_WORDS, "%ws\n", pLicense )); } else { ciDebugOut(( DEB_ERROR, "IStemmer::GetLicenseToUse returned 0x%x\n", sc )); THROW( CException( sc ) ); } }
_xWordRep2.Set( _xWordRep.Acquire() ); _xWordRep.Set( new CStemmerSink( pStemmer, _xWordRep2.GetReference() ) ); } else { ciDebugOut(( DEB_ERROR, "Fuzzy2 query, but no stemmer available for locale 0x%x\n", locale )); } }
//
// Initialize word breaker
//
_pWBreak = _sLang->GetWordBreaker();
Win4Assert( _pWBreak );
BOOL fCopyright; SCODE sc = _pWBreak->Init( fQuery, _cwcMaxNormBuf, &fCopyright );
if ( FAILED(sc) ) { ciDebugOut(( DEB_ERROR, "IWordBreaker::Init returned 0x%x\n", sc )); THROW( CException( sc ) ); }
if ( fCopyright ) { WCHAR const * pLicense; sc = _pWBreak->GetLicenseToUse( &pLicense );
if ( SUCCEEDED(sc) ) { ciDebugOut(( DEB_WORDS, "%ws\n", pLicense )); } else { ciDebugOut(( DEB_ERROR, "IWordBreaker::GetLicenseToUse returned 0x%x\n", sc )); THROW( CException( sc ) ); } } } //CKeyMaker
//+---------------------------------------------------------------------------
//
// Member: CKeyMaker::CKeyMaker
//
// Synopsis: Constructs key maker for noise word list initialization.
//
// Arguments: [pWBreak] -- word breaker
// [Noise] -- noise word list
//
// History: 05-June-91 t-WadeR Created.
// 12-Oct-92 AmyA Added Unicode support
//
//----------------------------------------------------------------------------
CKeyMaker::CKeyMaker( IWordBreaker * pWBreak, PNoiseList & Noise ) : _pWBreak( pWBreak ), _pPhraseSink(0), _fQuery(FALSE) { _xWordRep.Set( new CNormalizer( Noise ) );
// Get Normalizer's buffer length
_cwcMaxNormBuf = _xWordRep->GetMaxBufferLen();
_pcwcSrcPos = 0; // We don't use them!
_pcwcSrcLen = 0;
//
// Initialize word breaker
//
Win4Assert( _pWBreak );
BOOL fCopyright; SCODE sc = _pWBreak->Init( FALSE, _cwcMaxNormBuf, &fCopyright );
if ( FAILED(sc) ) { ciDebugOut(( DEB_ERROR, "IWordBreaker::Init returned 0x%x\n", sc )); THROW( CException( sc ) ); }
if ( fCopyright ) { WCHAR const * pLicense; sc = _pWBreak->GetLicenseToUse( &pLicense );
if ( SUCCEEDED(sc) ) { ciDebugOut(( DEB_WORDS, "%ws\n", pLicense )); } else { ciDebugOut(( DEB_ERROR, "IWordBreaker::GetLicenseToUse returned 0x%x\n", sc )); THROW( CException( sc ) ); } } } //CKeyMaker
//+---------------------------------------------------------------------------
//
// Member: CKeyMaker::~CKeyMaker
//
// Synopsis: destroys a key maker object
//
// History: 05-June-91 t-WadeR Created.
//
//----------------------------------------------------------------------------
CKeyMaker::~CKeyMaker() { }
//
// The following are needed to make midl happy. There are no other interfaces
// to bind to. Inheritance from IUnknown is unnecessary.
//
SCODE STDMETHODCALLTYPE CKeyMaker::QueryInterface(REFIID riid, void * * ppvObject) { *ppvObject = 0; return( E_NOTIMPL ); }
ULONG STDMETHODCALLTYPE CKeyMaker::AddRef() { return( 1 ); }
ULONG STDMETHODCALLTYPE CKeyMaker::Release() { return( 1 ); }
//+-------------------------------------------------------------------------
//
// Method: CKeyMaker::PutWord
//
// Synopsis: Store word in word repository
//
// Arguments: [cwc] -- Count of characters in [pwcInBuf]
// [pwcInBuf] -- Word
// [cwcSrcLen] -- count of characters in pTextSource buffer (see IWordBreaker::BreakText)
// [cwcSrcPos] -- position of word in pTextSource buffer
//
// History: 19-Apr-1994 KyleP Created
//
//--------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CKeyMaker::PutWord( ULONG cwc, WCHAR const *pwcInBuf, ULONG cwcSrcLen, ULONG cwcSrcPos ) { SCODE sc = S_OK;
// validate PutWord call
if ( !_altWordsEnforcer.IsPutWordOk() ) { Win4Assert( !"CKeyMaker::PutWord - invalid state" ); ciDebugOut(( DEB_ITRACE, "PutWord: %.*ws\n", cwc, pwcInBuf ));
return E_FAIL; }
CTranslateSystemExceptions translate;
TRY { if ( cwc > _cwcMaxNormBuf ) { sc = LANGUAGE_S_LARGE_WORD; cwc = _cwcMaxNormBuf; } if ( cwc > 0 ) { #if CIDBG == 1
if ( ciInfoLevel & DEB_WORDS ) { //
// Check for 'printable' characters.
//
BOOL fOk = TRUE; for ( unsigned i = 0; i < cwc; i++ ) { if ( pwcInBuf[i] > 0xFF ) { fOk = FALSE; break; } } if ( fOk ) ciDebugOut(( DEB_WORDS, "PutWord: \"%.*ws\" Occ = %d cwcSrcLen = %d, cwcSrcPos = %d\n", cwc, pwcInBuf, _xWordRep->GetOccurrence(), cwcSrcLen, cwcSrcPos )); else { ciDebugOut(( DEB_WORDS, "PutWord:" )); for ( i = 0; i < cwc; i++ ) ciDebugOut(( DEB_WORDS | DEB_NOCOMPNAME, " %04X", pwcInBuf[i] )); ciDebugOut(( DEB_WORDS | DEB_NOCOMPNAME, " Occ = %d cwcSrcLen = %d, cwcSrcPos = %d\n", _xWordRep->GetOccurrence(), cwcSrcLen, cwcSrcPos )); } } #endif // CIDBG
//
// No internal call to PutAltWord for performance reasons.
//
if (0 != _pcwcSrcPos) { Win4Assert ( 0 != _pcwcSrcLen ); *_pcwcSrcLen = cwcSrcLen; *_pcwcSrcPos = cwcSrcPos; } _xWordRep->ProcessWord( pwcInBuf, cwc ); } } CATCH( CException, e ) { sc = e.GetErrorCode(); } END_CATCH;
return sc; } //PutWord
//+-------------------------------------------------------------------------
//
// Method: CKeyMaker::PutAltWord
//
// Synopsis: Store alternate word in word repository.
//
// Effects: Identical to PutWord except occurrence count is not
// incremented.
//
// Arguments: [cwc] -- Count of characters in [pwcInBuf]
// [pwcInBuf] -- Word
// [cwcSrcLen] -- count of characters in pTextSource buffer (see IWordBreaker::BreakText)
// [cwcSrcPos] -- position of word in pTextSource buffer
//
// History: 19-Apr-1994 KyleP Created
//
//--------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CKeyMaker::PutAltWord( ULONG cwc, WCHAR const *pwcInBuf, ULONG cwcSrcLen, ULONG cwcSrcPos ) { SCODE sc = S_OK;
// validate PutWord call
if ( !_altWordsEnforcer.IsPutAltWordOk() ) { Win4Assert( !"CKeyMaker::PutAltWord - invalid state" ); ciDebugOut(( DEB_ITRACE, "PutAltWord: %.*ws\n", cwc, pwcInBuf ));
return E_FAIL; }
CTranslateSystemExceptions translate;
TRY { //
// What is to be done if two large, alternate words end up with the
// same (truncated) prefix after truncation ?
// This is fixed in Babylon and isn't a problem here.
//
if ( cwc > _cwcMaxNormBuf ) { sc = LANGUAGE_S_LARGE_WORD; cwc = _cwcMaxNormBuf; } if ( cwc > 0 ) { ciDebugOut(( DEB_WORDS, "PutAltWord: \"%.*ws\" Occ = %d cwcSrcLen = %d, cwcSrcPos = %d\n", cwc, pwcInBuf, _xWordRep->GetOccurrence(), cwcSrcLen, cwcSrcPos )); if (0 != _pcwcSrcPos) { Win4Assert ( 0 != _pcwcSrcLen ); *_pcwcSrcLen = cwcSrcLen; *_pcwcSrcPos = cwcSrcPos; } _xWordRep->ProcessAltWord( pwcInBuf, cwc ); } } CATCH( CException, e ) { sc = e.GetErrorCode(); } END_CATCH;
return sc; } //PutAltWord
//+-------------------------------------------------------------------------
//
// Method: CKeyMaker::StartAltPhrase
//
// Synopsis: Pass on StartAltPhrase to word repository
//
// History: 24-Apr-1994 KyleP Created
//
//--------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CKeyMaker::StartAltPhrase() { SCODE sc = S_OK;
CTranslateSystemExceptions translate;
TRY { if ( _fQuery ) { // validate StartAltPhrase call
if ( !_altWordsEnforcer.IsStartAltPhraseOk() || !_altPhrasesEnforcer.IsStartAltPhraseOk() ) { Win4Assert( !"CKeyMaker::StartAltPhrase - invalid state" ); THROW( CException( E_FAIL ) ); } _xWordRep->StartAltPhrase(); } else sc = WBREAK_E_QUERY_ONLY; } CATCH( CException, e ) { sc = e.GetErrorCode(); } END_CATCH;
return sc; } //StartAltPhrase
//+-------------------------------------------------------------------------
//
// Method: CKeyMaker::EndAltPhrase
//
// Synopsis: Pass on EndAltPhrase to word repository
//
// History: 24-Apr-1994 KyleP Created
//
//--------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CKeyMaker::EndAltPhrase() { SCODE sc = S_OK;
CTranslateSystemExceptions translate;
TRY { if ( _fQuery ) { // validate EndAltPhrase call
if ( !_altWordsEnforcer.IsEndAltPhraseOk() || !_altPhrasesEnforcer.IsEndAltPhraseOk() ) { Win4Assert( !"CKeyMaker::EndAltPhrase - invalid state" ); THROW( CException( E_FAIL ) ); } _xWordRep->EndAltPhrase(); } else sc = WBREAK_E_QUERY_ONLY; } CATCH( CException, e ) { sc = e.GetErrorCode(); } END_CATCH;
return sc; } //EndAltPhrase
//+-------------------------------------------------------------------------
//
// Method: CKeyMaker::PutBreak
//
// Synopsis: Increment the occurrence count appropriately
//
// History: 24-Apr-1994 KyleP Created
//
//--------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CKeyMaker::PutBreak( WORDREP_BREAK_TYPE breakType ) { // We are modeling PutBreak by a skip of the appropriate number of noise words
switch ( breakType ) { case WORDREP_BREAK_EOW: _xWordRep->SkipNoiseWords( 1 ); break;
case WORDREP_BREAK_EOS: _xWordRep->SkipNoiseWords( 8 ); break;
case WORDREP_BREAK_EOP: _xWordRep->SkipNoiseWords( 128 ); break;
case WORDREP_BREAK_EOC: _xWordRep->SkipNoiseWords( 1024 ); break;
default: ciDebugOut(( DEB_ERROR, "CKeyMaker::PutBreak -- Bad break type %d\n", breakType )); return( E_FAIL ); }
return( S_OK ); } //PutBreak
//+-------------------------------------------------------------------------
//
// Method: CKeyMaker::Supports
//
// Synopsis: Checks if the pid/lang are supported by the language object
//
// Arguments: [pid] -- The property ID
// [lcid] -- The locale
//
// Returns: TRUE if it is supported
//
// History: 24-Apr-1994 KyleP Created
//
//--------------------------------------------------------------------------
BOOL CKeyMaker::Supports( PROPID pid, LCID lcid ) { if ( (lcid == _lcid) && (pid == _pid) ) return TRUE; else return _sLang.Supports( pid, lcid ); } //Supports
//+---------------------------------------------------------------------------
//
// Member: CKeyMaker::NormalizeWStr - Public
//
// Synopsis: Normalizes a UniCode string
//
// Arguments: [pwcInBuf] -- input buffer
// [cwcInBuf] -- count of chars in pwcInBuf
// [pbOutBuf] -- output buffer.
// [pcbOutBuf] - pointer to output count of bytes.
//
// History: 10-Feb-2000 KitmanH Created
//
//----------------------------------------------------------------------------
void CKeyMaker::NormalizeWStr( WCHAR const *pwcInBuf, ULONG cwcInBuf, BYTE *pbOutBuf, unsigned *pcbOutBuf ) { _xWordRep->NormalizeWStr( pwcInBuf, cwcInBuf, pbOutBuf, pcbOutBuf ); }
|