//+--------------------------------------------------------------------------- // // Microsoft Windows // Copyright (C) Microsoft Corporation, 1991 - 1994. // // File: DefBreak.cxx // // Contents: Text Word Breaker // // History: 08-May-91 t-WadeR Created stubs, filled in ASCII code. // 06-Jun-91 t-WadeR Changed to use input-based pipeline // 11-Apr-92 KyleP Sync to spec // //---------------------------------------------------------------------------- #include #pragma hdrstop #include //+--------------------------------------------------------------------------- // // Member: CDefWordBreaker::CDefWordBreaker // // Synopsis: Constructor for the CDefWordBreaker class. // // History: 07-June-91 t-WadeR Created // 12-Oct-92 AmyA Added Unicode support // //---------------------------------------------------------------------------- CDefWordBreaker::CDefWordBreaker() : _cRefs(1) { ciDebugOut(( DEB_ITRACE, "Creating default wordbreaker\n" )); // Look at IsWordChar. We don't want the last non-breaking // space in the chunk to be considered a word break. // It will be processed again (correctly) when we move to the next chunk. _aCharInfo3 [CDefWordBreaker::ccCompare] = C3_NONSPACING; } //+--------------------------------------------------------------------------- // // Member: CWordBreaker::~CWordBreaker // // Synopsis: Destructor for the CWordBreaker class. // //---------------------------------------------------------------------------- CDefWordBreaker::~CDefWordBreaker() { } //+------------------------------------------------------------------------- // // Method: CDefWordBreaker::QueryInterface // // Synopsis: Rebind to other interface // // Arguments: [riid] -- IID of new interface // [ppvObject] -- New interface * returned here // // Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed // // History: 23-Feb-1994 KyleP Created // //-------------------------------------------------------------------------- SCODE STDMETHODCALLTYPE CDefWordBreaker::QueryInterface( REFIID riid, void ** ppvObject) { if ( 0 == ppvObject ) return E_INVALIDARG; if ( IID_IWordBreaker == riid ) *ppvObject = (IUnknown *)(IWordBreaker *)this; else if ( IID_IUnknown == riid ) *ppvObject = (IUnknown *)(IPersist *)(IPersistFile *)this; else { *ppvObject = 0; return E_NOINTERFACE; } AddRef(); return S_OK; } //+------------------------------------------------------------------------- // // Method: CDefWordBreaker::AddRef // // Synopsis: Increments refcount // // History: 23-Feb-1994 KyleP Created // //-------------------------------------------------------------------------- ULONG STDMETHODCALLTYPE CDefWordBreaker::AddRef() { return InterlockedIncrement( &_cRefs ); } //+------------------------------------------------------------------------- // // Method: CDefWordBreaker::Release // // Synopsis: Decrement refcount. Delete if necessary. // // History: 23-Feb-1994 KyleP Created // //-------------------------------------------------------------------------- ULONG STDMETHODCALLTYPE CDefWordBreaker::Release() { unsigned long uTmp = InterlockedDecrement( &_cRefs ); if ( 0 == uTmp ) delete this; return uTmp; } //+------------------------------------------------------------------------- // // Method: CDefWordBreaker::Init // // Synopsis: Initialize word-breaker // // Arguments: [fQuery] -- TRUE if query-time // [ulMaxTokenSize] -- Maximum size token stored by caller // [pfLicense] -- Set to true if use restricted // // Returns: Status code // // History: 11-Apr-1994 KyleP Created // //-------------------------------------------------------------------------- SCODE STDMETHODCALLTYPE CDefWordBreaker::Init( BOOL fQuery, ULONG ulMaxTokenSize, BOOL *pfLicense ) { if ( 0 == pfLicense ) return E_INVALIDARG; *pfLicense = FALSE; return S_OK; } //+------------------------------------------------------------------------- // // Method: CDefWordBreaker::IsWordChar // // Synopsis: Find whether the i'th character in the buffer _awString // is a word character (rather than word break) // // Arguments: [i] -- index into _awString // // History: 22-Jul-1994 BartoszM Created // //-------------------------------------------------------------------------- inline BOOL CDefWordBreaker::IsWordChar (int i) const { if ( (_aCharInfo1[i] & (C1_ALPHA | C1_DIGIT)) || (_aCharInfo3[i] & C3_NONSPACING) ) { return TRUE; } WCHAR c = _pwcChunk[i]; if (c == L'_') return TRUE; if (c == 0xa0) // non breaking space { // followed by a non-spacing character // (looking ahead is okay) if (_aCharInfo3[i+1] & C3_NONSPACING) return TRUE; } return FALSE; } //+--------------------------------------------------------------------------- // // Member: CDefWordBreaker::ScanChunk // // Synopsis: For each character find its type // // // History: 16-Aug-94 BartoszM Created // //---------------------------------------------------------------------------- BOOL CDefWordBreaker::ScanChunk () { // // GetStringTypeW is returning error 87 (ERROR_INVALID_PARAMETER) if // we pass in a null string. // Win4Assert( (0 != _cMapped) && (0 != _pwcChunk) ); if ( !GetStringTypeW( CT_CTYPE1, // POSIX character typing _pwcChunk, // Source _cMapped, // Size of source _aCharInfo1 ) ) // Character info { ciDebugOut(( DEB_ERROR, "GetStringTypeW returned %d\n", GetLastError() )); return FALSE; } if ( !GetStringTypeW( CT_CTYPE3, // Additional POSIX _pwcChunk, _cMapped, // Size of source _aCharInfo3 ) ) // Character info 3 { ciDebugOut(( DEB_ERROR, "GetStringTypeW CTYPE3 returned %d\n", GetLastError() )); return FALSE; } return TRUE; } //+--------------------------------------------------------------------------- // // Member: CDefWordBreaker::BreakText // // Synopsis: Break input stream into words. // // Arguments: [pTextSource] - source of input buffers // [pWordSink] - sink for words // [pPhraseSink] - sink for noun phrases // // History: 07-June-91 t-WadeR Created // 12-Oct-92 AmyA Added Unicode support // 18-Nov-92 AmyA Overloaded // 11-Apr-94 KyleP Sync with spec // 26-Aug-94 BartoszM Fixed Unicode parsing // //---------------------------------------------------------------------------- SCODE STDMETHODCALLTYPE CDefWordBreaker::BreakText( TEXT_SOURCE *pTextSource, IWordSink *pWordSink, IPhraseSink *pPhraseSink ) { if ( 0 == pTextSource ) return E_INVALIDARG; if ( 0 == pWordSink || pTextSource->iCur == pTextSource->iEnd) return S_OK; if (pTextSource->iCur > pTextSource->iEnd) { Win4Assert ( !"BreakText called with bad TEXT_SOURCE" ); return E_FAIL; } SCODE sc = S_OK; ULONG cwc, cwcProcd; // cwcProcd is # chars actually processed by Tokenize() TRY { do { // // Flag for first time thru loop below. This is to fix the case // where the length of the buffer passed in is less than // MAX_II_BUFFER_LEN. In this case iEnd-iCur is <= MAX_II_BUFFER_LEN // and we break out the inner loop and call // pfnFillTextBuffer without having processed any characters, // and so pfnFillTextBuffer returns TRUE without adding any new // characters and this results in an infinite loop. // BOOL fFirstTime = TRUE; while ( pTextSource->iCur < pTextSource->iEnd ) { cwc = pTextSource->iEnd - pTextSource->iCur; // // Process in buckets of MAX_II_BUFER_LEN only // if ( cwc >= CDefWordBreaker::ccCompare ) cwc = CDefWordBreaker::ccCompare; else if ( !fFirstTime ) break; Tokenize( pTextSource, cwc, pWordSink, cwcProcd ); Win4Assert( cwcProcd <= cwc ); pTextSource->iCur += cwcProcd; fFirstTime = FALSE; } } while ( SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)) ); cwc = pTextSource->iEnd - pTextSource->iCur; // we know that the remaining text should be less than ccCompare Win4Assert( cwc < CDefWordBreaker::ccCompare ); if ( 0 != cwc ) { Tokenize( pTextSource, cwc, pWordSink, cwcProcd ); } } CATCH (CException, e) { ciDebugOut(( DEB_ITRACE, "Exception 0x%x caught when breaking text in default wordbreaker\n", e.GetErrorCode() )); sc = GetOleError( e ); } END_CATCH return sc; } //+--------------------------------------------------------------------------- // // Member: CDefWordBreaker::Tokenize // // Synopsis: Tokenize the input buffer into words // // Arguments: [pTextSource] -- input text source // [cwc] -- # chars to process // [pWordSink] -- sink for words // [cwcProd] -- # chars actually processed returned here // // History: 10-Aug-95 SitaramR Created // //---------------------------------------------------------------------------- void CDefWordBreaker::Tokenize( TEXT_SOURCE *pTextSource, ULONG cwc, IWordSink *pWordSink, ULONG& cwcProcd ) { _pwcChunk = &pTextSource->awcBuffer[pTextSource->iCur]; _cMapped = cwc; if ( !ScanChunk() ) THROW( CException( E_FAIL ) ); BOOL fWordHasZWS = FALSE; // Does the current word have a zero-width-space ? unsigned uLenZWS; // Length of a word minus embedded zero-width-spaces // // iBeginWord is the offset into _aCharInfo of the beginning character of // a word. iCur is the first *unprocessed* character. // They are indexes into the mapped chunk. // unsigned iBeginWord = 0; unsigned iCur = 0; SCODE sc = S_OK; // // Pump words from mapped chunk to word sink // while ( iCur < _cMapped ) { // // Skip whitespace, punctuation, etc. // for (; iCur < _cMapped; iCur++) if ( IsWordChar (iCur) ) break; // iCur points to a word char or is equal to _cMapped iBeginWord = iCur; if (iCur < _cMapped) iCur++; // we knew it pointed at word character // // Find word break. Filter may output Unicode zero-width-space, which // should be ignored by the wordbreaker. // fWordHasZWS = FALSE; for (; iCur < _cMapped; iCur++) { if ( !IsWordChar (iCur) ) { if ( _pwcChunk[iCur] == ZERO_WIDTH_SPACE ) fWordHasZWS = TRUE; else break; } } if ( fWordHasZWS ) { // // Copy word into _awcBufZWS after stripping zero-width-spaces // uLenZWS = 0; for ( unsigned i=iBeginWord; iPutWord( uLenZWS, _awcBufZWS, // stripped word iCur - iBeginWord, pTextSource->iCur + iBeginWord ); else sc = pWordSink->PutWord( iCur - iBeginWord, _pwcChunk + iBeginWord, // the word iCur - iBeginWord, pTextSource->iCur + iBeginWord ); if ( FAILED( sc ) ) THROW( CException( sc ) ); iCur++; // we knew it pointed at non-word char iBeginWord = iCur; // in case we exit the loop now } } // next word Win4Assert( iCur == _cMapped ); // End of words in chunk. // iCur == _cMapped // iBeginWord points at beginning of word or == _cMapped if ( 0 == iBeginWord ) { // A single word fills from beginning of this chunk // to the end. This is either a very long word or // a short word in a leftover buffer. // store the word and its source position if ( fWordHasZWS ) sc = pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word iCur, pTextSource->iCur ); // its source pos. else sc = pWordSink->PutWord( iCur, _pwcChunk, // the word iCur, pTextSource->iCur ); // its source pos. if ( FAILED( sc ) ) THROW( CException( sc ) ); // // Position it to not add the word twice. // iBeginWord = iCur; } // // If this is the last chunk from text source, then process the // last fragment // if ( cwc < CDefWordBreaker::ccCompare && iBeginWord != iCur ) { // store the word and its source position if ( fWordHasZWS ) sc = pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word iCur - iBeginWord, pTextSource->iCur + iBeginWord ); else sc = pWordSink->PutWord( iCur - iBeginWord, _pwcChunk + iBeginWord, // the word iCur - iBeginWord, pTextSource->iCur + iBeginWord ); if ( FAILED( sc ) ) THROW( CException( sc ) ); iBeginWord = iCur; } cwcProcd = iBeginWord; } //+--------------------------------------------------------------------------- // // Member: CDefWordBreaker::ComposePhrase // // Synopsis: Convert a noun and a modifier into a phrase // // Arguments: [pwcNoun] -- pointer to noun. // [cwcNoun] -- count of chars in pwcNoun // [pwcModifier] -- pointer to word modifying pwcNoun // [cwcModifier] -- count of chars in pwcModifier // [ulAttachmentType] -- relationship between pwcNoun &pwcModifier // // History: 10-Aug-95 SitaramR Created Header // //---------------------------------------------------------------------------- SCODE STDMETHODCALLTYPE CDefWordBreaker::ComposePhrase( WCHAR const *pwcNoun, ULONG cwcNoun, WCHAR const *pwcModifier, ULONG cwcModifier, ULONG ulAttachmentType, WCHAR *pwcPhrase, ULONG *pcwcPhrase ) { // // Never emitted phrase in the first place. // ciDebugOut(( DEB_WARN, "IWordBreaker::ComposePhrase called on default word breaker\n" )); return( E_FAIL ); } //+--------------------------------------------------------------------------- // // Member: CWordBreaker::GetLicenseToUse // // Synopsis: Returns a pointer to vendors license information // // Arguments: [ppwcsLicense] -- ptr to ptr to which license info is returned // // History: 10-Aug-95 SitaramR Created Header // //---------------------------------------------------------------------------- SCODE STDMETHODCALLTYPE CDefWordBreaker::GetLicenseToUse( const WCHAR **ppwcsLicense ) { if ( 0 == ppwcsLicense ) return E_INVALIDARG; static WCHAR const * wcsCopyright = L"Copyright (c) Microsoft Corporation, 1991-1998"; *ppwcsLicense = wcsCopyright; return( S_OK ); } extern long gulcInstances; //+------------------------------------------------------------------------- // // Method: CDefWordBreakerCF::CDefWordBreakerCF // // Synopsis: Default Word Breaker class factory constructor // // History: 07-Feb-1995 SitaramR Created // //-------------------------------------------------------------------------- CDefWordBreakerCF::CDefWordBreakerCF( ) : _cRefs( 1 ) { InterlockedIncrement( &gulcInstances ); } //+------------------------------------------------------------------------- // // Method: CDefWordBreakerCF::~CDefWordBreakerCF // // Synopsis: Default Word Breaker class factory destructor // // History: 07-Feb-1995 SitaramR Created // //-------------------------------------------------------------------------- CDefWordBreakerCF::~CDefWordBreakerCF() { InterlockedDecrement( &gulcInstances ); } //+------------------------------------------------------------------------- // // Method: CDefWordBreakerCF::QueryInterface // // Synopsis: Rebind to other interface // // Arguments: [riid] -- IID of new interface // [ppvObject] -- New interface * returned here // // Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed // // History: 07-Feb-1995 SitaramR Created // //-------------------------------------------------------------------------- SCODE STDMETHODCALLTYPE CDefWordBreakerCF::QueryInterface( REFIID riid, void ** ppvObject ) { if ( IID_IClassFactory == riid ) *ppvObject = (IUnknown *)(IClassFactory *)this; else if ( IID_IUnknown == riid ) *ppvObject = (IUnknown *)this; else { *ppvObject = 0; return E_NOINTERFACE; } AddRef(); return S_OK; } //+------------------------------------------------------------------------- // // Method: CDefWordBreakerCF::AddRef // // Synopsis: Increments refcount // // History: 07-Feb-1995 SitaramR Created // //-------------------------------------------------------------------------- ULONG STDMETHODCALLTYPE CDefWordBreakerCF::AddRef() { return InterlockedIncrement( &_cRefs ); } //+------------------------------------------------------------------------- // // Method: CDefWordBreakerCF::Release // // Synopsis: Decrement refcount. Delete if necessary. // // History: 07-Feb-1995 SitaramR Created // //-------------------------------------------------------------------------- ULONG STDMETHODCALLTYPE CDefWordBreakerCF::Release() { unsigned long uTmp = InterlockedDecrement( &_cRefs ); if ( 0 == uTmp ) delete this; return uTmp; } //+------------------------------------------------------------------------- // // Method: CDefWordBreakerCF::CreateInstance // // Synopsis: Creates new CDefWordBreaker object // // Arguments: [pUnkOuter] -- 'Outer' IUnknown // [riid] -- Interface to bind // [ppvObject] -- Interface returned here // // History: 07-Feb-1995 SitaramR Created // //-------------------------------------------------------------------------- SCODE STDMETHODCALLTYPE CDefWordBreakerCF::CreateInstance( IUnknown * pUnkOuter, REFIID riid, void * * ppvObject ) { CDefWordBreaker *pIUnk = 0; SCODE sc = S_OK; TRY { pIUnk = new CDefWordBreaker(); sc = pIUnk->QueryInterface( riid , ppvObject ); pIUnk->Release(); // Release extra refcount from QueryInterface } CATCH(CException, e) { Win4Assert( 0 == pIUnk ); switch( e.GetErrorCode() ) { case E_OUTOFMEMORY: sc = (E_OUTOFMEMORY); break; default: sc = (E_UNEXPECTED); } } END_CATCH; return (sc); } //+------------------------------------------------------------------------- // // Method: CDefWordBreakerCF::LockServer // // Synopsis: Force class factory to remain loaded // // Arguments: [fLock] -- TRUE if locking, FALSE if unlocking // // Returns: S_OK // // History: 07-Feb-1995 SitaramR Created // //-------------------------------------------------------------------------- SCODE STDMETHODCALLTYPE CDefWordBreakerCF::LockServer(BOOL fLock) { if(fLock) InterlockedIncrement( &gulcInstances ); else InterlockedDecrement( &gulcInstances ); return(S_OK); }