|
|
//+---------------------------------------------------------------------------
//
// Microsoft Windows
// Copyright (C) Microsoft Corporation, 1997
//
// File: IWBreak.cxx
//
// Contents: Thai Word Breaker glue code
//
// History: weibz, 10-Nov-1997 created
//
//----------------------------------------------------------------------------
#include <pch.cxx>
#include <filterr.h>
#include "iwbreak.hxx"
#include "thwbint.h"
#define MAX_BREAKS 255
#define WB_NORMAL 1
extern long gulcInstances;
//+---------------------------------------------------------------------------
//
// Member: CWordBreaker::CWordBreaker
//
// Synopsis: Constructor for the CWordBreaker class.
//
// Arguments: [lcid] -- locale id
//
//----------------------------------------------------------------------------
CWordBreaker::CWordBreaker( LCID lcid ) : _cRefs(1), _lcid(lcid) {
InterlockedIncrement( &gulcInstances );
}
//+---------------------------------------------------------------------------
//
// Member: CWordBreaker::~CWordBreaker
//
// Synopsis: Destructor for the CWordBreaker class.
//
// Notes: All termination/deallocation is done by embedded smart pointers
//
//----------------------------------------------------------------------------
CWordBreaker::~CWordBreaker() { InterlockedDecrement( &gulcInstances );
}
//+-------------------------------------------------------------------------
//
// Method: CWordBreaker::QueryInterface
//
// Synopsis: Rebind to other interface
//
// Arguments: [riid] -- IID of new interface
// [ppvObject] -- New interface * returned here
//
// Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
//
//--------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CWordBreaker::QueryInterface( REFIID riid, void ** ppvObject) { if ( 0 == ppvObject ) return E_INVALIDARG;
*ppvObject = 0;
if ( IID_IWordBreaker == riid ) *ppvObject = (IUnknown *)(IWordBreaker *)this; else if ( IID_IUnknown == riid ) *ppvObject = (IUnknown *)this; else return E_NOINTERFACE;
AddRef();
return S_OK; }
//+-------------------------------------------------------------------------
//
// Method: CWordBreaker::AddRef
//
// Synopsis: Increments refcount
//
//--------------------------------------------------------------------------
ULONG STDMETHODCALLTYPE CWordBreaker::AddRef() { return InterlockedIncrement( &_cRefs ); }
//+-------------------------------------------------------------------------
//
// Method: CWordBreaker::Release
//
// Synopsis: Decrement refcount. Delete if necessary.
//
//--------------------------------------------------------------------------
ULONG STDMETHODCALLTYPE CWordBreaker::Release() { unsigned long uTmp = InterlockedDecrement( &_cRefs );
if ( 0 == uTmp ) delete this;
return(uTmp); }
//+-------------------------------------------------------------------------
//
// Method: CWordBreaker::Init
//
// Synopsis: Initialize word-breaker
//
// Arguments: [fQuery] -- TRUE if query-time
// [ulMaxTokenSize] -- Maximum size token stored by caller
// [pfLicense] -- Set to true if use restricted
//
// Returns: Status code
//
//--------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CWordBreaker::Init( BOOL fQuery, ULONG ulMaxTokenSize, BOOL *pfLicense ) { if ( NULL == pfLicense ) { return E_INVALIDARG; }
if (IsBadWritePtr(pfLicense, sizeof(DWORD))) { return E_INVALIDARG; }
*pfLicense = TRUE; _fQuery = fQuery; _ulMaxTokenSize = ulMaxTokenSize;
return S_OK; }
//+---------------------------------------------------------------------------
//
// Member: CWordBreaker::ComposePhrase
//
// Synopsis: Convert a noun and a modifier into a phrase.
//
// Arguments: [pwcNoun] -- pointer to noun.
// [cwcNoun] -- count of chars in pwcNoun
// [pwcModifier] -- pointer to word modifying pwcNoun
// [cwcModifier] -- count of chars in pwcModifier
// [ulAttachmentType] -- relationship between pwcNoun &pwcModifier
//
//----------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CWordBreaker::ComposePhrase( WCHAR const *pwcNoun, ULONG cwcNoun, WCHAR const *pwcModifier, ULONG cwcModifier, ULONG ulAttachmentType, WCHAR *pwcPhrase, ULONG *pcwcPhrase ) { //
// Need to code in later
//
if ( _fQuery ) return( E_NOTIMPL ); else return ( WBREAK_E_QUERY_ONLY ); }
//+---------------------------------------------------------------------------
//
// Member: CWordBreaker::GetLicenseToUse
//
// Synopsis: Returns a pointer to vendors license information
//
// Arguments: [ppwcsLicense] -- ptr to ptr to which license info is returned
//
//----------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CWordBreaker::GetLicenseToUse( const WCHAR **ppwcsLicense ) {
static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-1998";
if ( NULL == ppwcsLicense ) { return E_INVALIDARG; }
if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD))) { return E_INVALIDARG; }
*ppwcsLicense = wcsCopyright; return( S_OK ); }
//+---------------------------------------------------------------------------
//
// Member: CWordBreaker::BreakText
//
// Synopsis: Break input stream into words.
//
// Arguments: [pTextSource] -- source of Unicode text
// [pWordSink] -- sink for collecting words
// [pPhraseSink] -- sink for collecting phrases
//
// History: 10-Nov-1997, WeibZ, Created.
//
// Notes: Since the input buffer may be greater than MAX_II_BUFFER_LEN
// we process the buffer in chunks of length MAX_II_BUFFER_LEN.
//
//----------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CWordBreaker::BreakText( TEXT_SOURCE *pTextSource, IWordSink *pWordSink, IPhraseSink *pPhraseSink ) { SCODE sc = S_OK; ULONG cwc; SCRIPT_ITEM *pItems, *pItem_Next, *pItem_org; SCRIPT_ANALYSIS *psa; PCWSTR pwcInChars; INT iItems; BOOL bItemProc; PCWSTR pwcChars; INT cChars; HRESULT retUSP; BOOL fSucceeded = true;
if ( NULL == pTextSource ) { return E_INVALIDARG; }
if ( NULL == pWordSink ) { // BUGBUG, propagate the null word sink error code
return sc; }
if ( 0 != pPhraseSink ) { // ignore the phrase sink for now
// return sc;
}
if (pTextSource->iEnd == pTextSource->iCur) { return S_OK; }
Assert( pTextSource->iCur < pTextSource->iEnd );
__try { do {
if ( pTextSource->iCur >= pTextSource->iEnd ) continue;
cwc = pTextSource->iEnd - pTextSource->iCur; pwcInChars = pTextSource->awcBuffer + pTextSource->iCur; pItems = (SCRIPT_ITEM *)LocalAlloc(LPTR,sizeof(SCRIPT_ITEM)*(cwc+1));
if ( !pItems) {
return E_UNEXPECTED; }
pItem_org = pItems;
iItems = 0; retUSP = ScriptItemize(pwcInChars,cwc,cwc+1, NULL, NULL, pItems, &iItems);
if (retUSP != S_OK) { LocalFree(pItem_org); return E_UNEXPECTED; }
while ( iItems > 1 ) { pItem_Next = pItems + 1; pwcChars = pwcInChars + pItems->iCharPos; cChars = pItem_Next->iCharPos - pItems->iCharPos;
sc = ProcessItem( pwcChars, cChars, pItems, FALSE, // no need to keep chars
pTextSource, pWordSink, pPhraseSink);
if ( ( FAILED( sc ) ) && ( FILTER_E_NO_MORE_VALUES != sc ) && ( FILTER_E_NO_TEXT != sc ) && ( FILTER_E_NO_VALUES != sc ) && ( FILTER_E_NO_MORE_TEXT != sc ) && ( FILTER_E_END_OF_CHUNKS != sc ) && ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) && ( WBREAK_E_END_OF_TEXT != sc ) ) { LocalFree(pItem_org); return sc; }
sc = S_OK;
pItems++; iItems--;
}
// special handle for the last item
if ( iItems == 1 ) {
pwcChars = pwcInChars + pItems->iCharPos; cChars = pTextSource->iEnd - pTextSource->iCur;
sc = ProcessItem(pwcChars, cChars, pItems, TRUE, // need to keep chars
pTextSource, pWordSink, pPhraseSink);
if ( ( FAILED( sc ) ) && ( FILTER_E_NO_MORE_VALUES != sc ) && ( FILTER_E_NO_TEXT != sc ) && ( FILTER_E_NO_VALUES != sc ) && ( FILTER_E_NO_MORE_TEXT != sc ) && ( FILTER_E_END_OF_CHUNKS != sc ) && ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) && ( WBREAK_E_END_OF_TEXT != sc ) ) { LocalFree(pItem_org); return sc; }
sc = S_OK; }
if (pItem_org) LocalFree(pItem_org);
// O11.17064. Under low memory it is possible to pfnFillTextBuffer to failed.
// We will need to return the error of TextSource for loging to Sharepoint.
sc = pTextSource->pfnFillTextBuffer(pTextSource); fSucceeded = SUCCEEDED(sc);
if ( ( FAILED( sc ) ) && ( FILTER_E_NO_MORE_VALUES != sc ) && ( FILTER_E_NO_TEXT != sc ) && ( FILTER_E_NO_VALUES != sc ) && ( FILTER_E_NO_MORE_TEXT != sc ) && ( FILTER_E_END_OF_CHUNKS != sc ) && ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) && ( WBREAK_E_END_OF_TEXT != sc ) ) { return sc; }
sc = S_OK;
} while (fSucceeded);
if ( pTextSource->iCur < pTextSource->iEnd ) {
cwc = pTextSource->iEnd - pTextSource->iCur; pwcInChars = pTextSource->awcBuffer + pTextSource->iCur; pItems = (SCRIPT_ITEM *)LocalAlloc(LPTR,sizeof(SCRIPT_ITEM)*(cwc+1));
if ( !pItems ) {
return E_UNEXPECTED; }
pItem_org = pItems;
iItems = 0; retUSP = ScriptItemize(pwcInChars,cwc,cwc+1, NULL, NULL, pItems, &iItems);
if (retUSP != S_OK) { LocalFree(pItem_org); return E_UNEXPECTED; }
while ( iItems > 1 ) { pItem_Next = pItems + 1; pwcChars = pwcInChars + pItems->iCharPos; cChars = pItem_Next->iCharPos - pItems->iCharPos; sc = ProcessItem(pwcChars, cChars, pItems, FALSE, // no need to keep chars
pTextSource, pWordSink, pPhraseSink);
if ( ( FAILED( sc ) ) && ( FILTER_E_NO_MORE_VALUES != sc ) && ( FILTER_E_NO_TEXT != sc ) && ( FILTER_E_NO_VALUES != sc ) && ( FILTER_E_NO_MORE_TEXT != sc ) && ( FILTER_E_END_OF_CHUNKS != sc ) && ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) && ( WBREAK_E_END_OF_TEXT != sc ) ) { LocalFree(pItem_org); return sc; }
sc = S_OK;
pItems++; iItems--; }
if ( iItems == 1 ) { pwcChars = pwcInChars + pItems->iCharPos; cChars = pTextSource->iEnd - pTextSource->iCur;
sc = ProcessItem(pwcChars, cChars, pItems, FALSE, // no need to keep chars
pTextSource, pWordSink, pPhraseSink);
if ( ( FAILED( sc ) ) && ( FILTER_E_NO_MORE_VALUES != sc ) && ( FILTER_E_NO_TEXT != sc ) && ( FILTER_E_NO_VALUES != sc ) && ( FILTER_E_NO_MORE_TEXT != sc ) && ( FILTER_E_END_OF_CHUNKS != sc ) && ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) && ( WBREAK_E_END_OF_TEXT != sc ) ) { LocalFree(pItem_org); return sc; }
sc = S_OK; }
if ( pItem_org ) LocalFree(pItem_org); }
} __except(1) {
sc = E_UNEXPECTED; }
return sc; }
SCODE CWordBreaker::ProcessItem( PCWSTR pwcChars, INT cChars, SCRIPT_ITEM *pItems, BOOL fKeep, TEXT_SOURCE *pTextSource, IWordSink *pWordSink, IPhraseSink *pPhraseSink ) { INT iChar,i; INT iWord, iWordStart, iWordLen; const SCRIPT_PROPERTIES **pScript_Properties; DWORD LangID; WORD iScript; HRESULT retUSP; SCODE scRetVal = S_OK;
ScriptGetProperties(&pScript_Properties, NULL);
iScript = pItems->a.eScript;
LangID = (pScript_Properties[iScript])->langid;
switch (LangID) { case LANG_THAI: { BYTE* pBreakPos; int iNumberOfBreak = 0; int i; WCHAR* pwch = (WCHAR*) pwcChars; THWB_STRUCT* pThwbStruct = NULL;
pBreakPos = new BYTE[cChars];
if ( pBreakPos == NULL ) return FALSE;
pThwbStruct = THWB_CreateThwbStruct(cChars); pBreakPos[0] = 0; iNumberOfBreak = THWB_IndexWordBreak(pwch,cChars, pBreakPos, pThwbStruct,cChars);
for (i=0;i < iNumberOfBreak; i++) {
// Search index alternate words.
// If not query create Alternate word.
if (pThwbStruct[i].alt != 0 && !_fQuery) { int iNumAltWord = 0, k; BYTE pAltBreakPos[5]; WCHAR* word1 = pwch; int indexWord1 = 0;
// Find Alternate words
iNumAltWord = THWB_FindAltWord(word1,pBreakPos[i], pThwbStruct[i].alt, pAltBreakPos); // Put alternate words.
for(k=0; k<iNumAltWord;k++) { scRetVal = pWordSink->PutAltWord(pAltBreakPos[k],&word1[indexWord1],pBreakPos[i],pTextSource->iCur); indexWord1 += pAltBreakPos[k]; } }
// if PutAltWord not okay return.
if (scRetVal != S_OK) break;
if (*pwch >= THAI_Ko_Kai && *pwch <= THAI_Vowel_MaiYaMok) scRetVal = pWordSink->PutWord(pBreakPos[i], pwch, pBreakPos[i], pTextSource->iCur); if (scRetVal != S_OK) break;
pTextSource->iCur += pBreakPos[i];
pwch += pBreakPos[i]; }
if (pBreakPos) delete pBreakPos;
// Prefix bug 1055941 - clear allocated memory.
THWB_DeleteThwbStruct(pThwbStruct);
break; }
case LANG_ENGLISH : // handle English chars
{ BYTE ct; BOOL fRomanWord = FALSE; CONST WCHAR *pwcInput; WT Type;
Type = WT_START;
pwcInput = pwcChars; iWordStart = 0;
for (iChar=0; iChar< cChars; iChar++, pwcInput++) { ct = GetCharType(*pwcInput);
if ( (ct != WS) && (ct != PS) ) ct = CH;
switch (ct) { case CH : if (!fRomanWord) { iWordStart = iChar; fRomanWord = TRUE; Type = WT_ROMAJI; } break; case WS : if (fRomanWord) { iWordLen = iChar - iWordStart;
scRetVal = pWordSink->PutWord(iWordLen, pwcChars+iWordStart, iWordLen, pTextSource->iCur);
pTextSource->iCur += iWordLen; fRomanWord = FALSE; } Type = WT_WORD_SEP; pTextSource->iCur++; break;
case PS : if (fRomanWord) { iWordLen = iChar - iWordStart;
scRetVal = pWordSink->PutWord(iWordLen, pwcChars+iWordStart, iWordLen, pTextSource->iCur);
pTextSource->iCur += iWordLen; fRomanWord = FALSE; } Type = WT_PHRASE_SEP; scRetVal = pWordSink->PutBreak(WORDREP_BREAK_EOS); pTextSource->iCur++; break; }
if (scRetVal != S_OK) break; }
if ((Type == WT_WORD_SEP) || (Type == WT_PHRASE_SEP)) break; if ( fKeep ) break; if (scRetVal != S_OK) break; iWordLen =cChars - iWordStart; scRetVal = pWordSink->PutWord(iWordLen, pwcChars+iWordStart, iWordLen,pTextSource->iCur); pTextSource->iCur += iWordLen;
if (scRetVal != S_OK) { break; }
}
break;
default:
pTextSource->iCur += cChars; break; }
return scRetVal; }
|