You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
677 lines
24 KiB
677 lines
24 KiB
//+---------------------------------------------------------------------------
|
|
//
|
|
// Microsoft Windows
|
|
// Copyright (C) Microsoft Corporation, 1997
|
|
//
|
|
// File: IWBreak.cxx
|
|
//
|
|
// Contents: Thai Word Breaker glue code
|
|
//
|
|
// History: weibz, 10-Nov-1997 created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
#include <pch.cxx>
|
|
#include <filterr.h>
|
|
#include "iwbreak.hxx"
|
|
#include "thwbint.h"
|
|
#define MAX_BREAKS 255
|
|
#define WB_NORMAL 1
|
|
extern long gulcInstances;
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CWordBreaker::CWordBreaker
|
|
//
|
|
// Synopsis: Constructor for the CWordBreaker class.
|
|
//
|
|
// Arguments: [lcid] -- locale id
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
CWordBreaker::CWordBreaker( LCID lcid )
|
|
: _cRefs(1),
|
|
_lcid(lcid)
|
|
{
|
|
|
|
InterlockedIncrement( &gulcInstances );
|
|
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CWordBreaker::~CWordBreaker
|
|
//
|
|
// Synopsis: Destructor for the CWordBreaker class.
|
|
//
|
|
// Notes: All termination/deallocation is done by embedded smart pointers
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
CWordBreaker::~CWordBreaker()
|
|
{
|
|
InterlockedDecrement( &gulcInstances );
|
|
|
|
|
|
}
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CWordBreaker::QueryInterface
|
|
//
|
|
// Synopsis: Rebind to other interface
|
|
//
|
|
// Arguments: [riid] -- IID of new interface
|
|
// [ppvObject] -- New interface * returned here
|
|
//
|
|
// Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
SCODE STDMETHODCALLTYPE
|
|
CWordBreaker::QueryInterface( REFIID riid, void ** ppvObject)
|
|
{
|
|
if ( 0 == ppvObject )
|
|
return E_INVALIDARG;
|
|
|
|
*ppvObject = 0;
|
|
|
|
if ( IID_IWordBreaker == riid )
|
|
*ppvObject = (IUnknown *)(IWordBreaker *)this;
|
|
else if ( IID_IUnknown == riid )
|
|
*ppvObject = (IUnknown *)this;
|
|
else
|
|
return E_NOINTERFACE;
|
|
|
|
AddRef();
|
|
|
|
return S_OK;
|
|
}
|
|
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CWordBreaker::AddRef
|
|
//
|
|
// Synopsis: Increments refcount
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
ULONG STDMETHODCALLTYPE
|
|
CWordBreaker::AddRef()
|
|
{
|
|
return InterlockedIncrement( &_cRefs );
|
|
}
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CWordBreaker::Release
|
|
//
|
|
// Synopsis: Decrement refcount. Delete if necessary.
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
ULONG STDMETHODCALLTYPE
|
|
CWordBreaker::Release()
|
|
{
|
|
unsigned long uTmp = InterlockedDecrement( &_cRefs );
|
|
|
|
if ( 0 == uTmp )
|
|
delete this;
|
|
|
|
|
|
return(uTmp);
|
|
}
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CWordBreaker::Init
|
|
//
|
|
// Synopsis: Initialize word-breaker
|
|
//
|
|
// Arguments: [fQuery] -- TRUE if query-time
|
|
// [ulMaxTokenSize] -- Maximum size token stored by caller
|
|
// [pfLicense] -- Set to true if use restricted
|
|
//
|
|
// Returns: Status code
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
SCODE STDMETHODCALLTYPE
|
|
CWordBreaker::Init(
|
|
BOOL fQuery,
|
|
ULONG ulMaxTokenSize,
|
|
BOOL *pfLicense )
|
|
{
|
|
if ( NULL == pfLicense ) {
|
|
return E_INVALIDARG;
|
|
}
|
|
|
|
|
|
if (IsBadWritePtr(pfLicense, sizeof(DWORD))) {
|
|
return E_INVALIDARG;
|
|
}
|
|
|
|
*pfLicense = TRUE;
|
|
_fQuery = fQuery;
|
|
_ulMaxTokenSize = ulMaxTokenSize;
|
|
|
|
|
|
return S_OK;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CWordBreaker::ComposePhrase
|
|
//
|
|
// Synopsis: Convert a noun and a modifier into a phrase.
|
|
//
|
|
// Arguments: [pwcNoun] -- pointer to noun.
|
|
// [cwcNoun] -- count of chars in pwcNoun
|
|
// [pwcModifier] -- pointer to word modifying pwcNoun
|
|
// [cwcModifier] -- count of chars in pwcModifier
|
|
// [ulAttachmentType] -- relationship between pwcNoun &pwcModifier
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
SCODE STDMETHODCALLTYPE
|
|
CWordBreaker::ComposePhrase(
|
|
WCHAR const *pwcNoun,
|
|
ULONG cwcNoun,
|
|
WCHAR const *pwcModifier,
|
|
ULONG cwcModifier,
|
|
ULONG ulAttachmentType,
|
|
WCHAR *pwcPhrase,
|
|
ULONG *pcwcPhrase )
|
|
{
|
|
//
|
|
// Need to code in later
|
|
//
|
|
if ( _fQuery )
|
|
return( E_NOTIMPL );
|
|
else
|
|
return ( WBREAK_E_QUERY_ONLY );
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CWordBreaker::GetLicenseToUse
|
|
//
|
|
// Synopsis: Returns a pointer to vendors license information
|
|
//
|
|
// Arguments: [ppwcsLicense] -- ptr to ptr to which license info is returned
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
SCODE STDMETHODCALLTYPE
|
|
CWordBreaker::GetLicenseToUse(
|
|
const WCHAR **ppwcsLicense )
|
|
{
|
|
|
|
static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-1998";
|
|
|
|
|
|
if ( NULL == ppwcsLicense ) {
|
|
return E_INVALIDARG;
|
|
}
|
|
|
|
if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD))) {
|
|
return E_INVALIDARG;
|
|
}
|
|
|
|
*ppwcsLicense = wcsCopyright;
|
|
return( S_OK );
|
|
}
|
|
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CWordBreaker::BreakText
|
|
//
|
|
// Synopsis: Break input stream into words.
|
|
//
|
|
// Arguments: [pTextSource] -- source of Unicode text
|
|
// [pWordSink] -- sink for collecting words
|
|
// [pPhraseSink] -- sink for collecting phrases
|
|
//
|
|
// History: 10-Nov-1997, WeibZ, Created.
|
|
//
|
|
// Notes: Since the input buffer may be greater than MAX_II_BUFFER_LEN
|
|
// we process the buffer in chunks of length MAX_II_BUFFER_LEN.
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
SCODE STDMETHODCALLTYPE CWordBreaker::BreakText( TEXT_SOURCE *pTextSource,
|
|
IWordSink *pWordSink,
|
|
IPhraseSink *pPhraseSink )
|
|
{
|
|
SCODE sc = S_OK;
|
|
ULONG cwc;
|
|
SCRIPT_ITEM *pItems, *pItem_Next, *pItem_org;
|
|
SCRIPT_ANALYSIS *psa;
|
|
PCWSTR pwcInChars;
|
|
INT iItems;
|
|
BOOL bItemProc;
|
|
PCWSTR pwcChars;
|
|
INT cChars;
|
|
HRESULT retUSP;
|
|
BOOL fSucceeded = true;
|
|
|
|
if ( NULL == pTextSource ) {
|
|
return E_INVALIDARG;
|
|
}
|
|
|
|
if ( NULL == pWordSink )
|
|
{
|
|
// BUGBUG, propagate the null word sink error code
|
|
return sc;
|
|
}
|
|
|
|
|
|
if ( 0 != pPhraseSink )
|
|
{
|
|
// ignore the phrase sink for now
|
|
// return sc;
|
|
}
|
|
|
|
if (pTextSource->iEnd == pTextSource->iCur) {
|
|
return S_OK;
|
|
}
|
|
|
|
Assert( pTextSource->iCur < pTextSource->iEnd );
|
|
|
|
|
|
__try
|
|
{
|
|
do
|
|
{
|
|
|
|
if ( pTextSource->iCur >= pTextSource->iEnd )
|
|
continue;
|
|
|
|
cwc = pTextSource->iEnd - pTextSource->iCur;
|
|
pwcInChars = pTextSource->awcBuffer + pTextSource->iCur;
|
|
|
|
|
|
pItems = (SCRIPT_ITEM *)LocalAlloc(LPTR,sizeof(SCRIPT_ITEM)*(cwc+1));
|
|
|
|
if ( !pItems) {
|
|
|
|
return E_UNEXPECTED;
|
|
}
|
|
|
|
pItem_org = pItems;
|
|
|
|
|
|
iItems = 0;
|
|
retUSP = ScriptItemize(pwcInChars,cwc,cwc+1, NULL, NULL,
|
|
pItems, &iItems);
|
|
|
|
if (retUSP != S_OK) {
|
|
LocalFree(pItem_org);
|
|
return E_UNEXPECTED;
|
|
}
|
|
|
|
while ( iItems > 1 ) {
|
|
pItem_Next = pItems + 1;
|
|
pwcChars = pwcInChars + pItems->iCharPos;
|
|
cChars = pItem_Next->iCharPos - pItems->iCharPos;
|
|
|
|
sc = ProcessItem( pwcChars,
|
|
cChars,
|
|
pItems,
|
|
FALSE, // no need to keep chars
|
|
pTextSource,
|
|
pWordSink,
|
|
pPhraseSink);
|
|
|
|
if ( ( FAILED( sc ) ) &&
|
|
( FILTER_E_NO_MORE_VALUES != sc ) &&
|
|
( FILTER_E_NO_TEXT != sc ) &&
|
|
( FILTER_E_NO_VALUES != sc ) &&
|
|
( FILTER_E_NO_MORE_TEXT != sc ) &&
|
|
( FILTER_E_END_OF_CHUNKS != sc ) &&
|
|
( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&
|
|
( WBREAK_E_END_OF_TEXT != sc ) ) {
|
|
LocalFree(pItem_org);
|
|
return sc;
|
|
}
|
|
|
|
sc = S_OK;
|
|
|
|
pItems++;
|
|
iItems--;
|
|
|
|
}
|
|
|
|
// special handle for the last item
|
|
if ( iItems == 1 ) {
|
|
|
|
pwcChars = pwcInChars + pItems->iCharPos;
|
|
cChars = pTextSource->iEnd - pTextSource->iCur;
|
|
|
|
sc = ProcessItem(pwcChars,
|
|
cChars,
|
|
pItems,
|
|
TRUE, // need to keep chars
|
|
pTextSource,
|
|
pWordSink,
|
|
pPhraseSink);
|
|
|
|
if ( ( FAILED( sc ) ) &&
|
|
( FILTER_E_NO_MORE_VALUES != sc ) &&
|
|
( FILTER_E_NO_TEXT != sc ) &&
|
|
( FILTER_E_NO_VALUES != sc ) &&
|
|
( FILTER_E_NO_MORE_TEXT != sc ) &&
|
|
( FILTER_E_END_OF_CHUNKS != sc ) &&
|
|
( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&
|
|
( WBREAK_E_END_OF_TEXT != sc ) ) {
|
|
LocalFree(pItem_org);
|
|
return sc;
|
|
}
|
|
|
|
sc = S_OK;
|
|
}
|
|
|
|
if (pItem_org)
|
|
LocalFree(pItem_org);
|
|
|
|
// O11.17064. Under low memory it is possible to pfnFillTextBuffer to failed.
|
|
// We will need to return the error of TextSource for loging to Sharepoint.
|
|
sc = pTextSource->pfnFillTextBuffer(pTextSource);
|
|
fSucceeded = SUCCEEDED(sc);
|
|
|
|
if ( ( FAILED( sc ) ) &&
|
|
( FILTER_E_NO_MORE_VALUES != sc ) &&
|
|
( FILTER_E_NO_TEXT != sc ) &&
|
|
( FILTER_E_NO_VALUES != sc ) &&
|
|
( FILTER_E_NO_MORE_TEXT != sc ) &&
|
|
( FILTER_E_END_OF_CHUNKS != sc ) &&
|
|
( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&
|
|
( WBREAK_E_END_OF_TEXT != sc ) ) {
|
|
return sc;
|
|
}
|
|
|
|
sc = S_OK;
|
|
|
|
} while (fSucceeded);
|
|
|
|
|
|
if ( pTextSource->iCur < pTextSource->iEnd ) {
|
|
|
|
cwc = pTextSource->iEnd - pTextSource->iCur;
|
|
pwcInChars = pTextSource->awcBuffer + pTextSource->iCur;
|
|
|
|
pItems = (SCRIPT_ITEM *)LocalAlloc(LPTR,sizeof(SCRIPT_ITEM)*(cwc+1));
|
|
|
|
if ( !pItems ) {
|
|
|
|
return E_UNEXPECTED;
|
|
}
|
|
|
|
pItem_org = pItems;
|
|
|
|
|
|
iItems = 0;
|
|
retUSP = ScriptItemize(pwcInChars,cwc,cwc+1, NULL, NULL,
|
|
pItems, &iItems);
|
|
|
|
if (retUSP != S_OK) {
|
|
LocalFree(pItem_org);
|
|
return E_UNEXPECTED;
|
|
}
|
|
|
|
while ( iItems > 1 ) {
|
|
pItem_Next = pItems + 1;
|
|
pwcChars = pwcInChars + pItems->iCharPos;
|
|
cChars = pItem_Next->iCharPos - pItems->iCharPos;
|
|
|
|
sc = ProcessItem(pwcChars,
|
|
cChars,
|
|
pItems,
|
|
FALSE, // no need to keep chars
|
|
pTextSource,
|
|
pWordSink,
|
|
pPhraseSink);
|
|
|
|
if ( ( FAILED( sc ) ) &&
|
|
( FILTER_E_NO_MORE_VALUES != sc ) &&
|
|
( FILTER_E_NO_TEXT != sc ) &&
|
|
( FILTER_E_NO_VALUES != sc ) &&
|
|
( FILTER_E_NO_MORE_TEXT != sc ) &&
|
|
( FILTER_E_END_OF_CHUNKS != sc ) &&
|
|
( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&
|
|
( WBREAK_E_END_OF_TEXT != sc ) ) {
|
|
LocalFree(pItem_org);
|
|
return sc;
|
|
}
|
|
|
|
sc = S_OK;
|
|
|
|
pItems++;
|
|
iItems--;
|
|
}
|
|
|
|
if ( iItems == 1 ) {
|
|
|
|
pwcChars = pwcInChars + pItems->iCharPos;
|
|
cChars = pTextSource->iEnd - pTextSource->iCur;
|
|
|
|
sc = ProcessItem(pwcChars,
|
|
cChars,
|
|
pItems,
|
|
FALSE, // no need to keep chars
|
|
pTextSource,
|
|
pWordSink,
|
|
pPhraseSink);
|
|
|
|
if ( ( FAILED( sc ) ) &&
|
|
( FILTER_E_NO_MORE_VALUES != sc ) &&
|
|
( FILTER_E_NO_TEXT != sc ) &&
|
|
( FILTER_E_NO_VALUES != sc ) &&
|
|
( FILTER_E_NO_MORE_TEXT != sc ) &&
|
|
( FILTER_E_END_OF_CHUNKS != sc ) &&
|
|
( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&
|
|
( WBREAK_E_END_OF_TEXT != sc ) ) {
|
|
LocalFree(pItem_org);
|
|
return sc;
|
|
}
|
|
|
|
sc = S_OK;
|
|
}
|
|
|
|
if ( pItem_org )
|
|
LocalFree(pItem_org);
|
|
}
|
|
|
|
} __except(1) {
|
|
|
|
sc = E_UNEXPECTED;
|
|
}
|
|
|
|
return sc;
|
|
}
|
|
|
|
SCODE CWordBreaker::ProcessItem(
|
|
PCWSTR pwcChars,
|
|
INT cChars,
|
|
SCRIPT_ITEM *pItems,
|
|
BOOL fKeep,
|
|
TEXT_SOURCE *pTextSource,
|
|
IWordSink *pWordSink,
|
|
IPhraseSink *pPhraseSink )
|
|
{
|
|
INT iChar,i;
|
|
INT iWord, iWordStart, iWordLen;
|
|
const SCRIPT_PROPERTIES **pScript_Properties;
|
|
DWORD LangID;
|
|
WORD iScript;
|
|
HRESULT retUSP;
|
|
SCODE scRetVal = S_OK;
|
|
|
|
ScriptGetProperties(&pScript_Properties, NULL);
|
|
|
|
iScript = pItems->a.eScript;
|
|
|
|
LangID = (pScript_Properties[iScript])->langid;
|
|
|
|
switch (LangID) {
|
|
case LANG_THAI:
|
|
{
|
|
BYTE* pBreakPos;
|
|
int iNumberOfBreak = 0;
|
|
int i;
|
|
WCHAR* pwch = (WCHAR*) pwcChars;
|
|
THWB_STRUCT* pThwbStruct = NULL;
|
|
|
|
pBreakPos = new BYTE[cChars];
|
|
|
|
if ( pBreakPos == NULL )
|
|
return FALSE;
|
|
|
|
pThwbStruct = THWB_CreateThwbStruct(cChars);
|
|
|
|
pBreakPos[0] = 0;
|
|
iNumberOfBreak = THWB_IndexWordBreak(pwch,cChars, pBreakPos, pThwbStruct,cChars);
|
|
|
|
for (i=0;i < iNumberOfBreak; i++)
|
|
{
|
|
|
|
// Search index alternate words.
|
|
// If not query create Alternate word.
|
|
if (pThwbStruct[i].alt != 0 && !_fQuery)
|
|
{
|
|
int iNumAltWord = 0, k;
|
|
BYTE pAltBreakPos[5];
|
|
WCHAR* word1 = pwch;
|
|
int indexWord1 = 0;
|
|
|
|
|
|
// Find Alternate words
|
|
iNumAltWord = THWB_FindAltWord(word1,pBreakPos[i], pThwbStruct[i].alt, pAltBreakPos);
|
|
|
|
// Put alternate words.
|
|
for(k=0; k<iNumAltWord;k++)
|
|
{
|
|
scRetVal = pWordSink->PutAltWord(pAltBreakPos[k],&word1[indexWord1],pBreakPos[i],pTextSource->iCur);
|
|
indexWord1 += pAltBreakPos[k];
|
|
}
|
|
}
|
|
|
|
// if PutAltWord not okay return.
|
|
if (scRetVal != S_OK)
|
|
break;
|
|
|
|
if (*pwch >= THAI_Ko_Kai && *pwch <= THAI_Vowel_MaiYaMok)
|
|
scRetVal = pWordSink->PutWord(pBreakPos[i], pwch, pBreakPos[i], pTextSource->iCur);
|
|
|
|
if (scRetVal != S_OK)
|
|
break;
|
|
|
|
|
|
pTextSource->iCur += pBreakPos[i];
|
|
|
|
pwch += pBreakPos[i];
|
|
}
|
|
|
|
if (pBreakPos)
|
|
delete pBreakPos;
|
|
|
|
// Prefix bug 1055941 - clear allocated memory.
|
|
THWB_DeleteThwbStruct(pThwbStruct);
|
|
|
|
break;
|
|
}
|
|
|
|
case LANG_ENGLISH : // handle English chars
|
|
|
|
{
|
|
BYTE ct;
|
|
BOOL fRomanWord = FALSE;
|
|
CONST WCHAR *pwcInput;
|
|
WT Type;
|
|
|
|
Type = WT_START;
|
|
|
|
pwcInput = pwcChars;
|
|
iWordStart = 0;
|
|
|
|
for (iChar=0; iChar< cChars; iChar++, pwcInput++)
|
|
{
|
|
ct = GetCharType(*pwcInput);
|
|
|
|
if ( (ct != WS) && (ct != PS) )
|
|
ct = CH;
|
|
|
|
|
|
switch (ct) {
|
|
case CH :
|
|
if (!fRomanWord) {
|
|
iWordStart = iChar;
|
|
fRomanWord = TRUE;
|
|
Type = WT_ROMAJI;
|
|
}
|
|
break;
|
|
case WS :
|
|
if (fRomanWord) {
|
|
iWordLen = iChar - iWordStart;
|
|
|
|
scRetVal = pWordSink->PutWord(iWordLen, pwcChars+iWordStart, iWordLen, pTextSource->iCur);
|
|
|
|
pTextSource->iCur += iWordLen;
|
|
fRomanWord = FALSE;
|
|
}
|
|
|
|
Type = WT_WORD_SEP;
|
|
pTextSource->iCur++;
|
|
break;
|
|
|
|
case PS :
|
|
if (fRomanWord) {
|
|
iWordLen = iChar - iWordStart;
|
|
|
|
scRetVal = pWordSink->PutWord(iWordLen, pwcChars+iWordStart, iWordLen, pTextSource->iCur);
|
|
|
|
pTextSource->iCur += iWordLen;
|
|
fRomanWord = FALSE;
|
|
}
|
|
|
|
Type = WT_PHRASE_SEP;
|
|
scRetVal = pWordSink->PutBreak(WORDREP_BREAK_EOS);
|
|
pTextSource->iCur++;
|
|
break;
|
|
}
|
|
|
|
if (scRetVal != S_OK)
|
|
break;
|
|
}
|
|
|
|
if ((Type == WT_WORD_SEP) || (Type == WT_PHRASE_SEP))
|
|
break;
|
|
|
|
if ( fKeep )
|
|
break;
|
|
|
|
if (scRetVal != S_OK)
|
|
break;
|
|
|
|
iWordLen =cChars - iWordStart;
|
|
scRetVal = pWordSink->PutWord(iWordLen, pwcChars+iWordStart, iWordLen,pTextSource->iCur);
|
|
pTextSource->iCur += iWordLen;
|
|
|
|
if (scRetVal != S_OK)
|
|
{
|
|
break;
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
pTextSource->iCur += cChars;
|
|
break;
|
|
}
|
|
|
|
return scRetVal;
|
|
}
|