windows-server-2003/inetsrv/intlwb/thai2/wb/iwbreak.cxx


								//+---------------------------------------------------------------------------

								//

								//  Microsoft Windows

								//  Copyright (C) Microsoft Corporation, 1997

								//

								//  File:       IWBreak.cxx

								//

								//  Contents:   Thai  Word Breaker glue code

								//

								//  History:   weibz,   10-Nov-1997   created

								//

								//----------------------------------------------------------------------------


								#include <pch.cxx>

								#include <filterr.h>

								#include "iwbreak.hxx"

								#include "thwbint.h"

								#define MAX_BREAKS 255

								#define WB_NORMAL 1

								extern  long            gulcInstances;


								//+---------------------------------------------------------------------------

								//

								//  Member:     CWordBreaker::CWordBreaker

								//

								//  Synopsis:   Constructor for the CWordBreaker class.

								//

								//  Arguments:  [lcid] -- locale id

								//

								//----------------------------------------------------------------------------


								CWordBreaker::CWordBreaker( LCID lcid )

								        : _cRefs(1),

								          _lcid(lcid)

								{


								    InterlockedIncrement( &gulcInstances );


								}


								//+---------------------------------------------------------------------------

								//

								//  Member:     CWordBreaker::~CWordBreaker

								//

								//  Synopsis:   Destructor for the CWordBreaker class.

								//

								//  Notes:      All termination/deallocation is done by embedded smart pointers

								//

								//----------------------------------------------------------------------------


								CWordBreaker::~CWordBreaker()

								{

								   InterlockedDecrement( &gulcInstances );


								}


								//+-------------------------------------------------------------------------

								//

								//  Method:     CWordBreaker::QueryInterface

								//

								//  Synopsis:   Rebind to other interface

								//

								//  Arguments:  [riid]      -- IID of new interface

								//              [ppvObject] -- New interface * returned here

								//

								//  Returns:    S_OK if bind succeeded, E_NOINTERFACE if bind failed

								//

								//--------------------------------------------------------------------------


								SCODE STDMETHODCALLTYPE

								CWordBreaker::QueryInterface( REFIID riid, void  ** ppvObject)

								{

								    if ( 0 == ppvObject )

								        return E_INVALIDARG;


								    *ppvObject = 0;


								    if ( IID_IWordBreaker == riid )

								        *ppvObject = (IUnknown *)(IWordBreaker *)this;

								    else if ( IID_IUnknown == riid )

								        *ppvObject =  (IUnknown *)this;

								    else

								        return E_NOINTERFACE;


								    AddRef();


								    return S_OK;

								}


								//+-------------------------------------------------------------------------

								//

								//  Method:     CWordBreaker::AddRef

								//

								//  Synopsis:   Increments refcount

								//

								//--------------------------------------------------------------------------


								ULONG STDMETHODCALLTYPE

								CWordBreaker::AddRef()

								{

								    return InterlockedIncrement( &_cRefs );

								}


								//+-------------------------------------------------------------------------

								//

								//  Method:     CWordBreaker::Release

								//

								//  Synopsis:   Decrement refcount.  Delete if necessary.

								//

								//--------------------------------------------------------------------------


								ULONG STDMETHODCALLTYPE

								CWordBreaker::Release()

								{

								    unsigned long uTmp = InterlockedDecrement( &_cRefs );


								    if ( 0 == uTmp )

								        delete this;


								    return(uTmp);

								}


								//+-------------------------------------------------------------------------

								//

								//  Method:     CWordBreaker::Init

								//

								//  Synopsis:   Initialize word-breaker

								//

								//  Arguments:  [fQuery]         -- TRUE if query-time

								//              [ulMaxTokenSize] -- Maximum size token stored by caller

								//              [pfLicense]      -- Set to true if use restricted

								//

								//  Returns:    Status code

								//

								//--------------------------------------------------------------------------


								SCODE STDMETHODCALLTYPE

								CWordBreaker::Init(

								    BOOL fQuery,

								    ULONG ulMaxTokenSize,

								    BOOL *pfLicense )

								{

								    if ( NULL == pfLicense ) {

								       return E_INVALIDARG;

								    }


								    if (IsBadWritePtr(pfLicense, sizeof(DWORD))) {

								        return E_INVALIDARG;

								    }


								    *pfLicense = TRUE;

								    _fQuery = fQuery;

								    _ulMaxTokenSize = ulMaxTokenSize;


								    return S_OK;

								}


								//+---------------------------------------------------------------------------

								//

								//  Member:     CWordBreaker::ComposePhrase

								//

								//  Synopsis:   Convert a noun and a modifier into a phrase.

								//

								//  Arguments:  [pwcNoun] -- pointer to noun.

								//              [cwcNoun] -- count of chars in pwcNoun

								//              [pwcModifier] -- pointer to word modifying pwcNoun

								//              [cwcModifier] -- count of chars in pwcModifier

								//              [ulAttachmentType] -- relationship between pwcNoun &pwcModifier

								//

								//----------------------------------------------------------------------------

								SCODE STDMETHODCALLTYPE

								CWordBreaker::ComposePhrase(

								    WCHAR const *pwcNoun,

								    ULONG cwcNoun,

								    WCHAR const *pwcModifier,

								    ULONG cwcModifier,

								    ULONG ulAttachmentType,

								    WCHAR *pwcPhrase,

								    ULONG *pcwcPhrase )

								{

								    //

								    // Need to code in later

								    //

								    if ( _fQuery )

								        return( E_NOTIMPL );

								    else

								        return ( WBREAK_E_QUERY_ONLY );

								}


								//+---------------------------------------------------------------------------

								//

								//  Member:     CWordBreaker::GetLicenseToUse

								//

								//  Synopsis:   Returns a pointer to vendors license information

								//

								//  Arguments:  [ppwcsLicense] -- ptr to ptr to which license info is returned

								//

								//----------------------------------------------------------------------------

								SCODE STDMETHODCALLTYPE

								CWordBreaker::GetLicenseToUse(

								    const WCHAR **ppwcsLicense )

								{


								    static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-1998";


								    if ( NULL == ppwcsLicense )  {

								       return E_INVALIDARG;

								    }


								    if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD))) {

								        return E_INVALIDARG;

								    }


								    *ppwcsLicense = wcsCopyright;

								    return( S_OK );

								}


								//+---------------------------------------------------------------------------

								//

								//  Member:     CWordBreaker::BreakText

								//

								//  Synopsis:   Break input stream into words.

								//

								//  Arguments:  [pTextSource] -- source of Unicode text

								//              [pWordSink] -- sink for collecting words

								//              [pPhraseSink] -- sink for collecting phrases

								//

								//  History:    10-Nov-1997, WeibZ,       Created.

								//

								//  Notes:      Since the input buffer may be greater than MAX_II_BUFFER_LEN

								//              we process the buffer in chunks of length MAX_II_BUFFER_LEN.

								//

								//----------------------------------------------------------------------------


								SCODE STDMETHODCALLTYPE CWordBreaker::BreakText( TEXT_SOURCE *pTextSource,

								                                                 IWordSink *pWordSink,

								                                                 IPhraseSink *pPhraseSink )

								{

								    SCODE                               sc = S_OK;

								    ULONG                               cwc;

								    SCRIPT_ITEM                 *pItems, *pItem_Next, *pItem_org;

								    SCRIPT_ANALYSIS             *psa;

								    PCWSTR                              pwcInChars;

								    INT                                 iItems;

								    BOOL                                bItemProc;

								    PCWSTR                              pwcChars;

								    INT                                 cChars;

								    HRESULT                             retUSP;

								        BOOL                            fSucceeded = true;


								    if ( NULL == pTextSource ) {

								       return E_INVALIDARG;

								    }


								    if ( NULL == pWordSink )

								    {

								        // BUGBUG, propagate the null word sink error code

								        return sc;

								    }


								    if ( 0 != pPhraseSink )

								    {

								        // ignore the phrase sink for now

								        // return sc;

								    }


								    if (pTextSource->iEnd == pTextSource->iCur) {

								       return S_OK;

								    }


								    Assert( pTextSource->iCur < pTextSource->iEnd );


								    __try

								    {

								        do

								                {


								            if ( pTextSource->iCur >= pTextSource->iEnd )

								               continue;


								            cwc = pTextSource->iEnd - pTextSource->iCur;

								            pwcInChars = pTextSource->awcBuffer + pTextSource->iCur;


								            pItems = (SCRIPT_ITEM *)LocalAlloc(LPTR,sizeof(SCRIPT_ITEM)*(cwc+1));


								            if ( !pItems) {


								                return E_UNEXPECTED;

								            }


								            pItem_org = pItems;


								            iItems = 0;

								            retUSP = ScriptItemize(pwcInChars,cwc,cwc+1, NULL, NULL,

								                                   pItems, &iItems);


								            if (retUSP != S_OK) {

								                LocalFree(pItem_org);

								                return  E_UNEXPECTED;

								            }


								            while  ( iItems > 1 ) {

								                                pItem_Next = pItems + 1;

								                                pwcChars =  pwcInChars + pItems->iCharPos;

								                                cChars   =  pItem_Next->iCharPos - pItems->iCharPos;


								                                sc = ProcessItem(       pwcChars,

								                                                                        cChars,

								                                                                        pItems,

								                                                                        FALSE,   // no need to keep chars

								                                                                        pTextSource,

								                                                                        pWordSink,

								                                                                        pPhraseSink);


								                                if ( ( FAILED( sc ) ) &&

								                                         ( FILTER_E_NO_MORE_VALUES != sc ) &&

								                                         ( FILTER_E_NO_TEXT != sc ) &&

								                                         ( FILTER_E_NO_VALUES != sc ) &&

								                                         ( FILTER_E_NO_MORE_TEXT != sc ) &&

								                                         ( FILTER_E_END_OF_CHUNKS != sc ) &&

								                                         ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&

								                                         ( WBREAK_E_END_OF_TEXT != sc ) ) {

								                                        LocalFree(pItem_org);

								                                        return  sc;

								                                }


								                                sc = S_OK;


								                                pItems++;

								                                iItems--;


								            }


								            //  special handle for the last item

								            if ( iItems == 1 ) {


								                                pwcChars = pwcInChars + pItems->iCharPos;

								                                cChars = pTextSource->iEnd - pTextSource->iCur;


								                                sc = ProcessItem(pwcChars,

								                                                                        cChars,

								                                                                        pItems,

								                                                                        TRUE,  // need to keep chars

								                                                                        pTextSource,

								                                                                        pWordSink,

								                                                                        pPhraseSink);


								                                if ( ( FAILED( sc ) ) &&

								                                         ( FILTER_E_NO_MORE_VALUES != sc ) &&

								                                         ( FILTER_E_NO_TEXT != sc ) &&

								                                         ( FILTER_E_NO_VALUES != sc ) &&

								                                         ( FILTER_E_NO_MORE_TEXT != sc ) &&

								                                         ( FILTER_E_END_OF_CHUNKS != sc ) &&

								                                         ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&

								                                         ( WBREAK_E_END_OF_TEXT != sc ) ) {

								                                        LocalFree(pItem_org);

								                                        return  sc;

								                                }


								                                sc = S_OK;

								                        }


								            if (pItem_org)

								               LocalFree(pItem_org);


								                        // O11.17064. Under low memory it is possible to pfnFillTextBuffer to failed.

								                        // We will need to return the error of TextSource for loging to Sharepoint.

								                        sc = pTextSource->pfnFillTextBuffer(pTextSource);

								                        fSucceeded = SUCCEEDED(sc);


								                        if ( ( FAILED( sc ) ) &&

								                                 ( FILTER_E_NO_MORE_VALUES != sc ) &&

								                                 ( FILTER_E_NO_TEXT != sc ) &&

								                                 ( FILTER_E_NO_VALUES != sc ) &&

								                                 ( FILTER_E_NO_MORE_TEXT != sc ) &&

								                                 ( FILTER_E_END_OF_CHUNKS != sc ) &&

								                                 ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&

								                                 ( WBREAK_E_END_OF_TEXT != sc ) ) {

								                                return sc;

								                                }


								                        sc = S_OK;


								        } while (fSucceeded);


								        if ( pTextSource->iCur < pTextSource->iEnd ) {


								            cwc = pTextSource->iEnd - pTextSource->iCur;

								            pwcInChars = pTextSource->awcBuffer + pTextSource->iCur;


								            pItems = (SCRIPT_ITEM *)LocalAlloc(LPTR,sizeof(SCRIPT_ITEM)*(cwc+1));


								            if ( !pItems ) {


								                return E_UNEXPECTED;

								            }


								            pItem_org = pItems;


								            iItems = 0;

								            retUSP = ScriptItemize(pwcInChars,cwc,cwc+1, NULL, NULL,

								                                   pItems, &iItems);


								            if (retUSP != S_OK) {

								                LocalFree(pItem_org);

								                return  E_UNEXPECTED;

								            }


								            while  ( iItems > 1 ) {

								                                pItem_Next = pItems + 1;

								                                pwcChars =  pwcInChars + pItems->iCharPos;

								                                cChars   =  pItem_Next->iCharPos - pItems->iCharPos;


								                                sc = ProcessItem(pwcChars,

								                                                                        cChars,

								                                                                        pItems,

								                                                                        FALSE,  // no need to keep chars

								                                                                        pTextSource,

								                                                                        pWordSink,

								                                                                        pPhraseSink);


								                                if ( ( FAILED( sc ) ) &&

								                                         ( FILTER_E_NO_MORE_VALUES != sc ) &&

								                                         ( FILTER_E_NO_TEXT != sc ) &&

								                                         ( FILTER_E_NO_VALUES != sc ) &&

								                                         ( FILTER_E_NO_MORE_TEXT != sc ) &&

								                                         ( FILTER_E_END_OF_CHUNKS != sc ) &&

								                                         ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&

								                                         ( WBREAK_E_END_OF_TEXT != sc ) ) {

								                                        LocalFree(pItem_org);

								                                        return  sc;

								                                }


								                                sc = S_OK;


								                                pItems++;

								                                iItems--;

								                        }


								            if ( iItems == 1 ) {


								                                pwcChars = pwcInChars + pItems->iCharPos;

								                                cChars = pTextSource->iEnd - pTextSource->iCur;


								                                sc = ProcessItem(pwcChars,

								                                                                        cChars,

								                                                                        pItems,

								                                                                        FALSE,    // no need to keep chars

								                                                                        pTextSource,

								                                                                        pWordSink,

								                                                                        pPhraseSink);


								                                if ( ( FAILED( sc ) ) &&

								                                         ( FILTER_E_NO_MORE_VALUES != sc ) &&

								                                         ( FILTER_E_NO_TEXT != sc ) &&

								                                         ( FILTER_E_NO_VALUES != sc ) &&

								                                         ( FILTER_E_NO_MORE_TEXT != sc ) &&

								                                         ( FILTER_E_END_OF_CHUNKS != sc ) &&

								                                         ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&

								                                         ( WBREAK_E_END_OF_TEXT != sc ) ) {

								                                        LocalFree(pItem_org);

								                                        return  sc;

								                                }


								                                sc = S_OK;

								                        }


								            if ( pItem_org )

								               LocalFree(pItem_org);

								        }


								    } __except(1) {


								       sc = E_UNEXPECTED;

								   }


								    return sc;

								}


								SCODE CWordBreaker::ProcessItem(

								                     PCWSTR        pwcChars,

								                     INT           cChars,

								                     SCRIPT_ITEM   *pItems,

								                     BOOL          fKeep,

								                     TEXT_SOURCE  *pTextSource,

								                     IWordSink    *pWordSink,

								                     IPhraseSink  *pPhraseSink )

								{

								    INT                                         iChar,i;

								    INT                                         iWord, iWordStart, iWordLen;

								    const SCRIPT_PROPERTIES **pScript_Properties;

								    DWORD                                       LangID;

								    WORD                                        iScript;

								    HRESULT                                     retUSP;

								        SCODE                                   scRetVal = S_OK;


								    ScriptGetProperties(&pScript_Properties, NULL);


								    iScript = pItems->a.eScript;


								    LangID = (pScript_Properties[iScript])->langid;


								    switch (LangID) {

								       case LANG_THAI:

								                        {

								                        BYTE*   pBreakPos;

								                        int             iNumberOfBreak = 0;

								                        int             i;

								                        WCHAR*  pwch = (WCHAR*) pwcChars;

								                        THWB_STRUCT* pThwbStruct = NULL;


								                        pBreakPos = new BYTE[cChars];


								                        if ( pBreakPos == NULL )

								                                return  FALSE;


								                        pThwbStruct = THWB_CreateThwbStruct(cChars);


								                        pBreakPos[0] = 0;

								                        iNumberOfBreak = THWB_IndexWordBreak(pwch,cChars, pBreakPos, pThwbStruct,cChars);


								                        for (i=0;i < iNumberOfBreak; i++)

								                                {


								                                // Search index alternate words.

								                                // If not query create Alternate word.

								                                if (pThwbStruct[i].alt != 0 && !_fQuery)

								                                        {

								                                        int             iNumAltWord = 0, k;

								                                        BYTE    pAltBreakPos[5];

								                                        WCHAR*  word1 = pwch;

								                                        int             indexWord1 = 0;


								                                        // Find Alternate words

								                                        iNumAltWord = THWB_FindAltWord(word1,pBreakPos[i], pThwbStruct[i].alt, pAltBreakPos);


								                                        // Put alternate words.

								                                        for(k=0; k<iNumAltWord;k++)

								                                                {

								                                                scRetVal = pWordSink->PutAltWord(pAltBreakPos[k],&word1[indexWord1],pBreakPos[i],pTextSource->iCur);

								                                                indexWord1 += pAltBreakPos[k];

								                                                }

								                                        }


								                                // if PutAltWord not okay return.

								                                if (scRetVal != S_OK)

								                                        break;


								                                if (*pwch >= THAI_Ko_Kai && *pwch <= THAI_Vowel_MaiYaMok)

								                                        scRetVal = pWordSink->PutWord(pBreakPos[i], pwch,       pBreakPos[i], pTextSource->iCur);


								                                if (scRetVal != S_OK)

								                                        break;


								                                pTextSource->iCur += pBreakPos[i];


								                                pwch += pBreakPos[i];

								                                }


								                        if (pBreakPos)

								                                delete pBreakPos;


								                        // Prefix bug 1055941 - clear allocated memory.

								                        THWB_DeleteThwbStruct(pThwbStruct);


								                        break;

								                        }


								       case LANG_ENGLISH :   // handle English chars


								            {

								                BYTE        ct;

								                BOOL        fRomanWord = FALSE;

								                CONST WCHAR *pwcInput;

								                WT          Type;


								                Type =  WT_START;


								                pwcInput = pwcChars;

								                iWordStart = 0;


								                for (iChar=0; iChar< cChars; iChar++, pwcInput++)

								                                        {

								                    ct = GetCharType(*pwcInput);


								                    if ( (ct != WS) && (ct != PS) )

								                       ct = CH;


								                    switch (ct) {

								                      case CH :

								                           if (!fRomanWord) {

								                               iWordStart = iChar;

								                               fRomanWord = TRUE;

								                               Type = WT_ROMAJI;

								                           }

								                           break;

								                      case WS :

								                           if (fRomanWord) {

								                              iWordLen = iChar - iWordStart;


								                              scRetVal = pWordSink->PutWord(iWordLen, pwcChars+iWordStart, iWordLen, pTextSource->iCur);


								                              pTextSource->iCur += iWordLen;

								                              fRomanWord = FALSE;

								                           }


								                           Type = WT_WORD_SEP;

								                           pTextSource->iCur++;

								                           break;


								                      case PS :

								                           if (fRomanWord) {

								                              iWordLen = iChar - iWordStart;


								                              scRetVal = pWordSink->PutWord(iWordLen, pwcChars+iWordStart, iWordLen, pTextSource->iCur);


								                              pTextSource->iCur += iWordLen;

								                              fRomanWord = FALSE;

								                           }


								                           Type = WT_PHRASE_SEP;

								                           scRetVal = pWordSink->PutBreak(WORDREP_BREAK_EOS);

								                           pTextSource->iCur++;

								                           break;

								                                        }


								                                        if (scRetVal != S_OK)

								                                                break;

								                }


								                if ((Type == WT_WORD_SEP) || (Type == WT_PHRASE_SEP))

								                   break;


								                if ( fKeep )

								                   break;


								                                if (scRetVal != S_OK)

								                                        break;


								                iWordLen =cChars - iWordStart;

								                scRetVal = pWordSink->PutWord(iWordLen, pwcChars+iWordStart, iWordLen,pTextSource->iCur);

								                pTextSource->iCur += iWordLen;


								                                if (scRetVal != S_OK)

								                                        {

								                                        break;

								                                        }


								            }


								            break;


								       default:


								               pTextSource->iCur += cChars;

								               break;

								    }


								    return scRetVal;

								}