windows-server-2003/inetsrv/intlwb/enu/wordbreaker/custombreaking.cpp


								////////////////////////////////////////////////////////////////////////////////

								//

								//  Filename :  Tokenizer.cpp

								//  Purpose  :  Tokenizer declerations

								//

								//  Project  :  WordBreakers

								//  Component:  English word breaker

								//

								//  Author   :  yairh

								//

								//  Log:

								//

								//      Jan 06 2000 yairh creation

								//      Apr 05 2000 dovh - Fixed two problematic debug / tracer buffer size

								//          problems.  (Fix Bug 15449).

								//      May 07 2000 dovh - USE_WS_SENTINEL algorithm in BreakText

								//

								////////////////////////////////////////////////////////////////////////////////


								#include "base.h"

								#include "CustomBreaking.h"

								#include "proparray.h"

								#include "AutoPtr.h"

								#include "excption.h"

								#include "SpanishUtils.h"

								#include "WbUtils.h"

								#ifndef WHISTLER_BUILD

								#include "LanguageResources_i.c"

								#endif  // WHISTLER_BUILD


								CAutoClassPointer<CCustomBreaker> g_apEngCustomBreaker;

								CAutoClassPointer<CCustomBreaker> g_apEngUKCustomBreaker;

								CAutoClassPointer<CCustomBreaker> g_apFrnCustomBreaker;

								CAutoClassPointer<CCustomBreaker> g_apSpnCustomBreaker;

								CAutoClassPointer<CCustomBreaker> g_apItlCustomBreaker;


								CCustomWordTerm::CCustomWordTerm(const WCHAR* pwcs) :

								    m_ulStartTxt(0),

								    m_ulEndTxt(0),

								    m_pwcs(NULL)

								{

								    ULONG ulLen = wcslen(pwcs);

								    CAutoArrayPointer<WCHAR> ap;

								    ap = new WCHAR[ulLen + 1];

								    wcscpy(ap.Get(), pwcs);


								    while ((m_ulStartTxt < ulLen) &&

								           TEST_PROP(GET_PROP(ap.Get()[m_ulStartTxt]), CUSTOM_PUNCT_HEAD))

								    {

								        m_ulStartTxt++;

								    }


								    if (m_ulStartTxt == ulLen)

								    {

								        THROW_HRESULT_EXCEPTION(E_INVALIDARG);

								    }


								    m_ulEndTxt = ulLen;


								    while(m_ulEndTxt &&

								          TEST_PROP(GET_PROP(ap.Get()[m_ulEndTxt - 1]), CUSTOM_PUNCT_TAIL))

								    {

								        m_ulEndTxt--;

								    }


								    if (m_ulEndTxt <= m_ulStartTxt)

								    {

								        THROW_HRESULT_EXCEPTION(E_INVALIDARG);

								    }


								    m_pwcs = ap.Detach();

								    m_ulLen = ulLen;

								}


								bool CCustomWordTerm::CheckWord(

								    const ULONG ulBufLen,

								    ULONG ulOffsetToBaseWord,

								    ULONG ulBaseWordLen,

								    const WCHAR* pwcsBuf,

								    ULONG* pulMatchOffset,

								    ULONG* pulMatchLen)

								{

								    ULONG ulStartTxt = m_ulStartTxt;


								    while (ulOffsetToBaseWord &&

								           ulStartTxt &&

								           m_pwcs[ulStartTxt] == pwcsBuf[ulOffsetToBaseWord])

								    {

								        ulOffsetToBaseWord--;

								        ulStartTxt--;

								        ulBaseWordLen++;

								    }


								    if (ulStartTxt)

								    {

								        return false;

								    }


								    ULONG ulEndTxt = m_ulEndTxt;


								    while ((ulEndTxt < m_ulLen) &&

								           (ulOffsetToBaseWord + ulBaseWordLen < ulBufLen) &&

								           (m_pwcs[ulEndTxt] == pwcsBuf[ulOffsetToBaseWord + ulBaseWordLen ]))

								    {

								        ulEndTxt++;

								        ulBaseWordLen++;

								    }


								    if (ulEndTxt != m_ulLen)

								    {

								        return false;

								    }


								    *pulMatchOffset = ulOffsetToBaseWord;

								    *pulMatchLen = ulBaseWordLen;

								    return true;

								}


								void CCustomWordCollection::AddWord(const WCHAR* pwcs)

								{

								    CAutoClassPointer<CCustomWordTerm> ap;


								    ap = new CCustomWordTerm(pwcs);

								    m_vaWordCollection[m_ulCount] = ap.Get();

								    m_ulCount++;

								    ap.Detach();

								}


								bool CCustomWordCollection::CheckWord(

								    const ULONG ulLen,

								    const ULONG ulOffsetToBaseWord,

								    const ULONG ulBaseWordLen,

								    const WCHAR* pwcsBuf,

								    ULONG* pulMatchOffset,

								    ULONG* pulMatchLen)

								{

								    for (ULONG ul = 0; ul < m_ulCount; ul++)

								    {

								        bool fRet = m_vaWordCollection[ul]->CheckWord(

								                                                 ulLen,

								                                                 ulOffsetToBaseWord,

								                                                 ulBaseWordLen,

								                                                 pwcsBuf,

								                                                 pulMatchOffset,

								                                                 pulMatchLen);

								        if (fRet)

								        {

								            return true;

								        }

								    }


								    return false;

								}


								CCustomBreaker::CCustomBreaker(LCID lcid) :

								    m_Trie(true),

								    m_ulWordCount(0)

								{

								    CVarString vsPath;


								    if (false == GetCustomWBFilePath(lcid, vsPath))

								    {

								        return;

								    }


								    CStandardCFile Words((LPWSTR)vsPath, L"r", false);

								    if (!((FILE*)Words))

								    {

								        return;

								    }


								    WCHAR pwcsBuf[64];

								    DictStatus status;


								    while(fgetws(pwcsBuf, 64, (FILE*) Words))

								    {

								        m_ulWordCount++;


								        ULONG ulLen = wcslen(pwcsBuf);


								        if (ulLen && pwcsBuf[ulLen - 1] == L'\n')

								        {

								            pwcsBuf[ulLen - 1] = L'\0';

								            ulLen--;

								        }


								        if (0 == ulLen)

								        {

								            continue;

								        }


								        try

								        {

								            CAutoClassPointer<CCustomWordCollection> apCollection = new CCustomWordCollection;

								            apCollection->AddWord(pwcsBuf);


								            WCHAR* pwcsKey = pwcsBuf + apCollection->GetFirstWord()->GetTxtStart();

								            pwcsBuf[apCollection->GetFirstWord()->GetTxtEnd()] = L'\0';


								            DictStatus status;

								            CCustomWordCollection* pExistingCollection;


								            status = m_Trie.trie_Insert(

								                                    pwcsKey,

								                                    TRIE_DEFAULT,

								                                    apCollection.Get(),

								                                    &pExistingCollection);

								            if (DICT_ITEM_ALREADY_PRESENT == status)

								            {

								                pExistingCollection->AddWord(apCollection->GetFirstWord()->GetTxt());

								            }

								            else if (DICT_SUCCESS == status)

								            {

								                apCollection.Detach();

								                continue;

								            }


								        }

								        catch (CHresultException& h)

								        {

								            if (E_INVALIDARG == (HRESULT)h)

								            {

								                continue;

								            }

								            else

								            {

								                throw h;

								            }

								        }

								    }

								}


								//

								// The idea behind the algorithm is to store a list of special patterns that should not

								// be broken. We also want to be able to recognize those patterns when few punctuations

								// are attached to them. For example if .NET is a special pattern then in the following

								// patterns (.NET) .NET! .NET? we also want to recognize the .NET pattern and emit .NET

								// It is more complicated in the next case - NET!. The expected behavior is not to break it.

								// So algorithm need to identify when a punctuation is part of the token and not be broken

								// and when it is just a breaker.

								// The algorithm is

								// 1. Initialization.

								//      for each token is the file

								//     	a. Remove punctuations from the beginning and ending of the token - we will

								//         reference it as the base form of the token.

								//      b. Insert the base form to a dictionary. Each base form will be pointing to the

								//         generating token. Few tokens can be mapped to the same base form

								//         (NET? and NET!) so each base form will point to a collection of generating tokens

								// 2. Breaking.

								//       For each pattern you get from the document

								//          a.  perform 1a.

								//          b.  look for the resulting base form in the dictionary.

								//          c.  per each item in the collection check whether the generating token exist in the

								//              pattern we got from the document.

								//


								bool CCustomBreaker::BreakText(

								    ULONG ulLen,

								    WCHAR* pwcsBuf,

								    ULONG* pulOutLen,

								    ULONG* pulOffset)

								{

								    DictStatus status;


								    CCustomWordCollection* pCollection;

								    short sCount = 0;


								    ULONG ul = 0;

								    while ((ul < ulLen) &&

								           TEST_PROP(GET_PROP(pwcsBuf[ul]), CUSTOM_PUNCT_HEAD))

								    {

								        ul++;

								    }


								    ULONG ulOffsetToBase = ul;


								    if (ulOffsetToBase == ulLen)

								    {

								        return false;

								    }


								    ULONG ulBaseLen = ulLen;


								    while(ulBaseLen &&

								          TEST_PROP(GET_PROP(pwcsBuf[ulBaseLen - 1]), CUSTOM_PUNCT_TAIL))

								    {

								        ulBaseLen--;

								    }


								    if (ulBaseLen <= ulOffsetToBase)

								    {

								        return false;

								    }


								    ulBaseLen -= ulOffsetToBase;


								    status = m_Trie.trie_Find(

								                            pwcsBuf + ulOffsetToBase,

								                            TRIE_LONGEST_MATCH,

								                            1,

								                            &pCollection,

								                            &sCount);

								    if (sCount)

								    {

								        bool bRet;


								        bRet = pCollection->CheckWord(

								                        ulLen,

								                        ulOffsetToBase,

								                        ulBaseLen,

								                        pwcsBuf,

								                        pulOffset,

								                        pulOutLen);

								        return bRet;

								    }


								    return false;

								}