windows-xp/Source/XPSP1/NT/inetsrv/intlwb/enu/wordbreaker/tokenizer.h


								////////////////////////////////////////////////////////////////////////////////

								//

								//  Filename :  Tokenizer.h

								//  Purpose  :  Tokenizer declerations

								//

								//  Project  :  WordBreakers

								//  Component:  English word breaker

								//

								//  Author   :  yairh

								//

								//  Log:

								//

								//      Jan 06 2000 yairh creation

								//      Apr 05 2000 dovh - Fixed two problematic debug / tracer buffer size

								//          problems.  (Fix Bug 15449).

								//      May 07 2000 dovh - USE_WS_SENTINEL algorithm in BreakText

								//      Nov 11 2000 dovh - Special underscore treatment

								//          Added inline support routines (FindLeftmostUnderscore etc.)

								//

								////////////////////////////////////////////////////////////////////////////////


								#ifndef _TOKENIZER_H_

								#define _TOKENIZER_H_


								#include "tracer.h"

								#include "PropArray.h"

								#include "Query.h"

								#include "stdafx.h"

								#include "cierror.h"

								#include "LangSupport.h"

								#include "Formats.h"


								#define TOKENIZER_MAXBUFFERLIMIT 1024 // max size of a token is 1024 chars


								DECLARE_TAG(s_tagTokenizer, "Tokenizer");

								DECLARE_TAG(s_tagTokenizerOutput, "Tokenizer Output");

								DECLARE_TAG(s_tagTokenizerTrace, "Tokenizer Trace");

								DECLARE_TAG(s_tagTokenizerDecision, "Tokenizer Decision");

								DECLARE_TAG(s_tagTokenizerSuspect, "Tokenizer Suspect");


								#if defined(DEBUG)

								///////////////////////////////////////////////////////////////////////////////

								// Class CTraceWordSink

								///////////////////////////////////////////////////////////////////////////////

								class CTraceWordSink : public IWordSink

								{

								public:

								    CTraceWordSink(IWordSink* p) : m_apWordSink(p)

								    {

								    }


								    ULONG __stdcall AddRef()

								    {

								        return 1;

								    }


								    ULONG __stdcall Release()

								    {

								        return 0;

								    }


								    STDMETHOD(QueryInterface)(

								        IN  REFIID  riid,

								        IN  void    **ppvObject)

								    {

								        Assert(false);

								        return E_FAIL;

								    }


								    STDMETHOD(PutWord)(

								                ULONG cwc,

								                WCHAR const* pwcInBuf,

								                ULONG cwcSrcLen,

								                ULONG cwcSrcPos)

								    {

								        Assert(cwc < TOKENIZER_MAXBUFFERLIMIT + 10);

								#if (defined (DEBUG) && !defined(_NO_TRACER)) || defined(USE_TRACER)

								        if (CheckTraceRestrictions(elVerbose, s_tagTokenizerOutput))

								        {

								            Trace(

								                elVerbose,

								                s_tagTokenizerOutput,

								                ("PutWord: %*.*S, %d, %d, %d",

								                cwc,

								                cwc,

								                pwcInBuf,

								                cwc,

								                cwcSrcLen,

								                cwcSrcPos));

								        }

								#endif


								        return m_apWordSink->PutWord(cwc, pwcInBuf, cwcSrcLen, cwcSrcPos);

								    }


								    STDMETHOD(PutAltWord)(

								                ULONG cwc,

								                WCHAR const* pwcInBuf,

								                ULONG cwcSrcLen,

								                ULONG cwcSrcPos)

								    {

								        Assert(cwc < TOKENIZER_MAXBUFFERLIMIT + 10);

								#if (defined (DEBUG) && !defined(_NO_TRACER)) || defined(USE_TRACER)

								        if (CheckTraceRestrictions(elVerbose, s_tagTokenizerOutput))

								        {

								            Trace(

								                elVerbose,

								                s_tagTokenizerOutput,

								                ("PutAltWord: %*.*S, %d, %d, %d",

								                cwc,

								                cwc,

								                pwcInBuf,

								                cwc,

								                cwcSrcLen,

								                cwcSrcPos));

								        }

								#endif

								        return m_apWordSink->PutAltWord(cwc, pwcInBuf, cwcSrcLen, cwcSrcPos);

								    }


								    STDMETHOD(StartAltPhrase)()

								    {

								        Trace(

								            elVerbose,

								            s_tagTokenizerOutput,

								            ("StartAltPhrase"));


								        return m_apWordSink->StartAltPhrase();

								    }


								    STDMETHOD(EndAltPhrase)()

								    {

								        Trace(

								            elVerbose,

								            s_tagTokenizerOutput,

								            ("EndAltPhrase"));


								        return m_apWordSink->EndAltPhrase();

								    }


								    STDMETHOD(PutBreak)(WORDREP_BREAK_TYPE breakType)

								    {

								        WCHAR* p;

								#if (defined (DEBUG) && !defined(_NO_TRACER)) || defined(USE_TRACER)

								        if (CheckTraceRestrictions(elVerbose, s_tagTokenizerOutput))

								        {

								            switch (breakType)

								            {

								            case WORDREP_BREAK_EOW:

								                p = L"WORDREP_BREAK_EOW";

								                break;

								            case WORDREP_BREAK_EOS:

								                p = L"WORDREP_BREAK_EOS";

								                break;

								            case WORDREP_BREAK_EOP:

								                p = L"WORDREP_BREAK_EOP";

								                break;

								            case WORDREP_BREAK_EOC:

								                p = L"WORDREP_BREAK_EOC";

								                break;

								            default:

								                p = L"Unknown break type";

								            }

								            Trace(

								                elVerbose,

								                s_tagTokenizerOutput,

								                ("PutBreak %S", p));

								        }

								#endif

								        return m_apWordSink->PutBreak(breakType);

								    }


								    CTraceWordSink* operator ->()

								    {

								        return this;

								    }

								private:

								    CComPtr<IWordSink> m_apWordSink;

								};

								#endif


								///////////////////////////////////////////////////////////////////////////////

								// Class CTokenState

								///////////////////////////////////////////////////////////////////////////////


								class CTokenState

								{

								public:

								    //

								    // methods

								    //


								    CTokenState();

								    CTokenState(CTokenState& s);


								    CTokenState& operator = (CTokenState& S);


								    void Clear(ULONG ulEnd);


								public:

								    //

								    // members

								    //


								    ULONG m_ulStart;

								    ULONG m_ulEnd;

								    CPropFlag m_Properties;

								    WCHAR* m_pwcsToken;

								};


								inline CTokenState::CTokenState() : m_ulStart(0), m_ulEnd(0)

								{

								}


								inline CTokenState::CTokenState(CTokenState& s) :

								    m_ulStart(s.m_ulStart),

								    m_ulEnd(s.m_ulEnd),

								    m_pwcsToken(s.m_pwcsToken),

								    m_Properties(s.m_Properties)

								{

								}


								inline CTokenState& CTokenState::operator = (CTokenState& S)

								{

								    m_ulStart = S.m_ulStart;

								    m_ulEnd = S.m_ulEnd;

								    m_Properties = S.m_Properties;

								    m_pwcsToken = S.m_pwcsToken;


								    return *this;

								}


								inline void CTokenState::Clear(ULONG ulEnd)

								{

								    m_ulStart = 0;

								    m_ulEnd = ulEnd;

								    m_Properties.Clear();

								    m_pwcsToken = NULL;

								}


								///////////////////////////////////////////////////////////////////////////////

								// Class CToken

								///////////////////////////////////////////////////////////////////////////////


								class CToken

								{

								public:

								    //

								    // methods

								    //


								    CToken(ULONG ulMaxTokenSize);


								    bool IsNotEmpty();

								    void Clear();

								    bool IsFull();

								    void MarkEndToken(ULONG ulCurPosInTxtSourceBuffer);

								    ULONG RemoveHeadPunct(CPropFlag& PunctProperties, CTokenState& State);

								    ULONG RemoveTailPunct(CPropFlag& PunctProperties, CTokenState& State);

								    void ComputeStateProperties(CTokenState& State);

								    ULONG CalculateStateOffsetInTxtSourceBuffer(CTokenState& State);


								    ULONG FindLeftmostUnderscore(CTokenState& State);

								    ULONG FindRightmostUnderscore(CTokenState& State);


								public:

								    //

								    // members

								    //

								    ULONG m_ulBufPos;

								    bool m_fHasEos;

								    ULONG m_ulOffsetInTxtSourceBuffer;


								    ULONG m_ulMaxTokenSize;


								    CTokenState m_State;


								    WCHAR m_awchBuf[TOKENIZER_MAXBUFFERLIMIT + 1];


								};


								inline CToken::CToken(ULONG ulMaxTokenSize) :

								    m_ulBufPos(0),

								    m_fHasEos(false),

								    m_ulOffsetInTxtSourceBuffer(0),

								    m_ulMaxTokenSize(ulMaxTokenSize)

								{

								    m_awchBuf[0] = L'\0';

								}


								inline bool CToken::IsNotEmpty()

								{

								    return (m_ulBufPos > 0);

								}


								inline void CToken::Clear()

								{

								    m_ulBufPos = 0;

								    m_awchBuf[0] = L'\0';

								    m_State.Clear(0);

								    m_fHasEos = false;

								    m_ulOffsetInTxtSourceBuffer = 0;

								}


								inline bool CToken::IsFull()

								{

								    return (m_ulBufPos == m_ulMaxTokenSize);

								}


								inline void CToken::MarkEndToken(ULONG ulCurPosInTxtSourceBuffer)

								{

								    Assert(m_ulBufPos < m_ulMaxTokenSize + 1);

								    m_awchBuf[m_ulBufPos] = L'\0';

								    m_State.m_pwcsToken = m_awchBuf;

								    m_State.m_ulStart = 0;

								    m_State.m_ulEnd = m_ulBufPos;


								    if (TEST_PROP(m_State.m_Properties, PROP_EOS) &&

								        (m_ulBufPos < m_ulMaxTokenSize))

								    {

								        ULONG ulCur = m_State.m_ulEnd - 1;


								        while (TEST_PROP(GET_PROP(m_awchBuf[ulCur]), EOS_SUFFIX))

								        {

								            ulCur--;

								        }


								        if (IS_EOS(m_awchBuf[ulCur]))

								        {

								            m_fHasEos = true;

								        }

								    }


								    //

								    // BUGBUG need to enalble the assert

								    //


								    // Assert(ulCurPosInTxtSourceBuffer > m_ulBufPos);


								    m_ulOffsetInTxtSourceBuffer = ulCurPosInTxtSourceBuffer - m_ulBufPos;

								}


								inline ULONG CToken::CalculateStateOffsetInTxtSourceBuffer(CTokenState& State)

								{

								    ULONG ulOffset =

								        m_ulOffsetInTxtSourceBuffer +

								        (State.m_pwcsToken - m_awchBuf) +

								        State.m_ulStart;


								    return ulOffset;

								}


								inline ULONG CToken::RemoveHeadPunct(CPropFlag& PunctProperties, CTokenState& State)

								{

								    Assert(m_State.m_ulStart <= State.m_ulStart);

								    Assert(State.m_ulStart <= State.m_ulEnd);

								    Assert(State.m_ulEnd <= m_State.m_ulEnd);


								    for (ULONG ul = State.m_ulStart; ul < State.m_ulEnd; ul++)

								    {

								        if (!TEST_PROP1(GET_PROP(State.m_pwcsToken[ul]), PunctProperties) )

								        {

								            break;

								        }

								    }

								    State.m_ulStart = ul;


								    //

								    // return num of characters removed

								    //

								    return ul;

								}


								inline ULONG CToken::RemoveTailPunct(CPropFlag& PunctProperties, CTokenState& State)

								{

								    Assert(m_State.m_ulStart <= State.m_ulStart);

								    Assert(State.m_ulStart <= State.m_ulEnd);

								    Assert(State.m_ulEnd <= m_State.m_ulEnd);


								    for (ULONG ul = State.m_ulEnd; ul > State.m_ulStart; ul--)

								    {

								        if (!TEST_PROP1(GET_PROP(State.m_pwcsToken[ul - 1]), PunctProperties) )

								        {

								            break;

								        }

								    }


								    ULONG ulNumOfRemovedChars = State.m_ulEnd - ul;

								    State.m_ulEnd = ul;


								    return ulNumOfRemovedChars;

								}


								inline void CToken::ComputeStateProperties(CTokenState& State)

								{

								    Assert(m_State.m_ulStart <= State.m_ulStart);

								    Assert(State.m_ulStart <= State.m_ulEnd);

								    Assert(State.m_ulEnd <= m_State.m_ulEnd);


								    State.m_Properties.Clear();


								    for (ULONG ul = State.m_ulStart; ul < State.m_ulEnd; ul++)

								    {

								        State.m_Properties |= GET_PROP(State.m_pwcsToken[ul]);

								    }

								}


								////////////////////////////////////////////////////////////////////////////////

								//

								//  Support routines for UNDERSCORE '_' treatment.

								//

								//  Current algorithm has the following behavior for tokens containing

								//  ALPHANUMERIC characters and UNDERSCORES:

								//

								//  1.  Single underscores and consecutive underscore sequence surrounded by

								//      alphanumeric characters (IE underscores buried within words) are

								//      treated as alphanumeric characters, and do not break words, or get

								//      omitted.  Examples: Foo_Bar => Foo_Bar, and X___Y => X___Y

								//

								//  2.  An underscore / underscore sequence tacked to the right (left) end

								//      end of an alphanumeric (+ embedded underscores) token, will be part of

								//      the token, as long as the sequence is attacked only to one side of the

								//      alphanumeric token.  If there are BOTH header and trailer consecutive

								//      underscore sequences, both header & trailer sequence will be omitted.

								//      Examples: __Foo_Bar => __Foo_Bar , alpha_beta_ => alpha_beta_ ,

								//      __HEADERFILE__ => __HEADERFILE__ , __MY_FILE_H__ => MY_FILE_H

								//

								//  3.  Caveat: Note that other than the two rules stated above underscores are

								//      NOT treated as ALPHANUMERIC characters. he behavior on a mixed sequence

								//      of underscores, and other  non-alphanumeric characters is undefined!

								//

								////////////////////////////////////////////////////////////////////////////////


								//

								//  Assumes: on entry State.m_ulStart is the first alphanumeric in token

								//  returns: num of underscores scanned

								//

								inline ULONG

								CToken::FindLeftmostUnderscore(CTokenState& State)

								{

								    Assert(m_State.m_ulStart < State.m_ulStart);

								    Assert(State.m_ulStart <= State.m_ulEnd);

								    Assert(State.m_ulEnd <= m_State.m_ulEnd);

								    Assert( TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulStart-1]), PROP_UNDERSCORE) );


								    ULONG ulNumUnderscores = 0;


								    for (ULONG ul = State.m_ulStart;

								        (ul > m_State.m_ulStart) &&

								            (TEST_PROP(GET_PROP(State.m_pwcsToken[ul-1]), PROP_UNDERSCORE) );

								        ul--)

								        ;


								    ulNumUnderscores = State.m_ulStart - ul;


								    State.m_ulStart = ul;


								    //

								    // return num of underscores scanned

								    //

								    return (ulNumUnderscores);


								} // CToken::FindLeftmostUnderscore


								//

								//  Assumes: on entry State.m_ulEnd is the last alphanumeric in token

								//  returns: num of underscores scanned

								//

								inline ULONG

								CToken::FindRightmostUnderscore(CTokenState& State)

								{

								    Assert(m_State.m_ulStart <= State.m_ulStart);

								    Assert(State.m_ulStart <= State.m_ulEnd);

								    Assert(State.m_ulEnd < m_State.m_ulEnd);

								    Assert( TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd]), PROP_UNDERSCORE) );


								    ULONG ulNumUnderscores = 0;


								    for (ULONG ul = State.m_ulEnd;

								        (ul < m_State.m_ulEnd) &&

								            (TEST_PROP(GET_PROP(State.m_pwcsToken[ul]), PROP_UNDERSCORE) );

								        ul++)

								        ;


								    ulNumUnderscores = ul - State.m_ulEnd;


								    State.m_ulEnd = ul;


								    //

								    // return num of underscores scanned

								    //

								    return (ulNumUnderscores);


								} // CToken::FindRightmostUnderscore


								///////////////////////////////////////////////////////////////////////////////

								// Class CTokenizer

								///////////////////////////////////////////////////////////////////////////////


								class CTokenizer

								{

								public:


								    CTokenizer(

								        TEXT_SOURCE* pTxtSource,

								        IWordSink   * pWordSink,

								        IPhraseSink * pPhraseSink,

								        LCID lcid,

								        BOOL bQueryTime,

								        ULONG ulMaxTokenSize);


								    // destructor frees the passed buffer, if it exists

								    virtual ~CTokenizer(void)

								    {

								    }


								    void BreakText();


								protected:


								    //

								    // methods

								    //


								    void ProcessToken();

								    void ProcessTokenInternal();

								    void BreakCompundString(CTokenState& State, CPropFlag& prop);


								    HRESULT FillBuffer();

								    void CalculateUpdateEndOfBuffer();


								    bool CheckAndCreateNumber(

								                         WCHAR* pwcsStr,

								                         ULONG ulLen,

								                         WCHAR* pwcsOut,

								                         ULONG* pulOffsetToTxt,

								                         ULONG* pulOutLen);


								    int CheckAndCreateNumber(

								                         WCHAR* pwcsStr,

								                         ULONG ulLen,

								                         WCHAR wchSDecimal,

								                         WCHAR wchSThousand,

								                         WCHAR* pwcsOut,

								                         ULONG* pulOffsetToTxt,

								                         ULONG* pulOutLen);


								    short ConvertHexCharToNumber(WCHAR wch);

								    void GetValuesFromDateString(

								        CDateTerm* pFormat,

								        WCHAR* pwcsDate,

								        LONG* plD_M1,     // we can't tell in this stage whether this is a Day or a month.

								        LONG* plD_M2,

								        LONG* plYear);


								    void GetValuesFromTimeString(

								        CTimeTerm* pFormat,

								        WCHAR* pwcsTime,

								        LONG* plHour,

								        LONG* plMin,

								        LONG* plSec,

								        TimeFormat* pAmPm);


								    LONG ConvertCharToDigit(WCHAR wch);

								#ifdef DEBUG

								    void TraceToken();

								#endif DEBUG


								    bool VerifyAlphaUrl();

								    bool VerifyWwwUrl();

								    bool VerifyAcronym();

								    bool VerifyAbbreviation();

								    bool VerifySpecialAbbreviation();

								    bool VerifyHyphenation();

								    bool VerifyParens();

								    const CCliticsTerm* VerifyClitics(CTokenState& State);

								    bool VerifyNumber(CTokenState& State);

								    bool VerifyNumberOrTimeOrDate();

								    bool VerifyTime(CTokenState& State);

								    bool VerifyDate(CTokenState& State);

								    bool VerifyCurrency();

								    bool VerifyMisc();

								    bool VerifyCommersialSign();


								    void ProcessDefault();


								    ULONG

								    AddBackUnderscores(

								        IN CTokenState& State,

								        IN bool hasFrontUnderscore,

								        IN bool hasBackUnderscore

								        );

								    bool CheckAndRemoveOneSidedUnderscores(CTokenState& State);


								    void OutputUrl(

								                CTokenState& State);

								    void OutputAcronym(

								                CTokenState& State,

								                const CCliticsTerm* pCliticsTerm);

								    void OutputAbbreviation(

								                CTokenState& State);

								    void OutputSpecialAbbreviation(

								                CTokenState& State,

								                CAbbTerm* pTerm,

								                const CCliticsTerm* pCliticsTerm);

								    virtual void OutputHyphenation(

								                CTokenState& State,

								                const CCliticsTerm* pCliticsTerm);

								    void OutputParens(

								                CTokenState& State);

								    void OutputNumbers(

								                CTokenState& State,

								                ULONG ulLen,

								                WCHAR* pwcsNumber,

								                const CCliticsTerm* pCliticsTerm);

								    void OutputTime(

								                WCHAR* pwcsTime,

								                CTokenState& State);

								    void OutputDate(

								                WCHAR* pwcsDate1,

								                WCHAR* pwcsDate2,

								                CTokenState& State);

								    virtual void OutputSimpleToken(

								                CTokenState& State,

								                const CCliticsTerm* pTerm);

								    void OutputCurrency(

								                ULONG ulLen,

								                WCHAR* pwcsCurrency,

								                CTokenState& State,

								                const CCliticsTerm* pTerm);

								    void OutputMisc(

								                CTokenState& State,

								                bool bPatternContainOnlyUpperCase,

								                ULONG ulSuffixSize,

								                const CCliticsTerm* pCliticsTerm);

								    void OutputCommersialSignToken(CTokenState& State);


								    //

								    // members

								    //


								    LCID m_Lcid;

								    CAutoClassPointer<CLangSupport> m_apLangSupport;


								    CToken* m_pCurToken;

								    CToken m_Token;


								#if defined(DEBUG)

								    CTraceWordSink m_apWordSink;

								#else

								    CComPtr<IWordSink> m_apWordSink;

								#endif

								    CComPtr<IPhraseSink> m_apPhraseSink;

								    TEXT_SOURCE* m_pTxtSource;


								    BOOL m_bQueryTime;


								    ULONG m_ulUpdatedEndOfBuffer;

								    bool m_bNoMoreTxt;


								    //

								    //  All Chunks in buffer have a white space

								    //

								    bool m_bWhiteSpaceGuarranteed;

								    ULONG m_ulMaxTokenSize;


								};


								inline HRESULT CTokenizer::FillBuffer()

								{

								    Trace(

								        elVerbose,

								        s_tagTokenizer,

								        ("WBreakGetNextChar: Filling the buffer"));


								    HRESULT hr;


								    if (!m_bNoMoreTxt)

								    {

								        do

								        {

								            //

								            // this loop usually performs only one rotations. we use it to solve the

								            // problem when the user return 0 characters and a success return code.

								            // the following code assumes that in case you get a success return code then

								            // the buffer is not empty.

								            //


								            hr = m_pTxtSource->pfnFillTextBuffer(m_pTxtSource);

								        } while ((m_pTxtSource->iEnd <= m_pTxtSource->iCur) && SUCCEEDED(hr));


								        if ( FAILED(hr))

								        {

								             m_bNoMoreTxt = true;

								        }

								    }


								    if (m_bNoMoreTxt && m_pTxtSource->iCur >= m_pTxtSource->iEnd)

								    {

								        //

								        // we reached the end of the buffer.

								        //

								        return WBREAK_E_END_OF_TEXT;

								    }


								    CalculateUpdateEndOfBuffer();


								    return S_OK;

								}


								inline void CTokenizer::CalculateUpdateEndOfBuffer()

								{

								    //

								    // m_ulUpdatedEndOfBuffer is a marker for the last character that we can read

								    // from the current buffer before and additional call to fill buffer is needed.

								    // we use this marker to avoid terms spitted between two consecutive buffers.

								    // in order to achieve the above m_ulUpdatedEndOfBuffer will point to a breaker

								    // character. (the only exception to that is when we have a very long term that does

								    // not contains breaker characters).

								    //


								    //

								    // we split the buffer into chunks of TOKENIZER_MAXBUFFERLIMIT size. in each

								    // chunk we make sure that there is a breaker.

								    //


								    ULONG ulStartChunk = m_pTxtSource->iCur;

								    ULONG ulEndChunk ;

								    bool fLastRound = false;


								    Assert(m_pTxtSource->iEnd > m_pTxtSource->iCur);


								    ulEndChunk = m_pTxtSource->iCur + m_ulMaxTokenSize > (m_pTxtSource->iEnd - 1) ?

								            (m_pTxtSource->iEnd - 1) : m_pTxtSource->iCur + m_ulMaxTokenSize;

								    ULONG ulCur;

								    ULONG ulBreakerMarker = 0;

								    m_bWhiteSpaceGuarranteed = false;


								    while(true)

								    {

								        ulCur = ulEndChunk;


								        //

								        // per each chunk we go backward and try to find a WS.

								        //

								        while ((ulCur > ulStartChunk) &&

								               (!IS_WS(m_pTxtSource->awcBuffer[ulCur])))

								        {

								            ulCur--;

								        }


								        if (ulCur == ulStartChunk)

								        {


								            //

								            // the last chunk that we checked did not contain any WS

								            //


								            if (m_ulMaxTokenSize == (ulEndChunk - ulStartChunk))

								            {

								                //

								                // full buffer case. we look for a default breaker.

								                //


								                ulCur = ulEndChunk;


								                while ( (ulCur > ulStartChunk) &&

								                        !IS_BREAKER( m_pTxtSource->awcBuffer[ulCur] )

								                      )

								                {

								                    ulCur--;

								                }


								                //

								                // if we found a breaker then ulBreakerMarker will set to it else

								                // the term does not contain any breakers and we set the ulBreakerMarker

								                // to the end of the term. this is the only case that we spilt terms.

								                //

								                ulBreakerMarker = ulCur > ulStartChunk ? ulCur : ulEndChunk;

								            }

								            else

								            {

								                if (ulStartChunk > m_pTxtSource->iCur)

								                {

								                    //

								                    // case we had a previous chunk. in this case ulStartChunk points to

								                    // a breaker

								                    //


								                    //

								                    // ulStart points to the WS from the previous chunk.

								                    //

								                    ulBreakerMarker = ulStartChunk;

								                }

								                else

								                {

								                    ulBreakerMarker = m_pTxtSource->iEnd;

								                }

								            }


								            break;

								        }


								        if (fLastRound)

								        {

								            //

								            // ulCur points to a WS

								            //

								            ulBreakerMarker = ulCur + 1;

								            m_bWhiteSpaceGuarranteed = true;


								            break;

								        }


								        //

								        // move to the next chunk

								        //

								        ulStartChunk = ulCur + 1; // ulStarChunk will points to a breaker

								        if (ulStartChunk + m_ulMaxTokenSize < (m_pTxtSource->iEnd - 1))

								        {

								            ulEndChunk = ulStartChunk + m_ulMaxTokenSize;


								        }

								        else

								        {

								            ulEndChunk = m_pTxtSource->iEnd - 1;

								            fLastRound = true;

								        }

								    }


								    Assert(ulBreakerMarker <= m_pTxtSource->iEnd);

								    m_ulUpdatedEndOfBuffer = ulBreakerMarker;


								}


								inline short CTokenizer::ConvertHexCharToNumber(WCHAR wch)

								{

								    //

								    // assumes wch is a valid HEX character

								    //

								    Assert(wch >= L'0');


								    if (wch <= L'9')

								    {

								        return (wch - L'0');

								    }

								    else if (wch <= L'F')

								    {

								        Assert(wch >= L'A');

								        return (wch - L'A' + 10);

								    }

								    else if (wch <= L'f')

								    {

								        Assert(wch >= L'a');

								        return (wch - L'a' + 10);

								    }

								    else if (wch <= 0xFF19)

								    {

								        Assert(wch >= 0xFF10);

								        return (wch - 0xFF10);

								    }

								    else if (wch <= 0xFF26)

								    {

								        Assert(wch >= 0xFF21);

								        return (wch - 0xFF21 + 10);

								    }

								    else

								    {

								        Assert((wch >= 0xFF41) && (wch <= 0xFF46));

								        return (wch - 0xFF41 + 10);

								    }


								}


								inline LONG CTokenizer::ConvertCharToDigit(WCHAR wch)

								{

								    Assert((wch >= L'0' && wch <= L'9') || ((wch >= 0xFF10) && (wch <= 0xFF19)));

								    if (wch <= L'9')

								    {

								        return (wch - L'0');

								    }


								    return (wch - 0xFF10); // Full width characters.

								}


								#endif _TOKENIZER_H_