windows-server-2003/inetcore/mshtml/tried/triedit/lexer.cpp

// Copyright (c)1997-1999 Microsoft Corporation, All Rights Reserved
/* copied from ..\htmed\lexer.cpp */

/*++

  Copyright (c) 1995 Microsoft Corporation

  File: lexer.cpp

  Abstract:
        Nitty Gritty Lexer stuff

  Contents:
      SetValueSeen()
      IsSingleOp()
      IsWhiteSpace()
      MapToken()
      FindEndTag()
      MakeSublang()
      SetLanguage()
      FindTable()
      FindTable()
      RemoveTable()
      MakeTableSet()
      GetToken()
      IfHackComment()
      FindServerScript()
      FindEndComment()
      FindEndEntity()
      FindEntityRef()
      FindValue()
      FindEndString()
      FindTagOpen()
      FindText()
      FindNextToken()
      GetTextHint()
      GetHint()
      GetTokenLength()
      GetValueTokenLength()
      IsElementName()
      IsAttributeName()
      IsIdentifier()
      IsUnknownID()
      IsNumber()
      CColorHtml::SetTable()
      CColorHtml::InitSublanguages()

  History:
      2/14/97   cgomes:   Created


--*/

#include "stdafx.h"

#include "resource.h"
#include "guids.h"
#include "token.h"
#include "table.h"
#include "lexer.h"

UINT FindClientScriptEnd(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token);

#undef ASSERT
#define ASSERT(b) _ASSERTE(b)
// HACK: we keep a copy of a ptr to the ASP table and sublang
// so we can do special behavior for ASP files
CTableSet* g_ptabASP = 0;
PSUBLANG g_psublangASP = 0;

PTABLESET g_arpTables[CV_MAX+1];

// NOTE: added to handle value tokens properly.
UINT GetValueTokenLength(LPCTSTR pchLine, UINT cbLen, UINT cbCur);

// mark state transition from value -> next attribute
inline int SetValueSeen(DWORD *plxs)
{
    if (*plxs & inValue)
    {
        *plxs &= ~inValue;
        *plxs |= inAttribute;
        return TRUE;
    }
    else
        return FALSE;
}

// REVIEW (walts) - need better way
inline void SetScriptLanguage(LPCTSTR pchLine, DWORD *plxs)
{
    LPCTSTR strJavaScript       = _T("javascript");
    LPCTSTR strVBScript         = _T("vbscript");
    // triedit's special language. Its set when we convert server-side scripts into
    // client-side scripts. Its a dummy language. if we find that as language, we
    // set in ServerASP. It is reset(removed) in FindNextToken().
    LPCTSTR strServerAsp        = _T("serverasp");

    // language attribute may have quotes around it.
    // if it does then advance past the first quote.
    //      ex. <SCRIPT LANGUAGE="VBScript">
    if(*pchLine == L'\"')
        pchLine++;

    if (_tcsnicmp(pchLine, strJavaScript, lstrlen(strJavaScript)) == 0)
    {
        *plxs &= ~inVBScript;
        *plxs &= ~inServerASP;
        *plxs |= inJavaScript;
    }
    else if (_tcsnicmp(pchLine, strVBScript, lstrlen(strVBScript)) == 0)
    {
        *plxs &= ~inJavaScript;
        *plxs &= ~inServerASP;
        *plxs |= inVBScript;
    }
    else if (_tcsnicmp(pchLine, strServerAsp, lstrlen(strServerAsp)) == 0)
    {
        *plxs &= ~inJavaScript;
        *plxs &= ~inVBScript;
        *plxs |= inServerASP;
    }
}

inline BOOL IsSingleOp(HINT hint)
{
    return ((hint >= tokOP_SINGLE) && (hint < tokOP_MAX));
};

inline BOOL IsWhiteSpace(TCHAR c)
{
    return _istspace(c);
};


// NOTE: Added to handle value tokens properly
inline IsValueChar(TCHAR ch)
{
    // REVIEW(cgomes): specify all the invalid value characters
    return ch != _T('<') && ch != _T('>');
};

////////////////////////////////////////////////////////////////////////////
//
// map parsed token to returned token

// left column must be in ascending order
static TOKEN _rgTokenMap[] =
{
    tokName,     tokSpace,
    tokNum,      tokSpace,
    tokParEnt,   tokSpace,
    tokResName,  tokSpace,
    0,           0
};

static TOKEN MapToken(TOKEN tokClass, DWORD lxs)
{
    if (IsSingleOp((HINT)tokClass))
        return tokOp;
    else if ((tokClass == tokTag) && (lxs & inHTXTag))
        return tokSSS;
    for (int i = 0; (_rgTokenMap[i] != 0) && (_rgTokenMap[i] >= tokClass); i += 2)
    {
        if (_rgTokenMap[i] == tokClass)
            return _rgTokenMap[i + 1];
    }
    return tokClass;
}

////////////////////////////////////////////////////////////////////////////

UINT FindEndTag(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD *plxs, TXTB & token)
{
    ASSERT(pchLine);
    TCHAR szEnd[16];
    ELLEX * pellex = pellexFromTextState(*plxs);
    ASSERT(0 != pellex); // shouldn't be called with something other than special text state
    UINT cbCmp = 3 + pellex->cb; // length of end tag
    ASSERT(cbCmp < sizeof szEnd);
    _tcscpy(szEnd, _T("</"));
    _tcscat(szEnd, pellex->sz);
    _tcscat(szEnd, _T(">"));

    while (cbCur < cbLen)
    {
        if (_T('<') == pchLine[cbCur])
        {
            if ((cbLen - cbCur >= cbCmp) && (0 == _tcsnicmp(szEnd, &pchLine[cbCur], cbCmp)))
            {
                *plxs &= ~TEXTMASK; // special text modes are exclusive
                token.ibTokMac = cbCur;
                return cbCur;
            }
            else if ((cbCur + 1 < cbLen) && (_T('%') == pchLine[cbCur+1]))
            {
                *plxs |= inHTXTag;
                token.ibTokMac = cbCur;
                break;
            }
            else
                cbCur++;
        }
        else
            cbCur += _tclen(&pchLine[cbCur]);
    }
    token.ibTokMac = cbCur;
    return cbCur;
}

////////////////////////////////////////////////////////////////////////////

BOOL MakeSublang(PSUBLANG ps, UINT id, const TCHAR *strName, UINT nIdTemplate, CLSID clsid)
{
    int len;

    ASSERT( NULL != ps );

    ps->szSubLang = NULL;
    ps->lxsInitial = LxsFromSubLangIndex(id);
    ps->nIdTemplate = nIdTemplate;
    ps->clsidTemplate = clsid;

    if ((len = lstrlen(strName)) != 0)
    {
        LPTSTR szNew = new TCHAR [len+1];
        if (NULL != szNew)
        {
            _tcscpy(szNew,strName);
            ps->szSubLang = szNew;
            return TRUE;
        }
    }
    return FALSE;
}

// Set sublang and tableset array members,
// putting the default one in 0th position.
//
void SetLanguage(TCHAR * strDefault, PSUBLANG rgSublang,
                 PTABLESET pTab, UINT & index, UINT nIdTemplate, CLSID clsid)
{
    if (pTab != NULL)
    {
        int i;
        if (lstrcmp(strDefault, pTab->Name()) == 0)
            i = 0;
        else
            i = index;
        if (MakeSublang(rgSublang+i, i, pTab->Name(), nIdTemplate, clsid))
        {
            g_arpTables[i] = pTab;
            if (i)
                index++;
            else
                g_pTable = pTab;
        }
        else
            delete pTab;
    }
}

CTableSet * FindTable(CTableSet ** rgpts, TCHAR *strName)
{
    for (int n = 0; rgpts[n]; n++)
    {
        if (rgpts[n]->Name() == strName)
        //if (strcmp(rgpts[n]->Name(), strName) == 0)
            return rgpts[n];
    }
    return NULL;
}

CTableSet * FindTable(CTableSet ** rgpts, CTableSet * pts)
{
    for (int n = 0; rgpts[n]; n++)
    {
        if (rgpts[n] == pts)
            return rgpts[n];
    }
    return NULL;
}

void RemoveTable(CTableSet ** rgpts, CTableSet *pts)
{
    int n;
    for (n = 0; rgpts[n]; n++)
    {
        if (rgpts[n] == pts)
        {
            for(; rgpts[n]; n++)
                rgpts[n] = rgpts[n+1];
            return;
        }
    }
}

CTableSet * MakeTableSet(CTableSet ** /*rgpts*/, RWATT_T att, UINT nIdName)
{
    return new CStaticTableSet(att, nIdName);
}

////////////////////////////////////////////////////////////////////////
// GetToken()
//
UINT GetToken(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
    ASSERT (cbCur < cbLen);
    if(cbCur > cbLen)
        return cbCur;

    UINT cbCount = 0;

    // init token
    token.tok = 0;

    // initialize location where token starts
    token.ibTokMin = cbCur;

    if (*plxs & inHTXTag)
        cbCount = FindServerScript(pchLine, cbLen, cbCur, plxs, token);
    else if (*plxs & inSCRIPT && !(*plxs & inTag) && !(*plxs & inServerASP))
    {
        // NOTE that we want to skip tokenizing scripts that are special to triedit
        // when we wrap server-side scripts in client-side scripts, we set a dummy
        // language as 'serverasp'. inServerASP is set in that case.
        cbCount = FindClientScriptEnd(pchLine, cbLen, cbCur, plxs, token);
    }
    else if (*plxs & inComment)  // in a comment
    {
        if (*plxs & inSCRIPT)
            *plxs |= inScriptText;
        COMMENTTYPE ct = IfHackComment(pchLine, cbLen, cbCur, plxs, token);
        if (ct == CT_METADATA)
        {
            // Treat as an element
            cbCount = FindNextToken(pchLine, cbLen, cbCur, plxs, token);
            // Remove inBangTag
            *plxs &= ~inBangTag;
        }
        else if (ct == CT_IECOMMENT)
            cbCount = token.ibTokMac;
        else
            cbCount = FindEndComment(pchLine, cbLen, cbCur, plxs, token);
    }
    else if (*plxs & INSTRING)  // in a string
        cbCount = FindEndString(pchLine, cbLen, cbCur, plxs, token);
    else
        cbCount = FindNextToken(pchLine, cbLen, cbCur, plxs, token);

    token.tokClass = MapToken(token.tokClass, *plxs);
    return cbCount;
}

///////////////////////////////////////////////////////////////////////////////////
// IfHackComment
//
// Probe ahead in the current line to see if we have what IE recognizes
// as the end of a comment ("->"). This does not conform to RFC 1866 or SGML,
// but suppports browser behavior. This lets us tolerate comments of the
// form: "<!--- whatever ->"
// (note how it ends)
//
// Returns a COMMENTTYPE enum.
//  0 if norma comment
//  1 if IE comment
//  -1 if METADATA comment
//
// Proper comments are scanned using FindEndComment().
//
COMMENTTYPE IfHackComment(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
    token.tokClass = tokComment;
    while (cbCur+1 < cbLen)
    {
        if(_tcsnicmp(&pchLine[cbCur], _T("METADATA"), lstrlen(_T("METADATA"))) == 0)
        {
            token.ibTokMac = cbCur + 1; // include second dash??
            *plxs &= ~inComment;
            // Remove inBangTag
            *plxs &= ~inBangTag;
            *plxs |= inTag;
            return CT_METADATA; // METADATA
        }
        else if (pchLine[cbCur] == '-' && pchLine[cbCur + 1] == '>')
        {
            token.ibTokMac = cbCur + 1;
            *plxs &= ~inComment;
            *plxs &= ~inScriptText;
            return CT_IECOMMENT;
        }
        else
        {
            cbCur += _tclen(&pchLine[cbCur]);
        }
    }
    return CT_NORMAL;
}


UINT FindServerScript(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
    LPCTSTR pCurrent = &pchLine[cbCur];
    int cb;

    // parse HTX start tag
    if (*pCurrent == _T('<') && (cbCur+1 < cbLen) && *(pCurrent+1) == '%')
    {
        token.tokClass = tokTag;
        token.tok = TokTag_SSSOPEN;
        token.ibTokMac = cbCur + 2;
        *plxs |= inHTXTag;
        return token.ibTokMac;
    }

    ASSERT(*plxs & inHTXTag); // should be in HTXTag state here

    if (*pCurrent == _T('%') && (cbCur+1 < cbLen) && *(pCurrent+1) == '>')
    {
        token.tok = TokTag_SSSCLOSE;
        token.tokClass = tokSSS; //tokTag;
        token.ibTokMac = cbCur + 2;
        *plxs &= ~inHTXTag;
        if (*plxs & inNestedQuoteinSSS)
            *plxs &= ~inNestedQuoteinSSS;
        return token.ibTokMac;
    }

    token.tokClass = tokSSS;

    while (cbCur < cbLen)
    {
        if (*pCurrent == _T('%') && (cbCur+1 < cbLen) && (*(pCurrent+1) == _T('>')))
            break;
        if (   *pCurrent == _T('"') 
            && *plxs&inTag
            && *plxs&inHTXTag
            && *plxs&inAttribute
            && *plxs&inString
            )
            *plxs |= inNestedQuoteinSSS;

        cb = _tclen(pCurrent);
        cbCur += cb;
        pCurrent += cb;
    }

    token.ibTokMac = cbCur;
    return cbCur;
}

///////////////////////////////////////////////////////////////////////////////////
// FindClientScriptEnd()
//
// HTMED CHANGE: Find the end of client script block
//
UINT FindClientScriptEnd(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
    LPCTSTR pCurrent = &pchLine[cbCur];
    int cb;

    TCHAR rgEndScript[] = _T("</SCRIPT");
    int cchEndScript = (wcslen(rgEndScript) - 1);

    if( cbCur + cchEndScript < cbLen &&
        0 == _tcsnicmp(pCurrent, rgEndScript, cchEndScript))
    {
        token.tokClass = tokTag;
        token.tok = TokTag_END;
        *plxs &= ~inSCRIPT;
        *plxs |= inEndTag;
        token.ibTokMac = cbCur + 2;
        return token.ibTokMac;
    }

    token.tokClass = tokSpace;

    while (cbCur < cbLen)
    {
        if (*pCurrent == _T('<') && (cbCur+1 < cbLen) && (*(pCurrent+1) == _T('/')))
        {
            // Check if found end </SCRIPT
            if( cbCur + cchEndScript < cbLen &&
                0 == _tcsnicmp(pCurrent, rgEndScript, cchEndScript))
            {
                // Check if found end </SCRIPT
                break;
            }
        }
        cb = _tclen(pCurrent);
        cbCur += cb;
        pCurrent += cb;
    }

    token.ibTokMac = cbCur;
    return cbCur;
}

///////////////////////////////////////////////////////////////////////////////////
// FindEndComment()
//
// Find the end of comment ("--").
//
UINT FindEndComment(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
    LPCTSTR pCurrent = &pchLine[cbCur];
    BOOL bEndComment = FALSE;
    int cb;

    ASSERT(*plxs & inComment); // must be in a comment now

    token.tokClass = tokComment;

    while (!bEndComment && cbCur < cbLen)
    {
        if (*pCurrent == _T('-'))  // check the character to see if it's the first "-" in "--"
        {
            pCurrent++;
            cbCur++;
            if ((cbCur < cbLen) &&
                (*pCurrent == _T('-'))) // we're possibly at the end, so search for the final "--" pair
            {
                bEndComment = TRUE;
            }
        }
        else
        {
            cb = _tclen(pCurrent);
            cbCur += cb;
            pCurrent += cb;
        }
    }
    if (cbCur < cbLen)
    {
        cb = _tclen(pCurrent);
        cbCur += cb;
        pCurrent += cb;
    }

    token.ibTokMac = cbCur;

    // reset state if we reach end of comment
    if (bEndComment)
        *plxs &= ~inComment;

    return cbCur;
}

/////////////////////////////////////////////////////////////
// FindEndEntity()
//
// Find the end of the special character sequence (ends with ; or whitespace).
//
UINT FindEndEntity(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * /*plxs*/, TXTB & token)
{
    token.tokClass = tokEntity;
    int cb = GetTokenLength(pchLine, cbLen, cbCur);
    if (pchLine[cbCur + cb] == ';')
        cb++;
    token.ibTokMac = cbCur + cb;
    return token.ibTokMac;
}

/////////////////////////////////////////////////////////////
// Find an entity reference or non-entity ref, literal "&..."
//
UINT FindEntityRef(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * /*plxs*/, TXTB & token)
{
    ASSERT(cbCur < cbLen);
    ASSERT(pchLine[cbCur] == '&'); // must be on ERO

    cbCur++;
    if (cbCur == cbLen)
    {
NotEntity:
        token.tokClass = tokIDENTIFIER; // plain text
        token.ibTokMac = cbCur;
        return cbCur;
    }

    if (pchLine[cbCur] == '#')
    {
        // parse and check valid number
        if (!IsNumber(pchLine, cbLen, cbCur + 1, token))
            goto NotEntity;

        // must be <= 3 digits
        if (token.ibTokMac - (cbCur + 1) > 3)
            goto NotEntity;

        // validate range
        TCHAR szNum[4];
        _tcsncpy(szNum, &pchLine[cbCur + 1], 3);
        if (_tcstoul(szNum, 0, 10) > 255)
            goto NotEntity;

        // we now have a valid numeric entity ref

        token.tokClass = tokEntity;
        cbCur = token.ibTokMac;

        // scan for end of entity ref

        // scan rest of alphanumeric token
        // REVIEW: Is this correct? IE 4.40.308 behaves this way
        while ((cbCur < cbLen) && IsCharAlphaNumeric(pchLine[cbCur]))
            cbCur++;

        // scan delimiter
        if (cbCur < cbLen)
            cbCur++;
        token.ibTokMac = cbCur;
        return cbCur;
    }
    else if (!IsCharAlpha(pchLine[cbCur]))
    {
        goto NotEntity;
    }
    else
    {
        // parse and check entity name
        UINT nLen = GetTokenLength(pchLine, cbLen, cbCur);
        if (!g_pTable->FindEntity(&pchLine[cbCur], nLen))
            goto NotEntity;

        cbCur += nLen;
        // eat delimiter if necessary
        if ((cbCur < cbLen) &&
            (pchLine[cbCur] == ';' || IsWhiteSpace(pchLine[cbCur])))
            cbCur++;
        token.tokClass = tokEntity;
        token.ibTokMac = cbCur;
        return cbCur;
    }
}


/////////////////////////////////////////////////////////////
// FindEndValue
// Find the end of an unquoted value.
//
// Scan for whitespace or end if tag
//
UINT FindValue(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
    ASSERT(cbCur < cbLen);

    do
    {
        cbCur++;
    } while ( cbCur < cbLen &&
        !IsWhiteSpace(pchLine[cbCur]) &&
        pchLine[cbCur] != '>' );

    token.tokClass = tokValue;
    token.ibTokMac = cbCur;

    // switch from value to attribute
    *plxs &= ~inValue;
    *plxs |= inAttribute;

    return cbCur;
}

/////////////////////////////////////////////////////////////
// FindEndString()
// Find the end of the string.
// Should only be called when we are in the string mode already.
//
UINT FindEndString (LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
    LPCTSTR pCurrent = &pchLine[cbCur];
    int cb;
    BOOL bInString = TRUE;
    TCHAR chDelim;

    ASSERT (*plxs & INSTRING); // must be in a string now

    token.tokClass = tokString;
    chDelim = (*plxs & inStringA) ? _T('\'') : _T('"');

    while (bInString && cbCur < cbLen)
    {
        if (*pCurrent == chDelim)
        {
            *plxs &= ~INSTRING;
            bInString = FALSE;
            SetValueSeen(plxs);
        }
        else if (*pCurrent == _T('<') &&
            cbCur+1 < cbLen &&
            *(pCurrent+1) == _T('%'))
        {
            *plxs |= inHTXTag;
            break;
        }
        cb = _tclen(pCurrent);
        cbCur += cb;
        pCurrent += cb;
    }
    token.ibTokMac = cbCur;
    return cbCur;
}

//////////////////////////////////////////////////////////////////
//
UINT FindTagOpen(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
    ASSERT(pchLine[cbCur] == '<');
    token.tokClass = tokTag;
    *plxs &= ~inScriptText;     // turn off script coloring when inside tags
    cbCur++;

    if (cbCur == cbLen)
    {
        *plxs |= inTag;
    }
    else
    {
#ifdef NEEDED // copied from htmed\lexer.cpp
        //
        // HTMED CHANGE:
        // REVIEW(cgomes): Figure out if I should turn off inSCRIPT in any of the
        // following cases.  Right now I only do it for the </ case.
        //
#endif //NEEDED         
        switch (pchLine[cbCur])
        {
        case '!': // MDO - Markup Declaration Open
            cbCur++;
            *plxs |= inBangTag;
            token.tok = TokTag_BANG;
            break;

        case '/': // End tag
            cbCur++;
            *plxs |= inEndTag;
            token.tok = TokTag_END;
#ifdef NEEDED // copied from htmed\lexer.cpp
            // HTMED CHANGE:
            // REVIEW(cgomes): Colorizer bug: it never removes the inSCRIPT state
            //  This removes the inSCRIPT in the case <SCRIPT <BODY>
            //  in this case <BODY is in error.
            //
            *plxs &= ~inSCRIPT;
#endif //NEEDED         
            break;

        // REVIEW: PI is SGML -- not in HTML, but might be added
        case '?': // PI - Processing Instruction
            cbCur++;
            *plxs |= inPITag;
            token.tok = TokTag_PI;
            break;

        case '%': // HTX -- ODBC server HTML extension
            cbCur++;
            *plxs |= inHTXTag;
            token.tok = TokTag_SSSOPEN;
            break;

        default: // Tag
            if (IsCharAlpha(pchLine[cbCur]))
            {
                *plxs |= inTag;
                token.tok = TokTag_START;
            }
            else
                token.tokClass = tokIDENTIFIER; // NOT a TAG
            break;
        }
    }
    token.ibTokMac = cbCur;
    return cbCur;
}

//////////////////////////////////////////////////////////////////
//  FindText
//  Scan a token of text
//      NOTE DO NOT MODIFY this function, mainly b/c the side effects
//              will be hard to find, and will break the way
//              that everything works.
//
UINT FindText(LPCTSTR pchLine, UINT cbLen, UINT cbCur, TXTB & token)
{
    //BOOL fExtraSpace = FALSE;
    //int cSpace = 0;

    ASSERT (cbCur < cbLen);

    token.tokClass = tokIDENTIFIER;

    //if (pchLine[cbCur] == ' ' && !fExtraSpace)
    //  fExtraSpace = TRUE;
    cbCur += _tclen(&pchLine[cbCur]);
    while (cbCur < cbLen)
    {
        switch (pchLine[cbCur])
        {
        case _T('\0'):
        case _T('\n'):
        case _T('<'):
        case _T('&'):
            //if (cSpace > 0) // found extra spaces so remember them somewhere
            goto ret;
            break;
        //case _T(' '):
        //  if (!fExtraSpace)
        //      fExtraSpace = TRUE;
        //  else
        //      cSpace++;
        //  break;
        default:
            //if (cSpace > 0) // found extra spaces so remember them somewhere
            //cSpace = 0;
            //fExtraSpace = FALSE;
            break;
        }
        cbCur += _tclen(&pchLine[cbCur]);
    }

ret:
    token.ibTokMac = cbCur;
    return cbCur;
}

//////////////////////////////////////////////////////////////////
// FindNextToken()
//  Find the next token in the line
//
UINT FindNextToken(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
    ASSERT (cbCur < cbLen);
    HINT hint;

    if (!(*plxs & INTAG)) // scanning text
    {
        if (*plxs & TEXTMASK)
        {
            if (*plxs & inCOMMENT)
                token.tokClass = tokComment;
            else
                token.tokClass = tokIDENTIFIER;
            // probe for end tag </comment>
            UINT cbEnd = FindEndTag(pchLine, cbLen, cbCur, plxs, token);
            if (cbEnd > cbCur) // parsed a nonzero-length token
            {
                return cbEnd;
            }
            //else fall through to normal processing
        }
        hint = GetTextHint(pchLine, cbLen, cbCur, plxs, token);
        switch (hint)
        {
        case HTA:
            // begin a tag
            return FindTagOpen(pchLine, cbLen, cbCur, plxs, token);

        case HEN:
            // scan an entity reference
            token.ibTokMac = FindEntityRef(pchLine, cbLen, cbCur, plxs, token);
            return token.ibTokMac;

        case EOS:
        case ONL:
            return token.ibTokMac;

        case ERR:
        default:
            // scan text as a single token
            // If the editor uses token info for more than coloring
            //   (e.g. extended selections), then this will need to
            //   return smaller chunks.
            if (*plxs & inSCRIPT)
                *plxs |= inScriptText;
            return FindText(pchLine, cbLen, cbCur, token);
            break;
        }

        return cbCur;
    }

    ASSERT(*plxs & INTAG); // must be in a tag here

    BOOL bError = FALSE;
    hint = GetHint(pchLine, cbLen, cbCur, plxs, token);
    switch (hint)
    {
    case HTE:
        // Tag end: remove all tag state bits
        *plxs &= ~TAGMASK;
        cbCur++;
        token.tokClass = tokTag;
        token.tok = TokTag_CLOSE;
        token.ibTokMac = cbCur;
        break;

    case HNU:
#if 0  // lexing HTML instance, not a DTD!
        if (!IsNumber(pchLine, cbLen, cbCur, token))
            bError = TRUE;
        if (SetValueSeen(plxs))
            token.tokClass = tokValue;
        break;
#else
        // fall through
#endif

    case HRN: // reserved name start: #
#if 1  // lexing HTML instance, not a DTD!
        // simple nonwhitespace stream
        if (!(*plxs & inValue))
            bError = TRUE;
        FindValue(pchLine, cbLen, cbCur, plxs, token);
        if (bError)
        {
            token.tokClass = tokSpace;
            bError = FALSE; //"corrected" the error
        }
#else
        cbCur++;
        if (cbCur == cbLen)
            token.tokClass = tokOp;
        else
        {
            if (IsIdChar(pchLine[cbCur]))
            {
                cbCur++;
                while (cbCur < cbLen && IsIdChar(pchLine[cbCur]))
                    cbCur++;
                token.tokClass = tokResName;
            }
            else
                token.tokClass = tokOp;
        }
        token.ibTokMac = cbCur;
        if (SetValueSeen(plxs))
            token.tokClass = tokValue;
#endif
        break;

    case HEP: // parameter entity: %
#if 1  // lexing HTML instance, not a DTD!
        goto BadChar;
#else
        cbCur++;
        if (cbCur == cbLen)
        {
            token.tokClass = tokOp;
            token.ibTokMac = cbCur;
        }
        else
        {
            if (IsIdChar(pchLine[cbCur]))
            {
                token.ibTokMac = FindEndEntity(pchLine, cbLen, cbCur, plxs, token);
                token.tokClass = tokParEnt;
            }
            else
            {
                token.ibTokMac = cbCur;
                token.tokClass = tokOp;
            }
        }
        if (SetValueSeen(plxs))
            token.tokClass = tokValue;
#endif
        break;

    // ported HTMED change (walts) -- handle some chars as valid start char for attribute values.
    case HAV:
        {
        if (!(*plxs & inTag) || !SetValueSeen(plxs))
            goto BadChar;   // not in tag or attribute value.

        int iTokenLength = GetValueTokenLength(pchLine, cbLen, cbCur);
        token.ibTokMac = token.ibTokMin + iTokenLength;
        token.tokClass = tokValue;
        break;
        }
    // ported HTMED change (walts) -- handle some chars as valid start char for attribute values.

    case HKW:  // identifier
        {
            int iTokenLength = GetTokenLength(pchLine, cbLen, cbCur);
            token.ibTokMac = token.ibTokMin + iTokenLength;
            token.tokClass = tokName;
            //FUTURE: Don't scan attributes in an end tag
            if (*plxs & (inTag|inEndTag))
            {
                if (*plxs & inAttribute)
                {
                    IsAttributeName(pchLine, cbCur, iTokenLength, token);
                    // don't change attribute/value state here
                    // we only look for values after we've seen "=" in case OEQ below

                    // REVIEW(cgomes): what if more attributes follow
                    // the SPAN??
                    // if found STARTSPAN then pretend I am not in a tag
                    if(token.tok == TokAttrib_STARTSPAN)
                        *plxs &= ~(inTag | inAttribute);
                    // if found ENDSPAN then goback to comment state
                    else if(token.tok == TokAttrib_ENDSPAN)
                    {
                        *plxs &= ~(inTag | inAttribute);
                        *plxs |= inBangTag | inComment;
                    }
                }
                else if (SetValueSeen(plxs))
                {
                    // REVIEW (walts)
                    // Handle the client side script language detection here for the
                    // following case (language attribute value is NOT wrapped by quotes.)
                    // <SCRIPT LANGUAGE=VBScript>
                    if (*plxs & inSCRIPT)
                    {
                        SetScriptLanguage(&pchLine[cbCur], plxs);
                    }

                    //
                    // REVIEW(cgomes): It seems that any non-white space character
                    //      is valid for non-quoted attribute values.
                    //      Problem is that GetTokenLength is used to determine
                    //      the token length, which works great non-values,
                    //      but pulls egss for values.
                    //      I use GetValueTokenLength here to get the length
                    //      of value token.  GetValueTokenLength will not
                    //      stop till it hits a white space character.
                    //

                    iTokenLength = GetValueTokenLength(pchLine, cbLen, cbCur);
                    token.ibTokMac = token.ibTokMin + iTokenLength;
                    token.tokClass = tokName;

                    token.tokClass = tokValue;
                }
                else
                {
                    IsElementName(pchLine, cbCur, iTokenLength, token);
                    // look for attributes
                    *plxs |= inAttribute;
                    // set content state
                    if (*plxs & inTag)
                        *plxs |= TextStateFromElement(&pchLine[token.ibTokMin], iTokenLength);
                    else if ((*plxs & inEndTag) && (*plxs & TEXTMASK))
                        *plxs &= ~TextStateFromElement(&pchLine[token.ibTokMin], iTokenLength);
                    else if ((*plxs & inEndTag) && (*plxs & inSCRIPT))
                        *plxs &= ~(inSCRIPT | inScriptText | inServerASP/* | inVBScript | inJavaScript*/);
                }
            }
            else if (*plxs & inBangTag)
            {
                // FUTURE: other <!...> items like "HTML", "PUBLIC"? -- nice for DTDs
                //   Use a RW table for it if we do

                // recognize <!DOCTYPE ...>  as 'element'
                if ((iTokenLength == 7) &&
                    (0 == _tcsnicmp(&pchLine[cbCur], _T("doctype"), 7)))
                    token.tokClass = tokElem;
            }
            break;
        }

    case HST:  // string "..."
        *plxs |= inString;
        goto String;

    case HSL:  // string alternate '...'
        *plxs |= inStringA;
String:
        cbCur++;
        token.ibTokMac = FindEndString(pchLine, cbLen, cbCur, plxs, token);
        SetValueSeen(plxs);
        // Handle the client side script language detection here for the
        // following case (language attribute value is wrapped by quotes.)
        // <SCRIPT LANGUAGE="VBScript">
        if((*plxs & inSCRIPT) && (*plxs & inAttribute))
        {
            SetScriptLanguage(&pchLine[cbCur], plxs);
        }
        break;

    case HWS: // tag whitespace
        do
        {
            cbCur++;
        } while (cbCur < cbLen && IsWhiteSpace(pchLine[cbCur]));
        token.tokClass = tokSpace;
        token.ibTokMac = cbCur;
        break;

    case OEQ:
        // GetHint has set token info
        if (*plxs & inAttribute)
        {
            // start looking for values
            *plxs &= ~inAttribute;
            *plxs |= inValue;
        }
        else
            goto BadChar;
        break;

    case HTA:
        if (cbCur+1 < cbLen && '%' == pchLine[cbCur+1])
        {
            SetValueSeen(plxs);
            return FindTagOpen(pchLine, cbLen, cbCur, plxs, token);
        }
        // else fall through
    case ERR:
    case HEN:
BadChar:
        token.tokClass = tokSpace;

        // DS96# 10116 [CFlaat]: we can be in DBCS here, and so we need
        //     to make sure that our increment is double-byte aware
        cbCur += _tcsnbcnt(pchLine + cbCur, 1); // byte count for current char
        ASSERT(cbCur <= cbLen);
        token.ibTokMac = cbCur;
        break;

    // ported HTMED CHANGE (walts) - added this case to handle dbcs attribute values.
    case HDB:
        {
        // DBCS char.  Handle for attribute values within tag.
        if (!SetValueSeen(plxs))
            goto BadChar;

        int iTokenLength = GetValueTokenLength(pchLine, cbLen, cbCur);
        token.ibTokMac = token.ibTokMin + iTokenLength;
        token.tokClass = tokValue;
        }
        break;
    // ported HTMED CHANGE END

    default:
        // GetHint has set token info
        if (token.tokClass != tokComment && (*plxs & inValue))
            FindValue(pchLine, cbLen, cbCur, plxs, token);
        break;
    }
    if (bError)
        IsUnknownID(pchLine, cbLen, cbCur, token);
    return token.ibTokMac;
}

////////////////////////////////////////////////////////////////////
// GetTextHint()
// Like GetHint when scanning text -- look only for tags and entities
//
HINT GetTextHint(LPCTSTR pchLine, UINT /*cbLen*/, UINT cbCur, DWORD * /*plxs*/, TXTB & token)
{
    // if the character is bigger than 128 (dbcs) then return error
    if (pchLine[cbCur] & ~0x7F)
        return HDB;

    HINT hint = g_hintTable[pchLine[cbCur]];

    if (IsSingleOp(hint))
    {
        hint = ERR;
    }
    else if (hint == ONL || hint == EOS)
    {
        token.tokClass = tokOp;
        token.ibTokMac = cbCur + 1;
    }
    return hint;
}

////////////////////////////////////////////////////////////////////
// GetHint()
//      Use hint table to guess what the next token going to be
//      If it is a single operator, it will fill in the token info
//      as well
//
HINT GetHint(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
    // if the character is bigger than 128 (dbcs) then return error
    if (pchLine[cbCur] & ~0x7F)
        return HDB;

    HINT hint = g_hintTable[pchLine[cbCur]];

    // check if it is a single op, new line or end of stream
    if (IsSingleOp(hint) || hint == ONL || hint == EOS)
    {
        token.tokClass = hint;
        token.ibTokMac = cbCur + 1;
    }
    else if (hint == ODA)
    {
        if ((cbCur + 1 < cbLen) &&
            (g_hintTable[pchLine[cbCur + 1]] == ODA) &&
            (*plxs & inBangTag))
        {
            cbCur += 2;
            *plxs |= inComment;
            COMMENTTYPE ct = IfHackComment(pchLine, cbLen, cbCur, plxs, token);
            if (ct == 0)
            {
                token.tokClass = tokComment;
                token.ibTokMac = cbCur;
            }
            else if(ct == CT_METADATA)
                hint = HTA; // tag open
        }
        else
        {
            // single -
            token.tokClass = tokOp;
            token.ibTokMac = cbCur + 1;
        }
    }
    return hint;
}

///////////////////////////////////////////////////////////////////
// GetTokenLength ()
//  return the length of a token identifier/keyword
//
UINT GetTokenLength(LPCTSTR pchLine, UINT cbLen, UINT cbCur)
{
    LPCTSTR pCurrent = &pchLine[cbCur];
    UINT cb;
    UINT cbOld = cbCur;

    if (IsCharAlphaNumeric(*pCurrent))
    {
        while (cbCur < cbLen && IsIdChar(*pCurrent))
        {
            cb = _tclen(pCurrent);
            cbCur += cb;
            pCurrent += cb;
        }
    }
    return (int) max((cbCur - cbOld), 1);
}

/*

    UINT GetValueTokenLength

    Description:
        Gets the length of the token.
        This version will accept any non whitespace character
        in the token.

*/
UINT GetValueTokenLength(LPCTSTR pchLine, UINT cbLen, UINT cbCur)
{
    LPCTSTR pCurrent = &pchLine[cbCur];
    UINT cb;
    UINT cbOld = cbCur;

    while (cbCur < cbLen && !_istspace(*pCurrent) && IsValueChar(*pCurrent))
    {
        cb = _tclen(pCurrent);
        cbCur += cb;
        pCurrent += cb;
    }
    return (int) max((cbCur - cbOld), 1);
}


////////////////////////////////////////////////////////////////
// IsElementName ()
//  lookup the keyword table to determine if it is a keyword or not
//
BOOL IsElementName(LPCTSTR pchLine, UINT cbCur, int iTokenLength, TXTB & token)
{
    LPCTSTR pCurrent = &pchLine[cbCur];
    int iFound = NOT_FOUND;

    if (NOT_FOUND != (iFound = g_pTable->FindElement(pCurrent, iTokenLength)))
    {
        token.tokClass = tokElem;
        token.ibTokMac = cbCur + iTokenLength;
        token.tok = iFound; // set token
    }
    return (iFound != NOT_FOUND);
}

int IndexFromElementName(LPCTSTR pszName)
{
    return g_pTable->FindElement(pszName, lstrlen(pszName));
}

////////////////////////////////////////////////////////////////
// IsAttributeName ()
// lookup the keyword table to determine if it is a keyword or not
//
BOOL IsAttributeName(LPCTSTR pchLine, UINT cbCur, int iTokenLength, TXTB & token)
{
    LPCTSTR pCurrent = &pchLine[cbCur];
    int iFound = NOT_FOUND;

    if (NOT_FOUND != (iFound = g_pTable->FindAttribute(pCurrent, iTokenLength)))
    {
        token.tokClass = tokAttr;
        // ENDSPAN__ is needed b/c the lexer does not recognize the
        // endspan-- as 2 seperate tokens.
        if(iFound == TokAttrib_ENDSPAN__)
        {
            // endspan-- found.  return TokAttrib_ENDSPAN
            // set ibTokMac to not include --.
            token.tok = TokAttrib_ENDSPAN;
            token.ibTokMac = cbCur + iTokenLength - 2;
        }
        else
        {
            token.ibTokMac = cbCur + iTokenLength;
            token.tok =  iFound; // set token
        }
    }
    return (iFound != NOT_FOUND);
}

//////////////////////////////////////////////////////////////////////////
// IsIdentifier()
// check if it is an identifier
//
BOOL IsIdentifier (int iTokenLength, TXTB & token)
{
    if (iTokenLength > 0)
    {
        token.tokClass = tokName;
        token.ibTokMac = token.ibTokMin + iTokenLength;
        return TRUE;
    }
    else
        return FALSE;
}

////////////////////////////////////////////////////////////////////
// IsUnknownID ()
//  Mark the next token as an ID
//
BOOL IsUnknownID (LPCTSTR pchLine, UINT cbLen, UINT cbCur, TXTB & token)
{
    ASSERT(cbCur < cbLen);
    UINT cb;
    LPCTSTR pCurrent = &pchLine[cbCur];

    cb = _tclen(pCurrent);
    cbCur += cb;
    pCurrent += cb;

    while ((cbCur < cbLen) && IsIdChar(*pCurrent))
    {
        cb = _tclen(pCurrent);
        cbCur += cb;
        pCurrent += cb;
    }

    token.tokClass = tokSpace;
    token.ibTokMac = cbCur;
    return TRUE;
}

/////////////////////////////////////////////////////////////////////////
// IsNumber()
//  Check whether the next token is an SGML NUMTOKEN
//
BOOL IsNumber(LPCTSTR pchLine, UINT cbLen, UINT cbCur, TXTB & token)
{
    if (cbCur >= cbLen)
        return FALSE;

    if (!_istdigit(pchLine[cbCur]))
        return FALSE;

    token.tokClass = tokNum;

    // assume all digits are one byte
    ASSERT(1 == _tclen(&pchLine[cbCur]));
    cbCur++;

    while (cbCur < cbLen && _istdigit(pchLine[cbCur]))
    {
        // assume all digits are one byte
        ASSERT(1 == _tclen(&pchLine[cbCur]));
        cbCur++;
    }

    token.ibTokMac = cbCur;
    return TRUE;
}


/* end of file */