Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1393 lines
39 KiB

// Copyright (c)1997-1999 Microsoft Corporation, All Rights Reserved
/* copied from ..\htmed\lexer.cpp */
/*++
Copyright (c) 1995 Microsoft Corporation
File: lexer.cpp
Abstract:
Nitty Gritty Lexer stuff
Contents:
SetValueSeen()
IsSingleOp()
IsWhiteSpace()
MapToken()
FindEndTag()
MakeSublang()
SetLanguage()
FindTable()
FindTable()
RemoveTable()
MakeTableSet()
GetToken()
IfHackComment()
FindServerScript()
FindEndComment()
FindEndEntity()
FindEntityRef()
FindValue()
FindEndString()
FindTagOpen()
FindText()
FindNextToken()
GetTextHint()
GetHint()
GetTokenLength()
GetValueTokenLength()
IsElementName()
IsAttributeName()
IsIdentifier()
IsUnknownID()
IsNumber()
CColorHtml::SetTable()
CColorHtml::InitSublanguages()
History:
2/14/97 cgomes: Created
--*/
#include "stdafx.h"
#include "resource.h"
#include "guids.h"
#include "token.h"
#include "table.h"
#include "lexer.h"
UINT FindClientScriptEnd(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token);
#undef ASSERT
#define ASSERT(b) _ASSERTE(b)
// HACK: we keep a copy of a ptr to the ASP table and sublang
// so we can do special behavior for ASP files
CTableSet* g_ptabASP = 0;
PSUBLANG g_psublangASP = 0;
PTABLESET g_arpTables[CV_MAX+1];
// NOTE: added to handle value tokens properly.
UINT GetValueTokenLength(LPCTSTR pchLine, UINT cbLen, UINT cbCur);
// mark state transition from value -> next attribute
inline int SetValueSeen(DWORD *plxs)
{
if (*plxs & inValue)
{
*plxs &= ~inValue;
*plxs |= inAttribute;
return TRUE;
}
else
return FALSE;
}
// REVIEW (walts) - need better way
inline void SetScriptLanguage(LPCTSTR pchLine, DWORD *plxs)
{
LPCTSTR strJavaScript = _T("javascript");
LPCTSTR strVBScript = _T("vbscript");
// triedit's special language. Its set when we convert server-side scripts into
// client-side scripts. Its a dummy language. if we find that as language, we
// set in ServerASP. It is reset(removed) in FindNextToken().
LPCTSTR strServerAsp = _T("serverasp");
// language attribute may have quotes around it.
// if it does then advance past the first quote.
// ex. <SCRIPT LANGUAGE="VBScript">
if(*pchLine == L'\"')
pchLine++;
if (_tcsnicmp(pchLine, strJavaScript, lstrlen(strJavaScript)) == 0)
{
*plxs &= ~inVBScript;
*plxs &= ~inServerASP;
*plxs |= inJavaScript;
}
else if (_tcsnicmp(pchLine, strVBScript, lstrlen(strVBScript)) == 0)
{
*plxs &= ~inJavaScript;
*plxs &= ~inServerASP;
*plxs |= inVBScript;
}
else if (_tcsnicmp(pchLine, strServerAsp, lstrlen(strServerAsp)) == 0)
{
*plxs &= ~inJavaScript;
*plxs &= ~inVBScript;
*plxs |= inServerASP;
}
}
inline BOOL IsSingleOp(HINT hint)
{
return ((hint >= tokOP_SINGLE) && (hint < tokOP_MAX));
};
inline BOOL IsWhiteSpace(TCHAR c)
{
return _istspace(c);
};
// NOTE: Added to handle value tokens properly
inline IsValueChar(TCHAR ch)
{
// REVIEW(cgomes): specify all the invalid value characters
return ch != _T('<') && ch != _T('>');
};
////////////////////////////////////////////////////////////////////////////
//
// map parsed token to returned token
// left column must be in ascending order
static TOKEN _rgTokenMap[] =
{
tokName, tokSpace,
tokNum, tokSpace,
tokParEnt, tokSpace,
tokResName, tokSpace,
0, 0
};
static TOKEN MapToken(TOKEN tokClass, DWORD lxs)
{
if (IsSingleOp((HINT)tokClass))
return tokOp;
else if ((tokClass == tokTag) && (lxs & inHTXTag))
return tokSSS;
for (int i = 0; (_rgTokenMap[i] != 0) && (_rgTokenMap[i] >= tokClass); i += 2)
{
if (_rgTokenMap[i] == tokClass)
return _rgTokenMap[i + 1];
}
return tokClass;
}
////////////////////////////////////////////////////////////////////////////
UINT FindEndTag(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD *plxs, TXTB & token)
{
ASSERT(pchLine);
TCHAR szEnd[16];
ELLEX * pellex = pellexFromTextState(*plxs);
ASSERT(0 != pellex); // shouldn't be called with something other than special text state
UINT cbCmp = 3 + pellex->cb; // length of end tag
ASSERT(cbCmp < sizeof szEnd);
_tcscpy(szEnd, _T("</"));
_tcscat(szEnd, pellex->sz);
_tcscat(szEnd, _T(">"));
while (cbCur < cbLen)
{
if (_T('<') == pchLine[cbCur])
{
if ((cbLen - cbCur >= cbCmp) && (0 == _tcsnicmp(szEnd, &pchLine[cbCur], cbCmp)))
{
*plxs &= ~TEXTMASK; // special text modes are exclusive
token.ibTokMac = cbCur;
return cbCur;
}
else if ((cbCur + 1 < cbLen) && (_T('%') == pchLine[cbCur+1]))
{
*plxs |= inHTXTag;
token.ibTokMac = cbCur;
break;
}
else
cbCur++;
}
else
cbCur += _tclen(&pchLine[cbCur]);
}
token.ibTokMac = cbCur;
return cbCur;
}
////////////////////////////////////////////////////////////////////////////
BOOL MakeSublang(PSUBLANG ps, UINT id, const TCHAR *strName, UINT nIdTemplate, CLSID clsid)
{
int len;
ASSERT( NULL != ps );
ps->szSubLang = NULL;
ps->lxsInitial = LxsFromSubLangIndex(id);
ps->nIdTemplate = nIdTemplate;
ps->clsidTemplate = clsid;
if ((len = lstrlen(strName)) != 0)
{
LPTSTR szNew = new TCHAR [len+1];
if (NULL != szNew)
{
_tcscpy(szNew,strName);
ps->szSubLang = szNew;
return TRUE;
}
}
return FALSE;
}
// Set sublang and tableset array members,
// putting the default one in 0th position.
//
void SetLanguage(TCHAR * strDefault, PSUBLANG rgSublang,
PTABLESET pTab, UINT & index, UINT nIdTemplate, CLSID clsid)
{
if (pTab != NULL)
{
int i;
if (lstrcmp(strDefault, pTab->Name()) == 0)
i = 0;
else
i = index;
if (MakeSublang(rgSublang+i, i, pTab->Name(), nIdTemplate, clsid))
{
g_arpTables[i] = pTab;
if (i)
index++;
else
g_pTable = pTab;
}
else
delete pTab;
}
}
CTableSet * FindTable(CTableSet ** rgpts, TCHAR *strName)
{
for (int n = 0; rgpts[n]; n++)
{
if (rgpts[n]->Name() == strName)
//if (strcmp(rgpts[n]->Name(), strName) == 0)
return rgpts[n];
}
return NULL;
}
CTableSet * FindTable(CTableSet ** rgpts, CTableSet * pts)
{
for (int n = 0; rgpts[n]; n++)
{
if (rgpts[n] == pts)
return rgpts[n];
}
return NULL;
}
void RemoveTable(CTableSet ** rgpts, CTableSet *pts)
{
int n;
for (n = 0; rgpts[n]; n++)
{
if (rgpts[n] == pts)
{
for(; rgpts[n]; n++)
rgpts[n] = rgpts[n+1];
return;
}
}
}
CTableSet * MakeTableSet(CTableSet ** /*rgpts*/, RWATT_T att, UINT nIdName)
{
return new CStaticTableSet(att, nIdName);
}
////////////////////////////////////////////////////////////////////////
// GetToken()
//
UINT GetToken(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
ASSERT (cbCur < cbLen);
if(cbCur > cbLen)
return cbCur;
UINT cbCount = 0;
// init token
token.tok = 0;
// initialize location where token starts
token.ibTokMin = cbCur;
if (*plxs & inHTXTag)
cbCount = FindServerScript(pchLine, cbLen, cbCur, plxs, token);
else if (*plxs & inSCRIPT && !(*plxs & inTag) && !(*plxs & inServerASP))
{
// NOTE that we want to skip tokenizing scripts that are special to triedit
// when we wrap server-side scripts in client-side scripts, we set a dummy
// language as 'serverasp'. inServerASP is set in that case.
cbCount = FindClientScriptEnd(pchLine, cbLen, cbCur, plxs, token);
}
else if (*plxs & inComment) // in a comment
{
if (*plxs & inSCRIPT)
*plxs |= inScriptText;
COMMENTTYPE ct = IfHackComment(pchLine, cbLen, cbCur, plxs, token);
if (ct == CT_METADATA)
{
// Treat as an element
cbCount = FindNextToken(pchLine, cbLen, cbCur, plxs, token);
// Remove inBangTag
*plxs &= ~inBangTag;
}
else if (ct == CT_IECOMMENT)
cbCount = token.ibTokMac;
else
cbCount = FindEndComment(pchLine, cbLen, cbCur, plxs, token);
}
else if (*plxs & INSTRING) // in a string
cbCount = FindEndString(pchLine, cbLen, cbCur, plxs, token);
else
cbCount = FindNextToken(pchLine, cbLen, cbCur, plxs, token);
token.tokClass = MapToken(token.tokClass, *plxs);
return cbCount;
}
///////////////////////////////////////////////////////////////////////////////////
// IfHackComment
//
// Probe ahead in the current line to see if we have what IE recognizes
// as the end of a comment ("->"). This does not conform to RFC 1866 or SGML,
// but suppports browser behavior. This lets us tolerate comments of the
// form: "<!--- whatever ->"
// (note how it ends)
//
// Returns a COMMENTTYPE enum.
// 0 if norma comment
// 1 if IE comment
// -1 if METADATA comment
//
// Proper comments are scanned using FindEndComment().
//
COMMENTTYPE IfHackComment(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
token.tokClass = tokComment;
while (cbCur+1 < cbLen)
{
if(_tcsnicmp(&pchLine[cbCur], _T("METADATA"), lstrlen(_T("METADATA"))) == 0)
{
token.ibTokMac = cbCur + 1; // include second dash??
*plxs &= ~inComment;
// Remove inBangTag
*plxs &= ~inBangTag;
*plxs |= inTag;
return CT_METADATA; // METADATA
}
else if (pchLine[cbCur] == '-' && pchLine[cbCur + 1] == '>')
{
token.ibTokMac = cbCur + 1;
*plxs &= ~inComment;
*plxs &= ~inScriptText;
return CT_IECOMMENT;
}
else
{
cbCur += _tclen(&pchLine[cbCur]);
}
}
return CT_NORMAL;
}
UINT FindServerScript(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
LPCTSTR pCurrent = &pchLine[cbCur];
int cb;
// parse HTX start tag
if (*pCurrent == _T('<') && (cbCur+1 < cbLen) && *(pCurrent+1) == '%')
{
token.tokClass = tokTag;
token.tok = TokTag_SSSOPEN;
token.ibTokMac = cbCur + 2;
*plxs |= inHTXTag;
return token.ibTokMac;
}
ASSERT(*plxs & inHTXTag); // should be in HTXTag state here
if (*pCurrent == _T('%') && (cbCur+1 < cbLen) && *(pCurrent+1) == '>')
{
token.tok = TokTag_SSSCLOSE;
token.tokClass = tokSSS; //tokTag;
token.ibTokMac = cbCur + 2;
*plxs &= ~inHTXTag;
if (*plxs & inNestedQuoteinSSS)
*plxs &= ~inNestedQuoteinSSS;
return token.ibTokMac;
}
token.tokClass = tokSSS;
while (cbCur < cbLen)
{
if (*pCurrent == _T('%') && (cbCur+1 < cbLen) && (*(pCurrent+1) == _T('>')))
break;
if ( *pCurrent == _T('"')
&& *plxs&inTag
&& *plxs&inHTXTag
&& *plxs&inAttribute
&& *plxs&inString
)
*plxs |= inNestedQuoteinSSS;
cb = _tclen(pCurrent);
cbCur += cb;
pCurrent += cb;
}
token.ibTokMac = cbCur;
return cbCur;
}
///////////////////////////////////////////////////////////////////////////////////
// FindClientScriptEnd()
//
// HTMED CHANGE: Find the end of client script block
//
UINT FindClientScriptEnd(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
LPCTSTR pCurrent = &pchLine[cbCur];
int cb;
TCHAR rgEndScript[] = _T("</SCRIPT");
int cchEndScript = (wcslen(rgEndScript) - 1);
if( cbCur + cchEndScript < cbLen &&
0 == _tcsnicmp(pCurrent, rgEndScript, cchEndScript))
{
token.tokClass = tokTag;
token.tok = TokTag_END;
*plxs &= ~inSCRIPT;
*plxs |= inEndTag;
token.ibTokMac = cbCur + 2;
return token.ibTokMac;
}
token.tokClass = tokSpace;
while (cbCur < cbLen)
{
if (*pCurrent == _T('<') && (cbCur+1 < cbLen) && (*(pCurrent+1) == _T('/')))
{
// Check if found end </SCRIPT
if( cbCur + cchEndScript < cbLen &&
0 == _tcsnicmp(pCurrent, rgEndScript, cchEndScript))
{
// Check if found end </SCRIPT
break;
}
}
cb = _tclen(pCurrent);
cbCur += cb;
pCurrent += cb;
}
token.ibTokMac = cbCur;
return cbCur;
}
///////////////////////////////////////////////////////////////////////////////////
// FindEndComment()
//
// Find the end of comment ("--").
//
UINT FindEndComment(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
LPCTSTR pCurrent = &pchLine[cbCur];
BOOL bEndComment = FALSE;
int cb;
ASSERT(*plxs & inComment); // must be in a comment now
token.tokClass = tokComment;
while (!bEndComment && cbCur < cbLen)
{
if (*pCurrent == _T('-')) // check the character to see if it's the first "-" in "--"
{
pCurrent++;
cbCur++;
if ((cbCur < cbLen) &&
(*pCurrent == _T('-'))) // we're possibly at the end, so search for the final "--" pair
{
bEndComment = TRUE;
}
}
else
{
cb = _tclen(pCurrent);
cbCur += cb;
pCurrent += cb;
}
}
if (cbCur < cbLen)
{
cb = _tclen(pCurrent);
cbCur += cb;
pCurrent += cb;
}
token.ibTokMac = cbCur;
// reset state if we reach end of comment
if (bEndComment)
*plxs &= ~inComment;
return cbCur;
}
/////////////////////////////////////////////////////////////
// FindEndEntity()
//
// Find the end of the special character sequence (ends with ; or whitespace).
//
UINT FindEndEntity(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * /*plxs*/, TXTB & token)
{
token.tokClass = tokEntity;
int cb = GetTokenLength(pchLine, cbLen, cbCur);
if (pchLine[cbCur + cb] == ';')
cb++;
token.ibTokMac = cbCur + cb;
return token.ibTokMac;
}
/////////////////////////////////////////////////////////////
// Find an entity reference or non-entity ref, literal "&..."
//
UINT FindEntityRef(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * /*plxs*/, TXTB & token)
{
ASSERT(cbCur < cbLen);
ASSERT(pchLine[cbCur] == '&'); // must be on ERO
cbCur++;
if (cbCur == cbLen)
{
NotEntity:
token.tokClass = tokIDENTIFIER; // plain text
token.ibTokMac = cbCur;
return cbCur;
}
if (pchLine[cbCur] == '#')
{
// parse and check valid number
if (!IsNumber(pchLine, cbLen, cbCur + 1, token))
goto NotEntity;
// must be <= 3 digits
if (token.ibTokMac - (cbCur + 1) > 3)
goto NotEntity;
// validate range
TCHAR szNum[4];
_tcsncpy(szNum, &pchLine[cbCur + 1], 3);
if (_tcstoul(szNum, 0, 10) > 255)
goto NotEntity;
// we now have a valid numeric entity ref
token.tokClass = tokEntity;
cbCur = token.ibTokMac;
// scan for end of entity ref
// scan rest of alphanumeric token
// REVIEW: Is this correct? IE 4.40.308 behaves this way
while ((cbCur < cbLen) && IsCharAlphaNumeric(pchLine[cbCur]))
cbCur++;
// scan delimiter
if (cbCur < cbLen)
cbCur++;
token.ibTokMac = cbCur;
return cbCur;
}
else if (!IsCharAlpha(pchLine[cbCur]))
{
goto NotEntity;
}
else
{
// parse and check entity name
UINT nLen = GetTokenLength(pchLine, cbLen, cbCur);
if (!g_pTable->FindEntity(&pchLine[cbCur], nLen))
goto NotEntity;
cbCur += nLen;
// eat delimiter if necessary
if ((cbCur < cbLen) &&
(pchLine[cbCur] == ';' || IsWhiteSpace(pchLine[cbCur])))
cbCur++;
token.tokClass = tokEntity;
token.ibTokMac = cbCur;
return cbCur;
}
}
/////////////////////////////////////////////////////////////
// FindEndValue
// Find the end of an unquoted value.
//
// Scan for whitespace or end if tag
//
UINT FindValue(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
ASSERT(cbCur < cbLen);
do
{
cbCur++;
} while ( cbCur < cbLen &&
!IsWhiteSpace(pchLine[cbCur]) &&
pchLine[cbCur] != '>' );
token.tokClass = tokValue;
token.ibTokMac = cbCur;
// switch from value to attribute
*plxs &= ~inValue;
*plxs |= inAttribute;
return cbCur;
}
/////////////////////////////////////////////////////////////
// FindEndString()
// Find the end of the string.
// Should only be called when we are in the string mode already.
//
UINT FindEndString (LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
LPCTSTR pCurrent = &pchLine[cbCur];
int cb;
BOOL bInString = TRUE;
TCHAR chDelim;
ASSERT (*plxs & INSTRING); // must be in a string now
token.tokClass = tokString;
chDelim = (*plxs & inStringA) ? _T('\'') : _T('"');
while (bInString && cbCur < cbLen)
{
if (*pCurrent == chDelim)
{
*plxs &= ~INSTRING;
bInString = FALSE;
SetValueSeen(plxs);
}
else if (*pCurrent == _T('<') &&
cbCur+1 < cbLen &&
*(pCurrent+1) == _T('%'))
{
*plxs |= inHTXTag;
break;
}
cb = _tclen(pCurrent);
cbCur += cb;
pCurrent += cb;
}
token.ibTokMac = cbCur;
return cbCur;
}
//////////////////////////////////////////////////////////////////
//
UINT FindTagOpen(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
ASSERT(pchLine[cbCur] == '<');
token.tokClass = tokTag;
*plxs &= ~inScriptText; // turn off script coloring when inside tags
cbCur++;
if (cbCur == cbLen)
{
*plxs |= inTag;
}
else
{
#ifdef NEEDED // copied from htmed\lexer.cpp
//
// HTMED CHANGE:
// REVIEW(cgomes): Figure out if I should turn off inSCRIPT in any of the
// following cases. Right now I only do it for the </ case.
//
#endif //NEEDED
switch (pchLine[cbCur])
{
case '!': // MDO - Markup Declaration Open
cbCur++;
*plxs |= inBangTag;
token.tok = TokTag_BANG;
break;
case '/': // End tag
cbCur++;
*plxs |= inEndTag;
token.tok = TokTag_END;
#ifdef NEEDED // copied from htmed\lexer.cpp
// HTMED CHANGE:
// REVIEW(cgomes): Colorizer bug: it never removes the inSCRIPT state
// This removes the inSCRIPT in the case <SCRIPT <BODY>
// in this case <BODY is in error.
//
*plxs &= ~inSCRIPT;
#endif //NEEDED
break;
// REVIEW: PI is SGML -- not in HTML, but might be added
case '?': // PI - Processing Instruction
cbCur++;
*plxs |= inPITag;
token.tok = TokTag_PI;
break;
case '%': // HTX -- ODBC server HTML extension
cbCur++;
*plxs |= inHTXTag;
token.tok = TokTag_SSSOPEN;
break;
default: // Tag
if (IsCharAlpha(pchLine[cbCur]))
{
*plxs |= inTag;
token.tok = TokTag_START;
}
else
token.tokClass = tokIDENTIFIER; // NOT a TAG
break;
}
}
token.ibTokMac = cbCur;
return cbCur;
}
//////////////////////////////////////////////////////////////////
// FindText
// Scan a token of text
// NOTE DO NOT MODIFY this function, mainly b/c the side effects
// will be hard to find, and will break the way
// that everything works.
//
UINT FindText(LPCTSTR pchLine, UINT cbLen, UINT cbCur, TXTB & token)
{
//BOOL fExtraSpace = FALSE;
//int cSpace = 0;
ASSERT (cbCur < cbLen);
token.tokClass = tokIDENTIFIER;
//if (pchLine[cbCur] == ' ' && !fExtraSpace)
// fExtraSpace = TRUE;
cbCur += _tclen(&pchLine[cbCur]);
while (cbCur < cbLen)
{
switch (pchLine[cbCur])
{
case _T('\0'):
case _T('\n'):
case _T('<'):
case _T('&'):
//if (cSpace > 0) // found extra spaces so remember them somewhere
goto ret;
break;
//case _T(' '):
// if (!fExtraSpace)
// fExtraSpace = TRUE;
// else
// cSpace++;
// break;
default:
//if (cSpace > 0) // found extra spaces so remember them somewhere
//cSpace = 0;
//fExtraSpace = FALSE;
break;
}
cbCur += _tclen(&pchLine[cbCur]);
}
ret:
token.ibTokMac = cbCur;
return cbCur;
}
//////////////////////////////////////////////////////////////////
// FindNextToken()
// Find the next token in the line
//
UINT FindNextToken(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
ASSERT (cbCur < cbLen);
HINT hint;
if (!(*plxs & INTAG)) // scanning text
{
if (*plxs & TEXTMASK)
{
if (*plxs & inCOMMENT)
token.tokClass = tokComment;
else
token.tokClass = tokIDENTIFIER;
// probe for end tag </comment>
UINT cbEnd = FindEndTag(pchLine, cbLen, cbCur, plxs, token);
if (cbEnd > cbCur) // parsed a nonzero-length token
{
return cbEnd;
}
//else fall through to normal processing
}
hint = GetTextHint(pchLine, cbLen, cbCur, plxs, token);
switch (hint)
{
case HTA:
// begin a tag
return FindTagOpen(pchLine, cbLen, cbCur, plxs, token);
case HEN:
// scan an entity reference
token.ibTokMac = FindEntityRef(pchLine, cbLen, cbCur, plxs, token);
return token.ibTokMac;
case EOS:
case ONL:
return token.ibTokMac;
case ERR:
default:
// scan text as a single token
// If the editor uses token info for more than coloring
// (e.g. extended selections), then this will need to
// return smaller chunks.
if (*plxs & inSCRIPT)
*plxs |= inScriptText;
return FindText(pchLine, cbLen, cbCur, token);
break;
}
return cbCur;
}
ASSERT(*plxs & INTAG); // must be in a tag here
BOOL bError = FALSE;
hint = GetHint(pchLine, cbLen, cbCur, plxs, token);
switch (hint)
{
case HTE:
// Tag end: remove all tag state bits
*plxs &= ~TAGMASK;
cbCur++;
token.tokClass = tokTag;
token.tok = TokTag_CLOSE;
token.ibTokMac = cbCur;
break;
case HNU:
#if 0 // lexing HTML instance, not a DTD!
if (!IsNumber(pchLine, cbLen, cbCur, token))
bError = TRUE;
if (SetValueSeen(plxs))
token.tokClass = tokValue;
break;
#else
// fall through
#endif
case HRN: // reserved name start: #
#if 1 // lexing HTML instance, not a DTD!
// simple nonwhitespace stream
if (!(*plxs & inValue))
bError = TRUE;
FindValue(pchLine, cbLen, cbCur, plxs, token);
if (bError)
{
token.tokClass = tokSpace;
bError = FALSE; //"corrected" the error
}
#else
cbCur++;
if (cbCur == cbLen)
token.tokClass = tokOp;
else
{
if (IsIdChar(pchLine[cbCur]))
{
cbCur++;
while (cbCur < cbLen && IsIdChar(pchLine[cbCur]))
cbCur++;
token.tokClass = tokResName;
}
else
token.tokClass = tokOp;
}
token.ibTokMac = cbCur;
if (SetValueSeen(plxs))
token.tokClass = tokValue;
#endif
break;
case HEP: // parameter entity: %
#if 1 // lexing HTML instance, not a DTD!
goto BadChar;
#else
cbCur++;
if (cbCur == cbLen)
{
token.tokClass = tokOp;
token.ibTokMac = cbCur;
}
else
{
if (IsIdChar(pchLine[cbCur]))
{
token.ibTokMac = FindEndEntity(pchLine, cbLen, cbCur, plxs, token);
token.tokClass = tokParEnt;
}
else
{
token.ibTokMac = cbCur;
token.tokClass = tokOp;
}
}
if (SetValueSeen(plxs))
token.tokClass = tokValue;
#endif
break;
// ported HTMED change (walts) -- handle some chars as valid start char for attribute values.
case HAV:
{
if (!(*plxs & inTag) || !SetValueSeen(plxs))
goto BadChar; // not in tag or attribute value.
int iTokenLength = GetValueTokenLength(pchLine, cbLen, cbCur);
token.ibTokMac = token.ibTokMin + iTokenLength;
token.tokClass = tokValue;
break;
}
// ported HTMED change (walts) -- handle some chars as valid start char for attribute values.
case HKW: // identifier
{
int iTokenLength = GetTokenLength(pchLine, cbLen, cbCur);
token.ibTokMac = token.ibTokMin + iTokenLength;
token.tokClass = tokName;
//FUTURE: Don't scan attributes in an end tag
if (*plxs & (inTag|inEndTag))
{
if (*plxs & inAttribute)
{
IsAttributeName(pchLine, cbCur, iTokenLength, token);
// don't change attribute/value state here
// we only look for values after we've seen "=" in case OEQ below
// REVIEW(cgomes): what if more attributes follow
// the SPAN??
// if found STARTSPAN then pretend I am not in a tag
if(token.tok == TokAttrib_STARTSPAN)
*plxs &= ~(inTag | inAttribute);
// if found ENDSPAN then goback to comment state
else if(token.tok == TokAttrib_ENDSPAN)
{
*plxs &= ~(inTag | inAttribute);
*plxs |= inBangTag | inComment;
}
}
else if (SetValueSeen(plxs))
{
// REVIEW (walts)
// Handle the client side script language detection here for the
// following case (language attribute value is NOT wrapped by quotes.)
// <SCRIPT LANGUAGE=VBScript>
if (*plxs & inSCRIPT)
{
SetScriptLanguage(&pchLine[cbCur], plxs);
}
//
// REVIEW(cgomes): It seems that any non-white space character
// is valid for non-quoted attribute values.
// Problem is that GetTokenLength is used to determine
// the token length, which works great non-values,
// but pulls egss for values.
// I use GetValueTokenLength here to get the length
// of value token. GetValueTokenLength will not
// stop till it hits a white space character.
//
iTokenLength = GetValueTokenLength(pchLine, cbLen, cbCur);
token.ibTokMac = token.ibTokMin + iTokenLength;
token.tokClass = tokName;
token.tokClass = tokValue;
}
else
{
IsElementName(pchLine, cbCur, iTokenLength, token);
// look for attributes
*plxs |= inAttribute;
// set content state
if (*plxs & inTag)
*plxs |= TextStateFromElement(&pchLine[token.ibTokMin], iTokenLength);
else if ((*plxs & inEndTag) && (*plxs & TEXTMASK))
*plxs &= ~TextStateFromElement(&pchLine[token.ibTokMin], iTokenLength);
else if ((*plxs & inEndTag) && (*plxs & inSCRIPT))
*plxs &= ~(inSCRIPT | inScriptText | inServerASP/* | inVBScript | inJavaScript*/);
}
}
else if (*plxs & inBangTag)
{
// FUTURE: other <!...> items like "HTML", "PUBLIC"? -- nice for DTDs
// Use a RW table for it if we do
// recognize <!DOCTYPE ...> as 'element'
if ((iTokenLength == 7) &&
(0 == _tcsnicmp(&pchLine[cbCur], _T("doctype"), 7)))
token.tokClass = tokElem;
}
break;
}
case HST: // string "..."
*plxs |= inString;
goto String;
case HSL: // string alternate '...'
*plxs |= inStringA;
String:
cbCur++;
token.ibTokMac = FindEndString(pchLine, cbLen, cbCur, plxs, token);
SetValueSeen(plxs);
// Handle the client side script language detection here for the
// following case (language attribute value is wrapped by quotes.)
// <SCRIPT LANGUAGE="VBScript">
if((*plxs & inSCRIPT) && (*plxs & inAttribute))
{
SetScriptLanguage(&pchLine[cbCur], plxs);
}
break;
case HWS: // tag whitespace
do
{
cbCur++;
} while (cbCur < cbLen && IsWhiteSpace(pchLine[cbCur]));
token.tokClass = tokSpace;
token.ibTokMac = cbCur;
break;
case OEQ:
// GetHint has set token info
if (*plxs & inAttribute)
{
// start looking for values
*plxs &= ~inAttribute;
*plxs |= inValue;
}
else
goto BadChar;
break;
case HTA:
if (cbCur+1 < cbLen && '%' == pchLine[cbCur+1])
{
SetValueSeen(plxs);
return FindTagOpen(pchLine, cbLen, cbCur, plxs, token);
}
// else fall through
case ERR:
case HEN:
BadChar:
token.tokClass = tokSpace;
// DS96# 10116 [CFlaat]: we can be in DBCS here, and so we need
// to make sure that our increment is double-byte aware
cbCur += _tcsnbcnt(pchLine + cbCur, 1); // byte count for current char
ASSERT(cbCur <= cbLen);
token.ibTokMac = cbCur;
break;
// ported HTMED CHANGE (walts) - added this case to handle dbcs attribute values.
case HDB:
{
// DBCS char. Handle for attribute values within tag.
if (!SetValueSeen(plxs))
goto BadChar;
int iTokenLength = GetValueTokenLength(pchLine, cbLen, cbCur);
token.ibTokMac = token.ibTokMin + iTokenLength;
token.tokClass = tokValue;
}
break;
// ported HTMED CHANGE END
default:
// GetHint has set token info
if (token.tokClass != tokComment && (*plxs & inValue))
FindValue(pchLine, cbLen, cbCur, plxs, token);
break;
}
if (bError)
IsUnknownID(pchLine, cbLen, cbCur, token);
return token.ibTokMac;
}
////////////////////////////////////////////////////////////////////
// GetTextHint()
// Like GetHint when scanning text -- look only for tags and entities
//
HINT GetTextHint(LPCTSTR pchLine, UINT /*cbLen*/, UINT cbCur, DWORD * /*plxs*/, TXTB & token)
{
// if the character is bigger than 128 (dbcs) then return error
if (pchLine[cbCur] & ~0x7F)
return HDB;
HINT hint = g_hintTable[pchLine[cbCur]];
if (IsSingleOp(hint))
{
hint = ERR;
}
else if (hint == ONL || hint == EOS)
{
token.tokClass = tokOp;
token.ibTokMac = cbCur + 1;
}
return hint;
}
////////////////////////////////////////////////////////////////////
// GetHint()
// Use hint table to guess what the next token going to be
// If it is a single operator, it will fill in the token info
// as well
//
HINT GetHint(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
{
// if the character is bigger than 128 (dbcs) then return error
if (pchLine[cbCur] & ~0x7F)
return HDB;
HINT hint = g_hintTable[pchLine[cbCur]];
// check if it is a single op, new line or end of stream
if (IsSingleOp(hint) || hint == ONL || hint == EOS)
{
token.tokClass = hint;
token.ibTokMac = cbCur + 1;
}
else if (hint == ODA)
{
if ((cbCur + 1 < cbLen) &&
(g_hintTable[pchLine[cbCur + 1]] == ODA) &&
(*plxs & inBangTag))
{
cbCur += 2;
*plxs |= inComment;
COMMENTTYPE ct = IfHackComment(pchLine, cbLen, cbCur, plxs, token);
if (ct == 0)
{
token.tokClass = tokComment;
token.ibTokMac = cbCur;
}
else if(ct == CT_METADATA)
hint = HTA; // tag open
}
else
{
// single -
token.tokClass = tokOp;
token.ibTokMac = cbCur + 1;
}
}
return hint;
}
///////////////////////////////////////////////////////////////////
// GetTokenLength ()
// return the length of a token identifier/keyword
//
UINT GetTokenLength(LPCTSTR pchLine, UINT cbLen, UINT cbCur)
{
LPCTSTR pCurrent = &pchLine[cbCur];
UINT cb;
UINT cbOld = cbCur;
if (IsCharAlphaNumeric(*pCurrent))
{
while (cbCur < cbLen && IsIdChar(*pCurrent))
{
cb = _tclen(pCurrent);
cbCur += cb;
pCurrent += cb;
}
}
return (int) max((cbCur - cbOld), 1);
}
/*
UINT GetValueTokenLength
Description:
Gets the length of the token.
This version will accept any non whitespace character
in the token.
*/
UINT GetValueTokenLength(LPCTSTR pchLine, UINT cbLen, UINT cbCur)
{
LPCTSTR pCurrent = &pchLine[cbCur];
UINT cb;
UINT cbOld = cbCur;
while (cbCur < cbLen && !_istspace(*pCurrent) && IsValueChar(*pCurrent))
{
cb = _tclen(pCurrent);
cbCur += cb;
pCurrent += cb;
}
return (int) max((cbCur - cbOld), 1);
}
////////////////////////////////////////////////////////////////
// IsElementName ()
// lookup the keyword table to determine if it is a keyword or not
//
BOOL IsElementName(LPCTSTR pchLine, UINT cbCur, int iTokenLength, TXTB & token)
{
LPCTSTR pCurrent = &pchLine[cbCur];
int iFound = NOT_FOUND;
if (NOT_FOUND != (iFound = g_pTable->FindElement(pCurrent, iTokenLength)))
{
token.tokClass = tokElem;
token.ibTokMac = cbCur + iTokenLength;
token.tok = iFound; // set token
}
return (iFound != NOT_FOUND);
}
int IndexFromElementName(LPCTSTR pszName)
{
return g_pTable->FindElement(pszName, lstrlen(pszName));
}
////////////////////////////////////////////////////////////////
// IsAttributeName ()
// lookup the keyword table to determine if it is a keyword or not
//
BOOL IsAttributeName(LPCTSTR pchLine, UINT cbCur, int iTokenLength, TXTB & token)
{
LPCTSTR pCurrent = &pchLine[cbCur];
int iFound = NOT_FOUND;
if (NOT_FOUND != (iFound = g_pTable->FindAttribute(pCurrent, iTokenLength)))
{
token.tokClass = tokAttr;
// ENDSPAN__ is needed b/c the lexer does not recognize the
// endspan-- as 2 seperate tokens.
if(iFound == TokAttrib_ENDSPAN__)
{
// endspan-- found. return TokAttrib_ENDSPAN
// set ibTokMac to not include --.
token.tok = TokAttrib_ENDSPAN;
token.ibTokMac = cbCur + iTokenLength - 2;
}
else
{
token.ibTokMac = cbCur + iTokenLength;
token.tok = iFound; // set token
}
}
return (iFound != NOT_FOUND);
}
//////////////////////////////////////////////////////////////////////////
// IsIdentifier()
// check if it is an identifier
//
BOOL IsIdentifier (int iTokenLength, TXTB & token)
{
if (iTokenLength > 0)
{
token.tokClass = tokName;
token.ibTokMac = token.ibTokMin + iTokenLength;
return TRUE;
}
else
return FALSE;
}
////////////////////////////////////////////////////////////////////
// IsUnknownID ()
// Mark the next token as an ID
//
BOOL IsUnknownID (LPCTSTR pchLine, UINT cbLen, UINT cbCur, TXTB & token)
{
ASSERT(cbCur < cbLen);
UINT cb;
LPCTSTR pCurrent = &pchLine[cbCur];
cb = _tclen(pCurrent);
cbCur += cb;
pCurrent += cb;
while ((cbCur < cbLen) && IsIdChar(*pCurrent))
{
cb = _tclen(pCurrent);
cbCur += cb;
pCurrent += cb;
}
token.tokClass = tokSpace;
token.ibTokMac = cbCur;
return TRUE;
}
/////////////////////////////////////////////////////////////////////////
// IsNumber()
// Check whether the next token is an SGML NUMTOKEN
//
BOOL IsNumber(LPCTSTR pchLine, UINT cbLen, UINT cbCur, TXTB & token)
{
if (cbCur >= cbLen)
return FALSE;
if (!_istdigit(pchLine[cbCur]))
return FALSE;
token.tokClass = tokNum;
// assume all digits are one byte
ASSERT(1 == _tclen(&pchLine[cbCur]));
cbCur++;
while (cbCur < cbLen && _istdigit(pchLine[cbCur]))
{
// assume all digits are one byte
ASSERT(1 == _tclen(&pchLine[cbCur]));
cbCur++;
}
token.ibTokMac = cbCur;
return TRUE;
}
/* end of file */