|
|
// Copyright (c)1997-1999 Microsoft Corporation, All Rights Reserved
/* copied from ..\htmed\lexer.cpp */
/*++
Copyright (c) 1995 Microsoft Corporation
File: lexer.cpp
Abstract: Nitty Gritty Lexer stuff
Contents: SetValueSeen() IsSingleOp() IsWhiteSpace() MapToken() FindEndTag() MakeSublang() SetLanguage() FindTable() FindTable() RemoveTable() MakeTableSet() GetToken() IfHackComment() FindServerScript() FindEndComment() FindEndEntity() FindEntityRef() FindValue() FindEndString() FindTagOpen() FindText() FindNextToken() GetTextHint() GetHint() GetTokenLength() GetValueTokenLength() IsElementName() IsAttributeName() IsIdentifier() IsUnknownID() IsNumber() CColorHtml::SetTable() CColorHtml::InitSublanguages()
History: 2/14/97 cgomes: Created
--*/
#include "stdafx.h"
#include "resource.h"
#include "guids.h"
#include "token.h"
#include "table.h"
#include "lexer.h"
UINT FindClientScriptEnd(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token);
#undef ASSERT
#define ASSERT(b) _ASSERTE(b)
// HACK: we keep a copy of a ptr to the ASP table and sublang
// so we can do special behavior for ASP files
CTableSet* g_ptabASP = 0; PSUBLANG g_psublangASP = 0;
PTABLESET g_arpTables[CV_MAX+1];
// NOTE: added to handle value tokens properly.
UINT GetValueTokenLength(LPCTSTR pchLine, UINT cbLen, UINT cbCur);
// mark state transition from value -> next attribute
inline int SetValueSeen(DWORD *plxs) { if (*plxs & inValue) { *plxs &= ~inValue; *plxs |= inAttribute; return TRUE; } else return FALSE; }
// REVIEW (walts) - need better way
inline void SetScriptLanguage(LPCTSTR pchLine, DWORD *plxs) { LPCTSTR strJavaScript = _T("javascript"); LPCTSTR strVBScript = _T("vbscript"); // triedit's special language. Its set when we convert server-side scripts into
// client-side scripts. Its a dummy language. if we find that as language, we
// set in ServerASP. It is reset(removed) in FindNextToken().
LPCTSTR strServerAsp = _T("serverasp");
// language attribute may have quotes around it.
// if it does then advance past the first quote.
// ex. <SCRIPT LANGUAGE="VBScript">
if(*pchLine == L'\"') pchLine++;
if (_tcsnicmp(pchLine, strJavaScript, lstrlen(strJavaScript)) == 0) { *plxs &= ~inVBScript; *plxs &= ~inServerASP; *plxs |= inJavaScript; } else if (_tcsnicmp(pchLine, strVBScript, lstrlen(strVBScript)) == 0) { *plxs &= ~inJavaScript; *plxs &= ~inServerASP; *plxs |= inVBScript; } else if (_tcsnicmp(pchLine, strServerAsp, lstrlen(strServerAsp)) == 0) { *plxs &= ~inJavaScript; *plxs &= ~inVBScript; *plxs |= inServerASP; } }
inline BOOL IsSingleOp(HINT hint) { return ((hint >= tokOP_SINGLE) && (hint < tokOP_MAX)); };
inline BOOL IsWhiteSpace(TCHAR c) { return _istspace(c); };
// NOTE: Added to handle value tokens properly
inline IsValueChar(TCHAR ch) { // REVIEW(cgomes): specify all the invalid value characters
return ch != _T('<') && ch != _T('>'); };
////////////////////////////////////////////////////////////////////////////
//
// map parsed token to returned token
// left column must be in ascending order
static TOKEN _rgTokenMap[] = { tokName, tokSpace, tokNum, tokSpace, tokParEnt, tokSpace, tokResName, tokSpace, 0, 0 };
static TOKEN MapToken(TOKEN tokClass, DWORD lxs) { if (IsSingleOp((HINT)tokClass)) return tokOp; else if ((tokClass == tokTag) && (lxs & inHTXTag)) return tokSSS; for (int i = 0; (_rgTokenMap[i] != 0) && (_rgTokenMap[i] >= tokClass); i += 2) { if (_rgTokenMap[i] == tokClass) return _rgTokenMap[i + 1]; } return tokClass; }
////////////////////////////////////////////////////////////////////////////
UINT FindEndTag(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD *plxs, TXTB & token) { ASSERT(pchLine); TCHAR szEnd[16]; ELLEX * pellex = pellexFromTextState(*plxs); ASSERT(0 != pellex); // shouldn't be called with something other than special text state
UINT cbCmp = 3 + pellex->cb; // length of end tag
ASSERT(cbCmp < sizeof szEnd); _tcscpy(szEnd, _T("</")); _tcscat(szEnd, pellex->sz); _tcscat(szEnd, _T(">"));
while (cbCur < cbLen) { if (_T('<') == pchLine[cbCur]) { if ((cbLen - cbCur >= cbCmp) && (0 == _tcsnicmp(szEnd, &pchLine[cbCur], cbCmp))) { *plxs &= ~TEXTMASK; // special text modes are exclusive
token.ibTokMac = cbCur; return cbCur; } else if ((cbCur + 1 < cbLen) && (_T('%') == pchLine[cbCur+1])) { *plxs |= inHTXTag; token.ibTokMac = cbCur; break; } else cbCur++; } else cbCur += _tclen(&pchLine[cbCur]); } token.ibTokMac = cbCur; return cbCur; }
////////////////////////////////////////////////////////////////////////////
BOOL MakeSublang(PSUBLANG ps, UINT id, const TCHAR *strName, UINT nIdTemplate, CLSID clsid) { int len;
ASSERT( NULL != ps );
ps->szSubLang = NULL; ps->lxsInitial = LxsFromSubLangIndex(id); ps->nIdTemplate = nIdTemplate; ps->clsidTemplate = clsid;
if ((len = lstrlen(strName)) != 0) { LPTSTR szNew = new TCHAR [len+1]; if (NULL != szNew) { _tcscpy(szNew,strName); ps->szSubLang = szNew; return TRUE; } } return FALSE; }
// Set sublang and tableset array members,
// putting the default one in 0th position.
//
void SetLanguage(TCHAR * strDefault, PSUBLANG rgSublang, PTABLESET pTab, UINT & index, UINT nIdTemplate, CLSID clsid) { if (pTab != NULL) { int i; if (lstrcmp(strDefault, pTab->Name()) == 0) i = 0; else i = index; if (MakeSublang(rgSublang+i, i, pTab->Name(), nIdTemplate, clsid)) { g_arpTables[i] = pTab; if (i) index++; else g_pTable = pTab; } else delete pTab; } }
CTableSet * FindTable(CTableSet ** rgpts, TCHAR *strName) { for (int n = 0; rgpts[n]; n++) { if (rgpts[n]->Name() == strName) //if (strcmp(rgpts[n]->Name(), strName) == 0)
return rgpts[n]; } return NULL; }
CTableSet * FindTable(CTableSet ** rgpts, CTableSet * pts) { for (int n = 0; rgpts[n]; n++) { if (rgpts[n] == pts) return rgpts[n]; } return NULL; }
void RemoveTable(CTableSet ** rgpts, CTableSet *pts) { int n; for (n = 0; rgpts[n]; n++) { if (rgpts[n] == pts) { for(; rgpts[n]; n++) rgpts[n] = rgpts[n+1]; return; } } }
CTableSet * MakeTableSet(CTableSet ** /*rgpts*/, RWATT_T att, UINT nIdName) { return new CStaticTableSet(att, nIdName); }
////////////////////////////////////////////////////////////////////////
// GetToken()
//
UINT GetToken(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { ASSERT (cbCur < cbLen); if(cbCur > cbLen) return cbCur;
UINT cbCount = 0;
// init token
token.tok = 0;
// initialize location where token starts
token.ibTokMin = cbCur;
if (*plxs & inHTXTag) cbCount = FindServerScript(pchLine, cbLen, cbCur, plxs, token); else if (*plxs & inSCRIPT && !(*plxs & inTag) && !(*plxs & inServerASP)) { // NOTE that we want to skip tokenizing scripts that are special to triedit
// when we wrap server-side scripts in client-side scripts, we set a dummy
// language as 'serverasp'. inServerASP is set in that case.
cbCount = FindClientScriptEnd(pchLine, cbLen, cbCur, plxs, token); } else if (*plxs & inComment) // in a comment
{ if (*plxs & inSCRIPT) *plxs |= inScriptText; COMMENTTYPE ct = IfHackComment(pchLine, cbLen, cbCur, plxs, token); if (ct == CT_METADATA) { // Treat as an element
cbCount = FindNextToken(pchLine, cbLen, cbCur, plxs, token); // Remove inBangTag
*plxs &= ~inBangTag; } else if (ct == CT_IECOMMENT) cbCount = token.ibTokMac; else cbCount = FindEndComment(pchLine, cbLen, cbCur, plxs, token); } else if (*plxs & INSTRING) // in a string
cbCount = FindEndString(pchLine, cbLen, cbCur, plxs, token); else cbCount = FindNextToken(pchLine, cbLen, cbCur, plxs, token);
token.tokClass = MapToken(token.tokClass, *plxs); return cbCount; }
///////////////////////////////////////////////////////////////////////////////////
// IfHackComment
//
// Probe ahead in the current line to see if we have what IE recognizes
// as the end of a comment ("->"). This does not conform to RFC 1866 or SGML,
// but suppports browser behavior. This lets us tolerate comments of the
// form: "<!--- whatever ->"
// (note how it ends)
//
// Returns a COMMENTTYPE enum.
// 0 if norma comment
// 1 if IE comment
// -1 if METADATA comment
//
// Proper comments are scanned using FindEndComment().
//
COMMENTTYPE IfHackComment(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { token.tokClass = tokComment; while (cbCur+1 < cbLen) { if(_tcsnicmp(&pchLine[cbCur], _T("METADATA"), lstrlen(_T("METADATA"))) == 0) { token.ibTokMac = cbCur + 1; // include second dash??
*plxs &= ~inComment; // Remove inBangTag
*plxs &= ~inBangTag; *plxs |= inTag; return CT_METADATA; // METADATA
} else if (pchLine[cbCur] == '-' && pchLine[cbCur + 1] == '>') { token.ibTokMac = cbCur + 1; *plxs &= ~inComment; *plxs &= ~inScriptText; return CT_IECOMMENT; } else { cbCur += _tclen(&pchLine[cbCur]); } } return CT_NORMAL; }
UINT FindServerScript(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { LPCTSTR pCurrent = &pchLine[cbCur]; int cb;
// parse HTX start tag
if (*pCurrent == _T('<') && (cbCur+1 < cbLen) && *(pCurrent+1) == '%') { token.tokClass = tokTag; token.tok = TokTag_SSSOPEN; token.ibTokMac = cbCur + 2; *plxs |= inHTXTag; return token.ibTokMac; }
ASSERT(*plxs & inHTXTag); // should be in HTXTag state here
if (*pCurrent == _T('%') && (cbCur+1 < cbLen) && *(pCurrent+1) == '>') { token.tok = TokTag_SSSCLOSE; token.tokClass = tokSSS; //tokTag;
token.ibTokMac = cbCur + 2; *plxs &= ~inHTXTag; if (*plxs & inNestedQuoteinSSS) *plxs &= ~inNestedQuoteinSSS; return token.ibTokMac; }
token.tokClass = tokSSS;
while (cbCur < cbLen) { if (*pCurrent == _T('%') && (cbCur+1 < cbLen) && (*(pCurrent+1) == _T('>'))) break; if ( *pCurrent == _T('"') && *plxs&inTag && *plxs&inHTXTag && *plxs&inAttribute && *plxs&inString ) *plxs |= inNestedQuoteinSSS;
cb = _tclen(pCurrent); cbCur += cb; pCurrent += cb; }
token.ibTokMac = cbCur; return cbCur; }
///////////////////////////////////////////////////////////////////////////////////
// FindClientScriptEnd()
//
// HTMED CHANGE: Find the end of client script block
//
UINT FindClientScriptEnd(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { LPCTSTR pCurrent = &pchLine[cbCur]; int cb;
TCHAR rgEndScript[] = _T("</SCRIPT"); int cchEndScript = (wcslen(rgEndScript) - 1);
if( cbCur + cchEndScript < cbLen && 0 == _tcsnicmp(pCurrent, rgEndScript, cchEndScript)) { token.tokClass = tokTag; token.tok = TokTag_END; *plxs &= ~inSCRIPT; *plxs |= inEndTag; token.ibTokMac = cbCur + 2; return token.ibTokMac; }
token.tokClass = tokSpace;
while (cbCur < cbLen) { if (*pCurrent == _T('<') && (cbCur+1 < cbLen) && (*(pCurrent+1) == _T('/'))) { // Check if found end </SCRIPT
if( cbCur + cchEndScript < cbLen && 0 == _tcsnicmp(pCurrent, rgEndScript, cchEndScript)) { // Check if found end </SCRIPT
break; } } cb = _tclen(pCurrent); cbCur += cb; pCurrent += cb; }
token.ibTokMac = cbCur; return cbCur; }
///////////////////////////////////////////////////////////////////////////////////
// FindEndComment()
//
// Find the end of comment ("--").
//
UINT FindEndComment(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { LPCTSTR pCurrent = &pchLine[cbCur]; BOOL bEndComment = FALSE; int cb;
ASSERT(*plxs & inComment); // must be in a comment now
token.tokClass = tokComment;
while (!bEndComment && cbCur < cbLen) { if (*pCurrent == _T('-')) // check the character to see if it's the first "-" in "--"
{ pCurrent++; cbCur++; if ((cbCur < cbLen) && (*pCurrent == _T('-'))) // we're possibly at the end, so search for the final "--" pair
{ bEndComment = TRUE; } } else { cb = _tclen(pCurrent); cbCur += cb; pCurrent += cb; } } if (cbCur < cbLen) { cb = _tclen(pCurrent); cbCur += cb; pCurrent += cb; }
token.ibTokMac = cbCur;
// reset state if we reach end of comment
if (bEndComment) *plxs &= ~inComment;
return cbCur; }
/////////////////////////////////////////////////////////////
// FindEndEntity()
//
// Find the end of the special character sequence (ends with ; or whitespace).
//
UINT FindEndEntity(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * /*plxs*/, TXTB & token) { token.tokClass = tokEntity; int cb = GetTokenLength(pchLine, cbLen, cbCur); if (pchLine[cbCur + cb] == ';') cb++; token.ibTokMac = cbCur + cb; return token.ibTokMac; }
/////////////////////////////////////////////////////////////
// Find an entity reference or non-entity ref, literal "&..."
//
UINT FindEntityRef(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * /*plxs*/, TXTB & token) { ASSERT(cbCur < cbLen); ASSERT(pchLine[cbCur] == '&'); // must be on ERO
cbCur++; if (cbCur == cbLen) { NotEntity: token.tokClass = tokIDENTIFIER; // plain text
token.ibTokMac = cbCur; return cbCur; }
if (pchLine[cbCur] == '#') { // parse and check valid number
if (!IsNumber(pchLine, cbLen, cbCur + 1, token)) goto NotEntity;
// must be <= 3 digits
if (token.ibTokMac - (cbCur + 1) > 3) goto NotEntity;
// validate range
TCHAR szNum[4]; _tcsncpy(szNum, &pchLine[cbCur + 1], 3); if (_tcstoul(szNum, 0, 10) > 255) goto NotEntity;
// we now have a valid numeric entity ref
token.tokClass = tokEntity; cbCur = token.ibTokMac;
// scan for end of entity ref
// scan rest of alphanumeric token
// REVIEW: Is this correct? IE 4.40.308 behaves this way
while ((cbCur < cbLen) && IsCharAlphaNumeric(pchLine[cbCur])) cbCur++;
// scan delimiter
if (cbCur < cbLen) cbCur++; token.ibTokMac = cbCur; return cbCur; } else if (!IsCharAlpha(pchLine[cbCur])) { goto NotEntity; } else { // parse and check entity name
UINT nLen = GetTokenLength(pchLine, cbLen, cbCur); if (!g_pTable->FindEntity(&pchLine[cbCur], nLen)) goto NotEntity;
cbCur += nLen; // eat delimiter if necessary
if ((cbCur < cbLen) && (pchLine[cbCur] == ';' || IsWhiteSpace(pchLine[cbCur]))) cbCur++; token.tokClass = tokEntity; token.ibTokMac = cbCur; return cbCur; } }
/////////////////////////////////////////////////////////////
// FindEndValue
// Find the end of an unquoted value.
//
// Scan for whitespace or end if tag
//
UINT FindValue(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { ASSERT(cbCur < cbLen);
do { cbCur++; } while ( cbCur < cbLen && !IsWhiteSpace(pchLine[cbCur]) && pchLine[cbCur] != '>' );
token.tokClass = tokValue; token.ibTokMac = cbCur;
// switch from value to attribute
*plxs &= ~inValue; *plxs |= inAttribute;
return cbCur; }
/////////////////////////////////////////////////////////////
// FindEndString()
// Find the end of the string.
// Should only be called when we are in the string mode already.
//
UINT FindEndString (LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { LPCTSTR pCurrent = &pchLine[cbCur]; int cb; BOOL bInString = TRUE; TCHAR chDelim;
ASSERT (*plxs & INSTRING); // must be in a string now
token.tokClass = tokString; chDelim = (*plxs & inStringA) ? _T('\'') : _T('"');
while (bInString && cbCur < cbLen) { if (*pCurrent == chDelim) { *plxs &= ~INSTRING; bInString = FALSE; SetValueSeen(plxs); } else if (*pCurrent == _T('<') && cbCur+1 < cbLen && *(pCurrent+1) == _T('%')) { *plxs |= inHTXTag; break; } cb = _tclen(pCurrent); cbCur += cb; pCurrent += cb; } token.ibTokMac = cbCur; return cbCur; }
//////////////////////////////////////////////////////////////////
//
UINT FindTagOpen(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { ASSERT(pchLine[cbCur] == '<'); token.tokClass = tokTag; *plxs &= ~inScriptText; // turn off script coloring when inside tags
cbCur++;
if (cbCur == cbLen) { *plxs |= inTag; } else { #ifdef NEEDED // copied from htmed\lexer.cpp
//
// HTMED CHANGE:
// REVIEW(cgomes): Figure out if I should turn off inSCRIPT in any of the
// following cases. Right now I only do it for the </ case.
//
#endif //NEEDED
switch (pchLine[cbCur]) { case '!': // MDO - Markup Declaration Open
cbCur++; *plxs |= inBangTag; token.tok = TokTag_BANG; break;
case '/': // End tag
cbCur++; *plxs |= inEndTag; token.tok = TokTag_END; #ifdef NEEDED // copied from htmed\lexer.cpp
// HTMED CHANGE:
// REVIEW(cgomes): Colorizer bug: it never removes the inSCRIPT state
// This removes the inSCRIPT in the case <SCRIPT <BODY>
// in this case <BODY is in error.
//
*plxs &= ~inSCRIPT; #endif //NEEDED
break;
// REVIEW: PI is SGML -- not in HTML, but might be added
case '?': // PI - Processing Instruction
cbCur++; *plxs |= inPITag; token.tok = TokTag_PI; break;
case '%': // HTX -- ODBC server HTML extension
cbCur++; *plxs |= inHTXTag; token.tok = TokTag_SSSOPEN; break;
default: // Tag
if (IsCharAlpha(pchLine[cbCur])) { *plxs |= inTag; token.tok = TokTag_START; } else token.tokClass = tokIDENTIFIER; // NOT a TAG
break; } } token.ibTokMac = cbCur; return cbCur; }
//////////////////////////////////////////////////////////////////
// FindText
// Scan a token of text
// NOTE DO NOT MODIFY this function, mainly b/c the side effects
// will be hard to find, and will break the way
// that everything works.
//
UINT FindText(LPCTSTR pchLine, UINT cbLen, UINT cbCur, TXTB & token) { //BOOL fExtraSpace = FALSE;
//int cSpace = 0;
ASSERT (cbCur < cbLen);
token.tokClass = tokIDENTIFIER;
//if (pchLine[cbCur] == ' ' && !fExtraSpace)
// fExtraSpace = TRUE;
cbCur += _tclen(&pchLine[cbCur]); while (cbCur < cbLen) { switch (pchLine[cbCur]) { case _T('\0'): case _T('\n'): case _T('<'): case _T('&'): //if (cSpace > 0) // found extra spaces so remember them somewhere
goto ret; break; //case _T(' '):
// if (!fExtraSpace)
// fExtraSpace = TRUE;
// else
// cSpace++;
// break;
default: //if (cSpace > 0) // found extra spaces so remember them somewhere
//cSpace = 0;
//fExtraSpace = FALSE;
break; } cbCur += _tclen(&pchLine[cbCur]); }
ret: token.ibTokMac = cbCur; return cbCur; }
//////////////////////////////////////////////////////////////////
// FindNextToken()
// Find the next token in the line
//
UINT FindNextToken(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { ASSERT (cbCur < cbLen); HINT hint;
if (!(*plxs & INTAG)) // scanning text
{ if (*plxs & TEXTMASK) { if (*plxs & inCOMMENT) token.tokClass = tokComment; else token.tokClass = tokIDENTIFIER; // probe for end tag </comment>
UINT cbEnd = FindEndTag(pchLine, cbLen, cbCur, plxs, token); if (cbEnd > cbCur) // parsed a nonzero-length token
{ return cbEnd; } //else fall through to normal processing
} hint = GetTextHint(pchLine, cbLen, cbCur, plxs, token); switch (hint) { case HTA: // begin a tag
return FindTagOpen(pchLine, cbLen, cbCur, plxs, token);
case HEN: // scan an entity reference
token.ibTokMac = FindEntityRef(pchLine, cbLen, cbCur, plxs, token); return token.ibTokMac;
case EOS: case ONL: return token.ibTokMac;
case ERR: default: // scan text as a single token
// If the editor uses token info for more than coloring
// (e.g. extended selections), then this will need to
// return smaller chunks.
if (*plxs & inSCRIPT) *plxs |= inScriptText; return FindText(pchLine, cbLen, cbCur, token); break; }
return cbCur; }
ASSERT(*plxs & INTAG); // must be in a tag here
BOOL bError = FALSE; hint = GetHint(pchLine, cbLen, cbCur, plxs, token); switch (hint) { case HTE: // Tag end: remove all tag state bits
*plxs &= ~TAGMASK; cbCur++; token.tokClass = tokTag; token.tok = TokTag_CLOSE; token.ibTokMac = cbCur; break;
case HNU: #if 0 // lexing HTML instance, not a DTD!
if (!IsNumber(pchLine, cbLen, cbCur, token)) bError = TRUE; if (SetValueSeen(plxs)) token.tokClass = tokValue; break; #else
// fall through
#endif
case HRN: // reserved name start: #
#if 1 // lexing HTML instance, not a DTD!
// simple nonwhitespace stream
if (!(*plxs & inValue)) bError = TRUE; FindValue(pchLine, cbLen, cbCur, plxs, token); if (bError) { token.tokClass = tokSpace; bError = FALSE; //"corrected" the error
} #else
cbCur++; if (cbCur == cbLen) token.tokClass = tokOp; else { if (IsIdChar(pchLine[cbCur])) { cbCur++; while (cbCur < cbLen && IsIdChar(pchLine[cbCur])) cbCur++; token.tokClass = tokResName; } else token.tokClass = tokOp; } token.ibTokMac = cbCur; if (SetValueSeen(plxs)) token.tokClass = tokValue; #endif
break;
case HEP: // parameter entity: %
#if 1 // lexing HTML instance, not a DTD!
goto BadChar; #else
cbCur++; if (cbCur == cbLen) { token.tokClass = tokOp; token.ibTokMac = cbCur; } else { if (IsIdChar(pchLine[cbCur])) { token.ibTokMac = FindEndEntity(pchLine, cbLen, cbCur, plxs, token); token.tokClass = tokParEnt; } else { token.ibTokMac = cbCur; token.tokClass = tokOp; } } if (SetValueSeen(plxs)) token.tokClass = tokValue; #endif
break;
// ported HTMED change (walts) -- handle some chars as valid start char for attribute values.
case HAV: { if (!(*plxs & inTag) || !SetValueSeen(plxs)) goto BadChar; // not in tag or attribute value.
int iTokenLength = GetValueTokenLength(pchLine, cbLen, cbCur); token.ibTokMac = token.ibTokMin + iTokenLength; token.tokClass = tokValue; break; } // ported HTMED change (walts) -- handle some chars as valid start char for attribute values.
case HKW: // identifier
{ int iTokenLength = GetTokenLength(pchLine, cbLen, cbCur); token.ibTokMac = token.ibTokMin + iTokenLength; token.tokClass = tokName; //FUTURE: Don't scan attributes in an end tag
if (*plxs & (inTag|inEndTag)) { if (*plxs & inAttribute) { IsAttributeName(pchLine, cbCur, iTokenLength, token); // don't change attribute/value state here
// we only look for values after we've seen "=" in case OEQ below
// REVIEW(cgomes): what if more attributes follow
// the SPAN??
// if found STARTSPAN then pretend I am not in a tag
if(token.tok == TokAttrib_STARTSPAN) *plxs &= ~(inTag | inAttribute); // if found ENDSPAN then goback to comment state
else if(token.tok == TokAttrib_ENDSPAN) { *plxs &= ~(inTag | inAttribute); *plxs |= inBangTag | inComment; } } else if (SetValueSeen(plxs)) { // REVIEW (walts)
// Handle the client side script language detection here for the
// following case (language attribute value is NOT wrapped by quotes.)
// <SCRIPT LANGUAGE=VBScript>
if (*plxs & inSCRIPT) { SetScriptLanguage(&pchLine[cbCur], plxs); }
//
// REVIEW(cgomes): It seems that any non-white space character
// is valid for non-quoted attribute values.
// Problem is that GetTokenLength is used to determine
// the token length, which works great non-values,
// but pulls egss for values.
// I use GetValueTokenLength here to get the length
// of value token. GetValueTokenLength will not
// stop till it hits a white space character.
//
iTokenLength = GetValueTokenLength(pchLine, cbLen, cbCur); token.ibTokMac = token.ibTokMin + iTokenLength; token.tokClass = tokName;
token.tokClass = tokValue; } else { IsElementName(pchLine, cbCur, iTokenLength, token); // look for attributes
*plxs |= inAttribute; // set content state
if (*plxs & inTag) *plxs |= TextStateFromElement(&pchLine[token.ibTokMin], iTokenLength); else if ((*plxs & inEndTag) && (*plxs & TEXTMASK)) *plxs &= ~TextStateFromElement(&pchLine[token.ibTokMin], iTokenLength); else if ((*plxs & inEndTag) && (*plxs & inSCRIPT)) *plxs &= ~(inSCRIPT | inScriptText | inServerASP/* | inVBScript | inJavaScript*/); } } else if (*plxs & inBangTag) { // FUTURE: other <!...> items like "HTML", "PUBLIC"? -- nice for DTDs
// Use a RW table for it if we do
// recognize <!DOCTYPE ...> as 'element'
if ((iTokenLength == 7) && (0 == _tcsnicmp(&pchLine[cbCur], _T("doctype"), 7))) token.tokClass = tokElem; } break; }
case HST: // string "..."
*plxs |= inString; goto String;
case HSL: // string alternate '...'
*plxs |= inStringA; String: cbCur++; token.ibTokMac = FindEndString(pchLine, cbLen, cbCur, plxs, token); SetValueSeen(plxs); // Handle the client side script language detection here for the
// following case (language attribute value is wrapped by quotes.)
// <SCRIPT LANGUAGE="VBScript">
if((*plxs & inSCRIPT) && (*plxs & inAttribute)) { SetScriptLanguage(&pchLine[cbCur], plxs); } break;
case HWS: // tag whitespace
do { cbCur++; } while (cbCur < cbLen && IsWhiteSpace(pchLine[cbCur])); token.tokClass = tokSpace; token.ibTokMac = cbCur; break;
case OEQ: // GetHint has set token info
if (*plxs & inAttribute) { // start looking for values
*plxs &= ~inAttribute; *plxs |= inValue; } else goto BadChar; break;
case HTA: if (cbCur+1 < cbLen && '%' == pchLine[cbCur+1]) { SetValueSeen(plxs); return FindTagOpen(pchLine, cbLen, cbCur, plxs, token); } // else fall through
case ERR: case HEN: BadChar: token.tokClass = tokSpace;
// DS96# 10116 [CFlaat]: we can be in DBCS here, and so we need
// to make sure that our increment is double-byte aware
cbCur += _tcsnbcnt(pchLine + cbCur, 1); // byte count for current char
ASSERT(cbCur <= cbLen); token.ibTokMac = cbCur; break;
// ported HTMED CHANGE (walts) - added this case to handle dbcs attribute values.
case HDB: { // DBCS char. Handle for attribute values within tag.
if (!SetValueSeen(plxs)) goto BadChar;
int iTokenLength = GetValueTokenLength(pchLine, cbLen, cbCur); token.ibTokMac = token.ibTokMin + iTokenLength; token.tokClass = tokValue; } break; // ported HTMED CHANGE END
default: // GetHint has set token info
if (token.tokClass != tokComment && (*plxs & inValue)) FindValue(pchLine, cbLen, cbCur, plxs, token); break; } if (bError) IsUnknownID(pchLine, cbLen, cbCur, token); return token.ibTokMac; }
////////////////////////////////////////////////////////////////////
// GetTextHint()
// Like GetHint when scanning text -- look only for tags and entities
//
HINT GetTextHint(LPCTSTR pchLine, UINT /*cbLen*/, UINT cbCur, DWORD * /*plxs*/, TXTB & token) { // if the character is bigger than 128 (dbcs) then return error
if (pchLine[cbCur] & ~0x7F) return HDB;
HINT hint = g_hintTable[pchLine[cbCur]];
if (IsSingleOp(hint)) { hint = ERR; } else if (hint == ONL || hint == EOS) { token.tokClass = tokOp; token.ibTokMac = cbCur + 1; } return hint; }
////////////////////////////////////////////////////////////////////
// GetHint()
// Use hint table to guess what the next token going to be
// If it is a single operator, it will fill in the token info
// as well
//
HINT GetHint(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { // if the character is bigger than 128 (dbcs) then return error
if (pchLine[cbCur] & ~0x7F) return HDB;
HINT hint = g_hintTable[pchLine[cbCur]];
// check if it is a single op, new line or end of stream
if (IsSingleOp(hint) || hint == ONL || hint == EOS) { token.tokClass = hint; token.ibTokMac = cbCur + 1; } else if (hint == ODA) { if ((cbCur + 1 < cbLen) && (g_hintTable[pchLine[cbCur + 1]] == ODA) && (*plxs & inBangTag)) { cbCur += 2; *plxs |= inComment; COMMENTTYPE ct = IfHackComment(pchLine, cbLen, cbCur, plxs, token); if (ct == 0) { token.tokClass = tokComment; token.ibTokMac = cbCur; } else if(ct == CT_METADATA) hint = HTA; // tag open
} else { // single -
token.tokClass = tokOp; token.ibTokMac = cbCur + 1; } } return hint; }
///////////////////////////////////////////////////////////////////
// GetTokenLength ()
// return the length of a token identifier/keyword
//
UINT GetTokenLength(LPCTSTR pchLine, UINT cbLen, UINT cbCur) { LPCTSTR pCurrent = &pchLine[cbCur]; UINT cb; UINT cbOld = cbCur;
if (IsCharAlphaNumeric(*pCurrent)) { while (cbCur < cbLen && IsIdChar(*pCurrent)) { cb = _tclen(pCurrent); cbCur += cb; pCurrent += cb; } } return (int) max((cbCur - cbOld), 1); }
/*
UINT GetValueTokenLength
Description: Gets the length of the token. This version will accept any non whitespace character in the token.
*/ UINT GetValueTokenLength(LPCTSTR pchLine, UINT cbLen, UINT cbCur) { LPCTSTR pCurrent = &pchLine[cbCur]; UINT cb; UINT cbOld = cbCur;
while (cbCur < cbLen && !_istspace(*pCurrent) && IsValueChar(*pCurrent)) { cb = _tclen(pCurrent); cbCur += cb; pCurrent += cb; } return (int) max((cbCur - cbOld), 1); }
////////////////////////////////////////////////////////////////
// IsElementName ()
// lookup the keyword table to determine if it is a keyword or not
//
BOOL IsElementName(LPCTSTR pchLine, UINT cbCur, int iTokenLength, TXTB & token) { LPCTSTR pCurrent = &pchLine[cbCur]; int iFound = NOT_FOUND;
if (NOT_FOUND != (iFound = g_pTable->FindElement(pCurrent, iTokenLength))) { token.tokClass = tokElem; token.ibTokMac = cbCur + iTokenLength; token.tok = iFound; // set token
} return (iFound != NOT_FOUND); }
int IndexFromElementName(LPCTSTR pszName) { return g_pTable->FindElement(pszName, lstrlen(pszName)); }
////////////////////////////////////////////////////////////////
// IsAttributeName ()
// lookup the keyword table to determine if it is a keyword or not
//
BOOL IsAttributeName(LPCTSTR pchLine, UINT cbCur, int iTokenLength, TXTB & token) { LPCTSTR pCurrent = &pchLine[cbCur]; int iFound = NOT_FOUND;
if (NOT_FOUND != (iFound = g_pTable->FindAttribute(pCurrent, iTokenLength))) { token.tokClass = tokAttr; // ENDSPAN__ is needed b/c the lexer does not recognize the
// endspan-- as 2 seperate tokens.
if(iFound == TokAttrib_ENDSPAN__) { // endspan-- found. return TokAttrib_ENDSPAN
// set ibTokMac to not include --.
token.tok = TokAttrib_ENDSPAN; token.ibTokMac = cbCur + iTokenLength - 2; } else { token.ibTokMac = cbCur + iTokenLength; token.tok = iFound; // set token
} } return (iFound != NOT_FOUND); }
//////////////////////////////////////////////////////////////////////////
// IsIdentifier()
// check if it is an identifier
//
BOOL IsIdentifier (int iTokenLength, TXTB & token) { if (iTokenLength > 0) { token.tokClass = tokName; token.ibTokMac = token.ibTokMin + iTokenLength; return TRUE; } else return FALSE; }
////////////////////////////////////////////////////////////////////
// IsUnknownID ()
// Mark the next token as an ID
//
BOOL IsUnknownID (LPCTSTR pchLine, UINT cbLen, UINT cbCur, TXTB & token) { ASSERT(cbCur < cbLen); UINT cb; LPCTSTR pCurrent = &pchLine[cbCur];
cb = _tclen(pCurrent); cbCur += cb; pCurrent += cb;
while ((cbCur < cbLen) && IsIdChar(*pCurrent)) { cb = _tclen(pCurrent); cbCur += cb; pCurrent += cb; }
token.tokClass = tokSpace; token.ibTokMac = cbCur; return TRUE; }
/////////////////////////////////////////////////////////////////////////
// IsNumber()
// Check whether the next token is an SGML NUMTOKEN
//
BOOL IsNumber(LPCTSTR pchLine, UINT cbLen, UINT cbCur, TXTB & token) { if (cbCur >= cbLen) return FALSE;
if (!_istdigit(pchLine[cbCur])) return FALSE;
token.tokClass = tokNum;
// assume all digits are one byte
ASSERT(1 == _tclen(&pchLine[cbCur])); cbCur++;
while (cbCur < cbLen && _istdigit(pchLine[cbCur])) { // assume all digits are one byte
ASSERT(1 == _tclen(&pchLine[cbCur])); cbCur++; }
token.ibTokMac = cbCur; return TRUE; }
/* end of file */
|