// Copyright (c)1997-1999 Microsoft Corporation, All Rights Reserved /* copied from ..\htmed\lexer.cpp */ /*++ Copyright (c) 1995 Microsoft Corporation File: lexer.cpp Abstract: Nitty Gritty Lexer stuff Contents: SetValueSeen() IsSingleOp() IsWhiteSpace() MapToken() FindEndTag() MakeSublang() SetLanguage() FindTable() FindTable() RemoveTable() MakeTableSet() GetToken() IfHackComment() FindServerScript() FindEndComment() FindEndEntity() FindEntityRef() FindValue() FindEndString() FindTagOpen() FindText() FindNextToken() GetTextHint() GetHint() GetTokenLength() GetValueTokenLength() IsElementName() IsAttributeName() IsIdentifier() IsUnknownID() IsNumber() CColorHtml::SetTable() CColorHtml::InitSublanguages() History: 2/14/97 cgomes: Created --*/ #include "stdafx.h" #include "resource.h" #include "guids.h" #include "token.h" #include "table.h" #include "lexer.h" UINT FindClientScriptEnd(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token); #undef ASSERT #define ASSERT(b) _ASSERTE(b) // HACK: we keep a copy of a ptr to the ASP table and sublang // so we can do special behavior for ASP files CTableSet* g_ptabASP = 0; PSUBLANG g_psublangASP = 0; PTABLESET g_arpTables[CV_MAX+1]; // NOTE: added to handle value tokens properly. UINT GetValueTokenLength(LPCTSTR pchLine, UINT cbLen, UINT cbCur); // mark state transition from value -> next attribute inline int SetValueSeen(DWORD *plxs) { if (*plxs & inValue) { *plxs &= ~inValue; *plxs |= inAttribute; return TRUE; } else return FALSE; } // REVIEW (walts) - need better way inline void SetScriptLanguage(LPCTSTR pchLine, DWORD *plxs) { LPCTSTR strJavaScript = _T("javascript"); LPCTSTR strVBScript = _T("vbscript"); // triedit's special language. Its set when we convert server-side scripts into // client-side scripts. Its a dummy language. if we find that as language, we // set in ServerASP. It is reset(removed) in FindNextToken(). LPCTSTR strServerAsp = _T("serverasp"); // language attribute may have quotes around it. // if it does then advance past the first quote. // ex. 3) goto NotEntity; // validate range TCHAR szNum[4]; _tcsncpy(szNum, &pchLine[cbCur + 1], 3); if (_tcstoul(szNum, 0, 10) > 255) goto NotEntity; // we now have a valid numeric entity ref token.tokClass = tokEntity; cbCur = token.ibTokMac; // scan for end of entity ref // scan rest of alphanumeric token // REVIEW: Is this correct? IE 4.40.308 behaves this way while ((cbCur < cbLen) && IsCharAlphaNumeric(pchLine[cbCur])) cbCur++; // scan delimiter if (cbCur < cbLen) cbCur++; token.ibTokMac = cbCur; return cbCur; } else if (!IsCharAlpha(pchLine[cbCur])) { goto NotEntity; } else { // parse and check entity name UINT nLen = GetTokenLength(pchLine, cbLen, cbCur); if (!g_pTable->FindEntity(&pchLine[cbCur], nLen)) goto NotEntity; cbCur += nLen; // eat delimiter if necessary if ((cbCur < cbLen) && (pchLine[cbCur] == ';' || IsWhiteSpace(pchLine[cbCur]))) cbCur++; token.tokClass = tokEntity; token.ibTokMac = cbCur; return cbCur; } } ///////////////////////////////////////////////////////////// // FindEndValue // Find the end of an unquoted value. // // Scan for whitespace or end if tag // UINT FindValue(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { ASSERT(cbCur < cbLen); do { cbCur++; } while ( cbCur < cbLen && !IsWhiteSpace(pchLine[cbCur]) && pchLine[cbCur] != '>' ); token.tokClass = tokValue; token.ibTokMac = cbCur; // switch from value to attribute *plxs &= ~inValue; *plxs |= inAttribute; return cbCur; } ///////////////////////////////////////////////////////////// // FindEndString() // Find the end of the string. // Should only be called when we are in the string mode already. // UINT FindEndString (LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { LPCTSTR pCurrent = &pchLine[cbCur]; int cb; BOOL bInString = TRUE; TCHAR chDelim; ASSERT (*plxs & INSTRING); // must be in a string now token.tokClass = tokString; chDelim = (*plxs & inStringA) ? _T('\'') : _T('"'); while (bInString && cbCur < cbLen) { if (*pCurrent == chDelim) { *plxs &= ~INSTRING; bInString = FALSE; SetValueSeen(plxs); } else if (*pCurrent == _T('<') && cbCur+1 < cbLen && *(pCurrent+1) == _T('%')) { *plxs |= inHTXTag; break; } cb = _tclen(pCurrent); cbCur += cb; pCurrent += cb; } token.ibTokMac = cbCur; return cbCur; } ////////////////////////////////////////////////////////////////// // UINT FindTagOpen(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { ASSERT(pchLine[cbCur] == '<'); token.tokClass = tokTag; *plxs &= ~inScriptText; // turn off script coloring when inside tags cbCur++; if (cbCur == cbLen) { *plxs |= inTag; } else { #ifdef NEEDED // copied from htmed\lexer.cpp // // HTMED CHANGE: // REVIEW(cgomes): Figure out if I should turn off inSCRIPT in any of the // following cases. Right now I only do it for the // in this case 0) // found extra spaces so remember them somewhere goto ret; break; //case _T(' '): // if (!fExtraSpace) // fExtraSpace = TRUE; // else // cSpace++; // break; default: //if (cSpace > 0) // found extra spaces so remember them somewhere //cSpace = 0; //fExtraSpace = FALSE; break; } cbCur += _tclen(&pchLine[cbCur]); } ret: token.ibTokMac = cbCur; return cbCur; } ////////////////////////////////////////////////////////////////// // FindNextToken() // Find the next token in the line // UINT FindNextToken(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token) { ASSERT (cbCur < cbLen); HINT hint; if (!(*plxs & INTAG)) // scanning text { if (*plxs & TEXTMASK) { if (*plxs & inCOMMENT) token.tokClass = tokComment; else token.tokClass = tokIDENTIFIER; // probe for end tag UINT cbEnd = FindEndTag(pchLine, cbLen, cbCur, plxs, token); if (cbEnd > cbCur) // parsed a nonzero-length token { return cbEnd; } //else fall through to normal processing } hint = GetTextHint(pchLine, cbLen, cbCur, plxs, token); switch (hint) { case HTA: // begin a tag return FindTagOpen(pchLine, cbLen, cbCur, plxs, token); case HEN: // scan an entity reference token.ibTokMac = FindEntityRef(pchLine, cbLen, cbCur, plxs, token); return token.ibTokMac; case EOS: case ONL: return token.ibTokMac; case ERR: default: // scan text as a single token // If the editor uses token info for more than coloring // (e.g. extended selections), then this will need to // return smaller chunks. if (*plxs & inSCRIPT) *plxs |= inScriptText; return FindText(pchLine, cbLen, cbCur, token); break; } return cbCur; } ASSERT(*plxs & INTAG); // must be in a tag here BOOL bError = FALSE; hint = GetHint(pchLine, cbLen, cbCur, plxs, token); switch (hint) { case HTE: // Tag end: remove all tag state bits *plxs &= ~TAGMASK; cbCur++; token.tokClass = tokTag; token.tok = TokTag_CLOSE; token.ibTokMac = cbCur; break; case HNU: #if 0 // lexing HTML instance, not a DTD! if (!IsNumber(pchLine, cbLen, cbCur, token)) bError = TRUE; if (SetValueSeen(plxs)) token.tokClass = tokValue; break; #else // fall through #endif case HRN: // reserved name start: # #if 1 // lexing HTML instance, not a DTD! // simple nonwhitespace stream if (!(*plxs & inValue)) bError = TRUE; FindValue(pchLine, cbLen, cbCur, plxs, token); if (bError) { token.tokClass = tokSpace; bError = FALSE; //"corrected" the error } #else cbCur++; if (cbCur == cbLen) token.tokClass = tokOp; else { if (IsIdChar(pchLine[cbCur])) { cbCur++; while (cbCur < cbLen && IsIdChar(pchLine[cbCur])) cbCur++; token.tokClass = tokResName; } else token.tokClass = tokOp; } token.ibTokMac = cbCur; if (SetValueSeen(plxs)) token.tokClass = tokValue; #endif break; case HEP: // parameter entity: % #if 1 // lexing HTML instance, not a DTD! goto BadChar; #else cbCur++; if (cbCur == cbLen) { token.tokClass = tokOp; token.ibTokMac = cbCur; } else { if (IsIdChar(pchLine[cbCur])) { token.ibTokMac = FindEndEntity(pchLine, cbLen, cbCur, plxs, token); token.tokClass = tokParEnt; } else { token.ibTokMac = cbCur; token.tokClass = tokOp; } } if (SetValueSeen(plxs)) token.tokClass = tokValue; #endif break; // ported HTMED change (walts) -- handle some chars as valid start char for attribute values. case HAV: { if (!(*plxs & inTag) || !SetValueSeen(plxs)) goto BadChar; // not in tag or attribute value. int iTokenLength = GetValueTokenLength(pchLine, cbLen, cbCur); token.ibTokMac = token.ibTokMin + iTokenLength; token.tokClass = tokValue; break; } // ported HTMED change (walts) -- handle some chars as valid start char for attribute values. case HKW: // identifier { int iTokenLength = GetTokenLength(pchLine, cbLen, cbCur); token.ibTokMac = token.ibTokMin + iTokenLength; token.tokClass = tokName; //FUTURE: Don't scan attributes in an end tag if (*plxs & (inTag|inEndTag)) { if (*plxs & inAttribute) { IsAttributeName(pchLine, cbCur, iTokenLength, token); // don't change attribute/value state here // we only look for values after we've seen "=" in case OEQ below // REVIEW(cgomes): what if more attributes follow // the SPAN?? // if found STARTSPAN then pretend I am not in a tag if(token.tok == TokAttrib_STARTSPAN) *plxs &= ~(inTag | inAttribute); // if found ENDSPAN then goback to comment state else if(token.tok == TokAttrib_ENDSPAN) { *plxs &= ~(inTag | inAttribute); *plxs |= inBangTag | inComment; } } else if (SetValueSeen(plxs)) { // REVIEW (walts) // Handle the client side script language detection here for the // following case (language attribute value is NOT wrapped by quotes.) //