// tokhtml.h - Tokens and lex state for HTML // Copyright (c)1997-1999 Microsoft Corporation, All Rights Reserved // // Include lex.h before including this file. // #ifndef __TOKHTML_H__ #define __TOKHTML_H__ #if 0 // Moved to the IDL enum HtmlToken { tokUNKNOWN = tokclsError, tokIDENTIFIER = tokclsIdentMin, // identifier/plain text tokNEWLINE = tokclsUserMin, // // colored HTML items // tokElem, // element name tokAttr, // attribute name tokValue, // attribute value tokComment, // comment tokEntity, // entity reference: e.g. " " tokTag, // tag delimiter tokString, // string tokSpace, // whitespace and unrecognized text in a tag tokOp, // operator tokSSS, // server-side script <%...%> // // parsed HTML and SGML items - tokens folded with items above // tokName, // NAMETOKEN tokNum, // NUMTOKEN tokParEnt, // parameter entity: e.g. "%name;" tokResName, // reserved name // // operators - colors folded with tokOp above // tokOP_MIN, tokOpDash = tokOP_MIN, // - tokOP_SINGLE, tokOpQuestion = tokOP_SINGLE, // ? tokOpComma, // , tokOpPipe, // | tokOpPlus, // + tokOpEqual, // = tokOpStar, // * tokOpAmp, // & tokOpCent, // % tokOpLP, // ( tokOpRP, // ) tokOpLB, // [ tokOpRB, // ] tokOP_MAX, // token op MAX tokEOF }; // the state of lexical analyser // // We're generally in one of two states: // 1. scanning text // 2. scanning tag info // // Within these states, the lexer can be in several substates. // // Text substates: // // inText HTML text content -- process markup // inPLAINTEXT after a tag - remainder of file is not HTML // inCOMMENT COMMENT content -- suppress all markup but </COMMENT> // color text as comment // inXMP XMP content -- suppress all markup but </XMP> // inLISTING LISTING content -- suppress all markup but </LISTING> // inSCRIPT SCRIPT content -- color using script engine. // // Tag substates: // // inTag inside a tag < ... > // inBangTag inside an SGML MDO tag <! ... > // inPITag inside an SGML Prcessing Instruction tag <? ... > // inHTXTag inside an ODBC HTML Extension template tag <% ... %> // inEndTag inside an end tag </name> // inAttribute expecting an attribute // inValue expecting an attribute value (right of =) // inComment inside a comment // inString inside a " string, terminated by " // inStringA inside a ' (Alternate) string, terminated by ' // enum HtmlLexState { // tag types inTag = 0x00000001, // < ... > inBangTag = 0x00000002, // <! ... > inPITag = 0x00000004, // <? ... > inHTXTag = 0x00000008, // <% ... %> inEndTag = 0x00000010, // </ ... > // tag scanning states inAttribute = 0x00000020, inValue = 0x00000040, inComment = 0x00000080, inString = 0x00000100, inStringA = 0x00000200, // text content model states inPLAINTEXT = 0x00001000, inCOMMENT = 0x00002000, inXMP = 0x00004000, inLISTING = 0x00008000, inSCRIPT = 0x00010000, // sublanguages inVariant = 0x00F00000, // mask for sublang index inHTML2 = 0x00000000, inIExplore2 = 0x00100000, inIExplore3 = 0x00200000, // script languages inJavaScript = 0x01000000, inVBScript = 0x02000000, }; // masks for subsets of the state #define INTAG (inTag|inBangTag|inPITag|inHTXTag|inEndTag) #define INSTRING (inString|inStringA) #define TAGMASK (INTAG|inAttribute|inValue|inComment|INSTRING) #define TEXTMASK (inPLAINTEXT|inCOMMENT|inXMP|inLISTING|inSCRIPT) #define STATEMASK (TAGMASK|TEXTMASK) #endif // convert state <-> sublang index inline DWORD SubLangIndexFromLxs(DWORD lxs) { return (lxs & inVariant) >> 20UL; } inline DWORD LxsFromSubLangIndex(DWORD isl) { return (isl << 20UL) & inVariant; } #endif // __TOKHTML_H__