/* * * Copyright (c) 1998,1999 Microsoft Corporation. All rights reserved. * EXEMPT: copyright change only, no build required * */ #ifndef _XMLSTREAM_HXX #define _XMLSTREAM_HXX #pragma once #include "bufferedstream.hxx" #include "encodingstream.hxx" #include "_rawstack.hxx" class XMLParser; // the XMLStream class uses the error code and token types defined in the xmlparser #include #include //============================================================================== // This enum and StateEntry struct are used in table driven parsing for DTD // stuff - so that the parser isn't bloated by this stuff. This is about 15% // slower than a hand written parser. typedef enum { OP_OWS, // optional whitespace OP_WS, // required whitespace OP_CHAR, // char comparison, _pch[0] is char, _sArg1 is else goto state or error code OP_CHAR2, // same os OP_CHAR - except it doesn't do _pInput->Mark. OP_PEEK, // same as OP_CHAR - except it doesn't advance. OP_NAME, // scan name OP_TOKEN, // return token, _sArg1 = token OP_STRING, // scan a string OP_EXTID, // scan an external id. OP_STRCMP, // string comparison. OP_POP, // pop state OP_NWS, // not whitespace conditional OP_SUBSET, // skip an internal subset OP_PUBIDOPTION, // conditional for _fShortPubIdOption OP_NMTOKEN, OP_TABLE, // push a new table. (pointer in _pch field). OP_STABLE, // switch to new table. (pointer in _pch field). OP_COMMENT, OP_CONDSECT, OP_SNCHAR, // conditional 'is start name char' OP_EQUALS, // scan ' = ' OP_ENCODING, // switch encoding. OP_CHARWS, // match char or must be white space. OP_ATTRVAL, //parse attribute values.(_sArg1 = return PCDATA token or not) OP_PETEST, OP_ATTEXPAND, OP_NMSTRING, // unqualified name within quote OP_FAKESYSTEM, } OPCODE; typedef struct { OPCODE _sOp; const WCHAR* _pch; DWORD _sGoto; DWORD _sArg1; long _lDelta; // for when we do a Mark(), or Token if OP_CHAR } StateEntry; //================================================================================ class XMLStream { public: XMLStream(XMLParser * pXMLParser); ~XMLStream(); //------------------------------------------------------------------------ // These are some more tokens that the XMLStream returns. // xiaoyu : only few are used in fusion manifest typedef enum { // ADDITIONAL TOKENS THAT THE PARSER PULLS UP XML_PENDING = 0, // still parsing. XML_NUMENTITYREF = XML_LASTSUBNODETYPE, // &23; XML_HEXENTITYREF, // &x0cf7; XML_BUILTINENTITYREF, //> XML_TAGEND, // > XML_EMPTYTAGEND, // /> (text = tag name) XML_ENDTAG, // ' XML_ENDXMLDECL, // end of xml declaration XML_ENDDECL, // '>' XML_CLOSEPAREN, XML_ENDCONDSECT, // ']]>' XML_STARTDTDSUBSET, XML_ENDPROLOG, XML_DATAAVAILABLE, XML_DATAREALLOCATE, } XMLToken; HRESULT PushStream( /* [in] */ EncodingStream *pStm, /* [in] */ bool fExternalPE); HRESULT AppendData( /* [in] */ const BYTE *buffer, /* [in] */ long length, /* [in] */ BOOL lastBuffer); HRESULT Reset( void); HRESULT GetNextToken( /* [out] */ DWORD *token, /* [out] */ const WCHAR **text, /* [out] */ long *length, /* [out] */ long *nslen); ULONG GetLine(); ULONG GetLinePosition(); ULONG GetInputPosition(); HRESULT GetLineBuffer( /* [out] */ const WCHAR * *buf, /* [out] */ ULONG* len, /* [out] */ ULONG* startpos); void SetFlags( /* [in] */ unsigned short usFlags); unsigned short GetFlags(); HRESULT ErrorCallback(HRESULT hr); WCHAR getAttrValueQuoteChar() { return _chTerminator; } private: HRESULT init(); void _init(); HRESULT firstAdvance(); HRESULT parseContent(); HRESULT parseElement(); HRESULT parseEndTag(); HRESULT parsePI(); HRESULT parseComment(); HRESULT parseName(); HRESULT parseAttributes(); HRESULT parseAttrValue(); HRESULT parsePCData(); HRESULT parseEntityRef(); HRESULT parseCondSect(); HRESULT parseCData(); HRESULT parseTable(); HRESULT parseEquals(); HRESULT skipWhiteSpace(); inline void mark(long back = 0) { _pInput->Mark(back); } typedef HRESULT (XMLStream::* StateFunc)(); // The state machine consists of functions where each // function can determine for itself its own substates // so that when it is reactivated by a pop() it can pick // up where it left off. The current substate is set // to zero on a push() and at pop() time it is restored // to whatever it was told to be in the push(). HRESULT push(StateFunc f, short substate = 0); HRESULT pushTable(short substate = 0, const StateEntry* table = NULL, DWORD le = 0); HRESULT pop(bool boundary = true); HRESULT switchTo(StateFunc f); // pop & push // Advance and jump to state HRESULT AdvanceTo(short substate); HRESULT PopStream(); HRESULT ScanHexDigits(); HRESULT ScanDecimalDigits(); bool PreEntityText(); // Always use this function instead of calling _pInput->getToken inline void getToken(const WCHAR** ppText, long* pLen) { _pInput->getToken(ppText,pLen); } BufferedStream* getCurrentStream(); StateFunc _fnState; // current function. short _sSubState; // current substate. short _sSavedState; struct StateInfo { StateFunc _fnState; short _sSubState; const StateEntry* _pTable; //DWORD _lEOFError; int _cStreamDepth; }; _rawstack _pStack; struct InputInfo { BufferedStream* _pInput; WCHAR _chLookahead; //bool _fPE; //bool _fExternalPE; //bool _fInternalSubset; // remember that we were in internal subset. StateFunc _fnState; // remember the state function when pushstream // it is used to check parameter entity replacement text // is properly nested with markup declarations. }; _rawstack _pStreams; // Cache the current value of _pStreams.used() which is used to making sure // a parameter entity doesn't pop out of the scope in which it was entered. int _cStreamDepth; BufferedStream* _pInput; // current input stream. WCHAR _chNextLookahead; bool _fWasUsingBuffer; long _lParseStringLevel; DWORD _nPreToken; DWORD _nToken; long _lLengthDelta; // amount to adjust token length by long _lMarkDelta; // amount to adjust mark position by bool _fDelayMark; bool _fFoundFirstElement; // special trick for EndProlog. WCHAR _chLookahead; bool _fWhitespace; // found whitespace while parsing PCDATA WCHAR _chTerminator; WCHAR _chEndChar; // for parseAttributes. bool _fEOF; // reached end of file. long _lNslen; // namespace length long _lNssep; // namespace separator length ':' or '::'. long _lEntityPos; // for parsing entity references. bool _fPCDataPending; // whether pcdata is pending during parseEntityRef. const WCHAR* _pchCDataState; int _cAttrCount; int _nEntityNSLen; // saved namespace info for entity references. // Switches. unsigned short _usFlags; // bool _fFloatingAmp; // used in ParseEntityRef() bool _fShortEndTags; // used in ParserEndTag() bool _fCaseInsensitive; bool _fNoNamespaces; // used in parseName() //bool _fNoWhitespaceNodes; // used in DTD data //bool _fIE4Quirks; // xiaoyu : what it means? bool _fNoDTDNodes; // only used in GetDTDNextToken(). may be deleted later //bool _fHandlePE; // This flag is used to turn on and off parameter entity handling in DTD // xiaoyu: used in ParsePI(), ParseDTD(), parseComment(), // parsePEDecl(), parseIgnoSet() // for table driven parsing. const StateEntry* _pTable; //DWORD _lEOFError; // used in parsePEDecl(), pushTable(), parseTable(), // buffer used during whitespace normalization WCHAR* _pchBuffer; long _lBufLen; long _lBufSize; bool _fFoundWhitespace; bool _fUsingBuffer; bool _fFoundNonWhitespace; bool _fCheckAttribute; // need to check the attribute name // xiaoyu : used for complicate attribute type, such as "xml:lang", "xmlns" bool _fDTD; // xiaoyu whether xml contains DTD //bool _fInternalSubset; // xiaoyu used in ParseDTD //int _cConditionalSection; //bool _fFoundPEREf; //bool _fWasDTD; // bool _fParsingNames; bool _fParsingAttDef; // used in ParseAttrValue() //int _cIgnoreSectLevel; //bool _fResolved; // used in ParseEntity(); bool _fReturnAttributeValue; //int _cStreams; // used to identify if PushStream was called. // used in parseEntity(); WCHAR _wcEntityValue; // used in parseEntityRef() XMLParser * _pXMLParser; // regular pointer pointing back to the parser inline HRESULT PushChar(WCHAR ch) { if ((_lBufLen < _lBufSize) && _pchBuffer) { _pchBuffer[_lBufLen++] = ch; return S_OK; } else return _PushChar(ch); } HRESULT _PushChar(WCHAR ch); // grow the buffer. }; #endif // _XML_STREAM_HXX