Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

159 lines
5.2 KiB

  1. //------------------------------------------------------------------------
  2. //
  3. // Tabular Data Control Parse Module
  4. // Copyright (C) Microsoft Corporation, 1996, 1997
  5. //
  6. // File: TDCParse.h
  7. //
  8. // Contents: Declaration of the TDC parser classes.
  9. //
  10. // The intent of these classes once was to create a pipeline.
  11. //
  12. //
  13. // |
  14. // | Wide-character stream
  15. // | ~~~~~~~~~~~~~~~~~~~~~
  16. // \|/
  17. // ------------------------
  18. // | CTDCTokenise object | Created with field & row delimiters, quote &
  19. // | AddWcharBuffer() | escape characters
  20. // ------------------------
  21. // |
  22. // | Stream of <field>, <eoln> and <eof> tokens
  23. // | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  24. // \|/
  25. // ------------------------
  26. // | CTDCFieldSink object | Abstract class, e.g. STD object created with
  27. // | AddField() | sort/filter criteria & fUseHeader flag
  28. // | EOLN() | to interpret the sequence of fields.
  29. // | EOF() |
  30. // ------------------------
  31. //
  32. //------------------------------------------------------------------------
  33. #define DEFAULT_FIELD_DELIM L","
  34. #define DEFAULT_ROW_DELIM L"\n"
  35. #define DEFAULT_QUOTE_CHAR L"\""
  36. #define UNICODE_CP 1200 // Win32's Unicode codepage
  37. #define UNICODE_REVERSE_CP 1201 // Byte-swapped Unicode codepage
  38. #define CP_1252 1252 // Ansi, Western Europe
  39. #define CP_AUTO 50001 // cross language detection
  40. // number of bytes for MLang to make a good guess for the codepage
  41. // (this is a somewhat arbitrary number)
  42. #define CODEPAGE_BYTE_THRESHOLD (4096)
  43. #define N_DETECTENCODINGINFO (5)
  44. #define ALLOW_DOMAIN_STRING L"@!allow_domains"
  45. //------------------------------------------------------------------------
  46. //
  47. // Class: CTDCFieldSink
  48. //
  49. // This class accumulates a sequence of <fields> and <eoln> tokens
  50. // into a 2-D array.
  51. //
  52. // An admissible calling sequence on this object is:
  53. // * 0 or more calls to AddField() or EOLN()
  54. // * 1 call to EOF()
  55. //
  56. //------------------------------------------------------------------------
  57. class CTDCFieldSink
  58. {
  59. public:
  60. STDMETHOD(AddField)(LPWCH pwch, DWORD dwSize) PURE;
  61. STDMETHOD(EOLN)() PURE;
  62. STDMETHOD(EOF)() PURE;
  63. };
  64. //------------------------------------------------------------------------
  65. //
  66. // Class: CTDCUnify
  67. //
  68. // This class takes a series of byte buffers and breaks them up into
  69. // UNICODE buffers.
  70. // The resulting buffers are passed to a CTDCTokenise object.
  71. //
  72. // An admissible calling sequence on this object is:
  73. // * Exactly 1 call to Create()
  74. // * 0 or more calls to AddByteBuffer() with a non-zero-sized buffer
  75. // * Exactly 1 call to AddByteBuffer() with a zero-sized buffer
  76. //
  77. // Calls to query the characteristics of the parsed data are allowed
  78. // after the call to Create(), but are only meaningful after a
  79. // reasonable amount of data has been collected.
  80. //
  81. //
  82. // Caveats:
  83. // ~~~~~~~
  84. // The class characterises the input stream as ASCII/UNICODE/COMPOSITE
  85. // based on the buffer passed in the initial call to AddByteBuffer().
  86. // If this buffer is too small, the class may make an incorrect
  87. // characterisation.
  88. //
  89. //------------------------------------------------------------------------
  90. class CTDCUnify
  91. {
  92. public:
  93. CTDCUnify();
  94. ~CTDCUnify();
  95. HRESULT Create(UINT nCodePage, UINT nAmbientCodePage, IMultiLanguage *pML);
  96. HRESULT ConvertByteBuffer(BYTE *pBytes, DWORD dwSize);
  97. HRESULT InitTokenizer(CTDCFieldSink *pFieldSink,
  98. WCHAR wchDelimField,
  99. WCHAR wchDelimRow,
  100. WCHAR wchQuote,
  101. WCHAR wchEscape);
  102. HRESULT AddWcharBuffer(BOOL fAtEnd);
  103. int IsUnicode(BYTE * pBytes, DWORD dwSize);
  104. BOOL DetermineCodePage(BOOL fForce);
  105. enum ALLOWDOMAINLIST
  106. {
  107. ALLOW_DOMAINLIST_YES,
  108. ALLOW_DOMAINLIST_NO,
  109. ALLOW_DOMAINLIST_DONTKNOW
  110. };
  111. ALLOWDOMAINLIST CheckForAllowDomainList();
  112. HRESULT MatchAllowDomainList(LPCWSTR pwzURL);
  113. boolean ProcessedAllowDomainList() {return m_fProcessedAllowDomainList;}
  114. private:
  115. CTDCFieldSink *m_pFieldSink;
  116. WCHAR m_wchDelimField;
  117. WCHAR m_wchDelimRow;
  118. WCHAR m_wchQuote;
  119. WCHAR m_wchEscape;
  120. WCHAR m_ucParsed;
  121. boolean m_fEscapeActive;
  122. boolean m_fQuoteActive;
  123. boolean m_fIgnoreNextLF;
  124. boolean m_fIgnoreNextCR;
  125. boolean m_fIgnoreNextWhiteSpace;
  126. boolean m_fFoldCRLF;
  127. boolean m_fFoldWhiteSpace;
  128. UINT m_nUnicode;
  129. boolean m_fDataMarkedUnicode;
  130. boolean m_fDataIsUnicode;
  131. boolean m_fCanConvertToUnicode;
  132. boolean m_fProcessedAllowDomainList;
  133. DWORD m_dwBytesProcessed;
  134. DWORD m_dwConvertMode;
  135. UINT m_nCodePage;
  136. UINT m_nAmbientCodePage;
  137. BYTE *m_psByteBuf;
  138. ULONG m_ucByteBufSize;
  139. ULONG m_ucByteBufCount;
  140. WCHAR *m_psWcharBuf;
  141. ULONG m_ucWcharBufSize;
  142. ULONG m_ucWcharBufCount;
  143. IMultiLanguage *m_pML;
  144. };