Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

423 lines
12 KiB

  1. /*
  2. * @(#)EncodingStream.cxx 1.0 6/10/97
  3. *
  4. * Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. *
  5. */
  6. #include "stdinc.h"
  7. #include "core.hxx"
  8. #include "xmlhelper.hxx"
  9. #include "encodingstream.hxx"
  10. #pragma hdrstop
  11. const int EncodingStream::BUFFERSIZE = 4096*sizeof(WCHAR);
  12. //////////////////////////////////////////////////////////////////////////////////
  13. EncodingStream::EncodingStream(IStream * pStream): stream(pStream), encoding(NULL), buf(NULL)
  14. {
  15. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  16. // These objects are sometimes handed out to external clients.
  17. ::IncrementComponents();
  18. #endif
  19. pfnWideCharFromMultiByte = NULL;
  20. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  21. pfnWideCharToMultiByte = NULL;
  22. #endif
  23. btotal = bnext = startAt = 0;
  24. lastBuffer = false;
  25. bufsize = 0;
  26. _fEOF = false;
  27. _fReadStream = true;
  28. _fUTF8BOM = false;
  29. //_fTextXML = false;
  30. //_fSetCharset = false;
  31. _dwMode = 0;
  32. codepage = CP_UNDEFINED;
  33. }
  34. //////////////////////////////////////////////////////////////////////////////////
  35. /**
  36. * Builds the EncodingStream for input.
  37. * Reads the first two bytes of the InputStream * in order to make a guess
  38. * as to the character encoding of the file.
  39. */
  40. IStream * EncodingStream::newEncodingStream(IStream * pStream)
  41. {
  42. EncodingStream * es = NEW (EncodingStream(pStream));
  43. if (es == NULL)
  44. return NULL;
  45. es->AddRef(); // xwu@@ : check this addRef()!
  46. es->isInput = true;
  47. es->buf = NULL;
  48. return es;
  49. }
  50. //////////////////////////////////////////////////////////////////////////////////
  51. EncodingStream::~EncodingStream()
  52. {
  53. if (buf)
  54. delete [] buf;
  55. if (encoding != NULL)
  56. delete encoding;
  57. stream = NULL; // smart pointer
  58. }
  59. //////////////////////////////////////////////////////////////////////////////////
  60. /**
  61. * Reads characters from stream and encode it to Unicode
  62. */
  63. HRESULT STDMETHODCALLTYPE EncodingStream::Read(void * pv, ULONG cb, ULONG * pcbRead)
  64. {
  65. HRESULT hr;
  66. ULONG num = 0;
  67. if (pcbRead != NULL)
  68. *pcbRead = 0;
  69. if (btotal == 0 && _fEOF) // we already hit EOF - so return right away.
  70. return S_OK;
  71. // Calculate how many UNICODE chars we are allowed to return,
  72. // xiaoyu : which is the same as the number of BYTES read from the file
  73. cb /= sizeof(WCHAR);
  74. checkhr2(prepareForInput(cb));
  75. if (stream && _fReadStream)
  76. {
  77. // btotal = number of bytes already in start of buffer.
  78. if (cb > btotal)
  79. {
  80. hr = stream->Read(buf + btotal, cb - btotal, &num);
  81. // Let's show what we've seen in the debugger so that we can diagnose bad manifests
  82. // more easily. mgrier 12/28/2000
  83. if (::FusionpDbgWouldPrintAtFilterLevel(FUSION_DBG_LEVEL_XMLSTREAM))
  84. {
  85. ::FusionpDbgPrintEx(
  86. FUSION_DBG_LEVEL_XMLSTREAM,
  87. "SXS.DLL: Read %lu bytes from XML stream; HRESULT returned = 0x%08lx\n", num, hr);
  88. if (num > 0)
  89. {
  90. ::FusionpDbgPrintBlob(
  91. FUSION_DBG_LEVEL_XMLSTREAM,
  92. buf + btotal,
  93. num,
  94. L" ");
  95. }
  96. }
  97. if (hr == E_PENDING && num > 0)
  98. {
  99. // in which case we ignore the error, and continue on !!.
  100. // BUGBUG - this may be a problem.since we are changing the
  101. // return code returned from the stream. This may mean we
  102. // should not ever hand out this stream outside of MSXML.
  103. hr = 0;
  104. }
  105. if (FAILED(hr))
  106. {
  107. return hr;
  108. }
  109. if (btotal == 0 && num == 0)
  110. {
  111. _fEOF = true;
  112. return hr;
  113. }
  114. }
  115. else
  116. {
  117. hr = S_OK;
  118. }
  119. }
  120. else if (btotal == 0)
  121. {
  122. return (lastBuffer) ? S_FALSE : E_PENDING;
  123. }
  124. btotal += num;
  125. UINT b = btotal, utotal = cb;
  126. if (b > cb)
  127. {
  128. // If we have more bytes in our buffer than the caller has
  129. // room for, then only return the number of bytes the caller
  130. // asked for -- otherwise pfnWideCharFromMultiByte will write
  131. // off the end of the caller's buffer.
  132. b = cb;
  133. }
  134. if (pfnWideCharFromMultiByte == NULL) // first read() call
  135. {
  136. checkhr2(autoDetect());
  137. if (pfnWideCharFromMultiByte == NULL) // failed to fully determine encoding
  138. return (lastBuffer) ? S_FALSE : E_PENDING;
  139. b -= bnext;
  140. startAt -= bnext;
  141. }
  142. hr = (this->pfnWideCharFromMultiByte)(&_dwMode, codepage, buf + bnext, &b, (WCHAR *)pv, &utotal);
  143. if (hr != S_OK)
  144. return hr;
  145. if (b == 0 && num == 0 && (stream || lastBuffer))
  146. {
  147. // stream says we're at the end, but pfnWideCharFromMultiByte
  148. // disagrees !!
  149. ::FusionpDbgPrintEx(
  150. FUSION_DBG_LEVEL_ERROR,
  151. "SXS.DLL: XML Parser found incomplete encoding\n");
  152. return XML_E_INCOMPLETE_ENCODING;
  153. }
  154. bnext += b;
  155. if (pcbRead != NULL)
  156. *pcbRead = utotal*sizeof(WCHAR);
  157. return (utotal == 0) ? E_PENDING : S_OK;
  158. }
  159. //////////////////////////////////////////////////////////////////////////////////
  160. /**
  161. * Checks the first two/four bytes of the input Stream in order to
  162. * detect UTF-16/UCS-4 or UTF-8 encoding;
  163. * otherwise assume it is UTF-8
  164. * xiaoyu : since only UCS-2 and UTF-8 are support, we do not deal with others...
  165. */
  166. HRESULT EncodingStream::autoDetect()
  167. {
  168. // wait until we have enough to be sure.
  169. if (btotal < 2)
  170. return S_OK;
  171. unsigned int guess = (((unsigned char)buf[0]) << 8) + ((unsigned char)buf[1]);
  172. HRESULT hr;
  173. if (guess == 0xFEFF || guess == 0xFFFE) // BOM found
  174. {
  175. // wait until we have enough to be sure.
  176. if (btotal < 4)
  177. return S_OK;
  178. unsigned int guess1 = (((unsigned char)buf[2]) << 8) + ((unsigned char)buf[3]);
  179. if (guess == guess1)
  180. {
  181. /*
  182. if (!encoding)
  183. {
  184. static const WCHAR* wchUCS4 = TEXT("UCS-4");
  185. encoding = Encoding::newEncoding(wchUCS4, 5, (0xFFFE == guess), true);
  186. }
  187. bnext = 4;
  188. */
  189. // FUSION_XML_PARSER does not support UCS4
  190. return XML_E_INVALIDENCODING;
  191. }
  192. else
  193. {
  194. if (!encoding)
  195. {
  196. static const WCHAR* wchUCS2 = L"UCS-2";
  197. encoding = Encoding::newEncoding(wchUCS2, 5, (0xFFFE == guess), true);
  198. }
  199. bnext = 2;
  200. }
  201. if (NULL == encoding)
  202. return E_OUTOFMEMORY;
  203. encoding->littleendian = (0xFFFE == guess);
  204. }
  205. else
  206. {
  207. if (!encoding)
  208. {
  209. encoding = Encoding::newEncoding(); // default encoding : UTF-8
  210. if (NULL == encoding)
  211. return E_OUTOFMEMORY;
  212. }
  213. // In some system, such as win2k, there is BOM 0xEF BB BF for UTF8
  214. if (guess == 0xEFBB)
  215. {
  216. if (btotal < 3)
  217. return S_OK;
  218. if (buf[2] == 0xBF)
  219. _fUTF8BOM = true;
  220. bnext = 3;
  221. }
  222. else
  223. {
  224. encoding->byteOrderMark = false;
  225. }
  226. }
  227. checkhr2(CharEncoder::getWideCharFromMultiByteInfo(encoding, &codepage, &pfnWideCharFromMultiByte, &maxCharSize));
  228. return S_OK;
  229. }
  230. /////////////////////////////////////////////////////////////////////////////////////////
  231. /**
  232. * Switchs the character encoding of the input stream
  233. * Returns:
  234. * S_OK: succeeded, and do not need re-read
  235. * S_FALSE: succeeded, needs to re-read from <code> newPosition </code>
  236. * Otherwise: error code
  237. * Notice:
  238. * This method only works for input stream, newPosition starts with 1
  239. */
  240. HRESULT EncodingStream::switchEncodingAt(Encoding * newEncoding, int newPosition)
  241. {
  242. // Ignore encoding information in the document when charset information is set from outside
  243. // xwu: fusion xml parsed does not use Charset
  244. //if (_fSetCharset)
  245. // return S_OK;
  246. int l = newPosition - startAt;
  247. if (l < 0 || l > (int)bnext)
  248. {
  249. // out of range
  250. delete newEncoding;
  251. return E_INVALIDARG;
  252. }
  253. UINT newcodepage;
  254. UINT newCharSize;
  255. //
  256. // get and check charset information
  257. //
  258. WideCharFromMultiByteFunc * pfn;
  259. HRESULT hr = CharEncoder::getWideCharFromMultiByteInfo(newEncoding, &newcodepage, &pfn, &newCharSize);
  260. if (hr != S_OK)
  261. {
  262. delete newEncoding;
  263. return E_INVALIDARG;
  264. }
  265. if (codepage == newcodepage)
  266. {
  267. delete newEncoding;
  268. return S_OK;
  269. }
  270. // Now if we are in UCS-2/UCS-4 we cannot switch out of UCS-2/UCS-4 and if we are
  271. // not in UCS-2/UCS-4 we cannot switch into UCS-2/UCS-4.
  272. // Also if UTF-8 BOM is presented, we cannot switch away
  273. if ((codepage != CP_UCS_2 && newcodepage == CP_UCS_2) ||
  274. (codepage == CP_UCS_2 && newcodepage != CP_UCS_2) ||
  275. /* xuw: fusion xml parser only support UTF-8 and UCS-2
  276. (codepage != CP_UCS_4 && newcodepage == CP_UCS_4) ||
  277. (codepage == CP_UCS_4 && newcodepage != CP_UCS_4) ||
  278. */
  279. (codepage == CP_UTF_8 && newcodepage != CP_UTF_8 && _fUTF8BOM))
  280. {
  281. delete newEncoding;
  282. return E_FAIL;
  283. }
  284. // Ok, then, let's make the switch.
  285. delete encoding;
  286. encoding = newEncoding;
  287. maxCharSize = newCharSize;
  288. codepage = newcodepage;
  289. pfnWideCharFromMultiByte = pfn;
  290. // Because the XML declaration is encoded in UTF-8,
  291. // Mapping input characters to wide characters is one-to-one mapping
  292. if ((int)bnext != l)
  293. {
  294. bnext = l;
  295. return S_FALSE;
  296. }
  297. return S_OK;
  298. }
  299. //////////////////////////////////////////////////////////////////////////////////
  300. // minlen is the number of UNICODE, which is the same number of byte we read from the file
  301. HRESULT EncodingStream::prepareForInput(ULONG minlen)
  302. {
  303. Assert(btotal >= bnext);
  304. btotal -= bnext;
  305. if (bufsize < minlen)
  306. {
  307. BYTE* newbuf = NEW (BYTE[minlen]);
  308. if (newbuf == NULL) {
  309. return E_OUTOFMEMORY;
  310. }
  311. if (buf){
  312. ::memcpy(newbuf, buf+bnext, btotal);
  313. delete[] buf;
  314. }
  315. buf = newbuf;
  316. bufsize = minlen;
  317. }
  318. else if (bnext > 0 && btotal > 0)
  319. {
  320. // Shift remaining bytes down to beginning of buffer.
  321. ::memmove(buf, buf + bnext, btotal);
  322. }
  323. startAt += bnext;
  324. bnext = 0;
  325. return S_OK;
  326. }
  327. //////////////////////////////////////////////////////////////////////////////////
  328. // xiaoyu : here it assumes that it is a BYTE buffer, not a WCHAR byte, so it can be copied directly
  329. HRESULT EncodingStream::AppendData( const BYTE* buffer, ULONG length, BOOL fLastBuffer)
  330. {
  331. Assert(btotal >= bnext);
  332. lastBuffer = (fLastBuffer != FALSE);
  333. HRESULT hr;
  334. ULONG minlen = length + (btotal - bnext); // make sure we don't loose any data
  335. if (minlen < BUFFERSIZE)
  336. minlen = BUFFERSIZE;
  337. checkhr2( prepareForInput(minlen)); // guarantee enough space in the array
  338. if (length > 0 && buffer != NULL){
  339. // Copy raw data into new buffer.
  340. ::memcpy(buf + btotal, buffer, length);
  341. btotal += length;
  342. }
  343. if (pfnWideCharFromMultiByte == NULL) // first AppendData call
  344. {
  345. checkhr2(autoDetect());
  346. }
  347. return hr;
  348. }
  349. //////////////////////////////////////////////////////////////////////////////////
  350. HRESULT EncodingStream::BufferData()
  351. {
  352. HRESULT hr = S_OK;
  353. checkhr2(prepareForInput(0)); // 0 is used just for shift down (so bnext=0).
  354. if (_fEOF) // already hit the end of the stream.
  355. return S_FALSE;
  356. const DWORD BUFSIZE = 4096;
  357. DWORD dwRead = 1;
  358. while (S_OK == hr && dwRead > 0)
  359. {
  360. // if we cannot fit another buffer full, then re-allocate.
  361. DWORD minsize = (btotal+BUFSIZE > bufsize) ? bufsize + BUFSIZE : bufsize;
  362. checkhr2( prepareForInput(minsize)); // make space available.
  363. dwRead = 0;
  364. hr = stream->Read(buf + btotal, BUFSIZE, &dwRead);
  365. btotal += dwRead;
  366. }
  367. if (SUCCEEDED(hr) && dwRead == 0)
  368. {
  369. _fEOF = true;
  370. hr = S_FALSE; // return S_FALSE when at eof.
  371. }
  372. return hr;
  373. }