Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

420 lines
12 KiB

  1. /*
  2. * @(#)EncodingStream.cxx 1.0 6/10/97
  3. *
  4. * Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. *
  5. */
  6. #include "stdinc.h"
  7. #include "core.hxx"
  8. #include "xmlhelper.hxx"
  9. #include "encodingstream.hxx"
  10. #pragma hdrstop
  11. const int EncodingStream::BUFFERSIZE = 4096*sizeof(WCHAR);
  12. //////////////////////////////////////////////////////////////////////////////////
  13. EncodingStream::EncodingStream(IStream * pStream):
  14. stream(pStream), encoding(NULL), buf(NULL), pfnWideCharFromMultiByte(NULL),
  15. btotal(0), bnext(0), startAt(0), lastBuffer(false), bufsize(0), _fEOF(false),
  16. _fReadStream(true), _fUTF8BOM(false), _dwMode(0), codepage(CP_UNDEFINED)
  17. {
  18. }
  19. //////////////////////////////////////////////////////////////////////////////////
  20. /**
  21. * Builds the EncodingStream for input.
  22. * Reads the first two bytes of the InputStream * in order to make a guess
  23. * as to the character encoding of the file.
  24. */
  25. IStream * EncodingStream::newEncodingStream(IStream * pStream)
  26. {
  27. EncodingStream * es = NEW (EncodingStream(pStream));
  28. if (es == NULL)
  29. return NULL;
  30. //
  31. // REVIEW REVIEW:
  32. // Shouldn't this rewind the stream cursor back to a known good point? The
  33. // comment above is useless, too - there's no data reading at all here.
  34. //
  35. es->AddRef(); // xwu@@ : check this addRef()!
  36. es->isInput = true;
  37. es->buf = NULL;
  38. return es;
  39. }
  40. //////////////////////////////////////////////////////////////////////////////////
  41. EncodingStream::~EncodingStream()
  42. {
  43. if (buf)
  44. {
  45. delete [] buf;
  46. buf = NULL;
  47. }
  48. if (encoding != NULL)
  49. {
  50. delete encoding;
  51. encoding = NULL;
  52. }
  53. stream = NULL; // smart pointer
  54. }
  55. //////////////////////////////////////////////////////////////////////////////////
  56. /**
  57. * Reads characters from stream and encode it to Unicode
  58. */
  59. HRESULT STDMETHODCALLTYPE EncodingStream::Read(void * pv, ULONG cb, ULONG * pcbRead)
  60. {
  61. HRESULT hr;
  62. ULONG num = 0;
  63. if (pcbRead != NULL)
  64. *pcbRead = 0;
  65. if (btotal == 0 && _fEOF) // we already hit EOF - so return right away.
  66. return S_OK;
  67. // Calculate how many UNICODE chars we are allowed to return,
  68. // xiaoyu : which is the same as the number of BYTES read from the file
  69. cb /= sizeof(WCHAR);
  70. checkhr2(prepareForInput(cb));
  71. if (stream && _fReadStream)
  72. {
  73. // btotal = number of bytes already in start of buffer.
  74. if (cb > btotal)
  75. {
  76. hr = stream->Read(buf + btotal, cb - btotal, &num);
  77. // Let's show what we've seen in the debugger so that we can diagnose bad manifests
  78. // more easily. mgrier 12/28/2000
  79. if (::FusionpDbgWouldPrintAtFilterLevel(FUSION_DBG_LEVEL_XMLSTREAM))
  80. {
  81. ::FusionpDbgPrintEx(
  82. FUSION_DBG_LEVEL_XMLSTREAM,
  83. "SXS.DLL: Read %lu bytes from XML stream; HRESULT returned = 0x%08lx\n", num, hr);
  84. if (num > 0)
  85. {
  86. ::FusionpDbgPrintBlob(
  87. FUSION_DBG_LEVEL_XMLSTREAM,
  88. buf + btotal,
  89. num,
  90. L" ");
  91. }
  92. }
  93. if ((hr == E_PENDING) && (num > 0))
  94. {
  95. // in which case we ignore the error, and continue on !!.
  96. // BUGBUG - this may be a problem.since we are changing the
  97. // return code returned from the stream. This may mean we
  98. // should not ever hand out this stream outside of MSXML.
  99. hr = 0;
  100. }
  101. if (FAILED(hr))
  102. {
  103. return hr;
  104. }
  105. if (btotal == 0 && num == 0)
  106. {
  107. _fEOF = true;
  108. return hr;
  109. }
  110. }
  111. else
  112. {
  113. hr = S_OK;
  114. }
  115. }
  116. else if (btotal == 0)
  117. {
  118. return (lastBuffer) ? S_FALSE : E_PENDING;
  119. }
  120. btotal += num;
  121. UINT b = btotal, utotal = cb;
  122. if (b > cb)
  123. {
  124. // If we have more bytes in our buffer than the caller has
  125. // room for, then only return the number of bytes the caller
  126. // asked for -- otherwise pfnWideCharFromMultiByte will write
  127. // off the end of the caller's buffer.
  128. b = cb;
  129. }
  130. if (pfnWideCharFromMultiByte == NULL) // first read() call
  131. {
  132. checkhr2(autoDetect());
  133. if (pfnWideCharFromMultiByte == NULL) // failed to fully determine encoding
  134. return (lastBuffer) ? S_FALSE : E_PENDING;
  135. b -= bnext;
  136. startAt -= bnext;
  137. }
  138. hr = (this->pfnWideCharFromMultiByte)(&_dwMode, codepage, buf + bnext, &b, (WCHAR *)pv, &utotal);
  139. if (hr != S_OK)
  140. return hr;
  141. if (b == 0 && num == 0 && (stream || lastBuffer))
  142. {
  143. // stream says we're at the end, but pfnWideCharFromMultiByte
  144. // disagrees !!
  145. ::FusionpDbgPrintEx(
  146. FUSION_DBG_LEVEL_ERROR,
  147. "SXS.DLL: XML Parser found incomplete encoding\n");
  148. return XML_E_INCOMPLETE_ENCODING;
  149. }
  150. bnext += b;
  151. if (pcbRead != NULL)
  152. *pcbRead = utotal*sizeof(WCHAR);
  153. return (utotal == 0) ? E_PENDING : S_OK;
  154. }
  155. //////////////////////////////////////////////////////////////////////////////////
  156. /**
  157. * Checks the first two/four bytes of the input Stream in order to
  158. * detect UTF-16/UCS-4 or UTF-8 encoding;
  159. * otherwise assume it is UTF-8
  160. * xiaoyu : since only UCS-2 and UTF-8 are support, we do not deal with others...
  161. */
  162. HRESULT EncodingStream::autoDetect()
  163. {
  164. // wait until we have enough to be sure.
  165. if (btotal < 2)
  166. return S_OK;
  167. unsigned int guess = (((unsigned char)buf[0]) << 8) + ((unsigned char)buf[1]);
  168. HRESULT hr;
  169. if (guess == 0xFEFF || guess == 0xFFFE) // BOM found
  170. {
  171. // wait until we have enough to be sure.
  172. if (btotal < 4)
  173. return S_OK;
  174. unsigned int guess1 = (((unsigned char)buf[2]) << 8) + ((unsigned char)buf[3]);
  175. if (guess == guess1)
  176. {
  177. /*
  178. if (!encoding)
  179. {
  180. static const WCHAR* wchUCS4 = TEXT("UCS-4");
  181. encoding = Encoding::newEncoding(wchUCS4, 5, (0xFFFE == guess), true);
  182. }
  183. bnext = 4;
  184. */
  185. // FUSION_XML_PARSER does not support UCS4
  186. return XML_E_INVALIDENCODING;
  187. }
  188. else
  189. {
  190. if (!encoding)
  191. {
  192. static const WCHAR wchUCS2[] = L"UCS-2";
  193. encoding = Encoding::newEncoding(wchUCS2, LENGTH(wchUCS2), (0xFFFE == guess), true);
  194. }
  195. bnext = 2;
  196. }
  197. if (NULL == encoding)
  198. return E_OUTOFMEMORY;
  199. encoding->littleendian = (0xFFFE == guess);
  200. }
  201. else
  202. {
  203. if (!encoding)
  204. {
  205. encoding = Encoding::newEncoding(L"UTF-8", 5, false, false);
  206. if (NULL == encoding)
  207. return E_OUTOFMEMORY;
  208. }
  209. // In some system, such as win2k, there is BOM 0xEF BB BF for UTF8
  210. if (guess == 0xEFBB)
  211. {
  212. if (btotal < 3)
  213. return S_OK;
  214. if (buf[2] == 0xBF)
  215. _fUTF8BOM = true;
  216. bnext = 3;
  217. }
  218. else
  219. {
  220. encoding->byteOrderMark = false;
  221. }
  222. }
  223. checkhr2(CharEncoder::getWideCharFromMultiByteInfo(encoding, &codepage, &pfnWideCharFromMultiByte, &maxCharSize));
  224. return S_OK;
  225. }
  226. /////////////////////////////////////////////////////////////////////////////////////////
  227. /**
  228. * Switchs the character encoding of the input stream
  229. * Returns:
  230. * S_OK: succeeded, and do not need re-read
  231. * S_FALSE: succeeded, needs to re-read from <code> newPosition </code>
  232. * Otherwise: error code
  233. * Notice:
  234. * This method only works for input stream, newPosition starts with 1
  235. */
  236. HRESULT EncodingStream::switchEncodingAt(Encoding * newEncoding, int newPosition)
  237. {
  238. // Ignore encoding information in the document when charset information is set from outside
  239. // xwu: fusion xml parsed does not use Charset
  240. //if (_fSetCharset)
  241. // return S_OK;
  242. int l = newPosition - startAt;
  243. if (l < 0 || l > (int)bnext)
  244. {
  245. // out of range
  246. delete newEncoding;
  247. return E_INVALIDARG;
  248. }
  249. UINT newcodepage;
  250. UINT newCharSize;
  251. //
  252. // get and check charset information
  253. //
  254. WideCharFromMultiByteFunc * pfn;
  255. HRESULT hr = CharEncoder::getWideCharFromMultiByteInfo(newEncoding, &newcodepage, &pfn, &newCharSize);
  256. if (hr != S_OK)
  257. {
  258. delete newEncoding;
  259. return E_INVALIDARG;
  260. }
  261. if (codepage == newcodepage)
  262. {
  263. delete newEncoding;
  264. return S_OK;
  265. }
  266. // Now if we are in UCS-2/UCS-4 we cannot switch out of UCS-2/UCS-4 and if we are
  267. // not in UCS-2/UCS-4 we cannot switch into UCS-2/UCS-4.
  268. // Also if UTF-8 BOM is presented, we cannot switch away
  269. if ((codepage != CP_UCS_2 && newcodepage == CP_UCS_2) ||
  270. (codepage == CP_UCS_2 && newcodepage != CP_UCS_2) ||
  271. (codepage == CP_UTF_8 && newcodepage != CP_UTF_8 && _fUTF8BOM))
  272. {
  273. delete newEncoding;
  274. return E_FAIL;
  275. }
  276. // Ok, then, let's make the switch.
  277. if (encoding)
  278. {
  279. delete encoding;
  280. }
  281. encoding = newEncoding;
  282. maxCharSize = newCharSize;
  283. codepage = newcodepage;
  284. pfnWideCharFromMultiByte = pfn;
  285. // Because the XML declaration is encoded in UTF-8,
  286. // Mapping input characters to wide characters is one-to-one mapping
  287. if ((int)bnext != l)
  288. {
  289. bnext = l;
  290. return S_FALSE;
  291. }
  292. return S_OK;
  293. }
  294. //////////////////////////////////////////////////////////////////////////////////
  295. // minlen is the number of UNICODE, which is the same number of byte we read from the file
  296. HRESULT EncodingStream::prepareForInput(ULONG minlen)
  297. {
  298. Assert(btotal >= bnext);
  299. btotal -= bnext;
  300. if (bufsize < minlen)
  301. {
  302. BYTE* newbuf = NEW (BYTE[minlen]);
  303. if (newbuf == NULL) {
  304. return E_OUTOFMEMORY;
  305. }
  306. if (buf){
  307. ::memcpy(newbuf, buf+bnext, btotal);
  308. delete[] buf;
  309. }
  310. buf = newbuf;
  311. bufsize = minlen;
  312. }
  313. else if (bnext > 0 && btotal > 0)
  314. {
  315. // Shift remaining bytes down to beginning of buffer.
  316. ::memmove(buf, buf + bnext, btotal);
  317. }
  318. startAt += bnext;
  319. bnext = 0;
  320. return S_OK;
  321. }
  322. //////////////////////////////////////////////////////////////////////////////////
  323. // xiaoyu : here it assumes that it is a BYTE buffer, not a WCHAR byte, so it can be copied directly
  324. HRESULT EncodingStream::AppendData( const BYTE* buffer, ULONG length, BOOL fLastBuffer)
  325. {
  326. Assert(btotal >= bnext);
  327. lastBuffer = (fLastBuffer != FALSE);
  328. HRESULT hr;
  329. ULONG minlen = length + (btotal - bnext); // make sure we don't loose any data
  330. if (minlen < BUFFERSIZE)
  331. minlen = BUFFERSIZE;
  332. checkhr2( prepareForInput(minlen)); // guarantee enough space in the array
  333. if (length > 0 && buffer != NULL){
  334. // Copy raw data into new buffer.
  335. ::memcpy(buf + btotal, buffer, length);
  336. btotal += length;
  337. }
  338. if (pfnWideCharFromMultiByte == NULL) // first AppendData call
  339. {
  340. checkhr2(autoDetect());
  341. }
  342. return hr;
  343. }
  344. //////////////////////////////////////////////////////////////////////////////////
  345. HRESULT EncodingStream::BufferData()
  346. {
  347. HRESULT hr = S_OK;
  348. checkhr2(prepareForInput(0)); // 0 is used just for shift down (so bnext=0).
  349. if (_fEOF) // already hit the end of the stream.
  350. return S_FALSE;
  351. const DWORD BUFSIZE = 4096;
  352. DWORD dwRead = 1;
  353. while (S_OK == hr && dwRead > 0)
  354. {
  355. // if we cannot fit another buffer full, then re-allocate.
  356. DWORD minsize = (btotal+BUFSIZE > bufsize) ? bufsize + BUFSIZE : bufsize;
  357. checkhr2( prepareForInput(minsize)); // make space available.
  358. dwRead = 0;
  359. hr = stream->Read(buf + btotal, BUFSIZE, &dwRead);
  360. btotal += dwRead;
  361. }
  362. if (SUCCEEDED(hr) && dwRead == 0)
  363. {
  364. _fEOF = true;
  365. hr = S_FALSE; // return S_FALSE when at eof.
  366. }
  367. return hr;
  368. }