Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

818 lines
25 KiB

  1. /*
  2. * @(#)CharEncoder.cxx 1.0 6/10/97
  3. *
  4. * Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. *
  5. */
  6. #include "stdinc.h"
  7. #include "core.hxx"
  8. #pragma hdrstop
  9. #include "charencoder.hxx"
  10. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  11. #include <shlwapip.h> // IsCharSpace
  12. #ifdef UNIX
  13. #include <lendian.hpp>
  14. #endif
  15. #ifdef UNIX
  16. // Not needed under UNIX
  17. #else
  18. #ifndef _WIN64
  19. #include <w95wraps.h>
  20. #endif // _WIN64
  21. #endif /* UNIX */
  22. #endif
  23. //
  24. // Delegate other charsets to mlang
  25. //
  26. const EncodingEntry CharEncoder::charsetInfo [] =
  27. {
  28. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  29. { CP_1250, _T("WINDOWS-1250"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 },
  30. { CP_1251, _T("WINDOWS-1251"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 },
  31. { CP_1252, _T("WINDOWS-1252"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 },
  32. { CP_1253, _T("WINDOWS-1253"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 },
  33. { CP_1254, _T("WINDOWS-1254"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 },
  34. { CP_1257, _T("WINDOWS-1257"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 },
  35. { CP_UCS_4, _T("UCS-4"), 4, wideCharFromUcs4Bigendian, wideCharToUcs4Bigendian },
  36. { CP_UCS_2, _T("ISO-10646-UCS-2"), 2, wideCharFromUcs2Bigendian, wideCharToUcs2Bigendian },
  37. { CP_UCS_2, _T("UNICODE-2-0-UTF-16"), 2, wideCharFromUcs2Bigendian, wideCharToUcs2Bigendian },
  38. { CP_UCS_2, _T("UTF-16"), 2, wideCharFromUcs2Bigendian, wideCharToUcs2Bigendian },
  39. { CP_UTF_8, _T("UNICODE-1-1-UTF-8"), 3, wideCharFromUtf8, wideCharToUtf8 },
  40. { CP_UTF_8, _T("UNICODE-2-0-UTF-8"), 3, wideCharFromUtf8, wideCharToUtf8 },
  41. #endif
  42. { CP_UCS_2, L"UTF-16", 2, wideCharFromUcs2Bigendian },
  43. { CP_UCS_2, L"UCS-2", 2, wideCharFromUcs2Bigendian },
  44. { CP_UTF_8, L"UTF-8", 3, wideCharFromUtf8 },
  45. };
  46. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  47. IMultiLanguage * CharEncoder::pMultiLanguage = NULL;
  48. #endif
  49. Encoding * Encoding::newEncoding(const WCHAR * s, ULONG len, bool endian, bool mark)
  50. {
  51. //Encoding * e = new Encoding();
  52. Encoding * e = NEW (Encoding());
  53. if (e == NULL)
  54. return NULL;
  55. e->charset = NEW (WCHAR[len + 1]);
  56. if (e->charset == NULL)
  57. {
  58. delete e;
  59. return NULL;
  60. }
  61. ::memcpy(e->charset, s, sizeof(WCHAR) * len);
  62. e->charset[len] = 0; // guarentee NULL termination.
  63. e->littleendian = endian;
  64. e->byteOrderMark = mark;
  65. return e;
  66. }
  67. Encoding::~Encoding()
  68. {
  69. if (charset != NULL)
  70. {
  71. delete [] charset;
  72. }
  73. }
  74. int CharEncoder::getCharsetInfo(const WCHAR * charset, CODEPAGE * pcodepage, UINT * mCharSize)
  75. {
  76. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  77. CPINFO cpinfo;
  78. #endif
  79. for (int i = LENGTH(charsetInfo) - 1; i >= 0; i--)
  80. {
  81. //if (StrCmpI(charset, charsetInfo[i].charset) == 0)
  82. if (::FusionpCompareStrings(charset, ::wcslen(charset), charsetInfo[i].charset, ::wcslen(charsetInfo[i].charset), true) == 0)
  83. {
  84. //
  85. // test whether we can handle it locally or not
  86. // BUGBUG(HACK) the index number may change if we change charsetInfo
  87. //
  88. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  89. if (i > 5 || GetCPInfo(charsetInfo[i].codepage, &cpinfo))
  90. #endif
  91. {
  92. *pcodepage = charsetInfo[i].codepage;
  93. *mCharSize = charsetInfo[i].maxCharSize;
  94. return i;
  95. }
  96. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  97. else
  98. {
  99. break;
  100. }
  101. #endif
  102. } // end of if
  103. }// end of for
  104. // xiaoyu: It is assumed that an error would return if neither UTF-8 nor UCS-2
  105. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  106. //
  107. // delegate to MLANG then
  108. //
  109. MIMECSETINFO mimeCharsetInfo;
  110. HRESULT hr;
  111. hr = _EnsureMultiLanguage();
  112. if (hr == S_OK)
  113. {
  114. hr = pMultiLanguage->GetCharsetInfo((WCHAR*)charset, &mimeCharsetInfo);
  115. if (hr == S_OK)
  116. {
  117. *pcodepage = mimeCharsetInfo.uiInternetEncoding;
  118. if (GetCPInfo(*pcodepage, &cpinfo))
  119. *mCharSize = cpinfo.MaxCharSize;
  120. else // if we don't know the max size, assume a large size
  121. *mCharSize = 4;
  122. return -1;
  123. }
  124. }
  125. #endif
  126. return -2;
  127. }
  128. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  129. extern HRESULT CreateMultiLanguage(IMultiLanguage ** ppUnk);
  130. HRESULT CharEncoder::_EnsureMultiLanguage()
  131. {
  132. return CreateMultiLanguage(&pMultiLanguage);
  133. }
  134. #endif
  135. /**
  136. * get information about a code page identified by <code> encoding </code>
  137. */
  138. HRESULT CharEncoder::getWideCharFromMultiByteInfo(Encoding * encoding, CODEPAGE * pcodepage, WideCharFromMultiByteFunc ** pfnWideCharFromMultiByte, UINT * mCharSize)
  139. {
  140. HRESULT hr = S_OK;
  141. int i = getCharsetInfo(encoding->charset, pcodepage, mCharSize);
  142. if (i >= 0) // in our short list
  143. {
  144. switch (*pcodepage)
  145. {
  146. case CP_UCS_2:
  147. if (encoding->littleendian)
  148. *pfnWideCharFromMultiByte = wideCharFromUcs2Littleendian;
  149. else
  150. *pfnWideCharFromMultiByte = wideCharFromUcs2Bigendian;
  151. break;
  152. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  153. case CP_UCS_4:
  154. if (encoding->littleendian)
  155. *pfnWideCharFromMultiByte = wideCharFromUcs4Littleendian;
  156. else
  157. *pfnWideCharFromMultiByte = wideCharFromUcs4Bigendian;
  158. break;
  159. #endif
  160. default:
  161. *pfnWideCharFromMultiByte = charsetInfo[i].pfnWideCharFromMultiByte;
  162. break;
  163. }
  164. }
  165. // xiaoyu : we do not deal this case
  166. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  167. else if (i == -1) // delegate to MLANG
  168. {
  169. hr = pMultiLanguage->IsConvertible(*pcodepage, CP_UCS_2);
  170. if (S_OK == hr)
  171. *pfnWideCharFromMultiByte = wideCharFromMultiByteMlang;
  172. }
  173. #endif
  174. else // invalid encoding
  175. {
  176. hr = E_FAIL;
  177. }
  178. return hr;
  179. }
  180. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  181. /**
  182. * get information about a code page identified by <code> encoding </code>
  183. */
  184. HRESULT CharEncoder::getWideCharToMultiByteInfo(Encoding * encoding, CODEPAGE * pcodepage, WideCharToMultiByteFunc ** pfnWideCharToMultiByte, UINT * mCharSize)
  185. {
  186. HRESULT hr = S_OK;
  187. int i = getCharsetInfo(encoding->charset, pcodepage, mCharSize);
  188. if (i >= 0) // in our short list
  189. {
  190. switch (*pcodepage)
  191. {
  192. case CP_UCS_2:
  193. if (encoding->littleendian)
  194. *pfnWideCharToMultiByte = wideCharToUcs2Littleendian;
  195. else
  196. *pfnWideCharToMultiByte = wideCharToUcs2Bigendian;
  197. break;
  198. case CP_UCS_4:
  199. if (encoding->littleendian)
  200. *pfnWideCharToMultiByte = wideCharToUcs4Littleendian;
  201. else
  202. *pfnWideCharToMultiByte = wideCharToUcs4Bigendian;
  203. break;
  204. default:
  205. *pfnWideCharToMultiByte = charsetInfo[i].pfnWideCharToMultiByte;
  206. break;
  207. }
  208. }
  209. else if (i == -1) // delegate to MLANG
  210. {
  211. hr = pMultiLanguage->IsConvertible(CP_UCS_2, *pcodepage);
  212. if (hr == S_OK)
  213. *pfnWideCharToMultiByte = wideCharToMultiByteMlang;
  214. else
  215. hr = E_FAIL;
  216. }
  217. else
  218. {
  219. hr = E_FAIL;
  220. }
  221. return hr;
  222. }
  223. #endif
  224. /**
  225. * Scans rawbuffer and translates UTF8 characters into UNICODE characters
  226. */
  227. HRESULT CharEncoder::wideCharFromUtf8(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
  228. UINT * cb, WCHAR * buffer, UINT * cch)
  229. {
  230. UNUSED(pdwMode);
  231. UNUSED(codepage);
  232. #if 0
  233. // Just for the record - I tried this and measured it and it's twice as
  234. // slow as our hand-crafted code.
  235. // Back up if end of buffer is the second or third byte of a multi-byte
  236. // encoding since MultiByteToWideChar cannot handle this case. These second
  237. // and third bytes are easy to identify - they always start with the bit
  238. // pattern 0x10xxxxxx.
  239. UINT remaining = 0;
  240. UINT count;
  241. int endpos = (int)*cb;
  242. while (endpos > 0 && (bytebuffer[endpos-1] & 0xc0) == 0x80)
  243. {
  244. endpos--;
  245. remaining++;
  246. }
  247. if (endpos > 0)
  248. {
  249. count = MultiByteToWideChar(CP_UTF8, 0, bytebuffer, endpos, buffer, *cch);
  250. if (count == 0)
  251. {
  252. return HRESULT_FROM_WIN32(GetLastError());
  253. }
  254. }
  255. #else
  256. UINT remaining = *cb;
  257. UINT count = 0;
  258. UINT max = *cch;
  259. ULONG ucs4;
  260. // UTF-8 multi-byte encoding. See Appendix A.2 of the Unicode book for more info.
  261. //
  262. // Unicode value 1st byte 2nd byte 3rd byte 4th byte
  263. // 000000000xxxxxxx 0xxxxxxx
  264. // 00000yyyyyxxxxxx 110yyyyy 10xxxxxx
  265. // zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
  266. // 110110wwwwzzzzyy+ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
  267. // 110111yyyyxxxxxx, where uuuuu = wwww + 1
  268. WCHAR c;
  269. bool valid = true;
  270. while (remaining > 0 && count < max)
  271. {
  272. // This is an optimization for straight runs of 7-bit ascii
  273. // inside the UTF-8 data.
  274. c = *bytebuffer;
  275. if (c & 0x80) // check 8th-bit and get out of here
  276. break; // so we can do proper UTF-8 decoding.
  277. *buffer++ = c;
  278. bytebuffer++;
  279. count++;
  280. remaining--;
  281. }
  282. while (remaining > 0 && count < max)
  283. {
  284. UINT bytes = 0;
  285. for (c = *bytebuffer; c & 0x80; c <<= 1)
  286. bytes++;
  287. if (bytes == 0)
  288. bytes = 1;
  289. if (remaining < bytes)
  290. {
  291. break;
  292. }
  293. c = 0;
  294. switch ( bytes )
  295. {
  296. case 6: bytebuffer++; // We do not handle ucs4 chars
  297. case 5: bytebuffer++; // except those on plane 1
  298. valid = false;
  299. // fall through
  300. case 4:
  301. // Do we have enough buffer?
  302. if (count >= max - 1)
  303. goto Cleanup;
  304. // surrogate pairs
  305. ucs4 = ULONG(*bytebuffer++ & 0x07) << 18;
  306. if ((*bytebuffer & 0xc0) != 0x80)
  307. valid = false;
  308. ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 12;
  309. if ((*bytebuffer & 0xc0) != 0x80)
  310. valid = false;
  311. ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 6;
  312. if ((*bytebuffer & 0xc0) != 0x80)
  313. valid = false;
  314. ucs4 |= ULONG(*bytebuffer++ & 0x3f);
  315. // For non-BMP code values of ISO/IEC 10646,
  316. // only those in plane 1 are valid xml characters
  317. if (ucs4 > 0x10ffff)
  318. valid = false;
  319. if (valid)
  320. {
  321. // first ucs2 char
  322. *buffer++ = static_cast<WCHAR>((ucs4 - 0x10000) / 0x400 + 0xd800);
  323. count++;
  324. // second ucs2 char
  325. c = static_cast<WCHAR>((ucs4 - 0x10000) % 0x400 + 0xdc00);
  326. }
  327. break;
  328. case 3: c = WCHAR(*bytebuffer++ & 0x0f) << 12; // 0x0800 - 0xffff
  329. if ((*bytebuffer & 0xc0) != 0x80)
  330. valid = false;
  331. // fall through
  332. case 2: c |= WCHAR(*bytebuffer++ & 0x3f) << 6; // 0x0080 - 0x07ff
  333. if ((*bytebuffer & 0xc0) != 0x80)
  334. valid = false;
  335. c |= WCHAR(*bytebuffer++ & 0x3f);
  336. break;
  337. case 1:
  338. c = WCHAR(*bytebuffer++); // 0x0000 - 0x007f
  339. break;
  340. default:
  341. valid = false; // not a valid UTF-8 character.
  342. break;
  343. }
  344. // If the multibyte sequence was illegal, store a FFFF character code.
  345. // The Unicode spec says this value may be used as a signal like this.
  346. // This will be detected later by the parser and an error generated.
  347. // We don't throw an exception here because the parser would not yet know
  348. // the line and character where the error occurred and couldn't produce a
  349. // detailed error message.
  350. if (! valid)
  351. {
  352. c = 0xffff;
  353. valid = true;
  354. }
  355. *buffer++ = c;
  356. count++;
  357. remaining -= bytes;
  358. }
  359. #endif
  360. Cleanup:
  361. // tell caller that there are bytes remaining in the buffer to
  362. // be processed next time around when we have more data.
  363. *cb -= remaining;
  364. *cch = count;
  365. return S_OK;
  366. }
  367. /**
  368. * Scans bytebuffer and translates UCS2 big endian characters into UNICODE characters
  369. */
  370. HRESULT CharEncoder::wideCharFromUcs2Bigendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
  371. UINT * cb, WCHAR * buffer, UINT * cch)
  372. {
  373. UNUSED(codepage);
  374. UNUSED(pdwMode);
  375. UINT num = *cb >> 1;
  376. if (num > *cch)
  377. num = *cch;
  378. for (UINT i = num; i > 0; i--)
  379. {
  380. *buffer++ = ((*bytebuffer) << 8) | (*(bytebuffer + 1));
  381. bytebuffer += 2;
  382. }
  383. *cch = num;
  384. *cb = num << 1;
  385. return S_OK;
  386. }
  387. /**
  388. * Scans bytebuffer and translates UCS2 little endian characters into UNICODE characters
  389. */
  390. HRESULT CharEncoder::wideCharFromUcs2Littleendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
  391. UINT * cb, WCHAR * buffer, UINT * cch)
  392. {
  393. UNUSED(codepage);
  394. UNUSED(pdwMode);
  395. UINT num = *cb / 2; // Ucs2 is two byte unicode.
  396. if (num > *cch)
  397. num = *cch;
  398. #ifndef UNIX
  399. // Optimization for windows platform where little endian maps directly to WCHAR.
  400. // (This increases overall parser performance by 5% for large unicode files !!)
  401. ::memcpy(buffer, bytebuffer, num * sizeof(WCHAR));
  402. #else
  403. for (UINT i = num; i > 0 ; i--)
  404. {
  405. // we want the letter 'a' to be 0x0000006a.
  406. *buffer++ = (*(bytebuffer+1)<<8) | (*bytebuffer);
  407. bytebuffer += 2;
  408. }
  409. #endif
  410. *cch = num;
  411. *cb = num * 2;
  412. return S_OK;
  413. }
  414. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  415. /**
  416. * Scans bytebuffer and translates UCS4 big endian characters into UNICODE characters
  417. */
  418. HRESULT CharEncoder::wideCharFromUcs4Bigendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
  419. UINT * cb, WCHAR * buffer, UINT * cch)
  420. {
  421. UINT num = *cb >> 2;
  422. if (num > *cch)
  423. num = *cch;
  424. for (UINT i = num; i > 0; i--)
  425. {
  426. #ifndef UNIX
  427. if (*bytebuffer != 0 || *(bytebuffer + 1) != 0)
  428. {
  429. return XML_E_INVALID_UNICODE;
  430. }
  431. *buffer++ = (*(bytebuffer + 2) << 8) | (*(bytebuffer + 3));
  432. #else
  433. *buffer++ = ((*bytebuffer)<<24) | (*(bytebuffer+1)<<16) | (*(bytebuffer+2)<<8) | (*(bytebuffer+3));
  434. #endif
  435. bytebuffer += 4;
  436. }
  437. *cch = num;
  438. *cb = num << 2;
  439. return S_OK;
  440. }
  441. #endif
  442. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  443. /**
  444. * Scans bytebuffer and translates UCS4 little endian characters into UNICODE characters
  445. */
  446. HRESULT CharEncoder::wideCharFromUcs4Littleendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
  447. UINT * cb, WCHAR * buffer, UINT * cch)
  448. {
  449. UINT num = *cb >> 2; // Ucs4 is two byte unicode.
  450. if (num > *cch)
  451. num = *cch;
  452. for (UINT i = num; i > 0 ; i--)
  453. {
  454. #ifndef UNIX
  455. *buffer++ = (*(bytebuffer+1)<<8) | (*bytebuffer);
  456. if (*(bytebuffer + 2) != 0 || *(bytebuffer + 3) != 0)
  457. {
  458. return XML_E_INVALID_UNICODE;
  459. }
  460. #else
  461. *buffer++ = (*(bytebuffer+3)<<24) | (*(bytebuffer+2)<<16) | (*(bytebuffer+1)<<8) | (*bytebuffer);
  462. #endif
  463. bytebuffer += 4;
  464. }
  465. *cch = num;
  466. *cb = num << 2;
  467. return S_OK;
  468. }
  469. #endif
  470. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  471. /**
  472. * Scans bytebuffer and translates characters of charSet identified by
  473. * <code> codepage </code> into UNICODE characters,
  474. * using Win32 function MultiByteToWideChar() for encoding
  475. */
  476. HRESULT CharEncoder::wideCharFromMultiByteWin32(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
  477. UINT * cb, WCHAR * buffer, UINT * cch)
  478. {
  479. HRESULT hr = S_OK;
  480. *cch = ::MultiByteToWideChar(codepage, MB_PRECOMPOSED,
  481. (char*)bytebuffer, *cb,
  482. buffer, *cch);
  483. if (*cch == 0)
  484. hr = GetLastError();
  485. return hr;
  486. }
  487. #endif
  488. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  489. /**
  490. * Scans bytebuffer and translates multibyte characters into UNICODE characters,
  491. * using Mlang for encoding
  492. */
  493. HRESULT CharEncoder::wideCharFromMultiByteMlang(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
  494. UINT * cb, WCHAR * buffer, UINT * cch)
  495. {
  496. HRESULT hr;
  497. checkhr2(_EnsureMultiLanguage());
  498. checkhr2(pMultiLanguage->ConvertStringToUnicode(pdwMode, codepage,
  499. (char*)bytebuffer, cb,
  500. buffer, cch ));
  501. return S_OK;
  502. }
  503. #endif
  504. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  505. /**
  506. * Scans buffer and translates Unicode characters into Ucs2 big endian characters
  507. */
  508. HRESULT CharEncoder::wideCharToUcs2Bigendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
  509. UINT *cch, BYTE* bytebuffer, UINT * cb)
  510. {
  511. UINT num = (*cb) >> 1;
  512. if (num > *cch)
  513. num = *cch;
  514. // BUGBUG - what do we do about Unix where WCHAR is 4 bytes ?
  515. // Currently we just throw away the high WORD - but I don't know how else
  516. // to do it, since UCS2 is 2-byte unicode by definition.
  517. for (UINT i = num; i > 0; i--)
  518. {
  519. *bytebuffer++ = (*buffer) >> 8;
  520. *bytebuffer++ = (*buffer++) & 0xFF;
  521. }
  522. *cch = num;
  523. *cb = num << 1;
  524. return S_OK;
  525. }
  526. #endif
  527. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  528. /**
  529. * Scans buffer and translates Unicode characters into Ucs2 little endian characters
  530. */
  531. HRESULT CharEncoder::wideCharToUcs2Littleendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
  532. UINT *cch, BYTE* bytebuffer, UINT * cb)
  533. {
  534. UINT num = (*cb) >> 1;
  535. if (num > *cch)
  536. num = *cch;
  537. // BUGBUG - what do we do about Unix where WCHAR is 4 bytes ?
  538. // Currently we just throw away the high WORD - but I don't know how else
  539. // to do it, since UCS2 is 2-byte unicode by definition.
  540. #ifndef UNIX
  541. // Optimization for windows platform where little endian maps directly to WCHAR.
  542. // (This increases overall parser performance by 5% for large unicode files !!)
  543. ::memcpy(bytebuffer, buffer, num * sizeof(WCHAR));
  544. #else
  545. for (UINT i = num; i > 0; i--)
  546. {
  547. *bytebuffer++ = (*buffer) & 0xFF;
  548. *bytebuffer++ = (*buffer++) >> 8;
  549. }
  550. #endif
  551. *cch = num;
  552. *cb = num << 1;
  553. return S_OK;
  554. }
  555. #endif
  556. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  557. /**
  558. * Scans buffer and translates Unicode characters into Ucs4 big endian characters
  559. */
  560. HRESULT CharEncoder::wideCharToUcs4Bigendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
  561. UINT *cch, BYTE* bytebuffer, UINT * cb)
  562. {
  563. UINT num = (*cb) >> 2;
  564. if (num > *cch)
  565. num = *cch;
  566. for (UINT i = num; i > 0; i--)
  567. {
  568. #ifndef UNIX
  569. *bytebuffer++ = 0;
  570. *bytebuffer++ = 0;
  571. *bytebuffer++ = (*buffer) >> 8;
  572. *bytebuffer++ = (*buffer) & 0xFF;
  573. #else
  574. *bytebuffer++ = (*buffer) >> 24;
  575. *bytebuffer++ = ((*buffer) >> 16) & 0xFF;
  576. *bytebuffer++ = ((*buffer) >> 8) & 0xFF;
  577. *bytebuffer++ = (*buffer) & 0xFF;
  578. #endif
  579. buffer++;
  580. }
  581. *cch = num;
  582. *cb = num << 2;
  583. return S_OK;
  584. }
  585. #endif
  586. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  587. /**
  588. * Scans buffer and translates Unicode characters into Ucs4 little endian characters
  589. */
  590. HRESULT CharEncoder::wideCharToUcs4Littleendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
  591. UINT *cch, BYTE* bytebuffer, UINT * cb)
  592. {
  593. UINT num = (*cb) >> 2;
  594. if (num > *cch)
  595. num = *cch;
  596. for (UINT i = num; i > 0; i--)
  597. {
  598. #ifndef UNIX
  599. *bytebuffer++ = (*buffer) & 0xFF;
  600. *bytebuffer++ = (*buffer) >> 8;
  601. *bytebuffer++ = 0;
  602. *bytebuffer++ = 0;
  603. #else
  604. *bytebuffer++ = (*buffer) & 0xFF;
  605. *bytebuffer++ = ((*buffer) >> 8) & 0xFF;
  606. *bytebuffer++ = ((*buffer) >> 16) & 0xFF;
  607. *bytebuffer++ = (*buffer) >> 24;
  608. #endif
  609. buffer++;
  610. }
  611. *cch = num;
  612. *cb = num << 2;
  613. return S_OK;
  614. }
  615. #endif
  616. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  617. /**
  618. * Scans buffer and translates Unicode characters into UTF8 characters
  619. */
  620. HRESULT CharEncoder::wideCharToUtf8(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
  621. UINT *cch, BYTE* bytebuffer, UINT * cb)
  622. {
  623. UINT count = 0, num = *cch, m1 = *cb, m2 = m1 - 1, m3 = m2 - 1, m4 = m3 - 1;
  624. DWORD dw1;
  625. bool surrogate = false;
  626. for (UINT i = num; i > 0; i--)
  627. {
  628. #ifdef UNIX
  629. // Solaris a WCHAR is 4 bytes (DWORD)
  630. DWORD dw = 0;
  631. DWORD dwTemp[4];
  632. BYTE* pByte = (BYTE*)buffer;
  633. dwTemp[3] = (DWORD)pByte[0];
  634. dwTemp[2] = (DWORD)pByte[1];
  635. dwTemp[1] = (DWORD)pByte[2];
  636. dwTemp[0] = (DWORD)pByte[3];
  637. dw = dwTemp[0]+(dwTemp[1]<<8)+(dwTemp[2]<<16)+(dwTemp[3]<<24);
  638. #else
  639. DWORD dw = *buffer;
  640. #endif
  641. if (surrogate) // is it the second char of a surrogate pair?
  642. {
  643. if (dw >= 0xdc00 && dw <= 0xdfff)
  644. {
  645. // four bytes 0x11110xxx 0x10xxxxxx 0x10xxxxxx 0x10xxxxxx
  646. if (count < m4)
  647. count += 4;
  648. else
  649. break;
  650. ULONG ucs4 = (dw1 - 0xd800) * 0x400 + (dw - 0xdc00) + 0x10000;
  651. *bytebuffer++ = (byte)(( ucs4 >> 18) | 0xF0);
  652. *bytebuffer++ = (byte)((( ucs4 >> 12) & 0x3F) | 0x80);
  653. *bytebuffer++ = (byte)((( ucs4 >> 6) & 0x3F) | 0x80);
  654. *bytebuffer++ = (byte)(( ucs4 & 0x3F) | 0x80);
  655. surrogate = false;
  656. buffer++;
  657. continue;
  658. }
  659. else // Then dw1 must be a three byte character
  660. {
  661. if (count < m3)
  662. count += 3;
  663. else
  664. break;
  665. *bytebuffer++ = (byte)(( dw1 >> 12) | 0xE0);
  666. *bytebuffer++ = (byte)((( dw1 >> 6) & 0x3F) | 0x80);
  667. *bytebuffer++ = (byte)(( dw1 & 0x3F) | 0x80);
  668. }
  669. surrogate = false;
  670. }
  671. if (dw < 0x80) // one byte, 0xxxxxxx
  672. {
  673. if (count < m1)
  674. count++;
  675. else
  676. break;
  677. *bytebuffer++ = (byte)dw;
  678. }
  679. else if ( dw < 0x800) // two WORDS, 110xxxxx 10xxxxxx
  680. {
  681. if (count < m2)
  682. count += 2;
  683. else
  684. break;
  685. *bytebuffer++ = (byte)((dw >> 6) | 0xC0);
  686. *bytebuffer++ = (byte)((dw & 0x3F) | 0x80);
  687. }
  688. else if (dw >= 0xd800 && dw <= 0xdbff) // Assume that it is the first char of surrogate pair
  689. {
  690. if (i == 1) // last wchar in buffer
  691. break;
  692. dw1 = dw;
  693. surrogate = true;
  694. }
  695. else // three bytes, 1110xxxx 10xxxxxx 10xxxxxx
  696. {
  697. if (count < m3)
  698. count += 3;
  699. else
  700. break;
  701. *bytebuffer++ = (byte)(( dw >> 12) | 0xE0);
  702. *bytebuffer++ = (byte)((( dw >> 6) & 0x3F) | 0x80);
  703. *bytebuffer++ = (byte)(( dw & 0x3F) | 0x80);
  704. }
  705. buffer++;
  706. }
  707. *cch = surrogate ? num - i - 1 : num - i;
  708. *cb = count;
  709. return S_OK;
  710. }
  711. #endif
  712. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  713. /**
  714. * Scans buffer and translates Unicode characters into characters identified
  715. * by <code> codepage </>, using Win32 function WideCharToMultiByte for encoding
  716. */
  717. HRESULT CharEncoder::wideCharToMultiByteWin32(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
  718. UINT *cch, BYTE* bytebuffer, UINT * cb)
  719. {
  720. HRESULT hr = S_OK;
  721. BOOL fBadChar = false;
  722. *cb = ::WideCharToMultiByte(codepage, NULL, buffer, *cch, (char*)bytebuffer, *cb, NULL, &fBadChar);
  723. if (*cb == 0)
  724. hr = ::GetLastError();
  725. else if (fBadChar)
  726. // BUGBUG: how do we inform the caller which character failed?
  727. hr = S_FALSE;
  728. return hr;
  729. }
  730. #endif
  731. #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
  732. /**
  733. * Scans buffer and translates Unicode characters into characters of charSet
  734. * identified by <code> codepage </code>, using Mlang for encoding
  735. */
  736. HRESULT CharEncoder::wideCharToMultiByteMlang(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
  737. UINT *cch, BYTE* bytebuffer, UINT * cb)
  738. {
  739. HRESULT hr;
  740. checkhr2(_EnsureMultiLanguage());
  741. checkhr2(pMultiLanguage->ConvertStringFromUnicode(pdwMode, codepage,
  742. buffer, cch, (char*)bytebuffer, cb ));
  743. return S_OK;
  744. }
  745. #endif