Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

278 lines
8.4 KiB

  1. /*
  2. * @(#)CharEncoder.cxx 1.0 6/10/97
  3. *
  4. * Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. *
  5. */
  6. #include "stdinc.h"
  7. #include "core.hxx"
  8. #pragma hdrstop
  9. #include "charencoder.hxx"
  10. //
  11. // Delegate other charsets to mlang
  12. //
  13. const EncodingEntry CharEncoder::charsetInfo [] =
  14. {
  15. { CP_UCS_2, L"UTF-16", 2, wideCharFromUcs2Bigendian },
  16. { CP_UCS_2, L"UCS-2", 2, wideCharFromUcs2Bigendian },
  17. { CP_UTF_8, L"UTF-8", 3, wideCharFromUtf8 },
  18. };
  19. Encoding * Encoding::newEncoding(const WCHAR * s, ULONG len, bool endian, bool mark)
  20. {
  21. //Encoding * e = new Encoding();
  22. Encoding * e = NEW (Encoding());
  23. if (e == NULL)
  24. return NULL;
  25. e->charset = NEW (WCHAR[len + 1]);
  26. if (e->charset == NULL)
  27. {
  28. delete e;
  29. return NULL;
  30. }
  31. ::memcpy(e->charset, s, sizeof(WCHAR) * len);
  32. e->charset[len] = 0; // guarentee NULL termination.
  33. e->littleendian = endian;
  34. e->byteOrderMark = mark;
  35. return e;
  36. }
  37. Encoding::~Encoding()
  38. {
  39. if (charset != NULL)
  40. {
  41. delete [] charset;
  42. }
  43. }
  44. int CharEncoder::getCharsetInfo(const WCHAR * charset, CODEPAGE * pcodepage, UINT * mCharSize)
  45. {
  46. for (int i = LENGTH(charsetInfo) - 1; i >= 0; i--)
  47. {
  48. if (::FusionpCompareStrings(charset, ::wcslen(charset), charsetInfo[i].charset, ::wcslen(charsetInfo[i].charset), true) == 0)
  49. {
  50. *pcodepage = charsetInfo[i].codepage;
  51. *mCharSize = charsetInfo[i].maxCharSize;
  52. return i;
  53. } // end of if
  54. }// end of for
  55. return -2;
  56. }
  57. /**
  58. * get information about a code page identified by <code> encoding </code>
  59. */
  60. HRESULT CharEncoder::getWideCharFromMultiByteInfo(Encoding * encoding, CODEPAGE * pcodepage, WideCharFromMultiByteFunc ** pfnWideCharFromMultiByte, UINT * mCharSize)
  61. {
  62. HRESULT hr = S_OK;
  63. int i = getCharsetInfo(encoding->charset, pcodepage, mCharSize);
  64. if (i >= 0) // in our short list
  65. {
  66. switch (*pcodepage)
  67. {
  68. case CP_UCS_2:
  69. if (encoding->littleendian)
  70. *pfnWideCharFromMultiByte = wideCharFromUcs2Littleendian;
  71. else
  72. *pfnWideCharFromMultiByte = wideCharFromUcs2Bigendian;
  73. break;
  74. default:
  75. *pfnWideCharFromMultiByte = charsetInfo[i].pfnWideCharFromMultiByte;
  76. break;
  77. }
  78. }
  79. else // invalid encoding
  80. {
  81. hr = E_FAIL;
  82. }
  83. return hr;
  84. }
  85. /**
  86. * Scans rawbuffer and translates UTF8 characters into UNICODE characters
  87. */
  88. HRESULT CharEncoder::wideCharFromUtf8(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
  89. UINT * cb, WCHAR * buffer, UINT * cch)
  90. {
  91. UNUSED(pdwMode);
  92. UNUSED(codepage);
  93. UINT remaining = *cb;
  94. UINT count = 0;
  95. UINT max = *cch;
  96. ULONG ucs4;
  97. // UTF-8 multi-byte encoding. See Appendix A.2 of the Unicode book for more info.
  98. //
  99. // Unicode value 1st byte 2nd byte 3rd byte 4th byte
  100. // 000000000xxxxxxx 0xxxxxxx
  101. // 00000yyyyyxxxxxx 110yyyyy 10xxxxxx
  102. // zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
  103. // 110110wwwwzzzzyy+ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
  104. // 110111yyyyxxxxxx, where uuuuu = wwww + 1
  105. WCHAR c;
  106. bool valid = true;
  107. while (remaining > 0 && count < max)
  108. {
  109. // This is an optimization for straight runs of 7-bit ascii
  110. // inside the UTF-8 data.
  111. c = *bytebuffer;
  112. if (c & 0x80) // check 8th-bit and get out of here
  113. break; // so we can do proper UTF-8 decoding.
  114. *buffer++ = c;
  115. bytebuffer++;
  116. count++;
  117. remaining--;
  118. }
  119. while (remaining > 0 && count < max)
  120. {
  121. UINT bytes = 0;
  122. for (c = *bytebuffer; c & 0x80; c <<= 1)
  123. bytes++;
  124. if (bytes == 0)
  125. bytes = 1;
  126. if (remaining < bytes)
  127. {
  128. break;
  129. }
  130. c = 0;
  131. switch ( bytes )
  132. {
  133. case 6: bytebuffer++; // We do not handle ucs4 chars
  134. case 5: bytebuffer++; // except those on plane 1
  135. valid = false;
  136. // fall through
  137. case 4:
  138. // Do we have enough buffer?
  139. if (count >= max - 1)
  140. goto Cleanup;
  141. // surrogate pairs
  142. ucs4 = ULONG(*bytebuffer++ & 0x07) << 18;
  143. if ((*bytebuffer & 0xc0) != 0x80)
  144. valid = false;
  145. ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 12;
  146. if ((*bytebuffer & 0xc0) != 0x80)
  147. valid = false;
  148. ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 6;
  149. if ((*bytebuffer & 0xc0) != 0x80)
  150. valid = false;
  151. ucs4 |= ULONG(*bytebuffer++ & 0x3f);
  152. // For non-BMP code values of ISO/IEC 10646,
  153. // only those in plane 1 are valid xml characters
  154. if (ucs4 > 0x10ffff)
  155. valid = false;
  156. if (valid)
  157. {
  158. // first ucs2 char
  159. *buffer++ = static_cast<WCHAR>((ucs4 - 0x10000) / 0x400 + 0xd800);
  160. count++;
  161. // second ucs2 char
  162. c = static_cast<WCHAR>((ucs4 - 0x10000) % 0x400 + 0xdc00);
  163. }
  164. break;
  165. case 3: c = WCHAR(*bytebuffer++ & 0x0f) << 12; // 0x0800 - 0xffff
  166. if ((*bytebuffer & 0xc0) != 0x80)
  167. valid = false;
  168. // fall through
  169. case 2: c |= WCHAR(*bytebuffer++ & 0x3f) << 6; // 0x0080 - 0x07ff
  170. if ((*bytebuffer & 0xc0) != 0x80)
  171. valid = false;
  172. c |= WCHAR(*bytebuffer++ & 0x3f);
  173. break;
  174. case 1:
  175. c = WCHAR(*bytebuffer++); // 0x0000 - 0x007f
  176. break;
  177. default:
  178. valid = false; // not a valid UTF-8 character.
  179. break;
  180. }
  181. // If the multibyte sequence was illegal, store a FFFF character code.
  182. // The Unicode spec says this value may be used as a signal like this.
  183. // This will be detected later by the parser and an error generated.
  184. // We don't throw an exception here because the parser would not yet know
  185. // the line and character where the error occurred and couldn't produce a
  186. // detailed error message.
  187. if (! valid)
  188. {
  189. c = 0xffff;
  190. valid = true;
  191. }
  192. *buffer++ = c;
  193. count++;
  194. remaining -= bytes;
  195. }
  196. Cleanup:
  197. // tell caller that there are bytes remaining in the buffer to
  198. // be processed next time around when we have more data.
  199. *cb -= remaining;
  200. *cch = count;
  201. return S_OK;
  202. }
  203. /**
  204. * Scans bytebuffer and translates UCS2 big endian characters into UNICODE characters
  205. */
  206. HRESULT CharEncoder::wideCharFromUcs2Bigendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
  207. UINT * cb, WCHAR * buffer, UINT * cch)
  208. {
  209. UNUSED(codepage);
  210. UNUSED(pdwMode);
  211. UINT num = *cb >> 1;
  212. if (num > *cch)
  213. num = *cch;
  214. for (UINT i = num; i > 0; i--)
  215. {
  216. *buffer++ = ((*bytebuffer) << 8) | (*(bytebuffer + 1));
  217. bytebuffer += 2;
  218. }
  219. *cch = num;
  220. *cb = num << 1;
  221. return S_OK;
  222. }
  223. /**
  224. * Scans bytebuffer and translates UCS2 little endian characters into UNICODE characters
  225. */
  226. HRESULT CharEncoder::wideCharFromUcs2Littleendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
  227. UINT * cb, WCHAR * buffer, UINT * cch)
  228. {
  229. UNUSED(codepage);
  230. UNUSED(pdwMode);
  231. UINT num = *cb / 2; // Ucs2 is two byte unicode.
  232. if (num > *cch)
  233. num = *cch;
  234. // Optimization for windows platform where little endian maps directly to WCHAR.
  235. // (This increases overall parser performance by 5% for large unicode files !!)
  236. ::memcpy(buffer, bytebuffer, num * sizeof(WCHAR));
  237. *cch = num;
  238. *cb = num * 2;
  239. return S_OK;
  240. }