Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

265 lines
5.7 KiB

  1. ///////////////////////////////////////////////////////////////////////////////
  2. //
  3. // Copyright (c) 1998, Microsoft Corp. All rights reserved.
  4. //
  5. // FILE
  6. //
  7. // iasutf8.c
  8. //
  9. // SYNOPSIS
  10. //
  11. // Defines functions for converting between UTF-8 and Unicode.
  12. //
  13. // MODIFICATION HISTORY
  14. //
  15. // 01/22/1999 Original version.
  16. //
  17. ///////////////////////////////////////////////////////////////////////////////
  18. #include <windows.h>
  19. #include <iasutf8.h>
  20. /////////
  21. // Tests the validity of a UTF-8 trail byte. Must be of the form 10vvvvvv.
  22. /////////
  23. #define NOT_TRAIL_BYTE(b) (((BYTE)(b) & 0xC0) != 0x80)
  24. //////////
  25. // Returns the number of characters required to hold the converted string. The
  26. // source string may not contain nulls. Returns -1 if 'src' is not a valid
  27. // UTF-8 string.
  28. //////////
  29. LONG
  30. WINAPI
  31. IASUtf8ToUnicodeLength(
  32. PCSTR src,
  33. DWORD srclen
  34. )
  35. {
  36. LONG nchar;
  37. PCSTR end;
  38. if (src == NULL) { return 0; }
  39. // Number of characters needed.
  40. nchar = 0;
  41. // End of string to be converted.
  42. end = src + srclen;
  43. // Loop through the UTF-8 string.
  44. while (src < end)
  45. {
  46. if (*src == 0)
  47. {
  48. // Do not allow embedded nulls.
  49. return -1;
  50. }
  51. else if ((BYTE)*src < 0x80)
  52. {
  53. // 0vvvvvvv = 1 byte character.
  54. }
  55. else if ((BYTE)*src < 0xC0)
  56. {
  57. // 10vvvvvv = Invalid lead byte.
  58. return -1;
  59. }
  60. else if ((BYTE)*src < 0xE0)
  61. {
  62. // 110vvvvv = 2 byte character.
  63. if (NOT_TRAIL_BYTE(*++src)) { return -1; }
  64. }
  65. else if ((BYTE)*src < 0xF0)
  66. {
  67. // 1110vvvv = 3 byte character.
  68. if (NOT_TRAIL_BYTE(*++src)) { return -1; }
  69. if (NOT_TRAIL_BYTE(*++src)) { return -1; }
  70. }
  71. else
  72. {
  73. // In theory, UTF-8 supports 4-6 byte characters, but Windows uses
  74. // 16-bit integers for Unicode, so we can't handle them.
  75. return -1;
  76. }
  77. // We successfully parsed a UTF-8 character.
  78. ++src;
  79. ++nchar;
  80. }
  81. // Return the number of characters needed.
  82. return nchar;
  83. }
  84. //////////
  85. // Returns the number of characters required to hold the converted string.
  86. //////////
  87. LONG
  88. WINAPI
  89. IASUnicodeToUtf8Length(
  90. PCWSTR src,
  91. DWORD srclen
  92. )
  93. {
  94. LONG nchar;
  95. PCWSTR end;
  96. if (src == NULL) { return 0; }
  97. // Number of characters needed.
  98. nchar = 0;
  99. // End of string to be converted.
  100. end = src + srclen;
  101. // Loop through the Unicode string.
  102. while (src < end)
  103. {
  104. if (*src < 0x80)
  105. {
  106. // 1 byte character.
  107. nchar += 1;
  108. }
  109. else if (*src < 0x800)
  110. {
  111. // 2 byte character.
  112. nchar += 2;
  113. }
  114. else
  115. {
  116. // 3 byte character.
  117. nchar += 3;
  118. }
  119. // Advance to the next character in the string.
  120. ++src;
  121. }
  122. // Return the number of characters needed.
  123. return nchar;
  124. }
  125. /////////
  126. // Converts a UTF-8 string to Unicode. Returns the number of characters in the
  127. // converted string. The source string may not contain nulls. Returns -1 if
  128. // 'src' is not a valid UTF-8 string.
  129. /////////
  130. LONG
  131. IASUtf8ToUnicode(
  132. PCSTR src,
  133. DWORD srclen,
  134. PWSTR dst
  135. )
  136. {
  137. PCWSTR start;
  138. PCSTR end;
  139. if (!src || !dst) { return 0; }
  140. // Remember where we started.
  141. start = dst;
  142. // End of the string to be converted.
  143. end = src + srclen;
  144. // Loop through the source UTF-8 string.
  145. while (src < end)
  146. {
  147. if (*src == 0)
  148. {
  149. // Do not allow embedded nulls.
  150. return -1;
  151. }
  152. else if ((BYTE)*src < 0x80)
  153. {
  154. // 1 byte character: 0vvvvvvv
  155. *dst = *src;
  156. }
  157. else if ((BYTE)*src < 0xC0)
  158. {
  159. // Invalid lead byte: 10vvvvvv
  160. return -1;
  161. }
  162. else if ((BYTE)*src < 0xE0)
  163. {
  164. // 2 byte character: 110vvvvv 10vvvvvv
  165. *dst = (*src & 0x1F) << 6;
  166. if (NOT_TRAIL_BYTE(*++src)) { return -1; }
  167. *dst |= (*src & 0x3F);
  168. }
  169. else if ((BYTE)*src < 0xF0)
  170. {
  171. // 3 byte character: 1110vvvv 10vvvvvv 10vvvvvv
  172. *dst = (*src & 0x0F) << 12;
  173. if (NOT_TRAIL_BYTE(*++src)) { return -1; }
  174. *dst |= (*src & 0x3f) << 6;
  175. if (NOT_TRAIL_BYTE(*++src)) { return -1; }
  176. *dst |= (*src & 0x3f);
  177. }
  178. else
  179. {
  180. // In theory, UTF-8 supports 4-6 byte characters, but Windows uses
  181. // 16-bit integers for Unicode, so we can't handle them.
  182. return -1;
  183. }
  184. // Advance to the next character.
  185. ++src;
  186. ++dst;
  187. }
  188. // Return the number of characters in the converted string.
  189. return (LONG)(dst - start);
  190. }
  191. /////////
  192. // Converts a Unicode string to UTF-8. Returns the number of characters in the
  193. // converted string.
  194. /////////
  195. LONG
  196. IASUnicodeToUtf8(
  197. PCWSTR src,
  198. DWORD srclen,
  199. PSTR dst
  200. )
  201. {
  202. PCSTR start;
  203. PCWSTR end;
  204. if (!src || !dst) { return 0; }
  205. // Remember where we started.
  206. start = dst;
  207. // End of the string to be converted.
  208. end = src + srclen;
  209. // Loop through the source Unicode string.
  210. while (src < end)
  211. {
  212. if (*src < 0x80)
  213. {
  214. // Pack as 0vvvvvvv
  215. *dst++ = (CHAR)*src;
  216. }
  217. else if (*src < 0x800)
  218. {
  219. // Pack as 110vvvvv 10vvvvvv 10vvvvvv
  220. *dst++ = (CHAR)(0xC0 | ((*src >> 6) & 0x3F));
  221. *dst++ = (CHAR)(0x80 | ((*src ) & 0x3F));
  222. }
  223. else
  224. {
  225. // Pack as 1110vvvv 10vvvvvv 10vvvvvv
  226. *dst++ = (CHAR)(0xE0 | ((*src >> 12) ));
  227. *dst++ = (CHAR)(0x80 | ((*src >> 6) & 0x3F));
  228. *dst++ = (CHAR)(0x80 | ((*src ) & 0x3F));
  229. }
  230. // Advance to the next character.
  231. ++src;
  232. }
  233. // Return the number of characters in the converted string.
  234. return (LONG)(dst - start);
  235. }