Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

691 lines
20 KiB

  1. //+-------------------------------------------------------------------------
  2. //
  3. // Microsoft Windows
  4. //
  5. // Copyright (C) Microsoft Corporation, 1995 - 1999
  6. //
  7. // File: utf8.cpp
  8. //
  9. // Contents: WideChar to/from UTF8 APIs
  10. //
  11. // Functions: WideCharToUTF8
  12. // UTF8ToWideChar
  13. //
  14. // History: 19-Feb-97 philh created
  15. // 28-Aug-99 philh added surrogate support. Copied from
  16. // nt\private\windows\winnls\utf.c or
  17. // \\rastaman\ntwin\src\winnls\utf.c.
  18. //
  19. //--------------------------------------------------------------------------
  20. #include "global.hxx"
  21. #include <dbgdef.h>
  22. #include "utf8.h"
  23. #if 1
  24. // NEW SURROGATE VERSION
  25. //
  26. // Constant Declarations.
  27. //
  28. #define ASCII 0x007f
  29. #define SHIFT_IN '+' // beginning of a shift sequence
  30. #define SHIFT_OUT '-' // end of a shift sequence
  31. #define UTF8_2_MAX 0x07ff // max UTF8 2-byte sequence (32 * 64 = 2048)
  32. #define UTF8_1ST_OF_2 0xc0 // 110x xxxx
  33. #define UTF8_1ST_OF_3 0xe0 // 1110 xxxx
  34. #define UTF8_1ST_OF_4 0xf0 // 1111 xxxx
  35. #define UTF8_TRAIL 0x80 // 10xx xxxx
  36. #define HIGHER_6_BIT(u) ((u) >> 12)
  37. #define MIDDLE_6_BIT(u) (((u) & 0x0fc0) >> 6)
  38. #define LOWER_6_BIT(u) ((u) & 0x003f)
  39. #define BIT7(a) ((a) & 0x80)
  40. #define BIT6(a) ((a) & 0x40)
  41. #define HIGH_SURROGATE_START 0xd800
  42. #define HIGH_SURROGATE_END 0xdbff
  43. #define LOW_SURROGATE_START 0xdc00
  44. #define LOW_SURROGATE_END 0xdfff
  45. ////////////////////////////////////////////////////////////////////////////
  46. //
  47. // UTF8ToUnicode
  48. //
  49. // Maps a UTF-8 character string to its wide character string counterpart.
  50. //
  51. // 02-06-96 JulieB Created.
  52. // 03-20-99 SamerA Surrogate support.
  53. ////////////////////////////////////////////////////////////////////////////
  54. int
  55. WINAPI
  56. UTF8ToWideChar(
  57. LPCSTR lpSrcStr,
  58. int cchSrc,
  59. LPWSTR lpDestStr,
  60. int cchDest)
  61. {
  62. int nTB = 0; // # trail bytes to follow
  63. int cchWC = 0; // # of Unicode code points generated
  64. LPCSTR pUTF8 = lpSrcStr;
  65. DWORD dwSurrogateChar = 0; // Full surrogate char
  66. BOOL bSurrogatePair = FALSE; // Indicate we'r collecting a surrogate pair
  67. char UTF8;
  68. // BEGIN ADDED CHECKS
  69. if (cchDest < 0)
  70. goto InvalidParameter;
  71. if (cchSrc < 0)
  72. cchSrc = strlen(lpSrcStr) + 1;
  73. // END ADDED CHECKS
  74. while ((cchSrc--) && ((cchDest == 0) || (cchWC < cchDest)))
  75. {
  76. //
  77. // See if there are any trail bytes.
  78. //
  79. if (BIT7(*pUTF8) == 0)
  80. {
  81. // BEGIN FIX
  82. if (nTB != 0)
  83. goto InvalidParameter;
  84. // END FIX
  85. //
  86. // Found ASCII.
  87. //
  88. if (cchDest)
  89. {
  90. lpDestStr[cchWC] = (WCHAR)*pUTF8;
  91. }
  92. bSurrogatePair = FALSE;
  93. cchWC++;
  94. }
  95. else if (BIT6(*pUTF8) == 0)
  96. {
  97. //
  98. // Found a trail byte.
  99. // Note : Ignore the trail byte if there was no lead byte.
  100. //
  101. if (nTB != 0)
  102. {
  103. //
  104. // Decrement the trail byte counter.
  105. //
  106. nTB--;
  107. if (bSurrogatePair)
  108. {
  109. dwSurrogateChar <<= 6;
  110. dwSurrogateChar |= LOWER_6_BIT(*pUTF8);
  111. if (nTB == 0)
  112. {
  113. if (cchDest)
  114. {
  115. if ((cchWC + 1) < cchDest)
  116. {
  117. lpDestStr[cchWC] = (WCHAR)
  118. (((dwSurrogateChar - 0x10000) >> 10) + HIGH_SURROGATE_START);
  119. lpDestStr[cchWC+1] = (WCHAR)
  120. ((dwSurrogateChar - 0x10000)%0x400 + LOW_SURROGATE_START);
  121. }
  122. // BEGIN FIX
  123. else
  124. {
  125. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  126. return (0);
  127. }
  128. // END FIX
  129. }
  130. cchWC += 2;
  131. bSurrogatePair = FALSE;
  132. }
  133. }
  134. else
  135. {
  136. //
  137. // Make room for the trail byte and add the trail byte
  138. // value.
  139. //
  140. if (cchDest)
  141. {
  142. lpDestStr[cchWC] <<= 6;
  143. lpDestStr[cchWC] |= LOWER_6_BIT(*pUTF8);
  144. }
  145. if (nTB == 0)
  146. {
  147. //
  148. // End of sequence. Advance the output counter.
  149. //
  150. cchWC++;
  151. }
  152. }
  153. }
  154. else
  155. {
  156. // error - not expecting a trail byte
  157. // BEGIN FIX
  158. // bSurrogatePair = FALSE;
  159. goto InvalidParameter;
  160. // END FIX
  161. }
  162. }
  163. else
  164. {
  165. //
  166. // Found a lead byte.
  167. //
  168. if (nTB > 0)
  169. {
  170. //
  171. // Error - previous sequence not finished.
  172. //
  173. // BEGIN FIX
  174. // nTB = 0;
  175. // bSurrogatePair = FALSE;
  176. // cchWC++;
  177. goto InvalidParameter;
  178. // END FIX
  179. }
  180. else
  181. {
  182. //
  183. // Calculate the number of bytes to follow.
  184. // Look for the first 0 from left to right.
  185. //
  186. UTF8 = *pUTF8;
  187. while (BIT7(UTF8) != 0)
  188. {
  189. UTF8 <<= 1;
  190. nTB++;
  191. }
  192. //
  193. // If this is a surrogate unicode pair
  194. //
  195. if (nTB == 4)
  196. {
  197. dwSurrogateChar = UTF8 >> nTB;
  198. bSurrogatePair = TRUE;
  199. }
  200. // BEGIN FIX
  201. else if (nTB >= 5)
  202. {
  203. goto InvalidParameter;
  204. }
  205. // END FIX
  206. //
  207. // Store the value from the first byte and decrement
  208. // the number of bytes to follow.
  209. //
  210. if (cchDest)
  211. {
  212. lpDestStr[cchWC] = (WCHAR) (UTF8 >> nTB);
  213. }
  214. nTB--;
  215. }
  216. }
  217. pUTF8++;
  218. }
  219. // BEGIN FIX
  220. if (nTB != 0)
  221. goto InvalidParameter;
  222. // END FIX
  223. //
  224. // Make sure the destination buffer was large enough.
  225. //
  226. if (cchDest && (cchSrc >= 0))
  227. {
  228. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  229. return (0);
  230. }
  231. //
  232. // Return the number of Unicode characters written.
  233. //
  234. return (cchWC);
  235. // BEGIN FIX
  236. InvalidParameter:
  237. SetLastError(ERROR_INVALID_PARAMETER);
  238. return (0);
  239. // END FIX
  240. }
  241. ////////////////////////////////////////////////////////////////////////////
  242. //
  243. // UnicodeToUTF8
  244. //
  245. // Maps a Unicode character string to its UTF-8 string counterpart.
  246. //
  247. // 02-06-96 JulieB Created.
  248. // 03-20-99 SamerA Surrogate support.
  249. ////////////////////////////////////////////////////////////////////////////
  250. int
  251. WINAPI
  252. WideCharToUTF8(
  253. LPCWSTR lpSrcStr,
  254. int cchSrc,
  255. LPSTR lpDestStr,
  256. int cchDest)
  257. {
  258. LPCWSTR lpWC = lpSrcStr;
  259. int cchU8 = 0; // # of UTF8 chars generated
  260. DWORD dwSurrogateChar;
  261. WCHAR wchHighSurrogate = 0;
  262. BOOL bHandled;
  263. // BEGIN ADDED CHECKS
  264. if (cchDest < 0)
  265. {
  266. SetLastError(ERROR_INVALID_PARAMETER);
  267. return (0);
  268. }
  269. if (cchSrc < 0)
  270. cchSrc = wcslen(lpSrcStr) + 1;
  271. // END ADDED CHECKS
  272. while ((cchSrc--) && ((cchDest == 0) || (cchU8 < cchDest)))
  273. {
  274. bHandled = FALSE;
  275. //
  276. // Check if high surrogate is available
  277. //
  278. if ((*lpWC >= HIGH_SURROGATE_START) && (*lpWC <= HIGH_SURROGATE_END))
  279. {
  280. if (cchDest)
  281. {
  282. // Another high surrogate, then treat the 1st as normal
  283. // Unicode character.
  284. if (wchHighSurrogate)
  285. {
  286. if ((cchU8 + 2) < cchDest)
  287. {
  288. lpDestStr[cchU8++] = (char) (UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate));
  289. lpDestStr[cchU8++] = (char) (UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate));
  290. lpDestStr[cchU8++] = (char) (UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate));
  291. }
  292. else
  293. {
  294. // not enough buffer
  295. cchSrc++;
  296. break;
  297. }
  298. }
  299. }
  300. else
  301. {
  302. cchU8 += 3;
  303. }
  304. wchHighSurrogate = *lpWC;
  305. bHandled = TRUE;
  306. }
  307. if (!bHandled && wchHighSurrogate)
  308. {
  309. if ((*lpWC >= LOW_SURROGATE_START) && (*lpWC <= LOW_SURROGATE_END))
  310. {
  311. // wheee, valid surrogate pairs
  312. if (cchDest)
  313. {
  314. if ((cchU8 + 3) < cchDest)
  315. {
  316. dwSurrogateChar = (((wchHighSurrogate-0xD800) << 10) + (*lpWC - 0xDC00) + 0x10000);
  317. lpDestStr[cchU8++] = (UTF8_1ST_OF_4 |
  318. (unsigned char)(dwSurrogateChar >> 18)); // 3 bits from 1st byte
  319. lpDestStr[cchU8++] = (UTF8_TRAIL |
  320. (unsigned char)((dwSurrogateChar >> 12) & 0x3f)); // 6 bits from 2nd byte
  321. lpDestStr[cchU8++] = (UTF8_TRAIL |
  322. (unsigned char)((dwSurrogateChar >> 6) & 0x3f)); // 6 bits from 3rd byte
  323. lpDestStr[cchU8++] = (UTF8_TRAIL |
  324. (unsigned char)(0x3f & dwSurrogateChar)); // 6 bits from 4th byte
  325. }
  326. else
  327. {
  328. // not enough buffer
  329. cchSrc++;
  330. break;
  331. }
  332. }
  333. else
  334. {
  335. // we already counted 3 previously (in high surrogate)
  336. cchU8 += 1;
  337. }
  338. bHandled = TRUE;
  339. }
  340. else
  341. {
  342. // Bad Surrogate pair : ERROR
  343. // Just process wchHighSurrogate , and the code below will
  344. // process the current code point
  345. if (cchDest)
  346. {
  347. if ((cchU8 + 2) < cchDest)
  348. {
  349. lpDestStr[cchU8++] = (char) (UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate));
  350. lpDestStr[cchU8++] = (char) (UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate));
  351. lpDestStr[cchU8++] = (char) (UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate));
  352. }
  353. else
  354. {
  355. // not enough buffer
  356. cchSrc++;
  357. break;
  358. }
  359. }
  360. }
  361. wchHighSurrogate = 0;
  362. }
  363. if (!bHandled)
  364. {
  365. if (*lpWC <= ASCII)
  366. {
  367. //
  368. // Found ASCII.
  369. //
  370. if (cchDest)
  371. {
  372. lpDestStr[cchU8] = (char)*lpWC;
  373. }
  374. cchU8++;
  375. }
  376. else if (*lpWC <= UTF8_2_MAX)
  377. {
  378. //
  379. // Found 2 byte sequence if < 0x07ff (11 bits).
  380. //
  381. if (cchDest)
  382. {
  383. if ((cchU8 + 1) < cchDest)
  384. {
  385. //
  386. // Use upper 5 bits in first byte.
  387. // Use lower 6 bits in second byte.
  388. //
  389. lpDestStr[cchU8++] = (char) (UTF8_1ST_OF_2 | (*lpWC >> 6));
  390. lpDestStr[cchU8++] = (char) (UTF8_TRAIL | LOWER_6_BIT(*lpWC));
  391. }
  392. else
  393. {
  394. //
  395. // Error - buffer too small.
  396. //
  397. cchSrc++;
  398. break;
  399. }
  400. }
  401. else
  402. {
  403. cchU8 += 2;
  404. }
  405. }
  406. else
  407. {
  408. //
  409. // Found 3 byte sequence.
  410. //
  411. if (cchDest)
  412. {
  413. if ((cchU8 + 2) < cchDest)
  414. {
  415. //
  416. // Use upper 4 bits in first byte.
  417. // Use middle 6 bits in second byte.
  418. // Use lower 6 bits in third byte.
  419. //
  420. lpDestStr[cchU8++] = (char) (UTF8_1ST_OF_3 | HIGHER_6_BIT(*lpWC));
  421. lpDestStr[cchU8++] = (char) (UTF8_TRAIL | MIDDLE_6_BIT(*lpWC));
  422. lpDestStr[cchU8++] = (char) (UTF8_TRAIL | LOWER_6_BIT(*lpWC));
  423. }
  424. else
  425. {
  426. //
  427. // Error - buffer too small.
  428. //
  429. cchSrc++;
  430. break;
  431. }
  432. }
  433. else
  434. {
  435. cchU8 += 3;
  436. }
  437. }
  438. }
  439. lpWC++;
  440. }
  441. //
  442. // If the last character was a high surrogate, then handle it as a normal
  443. // unicode character.
  444. //
  445. if ((cchSrc < 0) && (wchHighSurrogate != 0))
  446. {
  447. if (cchDest)
  448. {
  449. if ((cchU8 + 2) < cchDest)
  450. {
  451. lpDestStr[cchU8++] = (char) (UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate));
  452. lpDestStr[cchU8++] = (char) (UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate));
  453. lpDestStr[cchU8++] = (char) (UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate));
  454. }
  455. else
  456. {
  457. cchSrc++;
  458. }
  459. }
  460. }
  461. //
  462. // Make sure the destination buffer was large enough.
  463. //
  464. if (cchDest && (cchSrc >= 0))
  465. {
  466. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  467. return (0);
  468. }
  469. //
  470. // Return the number of UTF-8 characters written.
  471. //
  472. return (cchU8);
  473. }
  474. #else
  475. // OLD IMPLEMENTATION NOT SUPPORTING SURROGATE PAIRS
  476. //+-------------------------------------------------------------------------
  477. // Maps a wide-character (Unicode) string to a new UTF-8 encoded character
  478. // string.
  479. //
  480. // The wide characters are mapped as follows:
  481. //
  482. // Start End Bits UTF-8 Characters
  483. // ------ ------ ---- --------------------------------
  484. // 0x0000 0x007F 7 0x0xxxxxxx
  485. // 0x0080 0x07FF 11 0x110xxxxx 0x10xxxxxx
  486. // 0x0800 0xFFFF 16 0x1110xxxx 0x10xxxxxx 0x10xxxxxx
  487. //
  488. // The parameter and return value semantics are the same as for the
  489. // Win32 API, WideCharToMultiByte.
  490. //
  491. // Note, starting with NT 4.0, WideCharToMultiByte supports CP_UTF8. CP_UTF8
  492. // isn't supported on Win95.
  493. //--------------------------------------------------------------------------
  494. int
  495. WINAPI
  496. WideCharToUTF8(
  497. IN LPCWSTR lpWideCharStr,
  498. IN int cchWideChar,
  499. OUT LPSTR lpUTF8Str,
  500. IN int cchUTF8
  501. )
  502. {
  503. int cchRemainUTF8;
  504. if (cchUTF8 < 0)
  505. goto InvalidParameter;
  506. cchRemainUTF8 = cchUTF8;
  507. if (cchWideChar < 0)
  508. cchWideChar = wcslen(lpWideCharStr) + 1;
  509. while (cchWideChar--) {
  510. WCHAR wch = *lpWideCharStr++;
  511. if (wch <= 0x7F) {
  512. // 7 bits
  513. cchRemainUTF8 -= 1;
  514. if (cchRemainUTF8 >= 0)
  515. *lpUTF8Str++ = (char) wch;
  516. } else if (wch <= 0x7FF) {
  517. // 11 bits
  518. cchRemainUTF8 -= 2;
  519. if (cchRemainUTF8 >= 0) {
  520. *lpUTF8Str++ = (char) (0xC0 | ((wch >> 6) & 0x1F));
  521. *lpUTF8Str++ = (char) (0x80 | (wch & 0x3F));
  522. }
  523. } else {
  524. // 16 bits
  525. cchRemainUTF8 -= 3;
  526. if (cchRemainUTF8 >= 0) {
  527. *lpUTF8Str++ = (char) (0xE0 | ((wch >> 12) & 0x0F));
  528. *lpUTF8Str++ = (char) (0x80 | ((wch >> 6) & 0x3F));
  529. *lpUTF8Str++ = (char) (0x80 | (wch & 0x3F));
  530. }
  531. }
  532. }
  533. if (cchRemainUTF8 >= 0)
  534. cchUTF8 = cchUTF8 - cchRemainUTF8;
  535. else if (cchUTF8 == 0)
  536. cchUTF8 = -cchRemainUTF8;
  537. else {
  538. cchUTF8 = 0;
  539. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  540. }
  541. return cchUTF8;
  542. InvalidParameter:
  543. SetLastError(ERROR_INVALID_PARAMETER);
  544. return 0;
  545. }
  546. //+-------------------------------------------------------------------------
  547. // Maps a UTF-8 encoded character string to a new wide-character (Unicode)
  548. // string.
  549. //
  550. // See CertWideCharToUTF8 for how the UTF-8 characters are mapped to wide
  551. // characters.
  552. //
  553. // The parameter and return value semantics are the same as for the
  554. // Win32 API, MultiByteToWideChar.
  555. //
  556. // If the UTF-8 characters don't contain the expected high order bits,
  557. // ERROR_INVALID_PARAMETER is set and 0 is returned.
  558. //
  559. // Note, starting with NT 4.0, MultiByteToWideChar supports CP_UTF8. CP_UTF8
  560. // isn't supported on Win95.
  561. //--------------------------------------------------------------------------
  562. int
  563. WINAPI
  564. UTF8ToWideChar(
  565. IN LPCSTR lpUTF8Str,
  566. IN int cchUTF8,
  567. OUT LPWSTR lpWideCharStr,
  568. IN int cchWideChar
  569. )
  570. {
  571. int cchRemainWideChar;
  572. if (cchWideChar < 0)
  573. goto InvalidParameter;
  574. cchRemainWideChar = cchWideChar;
  575. if (cchUTF8 < 0)
  576. cchUTF8 = strlen(lpUTF8Str) + 1;
  577. while (cchUTF8--) {
  578. char ch = *lpUTF8Str++;
  579. WCHAR wch;
  580. if (0 == (ch & 0x80))
  581. // 7 bits, 1 byte
  582. wch = (WCHAR) ch;
  583. else if (0xC0 == (ch & 0xE0)) {
  584. // 11 bits, 2 bytes
  585. char ch2;
  586. if (--cchUTF8 < 0)
  587. goto InvalidParameter;
  588. ch2 = *lpUTF8Str++;
  589. if (0x80 != (ch2 & 0xC0))
  590. goto InvalidParameter;
  591. wch = (((WCHAR) ch & 0x1F) << 6) | ((WCHAR) ch2 & 0x3F);
  592. } else if (0xE0 == (ch & 0xF0)) {
  593. // 16 bits, 3 bytes
  594. char ch2;
  595. char ch3;
  596. cchUTF8 -= 2;
  597. if (cchUTF8 < 0)
  598. goto InvalidParameter;
  599. ch2 = *lpUTF8Str++;
  600. ch3 = *lpUTF8Str++;
  601. if (0x80 != (ch2 & 0xC0) || 0x80 != (ch3 & 0xC0))
  602. goto InvalidParameter;
  603. wch = (((WCHAR) ch & 0x0F) << 12) | (((WCHAR) ch2 & 0x3F) << 6) |
  604. ((WCHAR) ch3 & 0x3F);
  605. } else
  606. goto InvalidParameter;
  607. if (--cchRemainWideChar >= 0)
  608. *lpWideCharStr++ = wch;
  609. }
  610. if (cchRemainWideChar >= 0)
  611. cchWideChar = cchWideChar - cchRemainWideChar;
  612. else if (cchWideChar == 0)
  613. cchWideChar = -cchRemainWideChar;
  614. else {
  615. cchWideChar = 0;
  616. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  617. }
  618. return cchWideChar;
  619. InvalidParameter:
  620. SetLastError(ERROR_INVALID_PARAMETER);
  621. return 0;
  622. }
  623. #endif