Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1359 lines
38 KiB

  1. /*++
  2. Copyright (c) 1991-2001, Microsoft Corporation All rights reserved.
  3. Module Name:
  4. utf.c
  5. Abstract:
  6. This file contains functions that convert UTF strings to Unicode
  7. strings and Unicode string to UTF strings.
  8. External Routines found in this file:
  9. UTFCPInfo
  10. UTFToUnicode
  11. UnicodeToUTF
  12. Revision History:
  13. 02-06-96 JulieB Created.
  14. 03-20-99 SamerA Surrogate support.
  15. 01-23-01 v-michka Ported to Godot
  16. 03-16-01 v-michka Ported UTF-8 corrigendum compliance version
  17. 05-01-01 v-michka Picked up yslin's UTF-7/8 Whistler bug fixes: 371215/381323/381433/376403
  18. --*/
  19. //
  20. // Include Files.
  21. //
  22. #include "precomp.h"
  23. // v-michka: cut some stuff out of utf.c. Since it is holding the forward
  24. // declares for callers of functions in *this* file, there were problems
  25. // with duplicate definitions. Its all part of not using nls.h for forward
  26. // declares
  27. #define ASCII 0x007f
  28. #define SHIFT_IN '+' // beginning of a shift sequence
  29. #define SHIFT_OUT '-' // end of a shift sequence
  30. #define UTF8_2_MAX 0x07ff // max UTF8 2-byte sequence (32 * 64 = 2048)
  31. #define UTF8_1ST_OF_2 0xc0 // 110x xxxx
  32. #define UTF8_1ST_OF_3 0xe0 // 1110 xxxx
  33. #define UTF8_1ST_OF_4 0xf0 // 1111 xxxx
  34. #define UTF8_TRAIL 0x80 // 10xx xxxx
  35. #define HIGHER_6_BIT(u) ((u) >> 12)
  36. #define MIDDLE_6_BIT(u) (((u) & 0x0fc0) >> 6)
  37. #define LOWER_6_BIT(u) ((u) & 0x003f)
  38. #define BIT7(a) ((a) & 0x80)
  39. #define BIT6(a) ((a) & 0x40)
  40. #define HIGH_SURROGATE_START 0xd800
  41. #define HIGH_SURROGATE_END 0xdbff
  42. #define LOW_SURROGATE_START 0xdc00
  43. #define LOW_SURROGATE_END 0xdfff
  44. #define NlsStrLenW(wz) gwcslen(wz)
  45. // content from utf.h in the Whistler project:
  46. //
  47. // Convert one Unicode to 2 2/3 Base64 chars in a shifted sequence.
  48. // Each char represents a 6-bit portion of the 16-bit Unicode char.
  49. //
  50. CONST char cBase64[] =
  51. "ABCDEFGHIJKLMNOPQRSTUVWXYZ" // A : 000000 .... 011001 ( 0 - 25)
  52. "abcdefghijklmnopqrstuvwxyz" // a : 011010 .... 110011 (26 - 51)
  53. "0123456789" // 0 : 110100 .... 111101 (52 - 61)
  54. "+/"; // + : 111110, / : 111111 (62 - 63)
  55. //
  56. // To determine if an ASCII char needs to be shifted.
  57. // 1 : to be shifted
  58. // 0 : not to be shifted
  59. //
  60. CONST BOOLEAN fShiftChar[] =
  61. {
  62. 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // Null, Tab, LF, CR
  63. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  64. 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, // Space '() +,-./
  65. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // 0123456789: ?
  66. 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ABCDEFGHIJKLMNO
  67. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // PQRSTUVWXYZ
  68. 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // abcdefghijklmno
  69. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1 // pqrstuvwxyz
  70. };
  71. /////////////////////////
  72. // //
  73. // UTF-7 -> Unicode //
  74. // //
  75. /////////////////////////
  76. //
  77. // Convert a Base64 char in a shifted sequence to a 6-bit portion of a
  78. // Unicode char.
  79. // -1 means it is not a Base64
  80. //
  81. CONST char nBitBase64[] =
  82. {
  83. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  84. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  85. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, // + /
  86. 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, // 0123456789
  87. -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, // ABCDEFGHIJKLMNO
  88. 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, // PQRSTUVWXYZ
  89. -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, // abcdefghijklmno
  90. 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1 // pqrstuvwxyz
  91. };
  92. //
  93. // Forward Declarations.
  94. //
  95. int
  96. UTF7ToUnicode(
  97. LPCSTR lpSrcStr,
  98. int cchSrc,
  99. LPWSTR lpDestStr,
  100. int cchDest);
  101. int
  102. UTF8ToUnicode(
  103. LPCSTR lpSrcStr,
  104. int cchSrc,
  105. LPWSTR lpDestStr,
  106. int cchDest,
  107. DWORD dwFlags);
  108. int
  109. UnicodeToUTF7(
  110. LPCWSTR lpSrcStr,
  111. int cchSrc,
  112. LPSTR lpDestStr,
  113. int cchDest);
  114. int
  115. UnicodeToUTF8(
  116. LPCWSTR lpSrcStr,
  117. int cchSrc,
  118. LPSTR lpDestStr,
  119. int cchDest);
  120. /////////////////////////
  121. // //
  122. // Unicode -> UTF-7 //
  123. // //
  124. /////////////////////////
  125. //-------------------------------------------------------------------------//
  126. // EXTERNAL ROUTINES //
  127. //-------------------------------------------------------------------------//
  128. ////////////////////////////////////////////////////////////////////////////
  129. //
  130. // UTFCPInfo
  131. //
  132. // Gets the CPInfo for the given UTF code page.
  133. //
  134. // 10-23-96 JulieB Created.
  135. ////////////////////////////////////////////////////////////////////////////
  136. BOOL UTFCPInfo(
  137. UINT CodePage,
  138. LPCPINFO lpCPInfo,
  139. BOOL fExVer)
  140. {
  141. int ctr;
  142. //
  143. // Invalid Parameter Check:
  144. // - validate code page
  145. // - lpCPInfo is NULL
  146. //
  147. if ( (CodePage < CP_UTF7) || (CodePage > CP_UTF8) ||
  148. (lpCPInfo == NULL) )
  149. {
  150. SetLastError(ERROR_INVALID_PARAMETER);
  151. return (0);
  152. }
  153. switch (CodePage)
  154. {
  155. case ( CP_UTF7 ) :
  156. {
  157. lpCPInfo->MaxCharSize = 5;
  158. break;
  159. }
  160. case ( CP_UTF8 ) :
  161. {
  162. lpCPInfo->MaxCharSize = 4;
  163. break;
  164. }
  165. }
  166. (lpCPInfo->DefaultChar)[0] = '?';
  167. (lpCPInfo->DefaultChar)[1] = (BYTE)0;
  168. for (ctr = 0; ctr < MAX_LEADBYTES; ctr++)
  169. {
  170. (lpCPInfo->LeadByte)[ctr] = (BYTE)0;
  171. }
  172. if (fExVer)
  173. {
  174. LPCPINFOEXW lpCPInfoEx = (LPCPINFOEXW)lpCPInfo;
  175. lpCPInfoEx->UnicodeDefaultChar = L'?';
  176. lpCPInfoEx->CodePage = CodePage;
  177. }
  178. return (TRUE);
  179. }
  180. ////////////////////////////////////////////////////////////////////////////
  181. //
  182. // UTFToUnicode
  183. //
  184. // Maps a UTF character string to its wide character string counterpart.
  185. //
  186. // 02-06-96 JulieB Created.
  187. ////////////////////////////////////////////////////////////////////////////
  188. int UTFToUnicode(
  189. UINT CodePage,
  190. DWORD dwFlags,
  191. LPCSTR lpMultiByteStr,
  192. int cbMultiByte,
  193. LPWSTR lpWideCharStr,
  194. int cchWideChar)
  195. {
  196. int rc = 0;
  197. //
  198. // Invalid Parameter Check:
  199. // - validate code page
  200. // - length of MB string is 0
  201. // - wide char buffer size is negative
  202. // - MB string is NULL
  203. // - length of WC string is NOT zero AND
  204. // (WC string is NULL OR src and dest pointers equal)
  205. //
  206. if ( (CodePage < CP_UTF7) || (CodePage > CP_UTF8) ||
  207. (cbMultiByte == 0) || (cchWideChar < 0) ||
  208. (lpMultiByteStr == NULL) ||
  209. ((cchWideChar != 0) &&
  210. ((lpWideCharStr == NULL) ||
  211. (lpMultiByteStr == (LPSTR)lpWideCharStr))) )
  212. {
  213. SetLastError(ERROR_INVALID_PARAMETER);
  214. return (0);
  215. }
  216. //
  217. // Invalid Flags Check:
  218. // - UTF7: flags not 0.
  219. // - UTF8: flags not 0 nor MB_ERR_INVALID_CHARS.
  220. //
  221. if (CodePage == CP_UTF8)
  222. {
  223. // UTF8
  224. if ((dwFlags & ~MB_ERR_INVALID_CHARS) != 0)
  225. {
  226. SetLastError(ERROR_INVALID_FLAGS);
  227. return (0);
  228. }
  229. }
  230. else if (dwFlags != 0)
  231. {
  232. // UTF7
  233. SetLastError(ERROR_INVALID_FLAGS);
  234. return (0);
  235. }
  236. //
  237. // If cbMultiByte is -1, then the string is null terminated and we
  238. // need to get the length of the string. Add one to the length to
  239. // include the null termination. (This will always be at least 1.)
  240. //
  241. if (cbMultiByte <= -1)
  242. {
  243. cbMultiByte = strlen(lpMultiByteStr) + 1;
  244. }
  245. switch (CodePage)
  246. {
  247. case ( CP_UTF7 ) :
  248. {
  249. rc = UTF7ToUnicode( lpMultiByteStr,
  250. cbMultiByte,
  251. lpWideCharStr,
  252. cchWideChar );
  253. break;
  254. }
  255. case ( CP_UTF8 ) :
  256. {
  257. rc = UTF8ToUnicode( lpMultiByteStr,
  258. cbMultiByte,
  259. lpWideCharStr,
  260. cchWideChar,
  261. dwFlags);
  262. break;
  263. }
  264. }
  265. return (rc);
  266. }
  267. ////////////////////////////////////////////////////////////////////////////
  268. //
  269. // UnicodeToUTF
  270. //
  271. // Maps a Unicode character string to its UTF string counterpart.
  272. //
  273. // 02-06-96 JulieB Created.
  274. ////////////////////////////////////////////////////////////////////////////
  275. int UnicodeToUTF(
  276. UINT CodePage,
  277. DWORD dwFlags,
  278. LPCWSTR lpWideCharStr,
  279. int cchWideChar,
  280. LPSTR lpMultiByteStr,
  281. int cbMultiByte,
  282. LPCSTR lpDefaultChar,
  283. LPBOOL lpUsedDefaultChar)
  284. {
  285. int rc = 0;
  286. //
  287. // Invalid Parameter Check:
  288. // - validate code page
  289. // - length of WC string is 0
  290. // - multibyte buffer size is negative
  291. // - WC string is NULL
  292. // - length of WC string is NOT zero AND
  293. // (MB string is NULL OR src and dest pointers equal)
  294. // - lpDefaultChar and lpUsedDefaultChar not NULL
  295. //
  296. if ( (CodePage < CP_UTF7) || (CodePage > CP_UTF8) ||
  297. (cchWideChar == 0) || (cbMultiByte < 0) ||
  298. (lpWideCharStr == NULL) ||
  299. ((cbMultiByte != 0) &&
  300. ((lpMultiByteStr == NULL) ||
  301. (lpWideCharStr == (LPWSTR)lpMultiByteStr))) ||
  302. (lpDefaultChar != NULL) || (lpUsedDefaultChar != NULL) )
  303. {
  304. SetLastError(ERROR_INVALID_PARAMETER);
  305. return (0);
  306. }
  307. //
  308. // Invalid Flags Check:
  309. // - flags not 0
  310. //
  311. if (dwFlags != 0)
  312. {
  313. SetLastError(ERROR_INVALID_FLAGS);
  314. return (0);
  315. }
  316. //
  317. // If cchWideChar is -1, then the string is null terminated and we
  318. // need to get the length of the string. Add one to the length to
  319. // include the null termination. (This will always be at least 1.)
  320. //
  321. if (cchWideChar <= -1)
  322. {
  323. cchWideChar = NlsStrLenW(lpWideCharStr) + 1;
  324. }
  325. switch (CodePage)
  326. {
  327. case ( CP_UTF7 ) :
  328. {
  329. rc = UnicodeToUTF7( lpWideCharStr,
  330. cchWideChar,
  331. lpMultiByteStr,
  332. cbMultiByte );
  333. break;
  334. }
  335. case ( CP_UTF8 ) :
  336. {
  337. rc = UnicodeToUTF8( lpWideCharStr,
  338. cchWideChar,
  339. lpMultiByteStr,
  340. cbMultiByte );
  341. break;
  342. }
  343. }
  344. return (rc);
  345. }
  346. //-------------------------------------------------------------------------//
  347. // INTERNAL ROUTINES //
  348. //-------------------------------------------------------------------------//
  349. ////////////////////////////////////////////////////////////////////////////
  350. //
  351. // UTF7ToUnicode
  352. //
  353. // Maps a UTF-7 character string to its wide character string counterpart.
  354. //
  355. // 02-06-96 JulieB Created.
  356. ////////////////////////////////////////////////////////////////////////////
  357. int UTF7ToUnicode(
  358. LPCSTR lpSrcStr,
  359. int cchSrc,
  360. LPWSTR lpDestStr,
  361. int cchDest)
  362. {
  363. //CHAR is signed, so we have to cast lpSrcStr to an unsigned char below.
  364. BYTE* pUTF7 = (BYTE*)lpSrcStr;
  365. BOOL fShift = FALSE;
  366. DWORD dwBit = 0; // 32-bit buffer to hold temporary bits
  367. int iPos = 0; // 6-bit position pointer in the buffer
  368. int cchWC = 0; // # of Unicode code points generated
  369. while ((cchSrc--) && ((cchDest == 0) || (cchWC < cchDest)))
  370. {
  371. if (*pUTF7 > ASCII)
  372. {
  373. //
  374. // Error - non ASCII char, so zero extend it.
  375. //
  376. if (cchDest)
  377. {
  378. lpDestStr[cchWC] = (WCHAR)*pUTF7;
  379. }
  380. cchWC++;
  381. // Terminate the shifted sequence.
  382. fShift = FALSE;
  383. }
  384. else if (!fShift)
  385. {
  386. //
  387. // Not in shifted sequence.
  388. //
  389. if (*pUTF7 == SHIFT_IN)
  390. {
  391. if (cchSrc && (pUTF7[1] == SHIFT_OUT))
  392. {
  393. //
  394. // "+-" means "+"
  395. //
  396. if (cchDest)
  397. {
  398. lpDestStr[cchWC] = (WCHAR)*pUTF7;
  399. }
  400. pUTF7++;
  401. cchSrc--;
  402. cchWC++;
  403. }
  404. else
  405. {
  406. //
  407. // Start a new shift sequence.
  408. //
  409. fShift = TRUE;
  410. }
  411. }
  412. else
  413. {
  414. //
  415. // No need to shift.
  416. //
  417. if (cchDest)
  418. {
  419. lpDestStr[cchWC] = (WCHAR)*pUTF7;
  420. }
  421. cchWC++;
  422. }
  423. }
  424. else
  425. {
  426. //
  427. // Already in shifted sequence.
  428. //
  429. if (nBitBase64[*pUTF7] == -1)
  430. {
  431. //
  432. // Any non Base64 char also ends shift state.
  433. //
  434. if (*pUTF7 != SHIFT_OUT)
  435. {
  436. //
  437. // Not "-", so write it to the buffer.
  438. //
  439. if (cchDest)
  440. {
  441. lpDestStr[cchWC] = (WCHAR)*pUTF7;
  442. }
  443. cchWC++;
  444. }
  445. //
  446. // Reset bits.
  447. //
  448. fShift = FALSE;
  449. dwBit = 0;
  450. iPos = 0;
  451. }
  452. else
  453. {
  454. //
  455. // Store the bits in the 6-bit buffer and adjust the
  456. // position pointer.
  457. //
  458. dwBit |= ((DWORD)nBitBase64[*pUTF7]) << (26 - iPos);
  459. iPos += 6;
  460. }
  461. //
  462. // Output the 16-bit Unicode value.
  463. //
  464. while (iPos >= 16)
  465. {
  466. if (cchDest)
  467. {
  468. if (cchWC < cchDest)
  469. {
  470. lpDestStr[cchWC] = (WCHAR)(dwBit >> 16);
  471. }
  472. else
  473. {
  474. break;
  475. }
  476. }
  477. cchWC++;
  478. dwBit <<= 16;
  479. iPos -= 16;
  480. }
  481. if (iPos >= 16)
  482. {
  483. //
  484. // Error - buffer too small.
  485. //
  486. cchSrc++;
  487. break;
  488. }
  489. }
  490. pUTF7++;
  491. }
  492. //
  493. // Make sure the destination buffer was large enough.
  494. //
  495. if (cchDest && (cchSrc >= 0))
  496. {
  497. if (cchSrc == 0 && fShift && *(pUTF7--) == SHIFT_OUT)
  498. {
  499. //
  500. // Do nothing here.
  501. // If we are in shift-in mode previously, and the last byte is a shift-out byte ('-'),
  502. // we should absorb this byte. So don't set error.
  503. //
  504. } else
  505. {
  506. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  507. return (0);
  508. }
  509. }
  510. //
  511. // Return the number of Unicode characters written.
  512. //
  513. return (cchWC);
  514. }
  515. ////////////////////////////////////////////////////////////////////////////
  516. //
  517. // UTF8ToUnicode
  518. //
  519. // Maps a UTF-8 character string to its wide character string counterpart.
  520. //
  521. // 02-06-96 JulieB Created.
  522. ////////////////////////////////////////////////////////////////////////////
  523. int UTF8ToUnicode(
  524. LPCSTR lpSrcStr,
  525. int cchSrc,
  526. LPWSTR lpDestStr,
  527. int cchDest,
  528. DWORD dwFlags
  529. )
  530. {
  531. int nTB = 0; // # trail bytes to follow
  532. int cchWC = 0; // # of Unicode code points generated
  533. LPCSTR pUTF8 = lpSrcStr;
  534. DWORD dwSurrogateChar; // Full surrogate char
  535. BOOL bSurrogatePair = FALSE; // Indicate we'r collecting a surrogate pair
  536. BOOL bCheckInvalidBytes = (dwFlags & MB_ERR_INVALID_CHARS);
  537. char UTF8;
  538. while ((cchSrc--) && ((cchDest == 0) || (cchWC < cchDest)))
  539. {
  540. //
  541. // See if there are any trail bytes.
  542. //
  543. if (BIT7(*pUTF8) == 0)
  544. {
  545. //
  546. // Found ASCII.
  547. //
  548. if (cchDest)
  549. {
  550. lpDestStr[cchWC] = (WCHAR)*pUTF8;
  551. }
  552. nTB = bSurrogatePair = 0;
  553. cchWC++;
  554. }
  555. else if (BIT6(*pUTF8) == 0)
  556. {
  557. //
  558. // Found a trail byte.
  559. // Note : Ignore the trail byte if there was no lead byte.
  560. //
  561. if (nTB != 0)
  562. {
  563. //
  564. // Decrement the trail byte counter.
  565. //
  566. nTB--;
  567. if (bSurrogatePair)
  568. {
  569. dwSurrogateChar <<= 6;
  570. dwSurrogateChar |= LOWER_6_BIT(*pUTF8);
  571. if (nTB == 0)
  572. {
  573. if (cchDest)
  574. {
  575. if ((cchWC + 1) < cchDest)
  576. {
  577. lpDestStr[cchWC] = (WCHAR)
  578. (((dwSurrogateChar - 0x10000) >> 10) + HIGH_SURROGATE_START);
  579. lpDestStr[cchWC+1] = (WCHAR)
  580. ((dwSurrogateChar - 0x10000)%0x400 + LOW_SURROGATE_START);
  581. }
  582. else
  583. {
  584. // Error : Buffer too small
  585. cchSrc++;
  586. break;
  587. }
  588. }
  589. cchWC += 2;
  590. bSurrogatePair = FALSE;
  591. }
  592. }
  593. else
  594. {
  595. //
  596. // Make room for the trail byte and add the trail byte
  597. // value.
  598. //
  599. if (cchDest)
  600. {
  601. lpDestStr[cchWC] <<= 6;
  602. lpDestStr[cchWC] |= LOWER_6_BIT(*pUTF8);
  603. }
  604. if (nTB == 0)
  605. {
  606. //
  607. // End of sequence. Advance the output counter.
  608. //
  609. cchWC++;
  610. }
  611. }
  612. }
  613. else
  614. {
  615. if (bCheckInvalidBytes)
  616. {
  617. SetLastError(ERROR_NO_UNICODE_TRANSLATION);
  618. return (0);
  619. }
  620. // error - not expecting a trail byte. That is, there is a trailing byte without leading byte.
  621. bSurrogatePair = FALSE;
  622. }
  623. }
  624. else
  625. {
  626. //
  627. // Found a lead byte.
  628. //
  629. if (nTB > 0)
  630. {
  631. // error - A leading byte before the previous sequence is completed.
  632. if (bCheckInvalidBytes)
  633. {
  634. SetLastError(ERROR_NO_UNICODE_TRANSLATION);
  635. return (0);
  636. }
  637. //
  638. // Error - previous sequence not finished.
  639. //
  640. nTB = 0;
  641. bSurrogatePair = FALSE;
  642. // Put this character back so that we can start over another sequence.
  643. cchSrc++;
  644. pUTF8--;
  645. }
  646. else
  647. {
  648. //
  649. // Calculate the number of bytes to follow.
  650. // Look for the first 0 from left to right.
  651. //
  652. UTF8 = *pUTF8;
  653. while (BIT7(UTF8) != 0)
  654. {
  655. UTF8 <<= 1;
  656. nTB++;
  657. }
  658. //
  659. // Check for non-shortest form.
  660. //
  661. switch (nTB) {
  662. case 1:
  663. nTB = 0;
  664. break;
  665. case 2:
  666. // Make sure that bit 8 ~ bit 11 is not all zero.
  667. // 110XXXXx 10xxxxxx
  668. if ((*pUTF8 & 0x1e) == 0)
  669. {
  670. nTB = 0;
  671. }
  672. break;
  673. case 3:
  674. // Look ahead to check for non-shortest form.
  675. // 1110XXXX 10Xxxxxx 10xxxxxx
  676. if (cchSrc >= 2)
  677. {
  678. if (((*pUTF8 & 0x0f) == 0) && (*(pUTF8 + 1) & 0x20) == 0)
  679. {
  680. nTB = 0;
  681. }
  682. }
  683. break;
  684. case 4:
  685. //
  686. // This is a surrogate unicode pair
  687. //
  688. if (cchSrc >= 3)
  689. {
  690. WORD word = (((WORD)*pUTF8) << 8) | *(pUTF8 + 1);
  691. // Look ahead to check for non-shortest form.
  692. // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
  693. // Check for the 5 bits are not all zero.
  694. // 0x0730 == 00000111 11000000
  695. if ((word & 0x0730) == 0)
  696. {
  697. nTB = 0;
  698. } else if ((word & 0x0400) == 0x0400)
  699. {
  700. // The 21st bit is 1.
  701. // Make sure that the resulting Unicode is within the valid surrogate range.
  702. // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point ragne
  703. // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF.
  704. // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20
  705. // bit are all zero.
  706. // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx,
  707. // XXXXX can only be 10000.
  708. // 0x0330 = 0000 0011 0011 0000
  709. if ((word & 0x0330) != 0)
  710. {
  711. nTB = 0;
  712. }
  713. } else
  714. {
  715. dwSurrogateChar = UTF8 >> nTB;
  716. bSurrogatePair = TRUE;
  717. }
  718. }
  719. break;
  720. default:
  721. //
  722. // If the bits is greater than 4, this is an invalid
  723. // UTF8 lead byte.
  724. //
  725. nTB = 0;
  726. break;
  727. }
  728. if (nTB != 0)
  729. {
  730. //
  731. // Store the value from the first byte and decrement
  732. // the number of bytes to follow.
  733. //
  734. if (cchDest)
  735. {
  736. lpDestStr[cchWC] = UTF8 >> nTB;
  737. }
  738. nTB--;
  739. } else
  740. {
  741. if (bCheckInvalidBytes)
  742. {
  743. SetLastError(ERROR_NO_UNICODE_TRANSLATION);
  744. return (0);
  745. }
  746. }
  747. }
  748. }
  749. pUTF8++;
  750. }
  751. if ((bCheckInvalidBytes && nTB != 0) || (cchWC == 0))
  752. {
  753. // About (cchWC == 0):
  754. // Because we now throw away non-shortest form, it is possible that we generate 0 chars.
  755. // In this case, we have to set error to ERROR_NO_UNICODE_TRANSLATION so that we conform
  756. // to the spec of MultiByteToWideChar.
  757. SetLastError(ERROR_NO_UNICODE_TRANSLATION);
  758. return (0);
  759. }
  760. //
  761. // Make sure the destination buffer was large enough.
  762. //
  763. if (cchDest && (cchSrc >= 0))
  764. {
  765. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  766. return (0);
  767. }
  768. //
  769. // Return the number of Unicode characters written.
  770. //
  771. return (cchWC);
  772. }
  773. ////////////////////////////////////////////////////////////////////////////
  774. //
  775. // UnicodeToUTF7
  776. //
  777. // Maps a Unicode character string to its UTF-7 string counterpart.
  778. //
  779. // 02-06-96 JulieB Created.
  780. ////////////////////////////////////////////////////////////////////////////
  781. int UnicodeToUTF7(
  782. LPCWSTR lpSrcStr,
  783. int cchSrc,
  784. LPSTR lpDestStr,
  785. int cchDest)
  786. {
  787. LPCWSTR lpWC = lpSrcStr;
  788. BOOL fShift = FALSE;
  789. DWORD dwBit = 0; // 32-bit buffer
  790. int iPos = 0; // 6-bit position in buffer
  791. int cchU7 = 0; // # of UTF7 chars generated
  792. while ((cchSrc--) && ((cchDest == 0) || (cchU7 < cchDest)))
  793. {
  794. if ((*lpWC > ASCII) || (fShiftChar[*lpWC]))
  795. {
  796. //
  797. // Need shift. Store 16 bits in buffer.
  798. //
  799. dwBit |= ((DWORD)*lpWC) << (16 - iPos);
  800. iPos += 16;
  801. if (!fShift)
  802. {
  803. //
  804. // Not in shift state, so add "+".
  805. //
  806. if (cchDest)
  807. {
  808. lpDestStr[cchU7] = SHIFT_IN;
  809. }
  810. cchU7++;
  811. //
  812. // Go into shift state.
  813. //
  814. fShift = TRUE;
  815. }
  816. //
  817. // Output 6 bits at a time as Base64 chars.
  818. //
  819. while (iPos >= 6)
  820. {
  821. if (cchDest)
  822. {
  823. if (cchU7 < cchDest)
  824. {
  825. //
  826. // 26 = 32 - 6
  827. //
  828. lpDestStr[cchU7] = cBase64[(int)(dwBit >> 26)];
  829. }
  830. else
  831. {
  832. break;
  833. }
  834. }
  835. cchU7++;
  836. dwBit <<= 6; // remove from bit buffer
  837. iPos -= 6; // adjust position pointer
  838. }
  839. if (iPos >= 6)
  840. {
  841. //
  842. // Error - buffer too small.
  843. //
  844. cchSrc++;
  845. break;
  846. }
  847. }
  848. else
  849. {
  850. //
  851. // No need to shift.
  852. //
  853. if (fShift)
  854. {
  855. //
  856. // End the shift sequence.
  857. //
  858. fShift = FALSE;
  859. if (iPos != 0)
  860. {
  861. //
  862. // Some bits left in dwBit.
  863. //
  864. if (cchDest)
  865. {
  866. if ((cchU7 + 1) < cchDest)
  867. {
  868. lpDestStr[cchU7++] = cBase64[(int)(dwBit >> 26)];
  869. lpDestStr[cchU7++] = SHIFT_OUT;
  870. }
  871. else
  872. {
  873. //
  874. // Error - buffer too small.
  875. //
  876. cchSrc++;
  877. break;
  878. }
  879. }
  880. else
  881. {
  882. cchU7 += 2;
  883. }
  884. dwBit = 0; // reset bit buffer
  885. iPos = 0; // reset postion pointer
  886. }
  887. else
  888. {
  889. //
  890. // Simply end the shift sequence.
  891. //
  892. if (cchDest)
  893. {
  894. lpDestStr[cchU7++] = SHIFT_OUT;
  895. }
  896. else
  897. {
  898. cchU7++;
  899. }
  900. }
  901. }
  902. //
  903. // Write the character to the buffer.
  904. // If the character is "+", then write "+-".
  905. //
  906. if (cchDest)
  907. {
  908. if (cchU7 < cchDest)
  909. {
  910. lpDestStr[cchU7++] = (char)*lpWC;
  911. if (*lpWC == SHIFT_IN)
  912. {
  913. if (cchU7 < cchDest)
  914. {
  915. lpDestStr[cchU7++] = SHIFT_OUT;
  916. }
  917. else
  918. {
  919. //
  920. // Error - buffer too small.
  921. //
  922. cchSrc++;
  923. break;
  924. }
  925. }
  926. }
  927. else
  928. {
  929. //
  930. // Error - buffer too small.
  931. //
  932. cchSrc++;
  933. break;
  934. }
  935. }
  936. else
  937. {
  938. cchU7++;
  939. if (*lpWC == SHIFT_IN)
  940. {
  941. cchU7++;
  942. }
  943. }
  944. }
  945. lpWC++;
  946. }
  947. //
  948. // See if we're still in the shift state.
  949. //
  950. if (fShift)
  951. {
  952. if (iPos != 0)
  953. {
  954. //
  955. // Some bits left in dwBit.
  956. //
  957. if (cchDest)
  958. {
  959. if ((cchU7 + 1) < cchDest)
  960. {
  961. lpDestStr[cchU7++] = cBase64[(int)(dwBit >> 26)];
  962. lpDestStr[cchU7++] = SHIFT_OUT;
  963. }
  964. else
  965. {
  966. //
  967. // Error - buffer too small.
  968. //
  969. cchSrc++;
  970. }
  971. }
  972. else
  973. {
  974. cchU7 += 2;
  975. }
  976. }
  977. else
  978. {
  979. //
  980. // Simply end the shift sequence.
  981. //
  982. if (cchDest)
  983. {
  984. lpDestStr[cchU7++] = SHIFT_OUT;
  985. }
  986. else
  987. {
  988. cchU7++;
  989. }
  990. }
  991. }
  992. //
  993. // Make sure the destination buffer was large enough.
  994. //
  995. if (cchDest && (cchSrc >= 0))
  996. {
  997. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  998. return (0);
  999. }
  1000. //
  1001. // Return the number of UTF-7 characters written.
  1002. //
  1003. return (cchU7);
  1004. }
  1005. ////////////////////////////////////////////////////////////////////////////
  1006. //
  1007. // UnicodeToUTF8
  1008. //
  1009. // Maps a Unicode character string to its UTF-8 string counterpart.
  1010. //
  1011. // 02-06-96 JulieB Created.
  1012. ////////////////////////////////////////////////////////////////////////////
  1013. int UnicodeToUTF8(
  1014. LPCWSTR lpSrcStr,
  1015. int cchSrc,
  1016. LPSTR lpDestStr,
  1017. int cchDest)
  1018. {
  1019. LPCWSTR lpWC = lpSrcStr;
  1020. int cchU8 = 0; // # of UTF8 chars generated
  1021. DWORD dwSurrogateChar;
  1022. WCHAR wchHighSurrogate = 0;
  1023. BOOL bHandled;
  1024. while ((cchSrc--) && ((cchDest == 0) || (cchU8 < cchDest)))
  1025. {
  1026. bHandled = FALSE;
  1027. //
  1028. // Check if high surrogate is available
  1029. //
  1030. if ((*lpWC >= HIGH_SURROGATE_START) && (*lpWC <= HIGH_SURROGATE_END))
  1031. {
  1032. if (cchDest)
  1033. {
  1034. // Another high surrogate, then treat the 1st as normal
  1035. // Unicode character.
  1036. if (wchHighSurrogate)
  1037. {
  1038. if ((cchU8 + 2) < cchDest)
  1039. {
  1040. lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate);
  1041. lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate);
  1042. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate);
  1043. }
  1044. else
  1045. {
  1046. // not enough buffer
  1047. cchSrc++;
  1048. break;
  1049. }
  1050. }
  1051. }
  1052. else
  1053. {
  1054. cchU8 += 3;
  1055. }
  1056. wchHighSurrogate = *lpWC;
  1057. bHandled = TRUE;
  1058. }
  1059. if (!bHandled && wchHighSurrogate)
  1060. {
  1061. if ((*lpWC >= LOW_SURROGATE_START) && (*lpWC <= LOW_SURROGATE_END))
  1062. {
  1063. // wheee, valid surrogate pairs
  1064. if (cchDest)
  1065. {
  1066. if ((cchU8 + 3) < cchDest)
  1067. {
  1068. dwSurrogateChar = (((wchHighSurrogate-0xD800) << 10) + (*lpWC - 0xDC00) + 0x10000);
  1069. lpDestStr[cchU8++] = (UTF8_1ST_OF_4 |
  1070. (unsigned char)(dwSurrogateChar >> 18)); // 3 bits from 1st byte
  1071. lpDestStr[cchU8++] = (UTF8_TRAIL |
  1072. (unsigned char)((dwSurrogateChar >> 12) & 0x3f)); // 6 bits from 2nd byte
  1073. lpDestStr[cchU8++] = (UTF8_TRAIL |
  1074. (unsigned char)((dwSurrogateChar >> 6) & 0x3f)); // 6 bits from 3rd byte
  1075. lpDestStr[cchU8++] = (UTF8_TRAIL |
  1076. (unsigned char)(0x3f & dwSurrogateChar)); // 6 bits from 4th byte
  1077. }
  1078. else
  1079. {
  1080. // not enough buffer
  1081. cchSrc++;
  1082. break;
  1083. }
  1084. }
  1085. else
  1086. {
  1087. // we already counted 3 previously (in high surrogate)
  1088. cchU8 += 1;
  1089. }
  1090. bHandled = TRUE;
  1091. }
  1092. else
  1093. {
  1094. // Bad Surrogate pair : ERROR
  1095. // Just process wchHighSurrogate , and the code below will
  1096. // process the current code point
  1097. if (cchDest)
  1098. {
  1099. if ((cchU8 + 2) < cchDest)
  1100. {
  1101. lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate);
  1102. lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate);
  1103. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate);
  1104. }
  1105. else
  1106. {
  1107. // not enough buffer
  1108. cchSrc++;
  1109. break;
  1110. }
  1111. }
  1112. }
  1113. wchHighSurrogate = 0;
  1114. }
  1115. if (!bHandled)
  1116. {
  1117. if (*lpWC <= ASCII)
  1118. {
  1119. //
  1120. // Found ASCII.
  1121. //
  1122. if (cchDest)
  1123. {
  1124. lpDestStr[cchU8] = (char)*lpWC;
  1125. }
  1126. cchU8++;
  1127. }
  1128. else if (*lpWC <= UTF8_2_MAX)
  1129. {
  1130. //
  1131. // Found 2 byte sequence if < 0x07ff (11 bits).
  1132. //
  1133. if (cchDest)
  1134. {
  1135. if ((cchU8 + 1) < cchDest)
  1136. {
  1137. //
  1138. // Use upper 5 bits in first byte.
  1139. // Use lower 6 bits in second byte.
  1140. //
  1141. lpDestStr[cchU8++] = UTF8_1ST_OF_2 | (*lpWC >> 6);
  1142. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(*lpWC);
  1143. }
  1144. else
  1145. {
  1146. //
  1147. // Error - buffer too small.
  1148. //
  1149. cchSrc++;
  1150. break;
  1151. }
  1152. }
  1153. else
  1154. {
  1155. cchU8 += 2;
  1156. }
  1157. }
  1158. else
  1159. {
  1160. //
  1161. // Found 3 byte sequence.
  1162. //
  1163. if (cchDest)
  1164. {
  1165. if ((cchU8 + 2) < cchDest)
  1166. {
  1167. //
  1168. // Use upper 4 bits in first byte.
  1169. // Use middle 6 bits in second byte.
  1170. // Use lower 6 bits in third byte.
  1171. //
  1172. lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(*lpWC);
  1173. lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(*lpWC);
  1174. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(*lpWC);
  1175. }
  1176. else
  1177. {
  1178. //
  1179. // Error - buffer too small.
  1180. //
  1181. cchSrc++;
  1182. break;
  1183. }
  1184. }
  1185. else
  1186. {
  1187. cchU8 += 3;
  1188. }
  1189. }
  1190. }
  1191. lpWC++;
  1192. }
  1193. //
  1194. // If the last character was a high surrogate, then handle it as a normal
  1195. // unicode character.
  1196. //
  1197. if ((cchSrc < 0) && (wchHighSurrogate != 0))
  1198. {
  1199. if (cchDest)
  1200. {
  1201. if ((cchU8 + 2) < cchDest)
  1202. {
  1203. lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate);
  1204. lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate);
  1205. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate);
  1206. }
  1207. else
  1208. {
  1209. cchSrc++;
  1210. }
  1211. }
  1212. }
  1213. //
  1214. // Make sure the destination buffer was large enough.
  1215. //
  1216. if (cchDest && (cchSrc >= 0))
  1217. {
  1218. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  1219. return (0);
  1220. }
  1221. //
  1222. // Return the number of UTF-8 characters written.
  1223. //
  1224. return (cchU8);
  1225. }