Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1295 lines
36 KiB

  1. /*++
  2. Copyright (c) 1991-2000, Microsoft Corporation All rights reserved.
  3. Module Name:
  4. utf.c
  5. Abstract:
  6. This file contains functions that convert UTF strings to Unicode
  7. strings and Unicode string to UTF strings.
  8. External Routines found in this file:
  9. UTFCPInfo
  10. UTFToUnicode
  11. UnicodeToUTF
  12. Revision History:
  13. 02-06-96 JulieB Created.
  14. 03-20-99 SamerA Surrogate support.
  15. --*/
  16. //
  17. // Include Files.
  18. //
  19. #include "nls.h"
  20. #include "utf.h"
  21. //
  22. // Forward Declarations.
  23. //
  24. int
  25. UTF7ToUnicode(
  26. LPCSTR lpSrcStr,
  27. int cchSrc,
  28. LPWSTR lpDestStr,
  29. int cchDest);
  30. int
  31. UTF8ToUnicode(
  32. LPCSTR lpSrcStr,
  33. int cchSrc,
  34. LPWSTR lpDestStr,
  35. int cchDest,
  36. DWORD dwFlags);
  37. int
  38. UnicodeToUTF7(
  39. LPCWSTR lpSrcStr,
  40. int cchSrc,
  41. LPSTR lpDestStr,
  42. int cchDest);
  43. int
  44. UnicodeToUTF8(
  45. LPCWSTR lpSrcStr,
  46. int cchSrc,
  47. LPSTR lpDestStr,
  48. int cchDest);
  49. //-------------------------------------------------------------------------//
  50. // EXTERNAL ROUTINES //
  51. //-------------------------------------------------------------------------//
  52. ////////////////////////////////////////////////////////////////////////////
  53. //
  54. // UTFCPInfo
  55. //
  56. // Gets the CPInfo for the given UTF code page.
  57. //
  58. // 10-23-96 JulieB Created.
  59. ////////////////////////////////////////////////////////////////////////////
  60. BOOL UTFCPInfo(
  61. UINT CodePage,
  62. LPCPINFO lpCPInfo,
  63. BOOL fExVer)
  64. {
  65. int ctr;
  66. //
  67. // Invalid Parameter Check:
  68. // - validate code page
  69. // - lpCPInfo is NULL
  70. //
  71. if ( (CodePage < CP_UTF7) || (CodePage > CP_UTF8) ||
  72. (lpCPInfo == NULL) )
  73. {
  74. SetLastError(ERROR_INVALID_PARAMETER);
  75. return (0);
  76. }
  77. switch (CodePage)
  78. {
  79. case ( CP_UTF7 ) :
  80. {
  81. lpCPInfo->MaxCharSize = 5;
  82. break;
  83. }
  84. case ( CP_UTF8 ) :
  85. {
  86. lpCPInfo->MaxCharSize = 4;
  87. break;
  88. }
  89. }
  90. (lpCPInfo->DefaultChar)[0] = '?';
  91. (lpCPInfo->DefaultChar)[1] = (BYTE)0;
  92. for (ctr = 0; ctr < MAX_LEADBYTES; ctr++)
  93. {
  94. (lpCPInfo->LeadByte)[ctr] = (BYTE)0;
  95. }
  96. if (fExVer)
  97. {
  98. LPCPINFOEXW lpCPInfoEx = (LPCPINFOEXW)lpCPInfo;
  99. lpCPInfoEx->UnicodeDefaultChar = L'?';
  100. lpCPInfoEx->CodePage = CodePage;
  101. }
  102. return (TRUE);
  103. }
  104. ////////////////////////////////////////////////////////////////////////////
  105. //
  106. // UTFToUnicode
  107. //
  108. // Maps a UTF character string to its wide character string counterpart.
  109. //
  110. // 02-06-96 JulieB Created.
  111. ////////////////////////////////////////////////////////////////////////////
  112. int UTFToUnicode(
  113. UINT CodePage,
  114. DWORD dwFlags,
  115. LPCSTR lpMultiByteStr,
  116. int cbMultiByte,
  117. LPWSTR lpWideCharStr,
  118. int cchWideChar)
  119. {
  120. int rc = 0;
  121. //
  122. // Invalid Parameter Check:
  123. // - validate code page
  124. // - length of MB string is 0
  125. // - wide char buffer size is negative
  126. // - MB string is NULL
  127. // - length of WC string is NOT zero AND
  128. // (WC string is NULL OR src and dest pointers equal)
  129. //
  130. if ( (CodePage < CP_UTF7) || (CodePage > CP_UTF8) ||
  131. (cbMultiByte == 0) || (cchWideChar < 0) ||
  132. (lpMultiByteStr == NULL) ||
  133. ((cchWideChar != 0) &&
  134. ((lpWideCharStr == NULL) ||
  135. (lpMultiByteStr == (LPSTR)lpWideCharStr))) )
  136. {
  137. SetLastError(ERROR_INVALID_PARAMETER);
  138. return (0);
  139. }
  140. //
  141. // Invalid Flags Check:
  142. // - UTF7: flags not 0.
  143. // - UTF8: flags not 0 nor MB_ERR_INVALID_CHARS.
  144. //
  145. if (CodePage == CP_UTF8)
  146. {
  147. // UTF8
  148. if ((dwFlags & ~MB_ERR_INVALID_CHARS) != 0)
  149. {
  150. SetLastError(ERROR_INVALID_FLAGS);
  151. return (0);
  152. }
  153. }
  154. else if (dwFlags != 0)
  155. {
  156. // UTF7
  157. SetLastError(ERROR_INVALID_FLAGS);
  158. return (0);
  159. }
  160. //
  161. // If cbMultiByte is -1, then the string is null terminated and we
  162. // need to get the length of the string. Add one to the length to
  163. // include the null termination. (This will always be at least 1.)
  164. //
  165. if (cbMultiByte <= -1)
  166. {
  167. cbMultiByte = strlen(lpMultiByteStr) + 1;
  168. }
  169. switch (CodePage)
  170. {
  171. case ( CP_UTF7 ) :
  172. {
  173. rc = UTF7ToUnicode( lpMultiByteStr,
  174. cbMultiByte,
  175. lpWideCharStr,
  176. cchWideChar );
  177. break;
  178. }
  179. case ( CP_UTF8 ) :
  180. {
  181. rc = UTF8ToUnicode( lpMultiByteStr,
  182. cbMultiByte,
  183. lpWideCharStr,
  184. cchWideChar,
  185. dwFlags);
  186. break;
  187. }
  188. }
  189. return (rc);
  190. }
  191. ////////////////////////////////////////////////////////////////////////////
  192. //
  193. // UnicodeToUTF
  194. //
  195. // Maps a Unicode character string to its UTF string counterpart.
  196. //
  197. // 02-06-96 JulieB Created.
  198. ////////////////////////////////////////////////////////////////////////////
  199. int UnicodeToUTF(
  200. UINT CodePage,
  201. DWORD dwFlags,
  202. LPCWSTR lpWideCharStr,
  203. int cchWideChar,
  204. LPSTR lpMultiByteStr,
  205. int cbMultiByte,
  206. LPCSTR lpDefaultChar,
  207. LPBOOL lpUsedDefaultChar)
  208. {
  209. int rc = 0;
  210. //
  211. // Invalid Parameter Check:
  212. // - validate code page
  213. // - length of WC string is 0
  214. // - multibyte buffer size is negative
  215. // - WC string is NULL
  216. // - length of WC string is NOT zero AND
  217. // (MB string is NULL OR src and dest pointers equal)
  218. // - lpDefaultChar and lpUsedDefaultChar not NULL
  219. //
  220. if ( (CodePage < CP_UTF7) || (CodePage > CP_UTF8) ||
  221. (cchWideChar == 0) || (cbMultiByte < 0) ||
  222. (lpWideCharStr == NULL) ||
  223. ((cbMultiByte != 0) &&
  224. ((lpMultiByteStr == NULL) ||
  225. (lpWideCharStr == (LPWSTR)lpMultiByteStr))) ||
  226. (lpDefaultChar != NULL) || (lpUsedDefaultChar != NULL) )
  227. {
  228. SetLastError(ERROR_INVALID_PARAMETER);
  229. return (0);
  230. }
  231. //
  232. // Invalid Flags Check:
  233. // - flags not 0
  234. //
  235. if (dwFlags != 0)
  236. {
  237. SetLastError(ERROR_INVALID_FLAGS);
  238. return (0);
  239. }
  240. //
  241. // If cchWideChar is -1, then the string is null terminated and we
  242. // need to get the length of the string. Add one to the length to
  243. // include the null termination. (This will always be at least 1.)
  244. //
  245. if (cchWideChar <= -1)
  246. {
  247. cchWideChar = NlsStrLenW(lpWideCharStr) + 1;
  248. }
  249. switch (CodePage)
  250. {
  251. case ( CP_UTF7 ) :
  252. {
  253. rc = UnicodeToUTF7( lpWideCharStr,
  254. cchWideChar,
  255. lpMultiByteStr,
  256. cbMultiByte );
  257. break;
  258. }
  259. case ( CP_UTF8 ) :
  260. {
  261. rc = UnicodeToUTF8( lpWideCharStr,
  262. cchWideChar,
  263. lpMultiByteStr,
  264. cbMultiByte );
  265. break;
  266. }
  267. }
  268. return (rc);
  269. }
  270. //-------------------------------------------------------------------------//
  271. // INTERNAL ROUTINES //
  272. //-------------------------------------------------------------------------//
  273. ////////////////////////////////////////////////////////////////////////////
  274. //
  275. // UTF7ToUnicode
  276. //
  277. // Maps a UTF-7 character string to its wide character string counterpart.
  278. //
  279. // 02-06-96 JulieB Created.
  280. ////////////////////////////////////////////////////////////////////////////
  281. int UTF7ToUnicode(
  282. LPCSTR lpSrcStr,
  283. int cchSrc,
  284. LPWSTR lpDestStr,
  285. int cchDest)
  286. {
  287. //CHAR is signed, so we have to cast lpSrcStr to an unsigned char below.
  288. BYTE* pUTF7 = (BYTE*)lpSrcStr;
  289. BOOL fShift = FALSE;
  290. DWORD dwBit = 0; // 32-bit buffer to hold temporary bits
  291. int iPos = 0; // 6-bit position pointer in the buffer
  292. int cchWC = 0; // # of Unicode code points generated
  293. while ((cchSrc--) && ((cchDest == 0) || (cchWC < cchDest)))
  294. {
  295. if (*pUTF7 > ASCII)
  296. {
  297. //
  298. // Error - non ASCII char, so zero extend it.
  299. //
  300. if (cchDest)
  301. {
  302. lpDestStr[cchWC] = (WCHAR)*pUTF7;
  303. }
  304. cchWC++;
  305. // Terminate the shifted sequence.
  306. fShift = FALSE;
  307. }
  308. else if (!fShift)
  309. {
  310. //
  311. // Not in shifted sequence.
  312. //
  313. if (*pUTF7 == SHIFT_IN)
  314. {
  315. if (cchSrc && (pUTF7[1] == SHIFT_OUT))
  316. {
  317. //
  318. // "+-" means "+"
  319. //
  320. if (cchDest)
  321. {
  322. lpDestStr[cchWC] = (WCHAR)*pUTF7;
  323. }
  324. pUTF7++;
  325. cchSrc--;
  326. cchWC++;
  327. }
  328. else
  329. {
  330. //
  331. // Start a new shift sequence.
  332. //
  333. fShift = TRUE;
  334. }
  335. }
  336. else
  337. {
  338. //
  339. // No need to shift.
  340. //
  341. if (cchDest)
  342. {
  343. lpDestStr[cchWC] = (WCHAR)*pUTF7;
  344. }
  345. cchWC++;
  346. }
  347. }
  348. else
  349. {
  350. //
  351. // Already in shifted sequence.
  352. //
  353. if (nBitBase64[*pUTF7] == -1)
  354. {
  355. //
  356. // Any non Base64 char also ends shift state.
  357. //
  358. if (*pUTF7 != SHIFT_OUT)
  359. {
  360. //
  361. // Not "-", so write it to the buffer.
  362. //
  363. if (cchDest)
  364. {
  365. lpDestStr[cchWC] = (WCHAR)*pUTF7;
  366. }
  367. cchWC++;
  368. }
  369. //
  370. // Reset bits.
  371. //
  372. fShift = FALSE;
  373. dwBit = 0;
  374. iPos = 0;
  375. }
  376. else
  377. {
  378. //
  379. // Store the bits in the 6-bit buffer and adjust the
  380. // position pointer.
  381. //
  382. dwBit |= ((DWORD)nBitBase64[*pUTF7]) << (26 - iPos);
  383. iPos += 6;
  384. }
  385. //
  386. // Output the 16-bit Unicode value.
  387. //
  388. while (iPos >= 16)
  389. {
  390. if (cchDest)
  391. {
  392. if (cchWC < cchDest)
  393. {
  394. lpDestStr[cchWC] = (WCHAR)(dwBit >> 16);
  395. }
  396. else
  397. {
  398. break;
  399. }
  400. }
  401. cchWC++;
  402. dwBit <<= 16;
  403. iPos -= 16;
  404. }
  405. if (iPos >= 16)
  406. {
  407. //
  408. // Error - buffer too small.
  409. //
  410. cchSrc++;
  411. break;
  412. }
  413. }
  414. pUTF7++;
  415. }
  416. //
  417. // Make sure the destination buffer was large enough.
  418. //
  419. if (cchDest && (cchSrc >= 0))
  420. {
  421. if (cchSrc == 0 && fShift && *(pUTF7--) == SHIFT_OUT)
  422. {
  423. //
  424. // Do nothing here.
  425. // If we are in shift-in mode previously, and the last byte is a shift-out byte ('-'),
  426. // we should absorb this byte. So don't set error.
  427. //
  428. } else
  429. {
  430. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  431. return (0);
  432. }
  433. }
  434. //
  435. // Return the number of Unicode characters written.
  436. //
  437. return (cchWC);
  438. }
  439. ////////////////////////////////////////////////////////////////////////////
  440. //
  441. // UTF8ToUnicode
  442. //
  443. // Maps a UTF-8 character string to its wide character string counterpart.
  444. //
  445. // 02-06-96 JulieB Created.
  446. ////////////////////////////////////////////////////////////////////////////
  447. int UTF8ToUnicode(
  448. LPCSTR lpSrcStr,
  449. int cchSrc,
  450. LPWSTR lpDestStr,
  451. int cchDest,
  452. DWORD dwFlags
  453. )
  454. {
  455. int nTB = 0; // # trail bytes to follow
  456. int cchWC = 0; // # of Unicode code points generated
  457. CONST BYTE* pUTF8 = (CONST BYTE*)lpSrcStr;
  458. DWORD dwSurrogateChar; // Full surrogate char
  459. BOOL bSurrogatePair = FALSE; // Indicate we'r collecting a surrogate pair
  460. BOOL bCheckInvalidBytes = (dwFlags & MB_ERR_INVALID_CHARS);
  461. BYTE UTF8;
  462. while ((cchSrc--) && ((cchDest == 0) || (cchWC < cchDest)))
  463. {
  464. //
  465. // See if there are any trail bytes.
  466. //
  467. if (BIT7(*pUTF8) == 0)
  468. {
  469. //
  470. // Found ASCII.
  471. //
  472. if (cchDest)
  473. {
  474. lpDestStr[cchWC] = (WCHAR)*pUTF8;
  475. }
  476. nTB = bSurrogatePair = 0;
  477. cchWC++;
  478. }
  479. else if (BIT6(*pUTF8) == 0)
  480. {
  481. //
  482. // Found a trail byte.
  483. // Note : Ignore the trail byte if there was no lead byte.
  484. //
  485. if (nTB != 0)
  486. {
  487. //
  488. // Decrement the trail byte counter.
  489. //
  490. nTB--;
  491. if (bSurrogatePair)
  492. {
  493. dwSurrogateChar <<= 6;
  494. dwSurrogateChar |= LOWER_6_BIT(*pUTF8);
  495. if (nTB == 0)
  496. {
  497. if (cchDest)
  498. {
  499. if ((cchWC + 1) < cchDest)
  500. {
  501. lpDestStr[cchWC] = (WCHAR)
  502. (((dwSurrogateChar - 0x10000) >> 10) + HIGH_SURROGATE_START);
  503. lpDestStr[cchWC+1] = (WCHAR)
  504. ((dwSurrogateChar - 0x10000)%0x400 + LOW_SURROGATE_START);
  505. }
  506. else
  507. {
  508. // Error : Buffer too small
  509. cchSrc++;
  510. break;
  511. }
  512. }
  513. cchWC += 2;
  514. bSurrogatePair = FALSE;
  515. }
  516. }
  517. else
  518. {
  519. //
  520. // Make room for the trail byte and add the trail byte
  521. // value.
  522. //
  523. if (cchDest)
  524. {
  525. lpDestStr[cchWC] <<= 6;
  526. lpDestStr[cchWC] |= LOWER_6_BIT(*pUTF8);
  527. }
  528. if (nTB == 0)
  529. {
  530. //
  531. // End of sequence. Advance the output counter.
  532. //
  533. cchWC++;
  534. }
  535. }
  536. }
  537. else
  538. {
  539. if (bCheckInvalidBytes)
  540. {
  541. SetLastError(ERROR_NO_UNICODE_TRANSLATION);
  542. return (0);
  543. }
  544. // error - not expecting a trail byte. That is, there is a trailing byte without leading byte.
  545. bSurrogatePair = FALSE;
  546. }
  547. }
  548. else
  549. {
  550. //
  551. // Found a lead byte.
  552. //
  553. if (nTB > 0)
  554. {
  555. // error - A leading byte before the previous sequence is completed.
  556. if (bCheckInvalidBytes)
  557. {
  558. SetLastError(ERROR_NO_UNICODE_TRANSLATION);
  559. return (0);
  560. }
  561. //
  562. // Error - previous sequence not finished.
  563. //
  564. nTB = 0;
  565. bSurrogatePair = FALSE;
  566. // Put this character back so that we can start over another sequence.
  567. cchSrc++;
  568. pUTF8--;
  569. }
  570. else
  571. {
  572. //
  573. // Calculate the number of bytes to follow.
  574. // Look for the first 0 from left to right.
  575. //
  576. UTF8 = *pUTF8;
  577. while (BIT7(UTF8) != 0)
  578. {
  579. UTF8 <<= 1;
  580. nTB++;
  581. }
  582. //
  583. // Check for non-shortest form.
  584. //
  585. switch (nTB) {
  586. case 1:
  587. nTB = 0;
  588. break;
  589. case 2:
  590. // Make sure that bit 8 ~ bit 11 is not all zero.
  591. // 110XXXXx 10xxxxxx
  592. if ((*pUTF8 & 0x1e) == 0)
  593. {
  594. nTB = 0;
  595. }
  596. break;
  597. case 3:
  598. // Look ahead to check for non-shortest form.
  599. // 1110XXXX 10Xxxxxx 10xxxxxx
  600. if (cchSrc >= 2)
  601. {
  602. if (((*pUTF8 & 0x0f) == 0) && (*(pUTF8 + 1) & 0x20) == 0)
  603. {
  604. nTB = 0;
  605. }
  606. }
  607. break;
  608. case 4:
  609. //
  610. // This is a surrogate unicode pair
  611. //
  612. if (cchSrc >= 3)
  613. {
  614. WORD word = (((WORD)*pUTF8) << 8) | *(pUTF8 + 1);
  615. // Look ahead to check for non-shortest form.
  616. // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
  617. // Check for the 5 bits are not all zero.
  618. // 0x0730 == 00000111 11000000
  619. if ((word & 0x0730) == 0)
  620. {
  621. nTB = 0;
  622. } else if ((word & 0x0400) == 0x0400)
  623. {
  624. // The 21st bit is 1.
  625. // Make sure that the resulting Unicode is within the valid surrogate range.
  626. // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point ragne
  627. // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF.
  628. // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20
  629. // bit are all zero.
  630. // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx,
  631. // XXXXX can only be 10000.
  632. // 0x0330 = 0000 0011 0011 0000
  633. if ((word & 0x0330) != 0)
  634. {
  635. nTB = 0;
  636. }
  637. }
  638. if (nTB != 0)
  639. {
  640. dwSurrogateChar = UTF8 >> nTB;
  641. bSurrogatePair = TRUE;
  642. }
  643. }
  644. break;
  645. default:
  646. //
  647. // If the bits is greater than 4, this is an invalid
  648. // UTF8 lead byte.
  649. //
  650. nTB = 0;
  651. break;
  652. }
  653. if (nTB != 0)
  654. {
  655. //
  656. // Store the value from the first byte and decrement
  657. // the number of bytes to follow.
  658. //
  659. if (cchDest)
  660. {
  661. lpDestStr[cchWC] = UTF8 >> nTB;
  662. }
  663. nTB--;
  664. } else
  665. {
  666. if (bCheckInvalidBytes)
  667. {
  668. SetLastError(ERROR_NO_UNICODE_TRANSLATION);
  669. return (0);
  670. }
  671. }
  672. }
  673. }
  674. pUTF8++;
  675. }
  676. if ((bCheckInvalidBytes && nTB != 0) || (cchWC == 0))
  677. {
  678. // About (cchWC == 0):
  679. // Because we now throw away non-shortest form, it is possible that we generate 0 chars.
  680. // In this case, we have to set error to ERROR_NO_UNICODE_TRANSLATION so that we conform
  681. // to the spec of MultiByteToWideChar.
  682. SetLastError(ERROR_NO_UNICODE_TRANSLATION);
  683. return (0);
  684. }
  685. //
  686. // Make sure the destination buffer was large enough.
  687. //
  688. if (cchDest && (cchSrc >= 0))
  689. {
  690. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  691. return (0);
  692. }
  693. //
  694. // Return the number of Unicode characters written.
  695. //
  696. return (cchWC);
  697. }
  698. ////////////////////////////////////////////////////////////////////////////
  699. //
  700. // UnicodeToUTF7
  701. //
  702. // Maps a Unicode character string to its UTF-7 string counterpart.
  703. //
  704. // 02-06-96 JulieB Created.
  705. ////////////////////////////////////////////////////////////////////////////
  706. int UnicodeToUTF7(
  707. LPCWSTR lpSrcStr,
  708. int cchSrc,
  709. LPSTR lpDestStr,
  710. int cchDest)
  711. {
  712. LPCWSTR lpWC = lpSrcStr;
  713. BOOL fShift = FALSE;
  714. DWORD dwBit = 0; // 32-bit buffer
  715. int iPos = 0; // 6-bit position in buffer
  716. int cchU7 = 0; // # of UTF7 chars generated
  717. while ((cchSrc--) && ((cchDest == 0) || (cchU7 < cchDest)))
  718. {
  719. if ((*lpWC > ASCII) || (fShiftChar[*lpWC]))
  720. {
  721. //
  722. // Need shift. Store 16 bits in buffer.
  723. //
  724. dwBit |= ((DWORD)*lpWC) << (16 - iPos);
  725. iPos += 16;
  726. if (!fShift)
  727. {
  728. //
  729. // Not in shift state, so add "+".
  730. //
  731. if (cchDest)
  732. {
  733. lpDestStr[cchU7] = SHIFT_IN;
  734. }
  735. cchU7++;
  736. //
  737. // Go into shift state.
  738. //
  739. fShift = TRUE;
  740. }
  741. //
  742. // Output 6 bits at a time as Base64 chars.
  743. //
  744. while (iPos >= 6)
  745. {
  746. if (cchDest)
  747. {
  748. if (cchU7 < cchDest)
  749. {
  750. //
  751. // 26 = 32 - 6
  752. //
  753. lpDestStr[cchU7] = cBase64[(int)(dwBit >> 26)];
  754. }
  755. else
  756. {
  757. break;
  758. }
  759. }
  760. cchU7++;
  761. dwBit <<= 6; // remove from bit buffer
  762. iPos -= 6; // adjust position pointer
  763. }
  764. if (iPos >= 6)
  765. {
  766. //
  767. // Error - buffer too small.
  768. //
  769. cchSrc++;
  770. break;
  771. }
  772. }
  773. else
  774. {
  775. //
  776. // No need to shift.
  777. //
  778. if (fShift)
  779. {
  780. //
  781. // End the shift sequence.
  782. //
  783. fShift = FALSE;
  784. if (iPos != 0)
  785. {
  786. //
  787. // Some bits left in dwBit.
  788. //
  789. if (cchDest)
  790. {
  791. if ((cchU7 + 1) < cchDest)
  792. {
  793. lpDestStr[cchU7++] = cBase64[(int)(dwBit >> 26)];
  794. lpDestStr[cchU7++] = SHIFT_OUT;
  795. }
  796. else
  797. {
  798. //
  799. // Error - buffer too small.
  800. //
  801. cchSrc++;
  802. break;
  803. }
  804. }
  805. else
  806. {
  807. cchU7 += 2;
  808. }
  809. dwBit = 0; // reset bit buffer
  810. iPos = 0; // reset postion pointer
  811. }
  812. else
  813. {
  814. //
  815. // Simply end the shift sequence.
  816. //
  817. if (cchDest)
  818. {
  819. lpDestStr[cchU7++] = SHIFT_OUT;
  820. }
  821. else
  822. {
  823. cchU7++;
  824. }
  825. }
  826. }
  827. //
  828. // Write the character to the buffer.
  829. // If the character is "+", then write "+-".
  830. //
  831. if (cchDest)
  832. {
  833. if (cchU7 < cchDest)
  834. {
  835. lpDestStr[cchU7++] = (char)*lpWC;
  836. if (*lpWC == SHIFT_IN)
  837. {
  838. if (cchU7 < cchDest)
  839. {
  840. lpDestStr[cchU7++] = SHIFT_OUT;
  841. }
  842. else
  843. {
  844. //
  845. // Error - buffer too small.
  846. //
  847. cchSrc++;
  848. break;
  849. }
  850. }
  851. }
  852. else
  853. {
  854. //
  855. // Error - buffer too small.
  856. //
  857. cchSrc++;
  858. break;
  859. }
  860. }
  861. else
  862. {
  863. cchU7++;
  864. if (*lpWC == SHIFT_IN)
  865. {
  866. cchU7++;
  867. }
  868. }
  869. }
  870. lpWC++;
  871. }
  872. //
  873. // See if we're still in the shift state.
  874. //
  875. if (fShift)
  876. {
  877. if (iPos != 0)
  878. {
  879. //
  880. // Some bits left in dwBit.
  881. //
  882. if (cchDest)
  883. {
  884. if ((cchU7 + 1) < cchDest)
  885. {
  886. lpDestStr[cchU7++] = cBase64[(int)(dwBit >> 26)];
  887. lpDestStr[cchU7++] = SHIFT_OUT;
  888. }
  889. else
  890. {
  891. //
  892. // Error - buffer too small.
  893. //
  894. cchSrc++;
  895. }
  896. }
  897. else
  898. {
  899. cchU7 += 2;
  900. }
  901. }
  902. else
  903. {
  904. //
  905. // Simply end the shift sequence.
  906. //
  907. if (cchDest)
  908. {
  909. if (cchU7 < cchDest)
  910. {
  911. lpDestStr[cchU7++] = SHIFT_OUT;
  912. }
  913. else
  914. {
  915. //
  916. // Error - buffer too small.
  917. //
  918. cchSrc++;
  919. }
  920. }
  921. else
  922. {
  923. cchU7++;
  924. }
  925. }
  926. }
  927. //
  928. // Make sure the destination buffer was large enough.
  929. //
  930. if (cchDest && (cchSrc >= 0))
  931. {
  932. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  933. return (0);
  934. }
  935. //
  936. // Return the number of UTF-7 characters written.
  937. //
  938. return (cchU7);
  939. }
  940. ////////////////////////////////////////////////////////////////////////////
  941. //
  942. // UnicodeToUTF8
  943. //
  944. // Maps a Unicode character string to its UTF-8 string counterpart.
  945. //
  946. // 02-06-96 JulieB Created.
  947. ////////////////////////////////////////////////////////////////////////////
  948. int UnicodeToUTF8(
  949. LPCWSTR lpSrcStr,
  950. int cchSrc,
  951. LPSTR lpDestStr,
  952. int cchDest)
  953. {
  954. LPCWSTR lpWC = lpSrcStr;
  955. int cchU8 = 0; // # of UTF8 chars generated
  956. DWORD dwSurrogateChar;
  957. WCHAR wchHighSurrogate = 0;
  958. BOOL bHandled;
  959. while ((cchSrc--) && ((cchDest == 0) || (cchU8 < cchDest)))
  960. {
  961. bHandled = FALSE;
  962. //
  963. // Check if high surrogate is available
  964. //
  965. if ((*lpWC >= HIGH_SURROGATE_START) && (*lpWC <= HIGH_SURROGATE_END))
  966. {
  967. if (cchDest)
  968. {
  969. // Another high surrogate, then treat the 1st as normal
  970. // Unicode character.
  971. if (wchHighSurrogate)
  972. {
  973. if ((cchU8 + 2) < cchDest)
  974. {
  975. lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate);
  976. lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate);
  977. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate);
  978. }
  979. else
  980. {
  981. // not enough buffer
  982. cchSrc++;
  983. break;
  984. }
  985. }
  986. }
  987. else
  988. {
  989. cchU8 += 3;
  990. }
  991. wchHighSurrogate = *lpWC;
  992. bHandled = TRUE;
  993. }
  994. if (!bHandled && wchHighSurrogate)
  995. {
  996. if ((*lpWC >= LOW_SURROGATE_START) && (*lpWC <= LOW_SURROGATE_END))
  997. {
  998. // wheee, valid surrogate pairs
  999. if (cchDest)
  1000. {
  1001. if ((cchU8 + 3) < cchDest)
  1002. {
  1003. dwSurrogateChar = (((wchHighSurrogate-0xD800) << 10) + (*lpWC - 0xDC00) + 0x10000);
  1004. lpDestStr[cchU8++] = (UTF8_1ST_OF_4 |
  1005. (unsigned char)(dwSurrogateChar >> 18)); // 3 bits from 1st byte
  1006. lpDestStr[cchU8++] = (UTF8_TRAIL |
  1007. (unsigned char)((dwSurrogateChar >> 12) & 0x3f)); // 6 bits from 2nd byte
  1008. lpDestStr[cchU8++] = (UTF8_TRAIL |
  1009. (unsigned char)((dwSurrogateChar >> 6) & 0x3f)); // 6 bits from 3rd byte
  1010. lpDestStr[cchU8++] = (UTF8_TRAIL |
  1011. (unsigned char)(0x3f & dwSurrogateChar)); // 6 bits from 4th byte
  1012. }
  1013. else
  1014. {
  1015. // not enough buffer
  1016. cchSrc++;
  1017. break;
  1018. }
  1019. }
  1020. else
  1021. {
  1022. // we already counted 3 previously (in high surrogate)
  1023. cchU8 ++;
  1024. }
  1025. bHandled = TRUE;
  1026. }
  1027. else
  1028. {
  1029. // Bad Surrogate pair : ERROR
  1030. // Just process wchHighSurrogate , and the code below will
  1031. // process the current code point
  1032. if (cchDest)
  1033. {
  1034. if ((cchU8 + 2) < cchDest)
  1035. {
  1036. lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate);
  1037. lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate);
  1038. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate);
  1039. }
  1040. else
  1041. {
  1042. // not enough buffer
  1043. cchSrc++;
  1044. break;
  1045. }
  1046. }
  1047. }
  1048. wchHighSurrogate = 0;
  1049. }
  1050. if (!bHandled)
  1051. {
  1052. if (*lpWC <= ASCII)
  1053. {
  1054. //
  1055. // Found ASCII.
  1056. //
  1057. if (cchDest)
  1058. {
  1059. if (cchU8 < cchDest)
  1060. {
  1061. lpDestStr[cchU8] = (char)*lpWC;
  1062. }
  1063. else
  1064. {
  1065. //
  1066. // Error - buffer too small.
  1067. //
  1068. cchSrc++;
  1069. break;
  1070. }
  1071. }
  1072. cchU8++;
  1073. }
  1074. else if (*lpWC <= UTF8_2_MAX)
  1075. {
  1076. //
  1077. // Found 2 byte sequence if < 0x07ff (11 bits).
  1078. //
  1079. if (cchDest)
  1080. {
  1081. if ((cchU8 + 1) < cchDest)
  1082. {
  1083. //
  1084. // Use upper 5 bits in first byte.
  1085. // Use lower 6 bits in second byte.
  1086. //
  1087. lpDestStr[cchU8++] = UTF8_1ST_OF_2 | (*lpWC >> 6);
  1088. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(*lpWC);
  1089. }
  1090. else
  1091. {
  1092. //
  1093. // Error - buffer too small.
  1094. //
  1095. cchSrc++;
  1096. break;
  1097. }
  1098. }
  1099. else
  1100. {
  1101. cchU8 += 2;
  1102. }
  1103. }
  1104. else
  1105. {
  1106. //
  1107. // Found 3 byte sequence.
  1108. //
  1109. if (cchDest)
  1110. {
  1111. if ((cchU8 + 2) < cchDest)
  1112. {
  1113. //
  1114. // Use upper 4 bits in first byte.
  1115. // Use middle 6 bits in second byte.
  1116. // Use lower 6 bits in third byte.
  1117. //
  1118. lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(*lpWC);
  1119. lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(*lpWC);
  1120. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(*lpWC);
  1121. }
  1122. else
  1123. {
  1124. //
  1125. // Error - buffer too small.
  1126. //
  1127. cchSrc++;
  1128. break;
  1129. }
  1130. }
  1131. else
  1132. {
  1133. cchU8 += 3;
  1134. }
  1135. }
  1136. }
  1137. lpWC++;
  1138. }
  1139. //
  1140. // If the last character was a high surrogate, then handle it as a normal
  1141. // unicode character.
  1142. //
  1143. if ((cchSrc < 0) && (wchHighSurrogate != 0))
  1144. {
  1145. if (cchDest)
  1146. {
  1147. if ((cchU8 + 2) < cchDest)
  1148. {
  1149. lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate);
  1150. lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate);
  1151. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate);
  1152. }
  1153. else
  1154. {
  1155. cchSrc++;
  1156. }
  1157. }
  1158. }
  1159. //
  1160. // Make sure the destination buffer was large enough.
  1161. //
  1162. if (cchDest && (cchSrc >= 0))
  1163. {
  1164. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  1165. return (0);
  1166. }
  1167. //
  1168. // Return the number of UTF-8 characters written.
  1169. //
  1170. return (cchU8);
  1171. }