Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1299 lines
38 KiB

  1. /*++
  2. Copyright (c) 1991-2000, Microsoft Corporation All rights reserved.
  3. Module Name:
  4. utf.c
  5. Abstract:
  6. This file contains functions that convert UTF strings to Unicode
  7. strings and Unicode string to UTF strings.
  8. External Routines found in this file:
  9. UTFCPInfo
  10. UTFToUnicode
  11. UnicodeToUTF
  12. Revision History:
  13. 02-06-96 JulieB Created.
  14. 03-20-99 SamerA Surrogate support.
  15. --*/
  16. //
  17. // Include Files.
  18. //
  19. #include "nls.h"
  20. #include "nlssafe.h"
  21. #include "utf.h"
  22. //
  23. // Forward Declarations.
  24. //
  25. int
  26. UTF7ToUnicode(
  27. LPCSTR lpSrcStr,
  28. int cchSrc,
  29. LPWSTR lpDestStr,
  30. int cchDest);
  31. int
  32. UTF8ToUnicode(
  33. LPCSTR lpSrcStr,
  34. int cchSrc,
  35. LPWSTR lpDestStr,
  36. int cchDest,
  37. DWORD dwFlags);
  38. int
  39. UnicodeToUTF7(
  40. LPCWSTR lpSrcStr,
  41. int cchSrc,
  42. LPSTR lpDestStr,
  43. int cchDest);
  44. int
  45. UnicodeToUTF8(
  46. LPCWSTR lpSrcStr,
  47. int cchSrc,
  48. LPSTR lpDestStr,
  49. int cchDest);
  50. //-------------------------------------------------------------------------//
  51. // EXTERNAL ROUTINES //
  52. //-------------------------------------------------------------------------//
  53. ////////////////////////////////////////////////////////////////////////////
  54. //
  55. // UTFCPInfo
  56. //
  57. // Gets the CPInfo for the given UTF code page.
  58. //
  59. // 10-23-96 JulieB Created.
  60. ////////////////////////////////////////////////////////////////////////////
  61. BOOL UTFCPInfo(
  62. UINT CodePage,
  63. LPCPINFO lpCPInfo,
  64. BOOL fExVer)
  65. {
  66. int ctr;
  67. //
  68. // Invalid Parameter Check:
  69. // - validate code page
  70. // - lpCPInfo is NULL
  71. //
  72. if ( (CodePage < CP_UTF7) || (CodePage > CP_UTF8) ||
  73. (lpCPInfo == NULL) )
  74. {
  75. SetLastError(ERROR_INVALID_PARAMETER);
  76. return (0);
  77. }
  78. switch (CodePage)
  79. {
  80. case ( CP_UTF7 ) :
  81. {
  82. lpCPInfo->MaxCharSize = 5;
  83. break;
  84. }
  85. case ( CP_UTF8 ) :
  86. {
  87. lpCPInfo->MaxCharSize = 4;
  88. break;
  89. }
  90. }
  91. (lpCPInfo->DefaultChar)[0] = '?';
  92. (lpCPInfo->DefaultChar)[1] = (BYTE)0;
  93. for (ctr = 0; ctr < MAX_LEADBYTES; ctr++)
  94. {
  95. (lpCPInfo->LeadByte)[ctr] = (BYTE)0;
  96. }
  97. if (fExVer)
  98. {
  99. LPCPINFOEXW lpCPInfoEx = (LPCPINFOEXW)lpCPInfo;
  100. lpCPInfoEx->UnicodeDefaultChar = L'?';
  101. lpCPInfoEx->CodePage = CodePage;
  102. }
  103. return (TRUE);
  104. }
  105. ////////////////////////////////////////////////////////////////////////////
  106. //
  107. // UTFToUnicode
  108. //
  109. // Maps a UTF character string to its wide character string counterpart.
  110. //
  111. // 02-06-96 JulieB Created.
  112. ////////////////////////////////////////////////////////////////////////////
  113. int UTFToUnicode(
  114. UINT CodePage,
  115. DWORD dwFlags,
  116. LPCSTR lpMultiByteStr,
  117. int cbMultiByte,
  118. LPWSTR lpWideCharStr,
  119. int cchWideChar)
  120. {
  121. int rc = 0;
  122. //
  123. // Invalid Parameter Check:
  124. // - validate code page
  125. // - length of MB string is 0
  126. // - wide char buffer size is negative
  127. // - MB string is NULL
  128. // - length of WC string is NOT zero AND
  129. // (WC string is NULL OR src and dest pointers equal)
  130. //
  131. if ( (CodePage < CP_UTF7) || (CodePage > CP_UTF8) ||
  132. (cbMultiByte == 0) || (cchWideChar < 0) ||
  133. (lpMultiByteStr == NULL) ||
  134. ((cchWideChar != 0) &&
  135. ((lpWideCharStr == NULL) ||
  136. (lpMultiByteStr == (LPSTR)lpWideCharStr))) )
  137. {
  138. SetLastError(ERROR_INVALID_PARAMETER);
  139. return (0);
  140. }
  141. //
  142. // Invalid Flags Check:
  143. // - UTF7: flags not 0.
  144. // - UTF8: flags not 0 nor MB_ERR_INVALID_CHARS.
  145. //
  146. if (CodePage == CP_UTF8)
  147. {
  148. // UTF8
  149. if ((dwFlags & ~MB_ERR_INVALID_CHARS) != 0)
  150. {
  151. SetLastError(ERROR_INVALID_FLAGS);
  152. return (0);
  153. }
  154. }
  155. else if (dwFlags != 0)
  156. {
  157. // UTF7
  158. SetLastError(ERROR_INVALID_FLAGS);
  159. return (0);
  160. }
  161. //
  162. // If cbMultiByte is -1, then the string is null terminated and we
  163. // need to get the length of the string. Add one to the length to
  164. // include the null termination. (This will always be at least 1.)
  165. //
  166. if (cbMultiByte <= -1)
  167. {
  168. cbMultiByte = strlen(lpMultiByteStr) + 1;
  169. }
  170. switch (CodePage)
  171. {
  172. case ( CP_UTF7 ) :
  173. {
  174. rc = UTF7ToUnicode( lpMultiByteStr,
  175. cbMultiByte,
  176. lpWideCharStr,
  177. cchWideChar );
  178. break;
  179. }
  180. case ( CP_UTF8 ) :
  181. {
  182. rc = UTF8ToUnicode( lpMultiByteStr,
  183. cbMultiByte,
  184. lpWideCharStr,
  185. cchWideChar,
  186. dwFlags);
  187. break;
  188. }
  189. }
  190. return (rc);
  191. }
  192. ////////////////////////////////////////////////////////////////////////////
  193. //
  194. // UnicodeToUTF
  195. //
  196. // Maps a Unicode character string to its UTF string counterpart.
  197. //
  198. // 02-06-96 JulieB Created.
  199. ////////////////////////////////////////////////////////////////////////////
  200. int UnicodeToUTF(
  201. UINT CodePage,
  202. DWORD dwFlags,
  203. LPCWSTR lpWideCharStr,
  204. int cchWideChar,
  205. LPSTR lpMultiByteStr,
  206. int cbMultiByte,
  207. LPCSTR lpDefaultChar,
  208. LPBOOL lpUsedDefaultChar)
  209. {
  210. int rc = 0;
  211. //
  212. // Invalid Parameter Check:
  213. // - validate code page
  214. // - length of WC string is 0
  215. // - multibyte buffer size is negative
  216. // - WC string is NULL
  217. // - length of WC string is NOT zero AND
  218. // (MB string is NULL OR src and dest pointers equal)
  219. // - lpDefaultChar and lpUsedDefaultChar not NULL
  220. //
  221. if ( (CodePage < CP_UTF7) || (CodePage > CP_UTF8) ||
  222. (cchWideChar == 0) || (cbMultiByte < 0) ||
  223. (lpWideCharStr == NULL) ||
  224. ((cbMultiByte != 0) &&
  225. ((lpMultiByteStr == NULL) ||
  226. (lpWideCharStr == (LPWSTR)lpMultiByteStr))) ||
  227. (lpDefaultChar != NULL) || (lpUsedDefaultChar != NULL) )
  228. {
  229. SetLastError(ERROR_INVALID_PARAMETER);
  230. return (0);
  231. }
  232. //
  233. // Invalid Flags Check:
  234. // - flags not 0
  235. //
  236. if (dwFlags != 0)
  237. {
  238. SetLastError(ERROR_INVALID_FLAGS);
  239. return (0);
  240. }
  241. //
  242. // If cchWideChar is -1, then the string is null terminated and we
  243. // need to get the length of the string. Add one to the length to
  244. // include the null termination. (This will always be at least 1.)
  245. //
  246. if (cchWideChar <= -1)
  247. {
  248. cchWideChar = NlsStrLenW(lpWideCharStr) + 1;
  249. }
  250. switch (CodePage)
  251. {
  252. case ( CP_UTF7 ) :
  253. {
  254. rc = UnicodeToUTF7( lpWideCharStr,
  255. cchWideChar,
  256. lpMultiByteStr,
  257. cbMultiByte );
  258. break;
  259. }
  260. case ( CP_UTF8 ) :
  261. {
  262. rc = UnicodeToUTF8( lpWideCharStr,
  263. cchWideChar,
  264. lpMultiByteStr,
  265. cbMultiByte );
  266. break;
  267. }
  268. }
  269. return (rc);
  270. }
  271. //-------------------------------------------------------------------------//
  272. // INTERNAL ROUTINES //
  273. //-------------------------------------------------------------------------//
  274. ////////////////////////////////////////////////////////////////////////////
  275. //
  276. // UTF7ToUnicode
  277. //
  278. // Maps a UTF-7 character string to its wide character string counterpart.
  279. //
  280. // 02-06-96 JulieB Created.
  281. ////////////////////////////////////////////////////////////////////////////
  282. int UTF7ToUnicode(
  283. LPCSTR lpSrcStr,
  284. int cchSrc,
  285. LPWSTR lpDestStr,
  286. int cchDest)
  287. {
  288. //CHAR is signed, so we have to cast lpSrcStr to an unsigned char below.
  289. BYTE* pUTF7 = (BYTE*)lpSrcStr;
  290. BOOL fShift = FALSE;
  291. DWORD dwBit = 0; // 32-bit buffer to hold temporary bits
  292. int iPos = 0; // 6-bit position pointer in the buffer
  293. int cchWC = 0; // # of Unicode code points generated
  294. while ((cchSrc--) && ((cchDest == 0) || (cchWC < cchDest)))
  295. {
  296. if (*pUTF7 > ASCII)
  297. {
  298. //
  299. // Error - non ASCII char, so zero extend it.
  300. //
  301. if (cchDest)
  302. {
  303. lpDestStr[cchWC] = (WCHAR)*pUTF7;
  304. }
  305. cchWC++;
  306. // Terminate the shifted sequence.
  307. fShift = FALSE;
  308. }
  309. else if (!fShift)
  310. {
  311. //
  312. // Not in shifted sequence.
  313. //
  314. if (*pUTF7 == SHIFT_IN)
  315. {
  316. if (cchSrc && (pUTF7[1] == SHIFT_OUT))
  317. {
  318. //
  319. // "+-" means "+"
  320. //
  321. if (cchDest)
  322. {
  323. lpDestStr[cchWC] = (WCHAR)*pUTF7;
  324. }
  325. pUTF7++;
  326. cchSrc--;
  327. cchWC++;
  328. }
  329. else
  330. {
  331. //
  332. // Start a new shift sequence.
  333. //
  334. fShift = TRUE;
  335. }
  336. }
  337. else
  338. {
  339. //
  340. // No need to shift.
  341. //
  342. if (cchDest)
  343. {
  344. lpDestStr[cchWC] = (WCHAR)*pUTF7;
  345. }
  346. cchWC++;
  347. }
  348. }
  349. else
  350. {
  351. //
  352. // Already in shifted sequence.
  353. //
  354. if (nBitBase64[*pUTF7] == -1)
  355. {
  356. //
  357. // Any non Base64 char also ends shift state.
  358. //
  359. if (*pUTF7 != SHIFT_OUT)
  360. {
  361. //
  362. // Not "-", so write it to the buffer.
  363. //
  364. if (cchDest)
  365. {
  366. lpDestStr[cchWC] = (WCHAR)*pUTF7;
  367. }
  368. cchWC++;
  369. }
  370. //
  371. // Reset bits.
  372. //
  373. fShift = FALSE;
  374. dwBit = 0;
  375. iPos = 0;
  376. }
  377. else
  378. {
  379. //
  380. // Store the bits in the 6-bit buffer and adjust the
  381. // position pointer.
  382. //
  383. dwBit |= ((DWORD)nBitBase64[*pUTF7]) << (26 - iPos);
  384. iPos += 6;
  385. }
  386. //
  387. // Output the 16-bit Unicode value.
  388. //
  389. while (iPos >= 16)
  390. {
  391. if (cchDest)
  392. {
  393. if (cchWC < cchDest)
  394. {
  395. lpDestStr[cchWC] = (WCHAR)(dwBit >> 16);
  396. }
  397. else
  398. {
  399. break;
  400. }
  401. }
  402. cchWC++;
  403. dwBit <<= 16;
  404. iPos -= 16;
  405. }
  406. if (iPos >= 16)
  407. {
  408. //
  409. // Error - buffer too small.
  410. //
  411. cchSrc++;
  412. break;
  413. }
  414. }
  415. pUTF7++;
  416. }
  417. //
  418. // Make sure the destination buffer was large enough.
  419. //
  420. if (cchDest && (cchSrc >= 0))
  421. {
  422. if (cchSrc == 0 && fShift && *(pUTF7--) == SHIFT_OUT)
  423. {
  424. //
  425. // Do nothing here.
  426. // If we are in shift-in mode previously, and the last byte is a shift-out byte ('-'),
  427. // we should absorb this byte. So don't set error.
  428. //
  429. } else
  430. {
  431. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  432. return (0);
  433. }
  434. }
  435. //
  436. // Return the number of Unicode characters written.
  437. //
  438. return (cchWC);
  439. }
  440. ////////////////////////////////////////////////////////////////////////////
  441. //
  442. // UTF8ToUnicode
  443. //
  444. // Maps a UTF-8 character string to its wide character string counterpart.
  445. //
  446. // 04-22-2002 ShawnSte Fix bug 533476 where final characters are broken.
  447. // 02-06-96 JulieB Created.
  448. ////////////////////////////////////////////////////////////////////////////
  449. int UTF8ToUnicode(
  450. LPCSTR lpSrcStr,
  451. int cchSrc,
  452. LPWSTR lpDestStr,
  453. int cchDest,
  454. DWORD dwFlags
  455. )
  456. {
  457. int nTB = 0; // # trail bytes to follow
  458. int cchWC = 0; // # of Unicode code points generated
  459. CONST BYTE* pUTF8 = (CONST BYTE*)lpSrcStr;
  460. DWORD dwUnicodeChar; // Our character with room for full surrogate char
  461. BOOL bSurrogatePair = FALSE; // Indicate we'r collecting a surrogate pair
  462. BOOL bCheckInvalidBytes = (dwFlags & MB_ERR_INVALID_CHARS);
  463. BYTE UTF8;
  464. // Note that we can't test destination buffer length here because we may have to
  465. // iterate through thousands of broken characters which won't be output, even though
  466. // the buffer has no more room.
  467. while (cchSrc--)
  468. {
  469. //
  470. // See if there are any trail bytes.
  471. //
  472. if (BIT7(*pUTF8) == 0)
  473. {
  474. //
  475. // Found ASCII.
  476. //
  477. if (cchDest)
  478. {
  479. // In this function always test buffer size before using it
  480. if (cchWC >= cchDest)
  481. {
  482. // Error: Buffer too small, we didn't process this character
  483. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  484. return (0);
  485. }
  486. lpDestStr[cchWC] = (WCHAR)*pUTF8;
  487. }
  488. nTB = bSurrogatePair = 0;
  489. cchWC++;
  490. }
  491. else if (BIT6(*pUTF8) == 0)
  492. {
  493. //
  494. // Found a trail byte.
  495. // Note : Ignore the trail byte if there was no lead byte.
  496. //
  497. if (nTB != 0)
  498. {
  499. //
  500. // Decrement the trail byte counter.
  501. //
  502. nTB--;
  503. // Add room for trail byte and add the trail byte falue
  504. dwUnicodeChar <<= 6;
  505. dwUnicodeChar |= LOWER_6_BIT(*pUTF8);
  506. // If we're done then we may need to store the data
  507. if (nTB == 0)
  508. {
  509. if (bSurrogatePair)
  510. {
  511. if (cchDest)
  512. {
  513. if ((cchWC + 1) >= cchDest)
  514. {
  515. // Error: Buffer too small, we didn't process this character
  516. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  517. return (0);
  518. }
  519. lpDestStr[cchWC] = (WCHAR)
  520. (((dwUnicodeChar - 0x10000) >> 10) + HIGH_SURROGATE_START);
  521. lpDestStr[cchWC+1] = (WCHAR)
  522. ((dwUnicodeChar - 0x10000)%0x400 + LOW_SURROGATE_START);
  523. }
  524. //
  525. // End of sequence. Advance the output counter, turn off surrogateness
  526. //
  527. cchWC += 2;
  528. bSurrogatePair = FALSE;
  529. }
  530. else
  531. {
  532. if (cchDest)
  533. {
  534. if (cchWC >= cchDest)
  535. {
  536. // Error: Buffer too small, we didn't process this character
  537. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  538. return (0);
  539. }
  540. lpDestStr[cchWC] = (WCHAR)dwUnicodeChar;
  541. }
  542. //
  543. // End of sequence. Advance the output counter.
  544. //
  545. cchWC++;
  546. }
  547. }
  548. }
  549. else
  550. {
  551. if (bCheckInvalidBytes)
  552. {
  553. SetLastError(ERROR_NO_UNICODE_TRANSLATION);
  554. return (0);
  555. }
  556. // error - not expecting a trail byte. That is, there is a trailing byte without leading byte.
  557. bSurrogatePair = FALSE;
  558. }
  559. }
  560. else
  561. {
  562. //
  563. // Found a lead byte.
  564. //
  565. if (nTB > 0)
  566. {
  567. // error - A leading byte before the previous sequence is completed.
  568. if (bCheckInvalidBytes)
  569. {
  570. SetLastError(ERROR_NO_UNICODE_TRANSLATION);
  571. return (0);
  572. }
  573. //
  574. // Error - previous sequence not finished.
  575. //
  576. nTB = 0;
  577. bSurrogatePair = FALSE;
  578. // Put this character back so that we can start over another sequence.
  579. cchSrc++;
  580. pUTF8--;
  581. }
  582. else
  583. {
  584. //
  585. // Calculate the number of bytes to follow.
  586. // Look for the first 0 from left to right.
  587. //
  588. UTF8 = *pUTF8;
  589. while (BIT7(UTF8) != 0)
  590. {
  591. UTF8 <<= 1;
  592. nTB++;
  593. }
  594. // Recover the data from the byte
  595. UTF8 >>= nTB;
  596. //
  597. // Check for non-shortest form.
  598. //
  599. switch (nTB)
  600. {
  601. case 1:
  602. nTB = 0;
  603. break;
  604. case 2:
  605. // Make sure that bit 8 ~ bit 11 is not all zero.
  606. // 110XXXXx 10xxxxxx
  607. if ((*pUTF8 & 0x1e) == 0)
  608. {
  609. nTB = 0;
  610. }
  611. break;
  612. case 3:
  613. // Look ahead to check for non-shortest form.
  614. // 1110XXXX 10Xxxxxx 10xxxxxx
  615. if (cchSrc >= 2)
  616. {
  617. if (((*pUTF8 & 0x0f) == 0) && (*(pUTF8 + 1) & 0x20) == 0)
  618. {
  619. nTB = 0;
  620. }
  621. }
  622. break;
  623. case 4:
  624. //
  625. // This is a surrogate unicode pair
  626. //
  627. if (cchSrc >= 3)
  628. {
  629. WORD word = (((WORD)*pUTF8) << 8) | *(pUTF8 + 1);
  630. // Look ahead to check for non-shortest form.
  631. // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
  632. // Check if the 5 X bits are all zero.
  633. // 0x0730 == 00000111 00110000
  634. if ( (word & 0x0730) == 0 ||
  635. // If the 21st bit is 1, we have extra work
  636. ( (word & 0x0400) == 0x0400 &&
  637. // The 21st bit is 1.
  638. // Make sure that the resulting Unicode is within the valid surrogate range.
  639. // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point range
  640. // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF.
  641. // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20
  642. // bit are all zero.
  643. // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx,
  644. // XXXXX can only be 10000.
  645. // 0x0330 = 0000 0011 0011 0000
  646. (word & 0x0330) != 0 ) )
  647. {
  648. // Not shortest form
  649. nTB = 0;
  650. }
  651. else
  652. {
  653. // A real surrogate pair
  654. bSurrogatePair = TRUE;
  655. }
  656. }
  657. break;
  658. default:
  659. //
  660. // If the bits is greater than 4, this is an invalid
  661. // UTF8 lead byte.
  662. //
  663. nTB = 0;
  664. break;
  665. }
  666. if (nTB != 0)
  667. {
  668. //
  669. // Store the value from the first byte and decrement
  670. // the number of bytes to follow.
  671. //
  672. dwUnicodeChar = UTF8;
  673. nTB--;
  674. } else
  675. {
  676. if (bCheckInvalidBytes)
  677. {
  678. SetLastError(ERROR_NO_UNICODE_TRANSLATION);
  679. return (0);
  680. }
  681. }
  682. }
  683. }
  684. pUTF8++;
  685. }
  686. if ((bCheckInvalidBytes && nTB != 0) || (cchWC == 0))
  687. {
  688. // About (cchWC == 0):
  689. // Because we now throw away non-shortest form, it is possible that we generate 0 chars.
  690. // In this case, we have to set error to ERROR_NO_UNICODE_TRANSLATION so that we conform
  691. // to the spec of MultiByteToWideChar.
  692. SetLastError(ERROR_NO_UNICODE_TRANSLATION);
  693. return (0);
  694. }
  695. //
  696. // Return the number of Unicode characters written.
  697. //
  698. return (cchWC);
  699. }
  700. ////////////////////////////////////////////////////////////////////////////
  701. //
  702. // UnicodeToUTF7
  703. //
  704. // Maps a Unicode character string to its UTF-7 string counterpart.
  705. //
  706. // 02-06-96 JulieB Created.
  707. ////////////////////////////////////////////////////////////////////////////
  708. int UnicodeToUTF7(
  709. LPCWSTR lpSrcStr,
  710. int cchSrc,
  711. LPSTR lpDestStr,
  712. int cchDest)
  713. {
  714. LPCWSTR lpWC = lpSrcStr;
  715. BOOL fShift = FALSE;
  716. DWORD dwBit = 0; // 32-bit buffer
  717. int iPos = 0; // 6-bit position in buffer
  718. int cchU7 = 0; // # of UTF7 chars generated
  719. while ((cchSrc--) && ((cchDest == 0) || (cchU7 < cchDest)))
  720. {
  721. if ((*lpWC > ASCII) || (fShiftChar[*lpWC]))
  722. {
  723. //
  724. // Need shift. Store 16 bits in buffer.
  725. //
  726. dwBit |= ((DWORD)*lpWC) << (16 - iPos);
  727. iPos += 16;
  728. if (!fShift)
  729. {
  730. //
  731. // Not in shift state, so add "+".
  732. //
  733. if (cchDest)
  734. {
  735. lpDestStr[cchU7] = SHIFT_IN;
  736. }
  737. cchU7++;
  738. //
  739. // Go into shift state.
  740. //
  741. fShift = TRUE;
  742. }
  743. //
  744. // Output 6 bits at a time as Base64 chars.
  745. //
  746. while (iPos >= 6)
  747. {
  748. if (cchDest)
  749. {
  750. if (cchU7 < cchDest)
  751. {
  752. //
  753. // 26 = 32 - 6
  754. //
  755. lpDestStr[cchU7] = cBase64[(int)(dwBit >> 26)];
  756. }
  757. else
  758. {
  759. break;
  760. }
  761. }
  762. cchU7++;
  763. dwBit <<= 6; // remove from bit buffer
  764. iPos -= 6; // adjust position pointer
  765. }
  766. if (iPos >= 6)
  767. {
  768. //
  769. // Error - buffer too small.
  770. //
  771. cchSrc++;
  772. break;
  773. }
  774. }
  775. else
  776. {
  777. //
  778. // No need to shift.
  779. //
  780. if (fShift)
  781. {
  782. //
  783. // End the shift sequence.
  784. //
  785. fShift = FALSE;
  786. if (iPos != 0)
  787. {
  788. //
  789. // Some bits left in dwBit.
  790. //
  791. if (cchDest)
  792. {
  793. if ((cchU7 + 1) < cchDest)
  794. {
  795. lpDestStr[cchU7++] = cBase64[(int)(dwBit >> 26)];
  796. lpDestStr[cchU7++] = SHIFT_OUT;
  797. }
  798. else
  799. {
  800. //
  801. // Error - buffer too small.
  802. //
  803. cchSrc++;
  804. break;
  805. }
  806. }
  807. else
  808. {
  809. cchU7 += 2;
  810. }
  811. dwBit = 0; // reset bit buffer
  812. iPos = 0; // reset postion pointer
  813. }
  814. else
  815. {
  816. //
  817. // Simply end the shift sequence.
  818. //
  819. if (cchDest)
  820. {
  821. lpDestStr[cchU7++] = SHIFT_OUT;
  822. }
  823. else
  824. {
  825. cchU7++;
  826. }
  827. }
  828. }
  829. //
  830. // Write the character to the buffer.
  831. // If the character is "+", then write "+-".
  832. //
  833. if (cchDest)
  834. {
  835. if (cchU7 < cchDest)
  836. {
  837. lpDestStr[cchU7++] = (char)*lpWC;
  838. if (*lpWC == SHIFT_IN)
  839. {
  840. if (cchU7 < cchDest)
  841. {
  842. lpDestStr[cchU7++] = SHIFT_OUT;
  843. }
  844. else
  845. {
  846. //
  847. // Error - buffer too small.
  848. //
  849. cchSrc++;
  850. break;
  851. }
  852. }
  853. }
  854. else
  855. {
  856. //
  857. // Error - buffer too small.
  858. //
  859. cchSrc++;
  860. break;
  861. }
  862. }
  863. else
  864. {
  865. cchU7++;
  866. if (*lpWC == SHIFT_IN)
  867. {
  868. cchU7++;
  869. }
  870. }
  871. }
  872. lpWC++;
  873. }
  874. //
  875. // See if we're still in the shift state.
  876. //
  877. if (fShift)
  878. {
  879. if (iPos != 0)
  880. {
  881. //
  882. // Some bits left in dwBit.
  883. //
  884. if (cchDest)
  885. {
  886. if ((cchU7 + 1) < cchDest)
  887. {
  888. lpDestStr[cchU7++] = cBase64[(int)(dwBit >> 26)];
  889. lpDestStr[cchU7++] = SHIFT_OUT;
  890. }
  891. else
  892. {
  893. //
  894. // Error - buffer too small.
  895. //
  896. cchSrc++;
  897. }
  898. }
  899. else
  900. {
  901. cchU7 += 2;
  902. }
  903. }
  904. else
  905. {
  906. //
  907. // Simply end the shift sequence.
  908. //
  909. if (cchDest)
  910. {
  911. if (cchU7 < cchDest)
  912. {
  913. lpDestStr[cchU7++] = SHIFT_OUT;
  914. }
  915. else
  916. {
  917. //
  918. // Error - buffer too small.
  919. //
  920. cchSrc++;
  921. }
  922. }
  923. else
  924. {
  925. cchU7++;
  926. }
  927. }
  928. }
  929. //
  930. // Make sure the destination buffer was large enough.
  931. //
  932. if (cchDest && (cchSrc >= 0))
  933. {
  934. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  935. return (0);
  936. }
  937. //
  938. // Return the number of UTF-7 characters written.
  939. //
  940. return (cchU7);
  941. }
  942. ////////////////////////////////////////////////////////////////////////////
  943. //
  944. // UnicodeToUTF8
  945. //
  946. // Maps a Unicode character string to its UTF-8 string counterpart.
  947. //
  948. // 02-06-96 JulieB Created.
  949. ////////////////////////////////////////////////////////////////////////////
  950. int UnicodeToUTF8(
  951. LPCWSTR lpSrcStr,
  952. int cchSrc,
  953. LPSTR lpDestStr,
  954. int cchDest)
  955. {
  956. LPCWSTR lpWC = lpSrcStr;
  957. int cchU8 = 0; // # of UTF8 chars generated
  958. DWORD dwSurrogateChar;
  959. WCHAR wchHighSurrogate = 0;
  960. BOOL bHandled;
  961. while ((cchSrc--) && ((cchDest == 0) || (cchU8 < cchDest)))
  962. {
  963. bHandled = FALSE;
  964. //
  965. // Check if high surrogate is available
  966. //
  967. if ((*lpWC >= HIGH_SURROGATE_START) && (*lpWC <= HIGH_SURROGATE_END))
  968. {
  969. if (cchDest)
  970. {
  971. // Another high surrogate, then treat the 1st as normal
  972. // Unicode character.
  973. if (wchHighSurrogate)
  974. {
  975. if ((cchU8 + 2) < cchDest)
  976. {
  977. lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate);
  978. lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate);
  979. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate);
  980. }
  981. else
  982. {
  983. // not enough buffer
  984. cchSrc++;
  985. break;
  986. }
  987. }
  988. }
  989. else
  990. {
  991. cchU8 += 3;
  992. }
  993. wchHighSurrogate = *lpWC;
  994. bHandled = TRUE;
  995. }
  996. if (!bHandled && wchHighSurrogate)
  997. {
  998. if ((*lpWC >= LOW_SURROGATE_START) && (*lpWC <= LOW_SURROGATE_END))
  999. {
  1000. // wheee, valid surrogate pairs
  1001. if (cchDest)
  1002. {
  1003. if ((cchU8 + 3) < cchDest)
  1004. {
  1005. dwSurrogateChar = (((wchHighSurrogate-0xD800) << 10) + (*lpWC - 0xDC00) + 0x10000);
  1006. lpDestStr[cchU8++] = (UTF8_1ST_OF_4 |
  1007. (unsigned char)(dwSurrogateChar >> 18)); // 3 bits from 1st byte
  1008. lpDestStr[cchU8++] = (UTF8_TRAIL |
  1009. (unsigned char)((dwSurrogateChar >> 12) & 0x3f)); // 6 bits from 2nd byte
  1010. lpDestStr[cchU8++] = (UTF8_TRAIL |
  1011. (unsigned char)((dwSurrogateChar >> 6) & 0x3f)); // 6 bits from 3rd byte
  1012. lpDestStr[cchU8++] = (UTF8_TRAIL |
  1013. (unsigned char)(0x3f & dwSurrogateChar)); // 6 bits from 4th byte
  1014. }
  1015. else
  1016. {
  1017. // not enough buffer
  1018. cchSrc++;
  1019. break;
  1020. }
  1021. }
  1022. else
  1023. {
  1024. // we already counted 3 previously (in high surrogate)
  1025. cchU8 ++;
  1026. }
  1027. bHandled = TRUE;
  1028. }
  1029. else
  1030. {
  1031. // Bad Surrogate pair : ERROR
  1032. // Just process wchHighSurrogate , and the code below will
  1033. // process the current code point
  1034. if (cchDest)
  1035. {
  1036. if ((cchU8 + 2) < cchDest)
  1037. {
  1038. lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate);
  1039. lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate);
  1040. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate);
  1041. }
  1042. else
  1043. {
  1044. // not enough buffer
  1045. cchSrc++;
  1046. break;
  1047. }
  1048. }
  1049. }
  1050. wchHighSurrogate = 0;
  1051. }
  1052. if (!bHandled)
  1053. {
  1054. if (*lpWC <= ASCII)
  1055. {
  1056. //
  1057. // Found ASCII.
  1058. //
  1059. if (cchDest)
  1060. {
  1061. if (cchU8 < cchDest)
  1062. {
  1063. lpDestStr[cchU8] = (char)*lpWC;
  1064. }
  1065. else
  1066. {
  1067. //
  1068. // Error - buffer too small.
  1069. //
  1070. cchSrc++;
  1071. break;
  1072. }
  1073. }
  1074. cchU8++;
  1075. }
  1076. else if (*lpWC <= UTF8_2_MAX)
  1077. {
  1078. //
  1079. // Found 2 byte sequence if < 0x07ff (11 bits).
  1080. //
  1081. if (cchDest)
  1082. {
  1083. if ((cchU8 + 1) < cchDest)
  1084. {
  1085. //
  1086. // Use upper 5 bits in first byte.
  1087. // Use lower 6 bits in second byte.
  1088. //
  1089. lpDestStr[cchU8++] = UTF8_1ST_OF_2 | (*lpWC >> 6);
  1090. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(*lpWC);
  1091. }
  1092. else
  1093. {
  1094. //
  1095. // Error - buffer too small.
  1096. //
  1097. cchSrc++;
  1098. break;
  1099. }
  1100. }
  1101. else
  1102. {
  1103. cchU8 += 2;
  1104. }
  1105. }
  1106. else
  1107. {
  1108. //
  1109. // Found 3 byte sequence.
  1110. //
  1111. if (cchDest)
  1112. {
  1113. if ((cchU8 + 2) < cchDest)
  1114. {
  1115. //
  1116. // Use upper 4 bits in first byte.
  1117. // Use middle 6 bits in second byte.
  1118. // Use lower 6 bits in third byte.
  1119. //
  1120. lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(*lpWC);
  1121. lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(*lpWC);
  1122. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(*lpWC);
  1123. }
  1124. else
  1125. {
  1126. //
  1127. // Error - buffer too small.
  1128. //
  1129. cchSrc++;
  1130. break;
  1131. }
  1132. }
  1133. else
  1134. {
  1135. cchU8 += 3;
  1136. }
  1137. }
  1138. }
  1139. lpWC++;
  1140. }
  1141. //
  1142. // If the last character was a high surrogate, then handle it as a normal
  1143. // unicode character.
  1144. //
  1145. if ((cchSrc < 0) && (wchHighSurrogate != 0))
  1146. {
  1147. if (cchDest)
  1148. {
  1149. if ((cchU8 + 2) < cchDest)
  1150. {
  1151. lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate);
  1152. lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate);
  1153. lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate);
  1154. }
  1155. else
  1156. {
  1157. cchSrc++;
  1158. }
  1159. }
  1160. }
  1161. //
  1162. // Make sure the destination buffer was large enough.
  1163. //
  1164. if (cchDest && (cchSrc >= 0))
  1165. {
  1166. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  1167. return (0);
  1168. }
  1169. //
  1170. // Return the number of UTF-8 characters written.
  1171. //
  1172. return (cchU8);
  1173. }