Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

852 lines
32 KiB

  1. /*++
  2. Copyright (c) 1991-1999, Microsoft Corporation All rights reserved.
  3. Module Name:
  4. c_gb18030.c
  5. Abstract:
  6. This file contains functions to convert GB18030-2000 (code page 54936) into Unicode, and vice versa.
  7. The target module is c_g18030.dll. This will be the external DLL used by WideCharToMultiByte()
  8. and MultiByteToWideChar() to perform the conversion for GB18030 codepage.
  9. External Routines in this file:
  10. DllEntry
  11. NlsDllCodePageTranslation
  12. Notes:
  13. GB18030-2000 (aka GBK2K) is designed to be mostly compatible with GBK (codepage 936),
  14. while supports the full range of Unicode code points (BMP + 16 supplementary planes).
  15. The structure for GB18030 is:
  16. * Single byte:
  17. 0x00 ~ 0x7f
  18. * Two-byte:
  19. 0x81 ~ 0xfe, 0x40 ~ 0x7e (leading byte, trailing byte)
  20. 0x81 ~ 0xfe, 0x80 ~ 0xfe (leading byte, trailing byte)
  21. * Four-byte:
  22. 0x81 ~ 0xfe, 0x30 ~ 0x39, 0x81 ~ 0xfe, 0x30 ~ 0x39.
  23. The surrogare pair will be encoded from 0x90, 0x30, 0x81, 0x30
  24. The BMP range is fully supported in GB18030 using 1-byte, 2-byte and 4-byte sequences.
  25. In valid 4-byte GB18030, there are two gaps that can not be mapped to Unicode characters.
  26. 0x84, 0x31, 0xa5, 0x30 (just after the GB18030 bytes for U+FFFF(*)) ~ 0x8f, 0x39, 0xfe, 0x39 (just before the first GB18030 bytes for U+D800,U+DC00)
  27. 0xe3, 0x32, 0x9a, 0x36 (just after the GB18030 bytes for U+DBFF U+DFFF(**)) ~ 0xfe, 0x39, 0xfe, 0x39
  28. Note1: U+FFFF = 0x84, 0x31, 0xa4, 0x39
  29. Note2: U+DBFF U+DFFF = 0xe3, 0x32, 0x9a, 0x35
  30. Tables used in c_g18030.dll:
  31. * From Unicode to bytes:
  32. * g_wUnicodeToGB:
  33. Used to convert Unicode character to 2-byte GBK, 2-byte GB18030, or 4-byte GB18030.
  34. The index is 0x0000 ~ 0xffff, for Unicode BMP range.
  35. When the valures are:
  36. Value Meaning
  37. ====== =======
  38. 0xffff 2-byte GB18030, which is compatible with GBK. Call WC2MB(936,...) to convert.
  39. 0xfffe ~ [0xfffe - (ARRAYSIZE(g_wUnicodeToGBTwoBytes))+1]
  40. 2-byte GB18030, which is NOT compatible with GBK. (0xfffe - Value) will be indexed into
  41. a second table g_wUnicodeToGBTwoBytes, which contains the two-byte GB18030 values.
  42. E.g. if the value is 0xfffe, the index into g_wUnicodeToGBTwoBytes is 0, so the two-byte
  43. GB18030 will be 0xa8, 0xbf (which are stored g_wUnicodeToGBTwoBytes[0],g_wUnicodeToGBTwoBytes[1])
  44. 0x0000 ~ 0x99fb
  45. An offset value that can be used to convert to 4-byte GB18030
  46. If the value is 0x000, the 4-byte GB18030 is 0x81, 0x30, 0x81, 0x30.
  47. * From bytes to Unicode
  48. * Two-byte GB18030 to Unicode:
  49. * g_wGBLeadByteOffset
  50. The index into this table is lead byte 0x80 ~ 0xff (converted to index 0x00 ~ 0x7f).
  51. If the value is 0x0000, it means that this lead byte is compatible with GBK.
  52. Otherwise, the value can be:
  53. 0x0100 This is used to indexed into g_wUnicodeFromGBTwoBytes[0x0000 ~ 0x00ff].
  54. The value of g_wUnicodeFromGBTwoBytesis the Unicode value for this lead byte with the next valid trailing byte.
  55. 0x0200 This is used to indexed into g_wUnicodeFromGBTwoBytes[0x0100 ~ 0x01ff].
  56. 0x0300 This is used to indexed into g_wUnicodeFromGBTwoBytes[0x0200 ~ 0x02ff].
  57. 0x0400 This is used to indexed into g_wUnicodeFromGBTwoBytes[0x0300 ~ 0x03ff].
  58. E.g. g_wGBLeadByteOffset[0x07] = 0x0000. It means that GB18030 two-byte lead byte 0x87 is compatible with GBK.
  59. g_wGBLeadByteOffset[0x28] = 0x0200. It means that GB18030 two-byte lead byte 0xa8 (0x28+0x80 = 0xa8) is NOT compatible with GBK.
  60. The Unicode value for 0xa8, <trail byte> will be stored in g_wUnicodeFromGBTwoBytes[0x0100+<trail byte>]
  61. * Four-byte GB18030 to Unicode:
  62. * g_wGBFourBytesToUnicode
  63. The table is used to convert 4-byte GB18030 into a Unicode.
  64. The index value is the offset of the 4-byte GB18030.
  65. 4-byte GB18030 Index value
  66. ============== ===========
  67. 81,30,81,30 0
  68. 81,30,81,31 1
  69. 81,30,81,32 2
  70. ... ...
  71. The value of g_wGBFourBytesToUnicode cotains the Unicode codepoint for the offset of the
  72. corresponding 4-byte GB18030.
  73. E.g. g_wGBFourBytesToUnicode[0] = 0x0080. This means that GB18030 0x81, 0x30, 0x81, 0x30 will be converted to Unicode U+0800.
  74. Revision History:
  75. 02-20-2001 YSLin Created.
  76. --*/
  77. //
  78. // Include Files.
  79. //
  80. #include <share.h>
  81. #include "c_gb18030.h"
  82. //
  83. // Constant Declarations.
  84. //
  85. //
  86. // Structure used in GetCPInfo().
  87. //
  88. CPINFO g_CPInfo =
  89. {
  90. //UINT MaxCharSize;
  91. 4,
  92. //BYTE DefaultChar[MAX_DEFAULTCHAR];
  93. {0x3f, 0x00},
  94. //BYTE LeadByte[MAX_LEADBYTES];
  95. // Since GBK2K can have up to 4 bytes, we don't return
  96. // 0x81-0xfe as lead bytes here.
  97. {0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  98. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
  99. };
  100. // This is the offset for the start of surrogate U+D800, U+DC00
  101. #define SURROGATE_OFFSET GET_FOUR_BYTES_OFFSET_FROM_BYTES(0x90, 0x30, 0x81, 0x30)
  102. // This is the offset for the end of surrogate U+DBFF, U+DFFF
  103. #define SURROGATE_MAX_OFFSET GET_FOUR_BYTES_OFFSET_FROM_BYTES(0xe3, 0x32, 0x9a, 0x35)
  104. //-------------------------------------------------------------------------//
  105. // EXTERNAL ROUTINES //
  106. //-------------------------------------------------------------------------//
  107. ////////////////////////////////////////////////////////////////////////////
  108. //
  109. // NlsDllCodePageTranslation
  110. //
  111. // This routine is the main exported procedure for the functionality in
  112. // this DLL. All calls to this DLL must go through this function.
  113. //
  114. // 02-20-2001 YSLin Created.
  115. ////////////////////////////////////////////////////////////////////////////
  116. STDAPI_(DWORD) NlsDllCodePageTranslation(
  117. DWORD CodePage,
  118. DWORD dwFlags,
  119. LPSTR lpMultiByteStr,
  120. int cchMultiByte,
  121. LPWSTR lpWideCharStr,
  122. int cchWideChar,
  123. LPCPINFO lpCPInfo)
  124. {
  125. //
  126. // Error out if internally needed c_*.nls file is not installed.
  127. //
  128. if (!IsValidCodePage(CODEPAGE_GBK))
  129. {
  130. SetLastError(ERROR_INVALID_PARAMETER);
  131. return (0);
  132. }
  133. switch (dwFlags)
  134. {
  135. case ( NLS_CP_CPINFO ) :
  136. {
  137. memcpy(lpCPInfo, &g_CPInfo, sizeof(CPINFO));
  138. return (TRUE);
  139. }
  140. case ( NLS_CP_MBTOWC ) :
  141. {
  142. return (BytesToUnicode((BYTE*)lpMultiByteStr, cchMultiByte, NULL, lpWideCharStr, cchWideChar));
  143. }
  144. case ( NLS_CP_WCTOMB ) :
  145. {
  146. return (UnicodeToBytes(lpWideCharStr, cchWideChar, lpMultiByteStr, cchMultiByte));
  147. }
  148. }
  149. //
  150. // This shouldn't happen since this gets called by the NLS APIs.
  151. //
  152. SetLastError(ERROR_INVALID_PARAMETER);
  153. return (0);
  154. }
  155. //-------------------------------------------------------------------------//
  156. // INTERNAL ROUTINES //
  157. //-------------------------------------------------------------------------//
  158. ////////////////////////////////////////////////////////////////////////////
  159. //
  160. // GetBytesToUnicodeCount
  161. //
  162. // Return the Unicode character count needed to convert the specified
  163. // GB18030 multi-byte string.
  164. //
  165. // Parameters:
  166. // lpMultiByteStr The multi-byte string to be converted.
  167. // cchMultiByte The byte size of the multi-byte string to be converted
  168. // bSupportEncoder If TRUE and we have a lead byte at the end of string,
  169. // we will not convert that lead byte. Otherwise,
  170. // convert it to the default character.
  171. //
  172. // 02-21-2001 YSLin Created.
  173. ////////////////////////////////////////////////////////////////////////////
  174. DWORD GetBytesToUnicodeCount(BYTE* lpMultiByteStr, int cchMultiByte, BOOL bSupportEncoder)
  175. {
  176. int i = 0;
  177. BYTE ch;
  178. DWORD cchWCCount = 0;
  179. WORD wOffset;
  180. BYTE offset1, offset2, offset3, offset4;
  181. DWORD dwFourBytesOffset;
  182. if (cchMultiByte == -1)
  183. {
  184. cchMultiByte = strlen((LPSTR)lpMultiByteStr);
  185. }
  186. while (i < cchMultiByte)
  187. {
  188. ch = lpMultiByteStr[i];
  189. if (ch <= 0x7f)
  190. {
  191. cchWCCount++;
  192. i++;
  193. } else if (IS_GB_LEAD_BYTE(ch))
  194. {
  195. offset1 = (ch - GBK2K_BYTE1_MIN);
  196. //
  197. // If this is a lead byte, look ahead to see if this is
  198. // a two-byte GB18030 or four-byte GB18030.
  199. //
  200. if (i+1 < cchMultiByte)
  201. {
  202. if (IS_GB_TWO_BYTES_TRAILING(lpMultiByteStr[i+1]))
  203. {
  204. //
  205. // The trailing byte is a GB18030 two-byte.
  206. //
  207. cchWCCount++;
  208. i += 2;
  209. } else if (i+3 < cchMultiByte)
  210. {
  211. //
  212. // Check if this is a four-byte GB18030.
  213. //
  214. if (IS_GB_FOUR_BYTES_TRAILING(lpMultiByteStr[i+1]) &&
  215. IS_GB_LEAD_BYTE(lpMultiByteStr[i+2]) &&
  216. IS_GB_FOUR_BYTES_TRAILING(lpMultiByteStr[i+3]))
  217. {
  218. offset2 = lpMultiByteStr[i+1] - GBK2K_BYTE2_MIN;
  219. offset3 = lpMultiByteStr[i+2] - GBK2K_BYTE3_MIN;
  220. offset4 = lpMultiByteStr[i+3] - GBK2K_BYTE4_MIN;
  221. //
  222. // Four-byte GB18030
  223. //
  224. dwFourBytesOffset = GET_FOUR_BYTES_OFFSET(offset1, offset2, offset3, offset4);
  225. if (dwFourBytesOffset <= g_wMax4BytesOffset)
  226. {
  227. //
  228. // The Unicode will be in the BMP range.
  229. //
  230. cchWCCount++;
  231. } else if (dwFourBytesOffset >= SURROGATE_OFFSET && dwFourBytesOffset <= SURROGATE_MAX_OFFSET)
  232. {
  233. //
  234. // This will be converted to a surrogate pair.
  235. //
  236. cchWCCount+=2;
  237. } else {
  238. //
  239. // Valid GBK2K code point, but can not be mapped to Unicode.
  240. //
  241. cchWCCount++;
  242. }
  243. i += 4;
  244. } else
  245. {
  246. if (bSupportEncoder)
  247. {
  248. // Set i to cchMultiByte so that we will bail out the while loop.
  249. i = cchMultiByte;
  250. } else
  251. {
  252. //
  253. // We have a lead byte, but do have have a valid trailing byte.
  254. //
  255. // Use default Unicode char.
  256. i++;
  257. cchWCCount++;
  258. }
  259. }
  260. }else
  261. {
  262. if (bSupportEncoder)
  263. {
  264. // Set i to cchMultiByte so that we will bail out the while loop.
  265. i = cchMultiByte;
  266. } else
  267. {
  268. //
  269. // We have a lead byte, but do have have a valid trailing byte.
  270. //
  271. // Use default Unicode char.
  272. i++;
  273. cchWCCount++;
  274. }
  275. }
  276. } else
  277. {
  278. //
  279. // We have a lead byte at the end of the string.
  280. //
  281. if (bSupportEncoder)
  282. {
  283. i++;
  284. } else
  285. {
  286. // Use default Unicode char.
  287. i++;
  288. cchWCCount++;
  289. }
  290. }
  291. }else
  292. {
  293. //
  294. // This byte is NOT between 0x00 ~ 0x7f, and not a lead byte.
  295. // Use the default character.
  296. //
  297. i++;
  298. cchWCCount++;
  299. }
  300. }
  301. return (cchWCCount);
  302. }
  303. BOOL __forceinline PutDefaultCharacter(UINT* pCchWCCount, UINT cchWideChar, LPWSTR lpWideCharStr)
  304. {
  305. //
  306. // This byte is NOT between 0x00 ~ 0x7f, not a lead byte.
  307. //
  308. if (*pCchWCCount >= cchWideChar)
  309. {
  310. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  311. return (FALSE);
  312. }
  313. lpWideCharStr[(*pCchWCCount)++] = GB18030_DEFAULT_UNICODE_CHAR;
  314. return (TRUE);
  315. }
  316. STDAPI_(DWORD) BytesToUnicode(
  317. BYTE* lpMultiByteStr,
  318. UINT cchMultiByte,
  319. UINT* pcchLeftOverBytes,
  320. LPWSTR lpWideCharStr,
  321. UINT cchWideChar)
  322. {
  323. UINT i = 0;
  324. BYTE ch;
  325. UINT cchWCCount = 0;
  326. BYTE offset1, offset2, offset3, offset4;
  327. WORD wOffset;
  328. DWORD dwOffset;
  329. int nResult;
  330. if ((lpWideCharStr == NULL) || (cchWideChar == 0))
  331. {
  332. return (GetBytesToUnicodeCount(lpMultiByteStr, cchMultiByte, (pcchLeftOverBytes != NULL)));
  333. }
  334. if (cchMultiByte == -1)
  335. {
  336. cchMultiByte = strlen((LPSTR)lpMultiByteStr);
  337. }
  338. if (pcchLeftOverBytes != NULL)
  339. {
  340. *pcchLeftOverBytes = 0;
  341. }
  342. //
  343. // NOTENOTE YSLin:
  344. // If you make fix in the following code, remember to make the appropriate fix
  345. // in GetBytesToUnicodeCount() as well.
  346. //
  347. while (i < cchMultiByte)
  348. {
  349. ch = lpMultiByteStr[i];
  350. if (ch <= 0x7f)
  351. {
  352. //
  353. // This byte is from 0x00 ~ 0x7f.
  354. //
  355. if (cchWCCount >= cchWideChar)
  356. {
  357. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  358. return (0);
  359. }
  360. lpWideCharStr[cchWCCount++] = ch;
  361. i++;
  362. } else if (IS_GB_LEAD_BYTE(ch))
  363. {
  364. offset1 = ch - GBK2K_BYTE1_MIN;
  365. //
  366. // If this is a lead byte, just look ahead to see if this is
  367. // a two-byte GB18030 or four-byte GB18030.
  368. //
  369. if (i+1 < cchMultiByte)
  370. {
  371. if (IS_GB_TWO_BYTES_TRAILING(lpMultiByteStr[i+1]))
  372. {
  373. //
  374. // The trailing byte is a GB18030 two-byte.
  375. //
  376. //
  377. // Look up the table to see if we have the table for
  378. // the mapping Unicode character.
  379. //
  380. wOffset = g_wGBLeadByteOffset[ch - 0x80];
  381. if (wOffset == 0x0000)
  382. {
  383. if (cchWCCount == cchWideChar)
  384. {
  385. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  386. return (0);
  387. }
  388. //
  389. // We don't have the table, because this is a GBK compatible two-byte GB18030.
  390. //
  391. //
  392. // Two-byte GB18030
  393. //
  394. nResult = MultiByteToWideChar(CODEPAGE_GBK, 0, (LPCSTR)(lpMultiByteStr+i), 2, lpWideCharStr+cchWCCount, 1);
  395. if (nResult == 0)
  396. {
  397. return (0);
  398. }
  399. cchWCCount++;
  400. i += 2;
  401. } else
  402. {
  403. if (cchWCCount == cchWideChar)
  404. {
  405. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  406. return (0);
  407. }
  408. wOffset -= 0x0100;
  409. lpWideCharStr[cchWCCount++] = g_wUnicodeFromGBTwoBytes[wOffset + lpMultiByteStr[i+1]];
  410. i+= 2;
  411. }
  412. } else if (i+3 < cchMultiByte)
  413. {
  414. if (IS_GB_FOUR_BYTES_TRAILING(lpMultiByteStr[i+1]) &&
  415. IS_GB_LEAD_BYTE(lpMultiByteStr[i+2]) &&
  416. IS_GB_FOUR_BYTES_TRAILING(lpMultiByteStr[i+3]))
  417. {
  418. offset2 = lpMultiByteStr[i+1] - GBK2K_BYTE2_MIN;
  419. offset3 = lpMultiByteStr[i+2] - GBK2K_BYTE3_MIN;
  420. offset4 = lpMultiByteStr[i+3] - GBK2K_BYTE4_MIN;
  421. //
  422. // Four-byte GB18030
  423. //
  424. dwOffset = GET_FOUR_BYTES_OFFSET(offset1, offset2, offset3, offset4);
  425. if (dwOffset <= g_wMax4BytesOffset)
  426. {
  427. if (cchWCCount == cchWideChar)
  428. {
  429. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  430. return (0);
  431. }
  432. //
  433. // The Unicode will be in the BMP range.
  434. //
  435. lpWideCharStr[cchWCCount++] = g_wGBFourBytesToUnicode[dwOffset];
  436. } else if (dwOffset >= SURROGATE_OFFSET && dwOffset <= SURROGATE_MAX_OFFSET)
  437. {
  438. if (cchWCCount + 2 > cchWideChar)
  439. {
  440. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  441. return (0);
  442. }
  443. //
  444. // This will be converted to a surrogate pair.
  445. //
  446. dwOffset -= SURROGATE_OFFSET;
  447. lpWideCharStr[cchWCCount++] = 0xd800 + (WORD)(dwOffset / 0x400);
  448. lpWideCharStr[cchWCCount++] = 0xdc00 + (WORD)(dwOffset % 0x400);
  449. } else
  450. {
  451. //
  452. // Valid GBK2K code point, but can not be mapped to Unicode.
  453. //
  454. if (!PutDefaultCharacter(&cchWCCount, cchWideChar, lpWideCharStr))
  455. {
  456. return (0);
  457. }
  458. }
  459. i += 4;
  460. }else
  461. {
  462. if (!PutDefaultCharacter(&cchWCCount, cchWideChar, lpWideCharStr))
  463. {
  464. return (0);
  465. }
  466. i++;
  467. }
  468. }else
  469. {
  470. if (pcchLeftOverBytes != NULL)
  471. {
  472. *pcchLeftOverBytes = cchMultiByte - i;
  473. // Set i to cchMultiByte so that we will bail out the while loop.
  474. i = cchMultiByte;
  475. } else
  476. {
  477. //
  478. // We have a lead byte, but do have have a valid trailing byte.
  479. //
  480. // Use default Unicode char.
  481. if (!PutDefaultCharacter(&cchWCCount, cchWideChar, lpWideCharStr))
  482. {
  483. return (0);
  484. }
  485. i++;
  486. }
  487. }
  488. } else
  489. {
  490. if (pcchLeftOverBytes != NULL)
  491. {
  492. *pcchLeftOverBytes = 1;
  493. i++;
  494. } else
  495. {
  496. // We have a lead byte, but do have have a trailing byte.
  497. // Use default Unicode char.
  498. if (!PutDefaultCharacter(&cchWCCount, cchWideChar, lpWideCharStr))
  499. {
  500. return (0);
  501. }
  502. i++;
  503. }
  504. }
  505. } else
  506. {
  507. if (!PutDefaultCharacter(&cchWCCount, cchWideChar, lpWideCharStr))
  508. {
  509. return (0);
  510. }
  511. i++;
  512. }
  513. }
  514. return (cchWCCount);
  515. }
  516. DWORD GetUnicodeToBytesCount(LPWSTR lpWideCharStr, int cchWideChar)
  517. {
  518. int i;
  519. WORD wch;
  520. int cchMBCount = 0;
  521. DWORD wOffset;
  522. if (cchWideChar == -1)
  523. {
  524. cchWideChar = wcslen(lpWideCharStr);
  525. }
  526. for (i = 0; i < cchWideChar; i++)
  527. {
  528. wch = lpWideCharStr[i];
  529. if (wch <= 0x7f)
  530. {
  531. // One-byte GB18030.
  532. cchMBCount++;
  533. } else if (IS_HIGH_SURROGATE(wch))
  534. {
  535. //
  536. // Look ahead one character to see if the next char is a low surrogate.
  537. //
  538. if (i + 1 < cchWideChar)
  539. {
  540. if (IS_LOW_SURROGATE(lpWideCharStr[ i+1 ]))
  541. {
  542. //
  543. // Found a surrogate pair. This will be a four-byte GB18030.
  544. //
  545. cchMBCount += 4;
  546. i++;
  547. } else
  548. {
  549. //
  550. // A High surrogate character without a trailing low surrogate character.
  551. // In this case, we will convert this character to a default character.
  552. //
  553. cchMBCount++;
  554. }
  555. } else
  556. {
  557. //
  558. // A High surrogate character without a valid trailing low surrogate character.
  559. // In this case, we will convert this character to a default character.
  560. //
  561. cchMBCount++;
  562. }
  563. } else if (IS_LOW_SURROGATE(wch))
  564. {
  565. //
  566. // Only a low surrogate character without a leading high surrogate.
  567. // In this case, we will convert this character to a default character.
  568. //
  569. cchMBCount++;
  570. } else
  571. {
  572. //
  573. // Not a surrogate character. Look up the table to see this BMP Unicode character
  574. // will be converted to a two-byte GB18030 or four-byte GB18030.
  575. //
  576. wOffset = g_wUnicodeToGB[wch];
  577. if (wOffset == 0xFFFF)
  578. {
  579. //
  580. // This Unicode character will be converted to GBK compatible two-byte code.
  581. //
  582. cchMBCount += 2;
  583. } else if (wOffset <= g_wMax4BytesOffset)
  584. {
  585. //
  586. // This Unicode character will be converted to four-byte GB18030.
  587. //
  588. cchMBCount += 4;
  589. } else
  590. {
  591. //
  592. // This Unicode character will be converted to two-byte GB18030, which is not compatible
  593. // with GBK.
  594. //
  595. cchMBCount += 2;
  596. }
  597. }
  598. }
  599. return (cchMBCount);
  600. }
  601. STDAPI_(DWORD) UnicodeToBytes(
  602. LPWSTR lpWideCharStr,
  603. UINT cchWideChar,
  604. LPSTR lpMultiByteStr,
  605. UINT cchMultiByte)
  606. {
  607. UINT i;
  608. WORD wch;
  609. UINT cchMBCount = 0;
  610. CHAR MBTwoBytes[2];
  611. BYTE MBFourBytes[4];
  612. WORD wOffset;
  613. DWORD dwSurrogateOffset;
  614. int nResult;
  615. if ((lpMultiByteStr == NULL) || (cchMultiByte == 0))
  616. {
  617. return (GetUnicodeToBytesCount(lpWideCharStr, cchWideChar));
  618. }
  619. if (cchWideChar == -1)
  620. {
  621. cchWideChar = wcslen(lpWideCharStr);
  622. }
  623. //
  624. // NOTENOTE YSLin:
  625. // If you make fix in the following code, remember to make the appropriate fix
  626. // in GetUnicodeToBytesCount() as well.
  627. //
  628. for (i = 0; i < cchWideChar; i++)
  629. {
  630. wch = lpWideCharStr[i];
  631. if (wch <= 0x7f)
  632. {
  633. if (cchMBCount == cchMultiByte)
  634. {
  635. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  636. return (0);
  637. }
  638. lpMultiByteStr[cchMBCount++] = (BYTE)wch;
  639. } else if (IS_HIGH_SURROGATE(wch))
  640. {
  641. //
  642. // Look ahead one character to see if the next char is a low surrogate.
  643. //
  644. if (i + 1 < cchWideChar)
  645. {
  646. if (IS_LOW_SURROGATE(lpWideCharStr[ i+1 ]))
  647. {
  648. if (cchMBCount + 4 > cchMultiByte)
  649. {
  650. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  651. return (0);
  652. }
  653. i++;
  654. //
  655. // A surrogate pair will be converted to GB 18030 four-byte from
  656. // 0x90308130 ~ 0xe339fe39.
  657. //
  658. dwSurrogateOffset = (wch - 0xd800) * 0x0400 + (lpWideCharStr[i] - 0xdc00);
  659. lpMultiByteStr[cchMBCount+3] = (BYTE)(dwSurrogateOffset % GBK2K_BYTE4_RANGE) + GBK2K_BYTE4_MIN;
  660. dwSurrogateOffset /= GBK2K_BYTE4_RANGE;
  661. lpMultiByteStr[cchMBCount+2] = (BYTE)(dwSurrogateOffset % GBK2K_BYTE3_RANGE) + GBK2K_BYTE3_MIN;
  662. dwSurrogateOffset /= GBK2K_BYTE3_RANGE;
  663. lpMultiByteStr[cchMBCount+1] = (BYTE)(dwSurrogateOffset % GBK2K_BYTE2_RANGE) + GBK2K_BYTE2_MIN;
  664. dwSurrogateOffset /= GBK2K_BYTE2_RANGE;
  665. lpMultiByteStr[cchMBCount] = (BYTE)(dwSurrogateOffset % GBK2K_BYTE1_RANGE) + 0x90;
  666. cchMBCount += 4;
  667. } else
  668. {
  669. if (cchMBCount == cchMultiByte)
  670. {
  671. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  672. return (0);
  673. }
  674. //
  675. // A High surrogate character is at the end of string.
  676. // In this case, we will convert this character to a default character.
  677. //
  678. lpMultiByteStr[cchMBCount++] = GB18030_DEFAULT_CHAR;
  679. }
  680. }else
  681. {
  682. if (cchMBCount >= cchMultiByte)
  683. {
  684. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  685. return (0);
  686. }
  687. //
  688. // A High surrogate character without a valid trailing low surrogate character.
  689. // In this case, we will convert this character to a default character.
  690. //
  691. lpMultiByteStr[cchMBCount++] = GB18030_DEFAULT_CHAR;
  692. }
  693. } else if (IS_LOW_SURROGATE(wch))
  694. {
  695. if (cchMBCount == cchMultiByte)
  696. {
  697. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  698. return (0);
  699. }
  700. //
  701. // Only a low surrogate character without a leading high surrogate.
  702. // In this case, we will convert this character to a default character.
  703. //
  704. lpMultiByteStr[cchMBCount++] = GB18030_DEFAULT_CHAR;
  705. } else
  706. {
  707. //
  708. // This character is not below 0x7f, not a surrogate character.
  709. // Check the table to see how this Unicode character should be
  710. // converted. It could be:
  711. // 1. Two-byte GB18030, which is compatible with GBK. (wOffset == 0xffff)
  712. // 2. Two-byte GB18030, which is NOT compatible with GBK. (wOffset = 0xfffe and below)
  713. // 3. Four-byte GB18030. (wOffset >= 0 && wOffset < g_wMax4BytesOffset)
  714. //
  715. wOffset = g_wUnicodeToGB[wch];
  716. if (wOffset == 0xffff)
  717. {
  718. //
  719. // This Unicode character will be converted to the same two-byte GBK code, so use GBK table.
  720. //
  721. if (cchMBCount + 2 > cchMultiByte)
  722. {
  723. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  724. return (0);
  725. }
  726. nResult = WideCharToMultiByte(CODEPAGE_GBK, 0, lpWideCharStr+i, 1, lpMultiByteStr+cchMBCount, 2, NULL, NULL);
  727. if (nResult == 0)
  728. {
  729. return (0);
  730. }
  731. if (cchMBCount + nResult > cchMultiByte)
  732. {
  733. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  734. return (0);
  735. }
  736. cchMBCount += nResult;
  737. } else if (wOffset <= g_wMax4BytesOffset)
  738. {
  739. if (cchMBCount + 4 > cchMultiByte)
  740. {
  741. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  742. return (0);
  743. }
  744. //
  745. // This Unicode character will be converted to four-byte GB18030.
  746. //
  747. lpMultiByteStr[cchMBCount+3] = (wOffset % GBK2K_BYTE4_RANGE) + GBK2K_BYTE4_MIN;
  748. wOffset /= GBK2K_BYTE4_RANGE;
  749. lpMultiByteStr[cchMBCount+2] = (wOffset % GBK2K_BYTE3_RANGE) + GBK2K_BYTE3_MIN;
  750. wOffset /= GBK2K_BYTE3_RANGE;
  751. lpMultiByteStr[cchMBCount+1] = (wOffset % GBK2K_BYTE2_RANGE) + GBK2K_BYTE2_MIN;
  752. wOffset /= GBK2K_BYTE2_RANGE;
  753. lpMultiByteStr[cchMBCount] = (wOffset % GBK2K_BYTE1_RANGE) + GBK2K_BYTE1_MIN;
  754. cchMBCount += 4;
  755. } else
  756. {
  757. if (cchMBCount + 2 > cchMultiByte)
  758. {
  759. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  760. return (0);
  761. }
  762. //
  763. // This Unicode character will be converted to two-byte GB18030, which is not compatible
  764. // with GBK.
  765. //
  766. wOffset = 0xfffe - wOffset;
  767. // We don't have to check the range of wOffset here, since the value of wOffset is coming from
  768. // g_wUnicodeToGB.
  769. CopyMemory(lpMultiByteStr+cchMBCount, &g_wUnicodeToGBTwoBytes[wOffset * 2], 2);
  770. // Copy two bytes (a WORD) into lpMultiByteStr[cchMBCount].
  771. // Instead od CompMemory(), This is probably faster:
  772. // *((LPWORD)lpMultiByteStr[cchMBCount]) = *((LPWORD)g_wUnicodeToGBTwoBytes[wOffset * 2]);
  773. cchMBCount += 2;
  774. }
  775. }
  776. }
  777. return (cchMBCount);
  778. }