Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

396 lines
11 KiB

  1. #
  2. # GenGBK2K.pl
  3. #
  4. # Generate GBK2K (GB18030) codepage files by taking input from 936.txt and no80.txt.
  5. #
  6. use strict "vars";
  7. if ($#ARGV < 0)
  8. {
  9. ShowUsage();
  10. exit 1;
  11. }
  12. my $sCodePage936File = $ARGV[0]; # The NLS 936 codepage table.
  13. my $sGB18030File = $ARGV[1]; # The Unicode to GB18030 conversion table.
  14. my $sTableFile = "tables.cpp"; # The generated C source.
  15. my $nTwoBytesDiffCount = 0;
  16. my $nFourBytesCount = 0;
  17. my @FourBytesBuffer; # Store the value of the four bytes buffer.
  18. # This could be a four bytes encoding or an offset to the first 128 positions.
  19. my @TwoBytesBuffer; # Store the two bytes encoding values.
  20. my %GBK2KTwoBytes; # Used to store the mapping of GBK2K two bytes to Unicode.
  21. my %gGBTwoBytesToUnicode;
  22. if (!(open CODEPAGE936, $sCodePage936File))
  23. {
  24. die "Error in openning $sGB18030File.\n";
  25. }
  26. if (!(open GB18030, $sGB18030File))
  27. {
  28. die "Error in openning $sGB18030File.\n";
  29. }
  30. if (!(open TABLES, ">$sTableFile"))
  31. {
  32. die "Error in creating $sTableFile.\n";
  33. }
  34. Goto936WCTable();
  35. while (<CODEPAGE936>)
  36. {
  37. if (length($_) == 0)
  38. {
  39. next;
  40. }
  41. if (/0x(\w\w\w\w)\s+0x(\w+)\s+(.*)/)
  42. {
  43. my $ucp = hex(lc($1));
  44. my $gb = lc($2);
  45. while (<GB18030>)
  46. {
  47. if (length($_) == 0)
  48. {
  49. next;
  50. }
  51. if (/(\w\w\w\w)\s+(\w+)/)
  52. {
  53. my $ucpGBK2K = hex(lc($1)); # Unicode code point
  54. my $gbk2k = lc($2); # GBK encoding for this UCP.
  55. if ($ucp == $ucpGBK2K)
  56. {
  57. if ($ucp <= 0x7f)
  58. {
  59. #printf "%04x:%s\n", $ucp,$gb;
  60. } elsif ($gb eq $gbk2k)
  61. {
  62. #printf "%04x:%s\n", $ucp,$gb;
  63. $gGBTwoBytesToUnicode{$gb} = $ucp;
  64. } else
  65. {
  66. if (length($gbk2k) == 8)
  67. {
  68. # The GBK and GBK2K encoding are different, and the new
  69. # GBK2K encoding has 4 bytes.
  70. #printf ">>%04x:%s:%s\n", $ucp,$gb,$gbk2k;
  71. if (GetFourBytesOffset($gbk2k) != $nFourBytesCount)
  72. {
  73. die "Error in assumption";
  74. }
  75. $nFourBytesCount++;
  76. $FourBytesBuffer[$ucp] = $gbk2k;
  77. } else
  78. {
  79. # The GBK and GBK2K encoding are different, and the new
  80. # GBK2K encoding has 2 bytes.
  81. #printf "##%04x:%s:%s\n", $ucp,$gb,$gbk2k;
  82. $TwoBytesBuffer[$nTwoBytesDiffCount] = $gbk2k;
  83. $FourBytesBuffer[$ucp] = $nTwoBytesDiffCount;
  84. $nTwoBytesDiffCount++;
  85. $GBK2KTwoBytes{$gbk2k} = $ucp;
  86. $gGBTwoBytesToUnicode{$gbk2k} = $ucp;
  87. }
  88. }
  89. last;
  90. } else
  91. {
  92. GenerateGBK2K($ucpGBK2K, $gbk2k);
  93. }
  94. }
  95. } # while (<GB18030>)
  96. }
  97. if ($nFourBytesCount % 256 == 0)
  98. {
  99. print ".";
  100. }
  101. }
  102. while (<GB18030>)
  103. {
  104. if (length($_) == 0)
  105. {
  106. next;
  107. }
  108. if (/(\w\w\w\w)\s+(\w+)/)
  109. {
  110. my $ucpGBK2K = hex(lc($1)); # Unicode code point
  111. my $gbk2k = lc($2); # GBK encoding for this UCP.
  112. GenerateGBK2K($ucpGBK2K, $gbk2k);
  113. }
  114. }
  115. ################################################################
  116. #
  117. # Generate data table for Unicode to GB18030 conversion.
  118. #
  119. ################################################################
  120. print "\n";
  121. printf "Two Bytes diff : %d (%x)\n", $nTwoBytesDiffCount, $nTwoBytesDiffCount;
  122. printf "Four Bytes count: %d (%x)\n", $nFourBytesCount, $nFourBytesCount;
  123. print "Generating $sTableFile file...";
  124. my $nWCLines = $nFourBytesCount + $nTwoBytesDiffCount;
  125. my $i;
  126. my $j;
  127. my $lineCount = 0;
  128. print TABLES '#include <share.h>';
  129. print TABLES "\n";
  130. printf TABLES '#include "c_gb18030.h"';
  131. print TABLES "\n\n";
  132. printf TABLES "BYTE g_wUnicodeToGBTwoBytes[] = \n";
  133. printf TABLES "{\n";
  134. for ($i = 0; $i <= $#TwoBytesBuffer; $i++)
  135. {
  136. if ($i % 8 == 0)
  137. {
  138. printf TABLES "\n";
  139. }
  140. printf TABLES "0x%02x,", hex($TwoBytesBuffer[$i])/256;
  141. printf TABLES "0x%02x, ", hex($TwoBytesBuffer[$i])%256;
  142. }
  143. printf TABLES "\n\n};\n\n";
  144. printf TABLES "WORD g_wMax4BytesOffset = 0x%04x; // %d \n\n", $nFourBytesCount + $nTwoBytesDiffCount, $nFourBytesCount + $nTwoBytesDiffCount;
  145. printf TABLES "WORD g_wUnicodeToGB[] = \n";
  146. printf TABLES "{\n";
  147. for ($i = 0; $i <= 0xffff; $i++)
  148. {
  149. if (defined($FourBytesBuffer[$i]))
  150. {
  151. if (length($FourBytesBuffer[$i]) == 8)
  152. {
  153. #
  154. # Add 1 to offset since 0x0000 means that we should fallback to 936.txt.
  155. #
  156. printf TABLES ("0x%04x, ", GetFourBytesOffset($FourBytesBuffer[$i]));
  157. } else
  158. {
  159. printf TABLES ("0x%04x, ", 0xfffe - $FourBytesBuffer[$i]);
  160. }
  161. $lineCount++;
  162. } else
  163. {
  164. printf TABLES ("0x%04x, ", 0xffff);
  165. }
  166. if (($i+1) % 8 == 0)
  167. {
  168. printf TABLES " // U+%04x ~ U+%04x\n", $i - 7, $i;
  169. }
  170. }
  171. printf TABLES "\n\n};\n";
  172. #
  173. # Generate 54936MB.txt
  174. #
  175. my @GBK2KTwoBytesArray = sort(keys(%GBK2KTwoBytes));
  176. my %GBK2KLeadBytes; # Used to store the total number of bytes for a specific lead byte.
  177. #
  178. # Get the lead byte range.
  179. #
  180. for ($i = 0; $i <= $#GBK2KTwoBytesArray; $i++)
  181. {
  182. my $sLeadByte = lc(substr($GBK2KTwoBytesArray[$i], 0, 2)); # Get the first two character as the lead byte.
  183. if (!(defined($GBK2KLeadBytes{$sLeadByte})))
  184. {
  185. $GBK2KLeadBytes{$sLeadByte} = 1;
  186. } else
  187. {
  188. $GBK2KLeadBytes{$sLeadByte}++;
  189. }
  190. }
  191. my @GBK2KLeadBytesArray = sort(keys(%GBK2KLeadBytes));
  192. ################################################################
  193. #
  194. # Generate data table for GB18030 to Unicode conversion.
  195. #
  196. ################################################################
  197. $nWCLines = $nFourBytesCount;
  198. printf TABLES "\n\n";
  199. #
  200. # Generate data table for GB18030 four-byte data to Unicode conversion.
  201. #
  202. printf TABLES "WORD g_wGBFourBytesToUnicode[] = {\n\n";
  203. my $nCount = 0;
  204. for ($i = 0; $i <= $#FourBytesBuffer; $i++)
  205. {
  206. if (defined($FourBytesBuffer[$i]))
  207. {
  208. if (length($FourBytesBuffer[$i]) == 8)
  209. {
  210. printf TABLES ("0x%04x, ", $i);
  211. $nCount++;
  212. if ($nCount % 8 == 0)
  213. {
  214. printf TABLES " // Offset: %04x ~ %04x\n", ($nCount - 8 ), $nCount - 1;
  215. }
  216. }
  217. }
  218. }
  219. printf TABLES "\n\n};\n\n\n";
  220. #
  221. # Generate lead bytes array
  222. #
  223. printf TABLES "// The following lead bytes will be converted to different Unicode values compared with GBK:\n// ";
  224. for ($i = 0x80; $i <= 0xff; $i++)
  225. {
  226. my $sLeadByte = sprintf("%02x", $i);
  227. if (defined($GBK2KLeadBytes{$sLeadByte}))
  228. {
  229. printf TABLES "0x%02x, ", $i;
  230. }
  231. }
  232. printf TABLES "\n\n";
  233. printf TABLES "WORD g_wGBLeadByteOffset[] =\n";
  234. printf TABLES "{\n\n";
  235. my $sOffsetIndex = 1;
  236. for ($i = 0x80; $i <= 0xff; $i++)
  237. {
  238. my $sLeadByte = sprintf("%02x", $i);
  239. if (!(defined($GBK2KLeadBytes{$sLeadByte})))
  240. {
  241. printf TABLES "0x0000, ";
  242. } else
  243. {
  244. printf TABLES "0x%04x, ", ($sOffsetIndex * 256);
  245. $sOffsetIndex++;
  246. }
  247. if ((($i+1) % 8) == 0)
  248. {
  249. printf TABLES " // Lead byte %02x ~ %02x\n", ($i - 7), $i;
  250. }
  251. }
  252. printf TABLES "};\n\n";
  253. #
  254. # Generate data table for GB18030 two-byte data to Unicode conversion.
  255. #
  256. printf TABLES "WORD g_wUnicodeFromGBTwoBytes[] =\n";
  257. printf TABLES "{\n";
  258. for ($i = 0x81; $i <= 0xff; $i++)
  259. {
  260. my $sLeadByte = sprintf("%02x", $i);
  261. if ((defined($GBK2KLeadBytes{$sLeadByte})))
  262. {
  263. printf TABLES "\n// Lead Byte: 0x$sLeadByte\n";
  264. for ($j = 0x00; $j <= 0xff; $j++)
  265. {
  266. my $sGBK2K = sprintf("%02x%02x", $i, $j);
  267. if (defined($gGBTwoBytesToUnicode{$sGBK2K}))
  268. {
  269. printf TABLES "0x%04x, ", $gGBTwoBytesToUnicode{$sGBK2K};
  270. } else
  271. {
  272. printf TABLES "0x0000, ";
  273. }
  274. if ((($j + 1) % 8) == 0)
  275. {
  276. printf TABLES " // Trailing byte %02x ~ %02x\n", ($j - 7), $j;
  277. }
  278. }
  279. }
  280. }
  281. printf TABLES "\n};\n";
  282. close CODEPAGE936;
  283. close GB18030;
  284. close TABLES;
  285. print "\n\n$sTableFile is created successfully.\n";
  286. sub ShowUsage()
  287. {
  288. print "Generate GB18030 data table tables.cpp from 936 table and GB18030 mapping table.\n";
  289. print "tables.cpp is needed for compiling c_g18030.dll.";
  290. print "\n";
  291. print "Usage: GenTables.pl [Path to 936.txt] [Path to no80.txt]\n";
  292. }
  293. sub Goto936WCTable()
  294. {
  295. while (<CODEPAGE936>)
  296. {
  297. if (/WCTABLE/)
  298. {
  299. last;
  300. }
  301. }
  302. }
  303. sub GetFourBytesOffset
  304. {
  305. my ($sGBK2K) = @_;
  306. my $n1 = hex(substr($sGBK2K, 0, 2));
  307. my $n2 = hex(substr($sGBK2K, 2, 2));
  308. my $n3 = hex(substr($sGBK2K, 4, 2));
  309. my $n4 = hex(substr($sGBK2K, 6, 2));
  310. return (($n1 - 0x81)* 10 * 126 * 10 + ($n2 - 0x30) * 126 * 10 + ($n3 - 0x81) * 10 + ($n4 - 0x30));
  311. }
  312. sub GenerateGBK2K
  313. {
  314. my ($ucpGBK2K, $gbk2k) = @_;
  315. if (length($gbk2k) == 8)
  316. {
  317. #
  318. # There is no GBK encoding for this character. And
  319. # the new GBK2K encoding is 4 bytes.
  320. #
  321. #printf ">>%04x::%s\n", $ucpGBK2K, $gbk2k;
  322. if (GetFourBytesOffset($gbk2k) != $nFourBytesCount)
  323. {
  324. die "Error in assumption, $gbk2k, $nFourBytesCount";
  325. }
  326. $nFourBytesCount++;
  327. $FourBytesBuffer[$ucpGBK2K] = $gbk2k;
  328. } else
  329. {
  330. #
  331. # There is no GBK encoding for this character. And
  332. # the new GBK2K encoding is 2 bytes.
  333. #
  334. #printf "##%04x::%s\n", $ucpGBK2K, $gbk2k;
  335. $TwoBytesBuffer[$nTwoBytesDiffCount] = $gbk2k;
  336. $FourBytesBuffer[$ucpGBK2K] = $nTwoBytesDiffCount;
  337. $GBK2KTwoBytes{$gbk2k} = $ucpGBK2K;
  338. $nTwoBytesDiffCount++;
  339. $gGBTwoBytesToUnicode{$gbk2k} = $ucpGBK2K;
  340. }
  341. }