Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

125 lines
3.5 KiB

  1. #
  2. # Compare.pl
  3. #
  4. # Compare the the difference between 936.txt wctable and no80.txt
  5. #
  6. if ($#ARGV < 0)
  7. {
  8. ShowUsage();
  9. exit 1;
  10. }
  11. my $CodePage936File = $ARGV[0];
  12. my $GB18030File = $ARGV[1];
  13. my $twoBytesDiffCount = 0;
  14. my $twoBytesDiffCount2 = 0;
  15. my $fourBytesCount = 0;
  16. if (!(open CODEPAGE936, $CodePage936File))
  17. {
  18. die "Error in openning $GB18030File.\n";
  19. }
  20. if (!(open GB18030, $GB18030File))
  21. {
  22. die "Error in openning $GB18030File.\n";
  23. }
  24. Goto936WCTable();
  25. while (<CODEPAGE936>)
  26. {
  27. if (length($_) == 0)
  28. {
  29. next;
  30. }
  31. if (/0x(\w\w\w\w)\s+0x(\w+)\s+(.*)/)
  32. {
  33. my $ucp = hex(lc($1));
  34. my $gb = lc($2);
  35. while (<GB18030>)
  36. {
  37. if (length($_) == 0)
  38. {
  39. next;
  40. }
  41. if (/(\w\w\w\w)\s+(\w+)/)
  42. {
  43. # print "$1:$2\n";
  44. my $ucpGBK2K = hex(lc($1)); # Unicode code point
  45. my $gbk2k = lc($2); # GBK encoding for this UCP.
  46. if ($ucp == $ucpGBK2K)
  47. {
  48. if ($ucp <= 0x7f)
  49. {
  50. printf "%04x:%s\n", $ucp,$gb;
  51. } elsif ($gb eq $gbk2k)
  52. {
  53. printf "%04x:%s\n", $ucp,$gb;
  54. } else
  55. {
  56. if (length($gbk2k) == 8)
  57. {
  58. # The GBK and GBK2K encoding are different, and the new
  59. # GBK2K encoding has 4 bytes.
  60. printf ">>%04x:%s:%s\n", $ucp,$gb,$gbk2k;
  61. $twoBytesDiffCount++;
  62. } else
  63. {
  64. # The GBK and GBK2K encoding are different, and the new
  65. # GBK2K encoding has 4 bytes.
  66. printf "##%04x:%s:%s\n", $ucp,$gb,$gbk2k;
  67. $twoBytesDiffCount++;
  68. }
  69. }
  70. last;
  71. } else
  72. {
  73. if (length($gbk2k) == 8)
  74. {
  75. #
  76. # There is no GBK encoding for this character. And
  77. # the new GBK2K encoding is 4 bytes.
  78. #
  79. printf ">>%04x::%s\n", $ucpGBK2K, $gbk2k;
  80. $fourByteCount++;
  81. } else
  82. {
  83. #
  84. # There is no GBK encoding for this character. And
  85. # the new GBK2K encoding is 2 bytes.
  86. #
  87. printf "##%04x::%s\n", $ucpGBK2K, $gbk2k;
  88. $twoBytesDiffCount2++;
  89. }
  90. }
  91. }
  92. } # while (<GB18030>)
  93. }
  94. }
  95. print "\n";
  96. printf "Two Bytes diff : %d (%x)\n", $twoBytesDiffCount, $twoBytesDiffCount;
  97. printf "Two Bytes diff2 : %d (%x)\n", $twoBytesDiffCount2, $twoBytesDiffCount2;
  98. printf "Four Bytes count: %d (%x)\n", $fourByteCount, $fourByteCount;
  99. close CODEPAGE936;
  100. close GB18030;
  101. sub ShowUsage()
  102. {
  103. print "Compare [Path to 936.txt] [Path to no80.txt]\n";
  104. }
  105. sub Goto936WCTable()
  106. {
  107. while (<CODEPAGE936>)
  108. {
  109. if (/WCTABLE/)
  110. {
  111. last;
  112. }
  113. }
  114. }