Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

396 lines
11 KiB

#
# GenGBK2K.pl
#
# Generate GBK2K (GB18030) codepage files by taking input from 936.txt and no80.txt.
#
use strict "vars";
if ($#ARGV < 0)
{
ShowUsage();
exit 1;
}
my $sCodePage936File = $ARGV[0]; # The NLS 936 codepage table.
my $sGB18030File = $ARGV[1]; # The Unicode to GB18030 conversion table.
my $sTableFile = "tables.cpp"; # The generated C source.
my $nTwoBytesDiffCount = 0;
my $nFourBytesCount = 0;
my @FourBytesBuffer; # Store the value of the four bytes buffer.
# This could be a four bytes encoding or an offset to the first 128 positions.
my @TwoBytesBuffer; # Store the two bytes encoding values.
my %GBK2KTwoBytes; # Used to store the mapping of GBK2K two bytes to Unicode.
my %gGBTwoBytesToUnicode;
if (!(open CODEPAGE936, $sCodePage936File))
{
die "Error in openning $sGB18030File.\n";
}
if (!(open GB18030, $sGB18030File))
{
die "Error in openning $sGB18030File.\n";
}
if (!(open TABLES, ">$sTableFile"))
{
die "Error in creating $sTableFile.\n";
}
Goto936WCTable();
while (<CODEPAGE936>)
{
if (length($_) == 0)
{
next;
}
if (/0x(\w\w\w\w)\s+0x(\w+)\s+(.*)/)
{
my $ucp = hex(lc($1));
my $gb = lc($2);
while (<GB18030>)
{
if (length($_) == 0)
{
next;
}
if (/(\w\w\w\w)\s+(\w+)/)
{
my $ucpGBK2K = hex(lc($1)); # Unicode code point
my $gbk2k = lc($2); # GBK encoding for this UCP.
if ($ucp == $ucpGBK2K)
{
if ($ucp <= 0x7f)
{
#printf "%04x:%s\n", $ucp,$gb;
} elsif ($gb eq $gbk2k)
{
#printf "%04x:%s\n", $ucp,$gb;
$gGBTwoBytesToUnicode{$gb} = $ucp;
} else
{
if (length($gbk2k) == 8)
{
# The GBK and GBK2K encoding are different, and the new
# GBK2K encoding has 4 bytes.
#printf ">>%04x:%s:%s\n", $ucp,$gb,$gbk2k;
if (GetFourBytesOffset($gbk2k) != $nFourBytesCount)
{
die "Error in assumption";
}
$nFourBytesCount++;
$FourBytesBuffer[$ucp] = $gbk2k;
} else
{
# The GBK and GBK2K encoding are different, and the new
# GBK2K encoding has 2 bytes.
#printf "##%04x:%s:%s\n", $ucp,$gb,$gbk2k;
$TwoBytesBuffer[$nTwoBytesDiffCount] = $gbk2k;
$FourBytesBuffer[$ucp] = $nTwoBytesDiffCount;
$nTwoBytesDiffCount++;
$GBK2KTwoBytes{$gbk2k} = $ucp;
$gGBTwoBytesToUnicode{$gbk2k} = $ucp;
}
}
last;
} else
{
GenerateGBK2K($ucpGBK2K, $gbk2k);
}
}
} # while (<GB18030>)
}
if ($nFourBytesCount % 256 == 0)
{
print ".";
}
}
while (<GB18030>)
{
if (length($_) == 0)
{
next;
}
if (/(\w\w\w\w)\s+(\w+)/)
{
my $ucpGBK2K = hex(lc($1)); # Unicode code point
my $gbk2k = lc($2); # GBK encoding for this UCP.
GenerateGBK2K($ucpGBK2K, $gbk2k);
}
}
################################################################
#
# Generate data table for Unicode to GB18030 conversion.
#
################################################################
print "\n";
printf "Two Bytes diff : %d (%x)\n", $nTwoBytesDiffCount, $nTwoBytesDiffCount;
printf "Four Bytes count: %d (%x)\n", $nFourBytesCount, $nFourBytesCount;
print "Generating $sTableFile file...";
my $nWCLines = $nFourBytesCount + $nTwoBytesDiffCount;
my $i;
my $j;
my $lineCount = 0;
print TABLES '#include <share.h>';
print TABLES "\n";
printf TABLES '#include "c_gb18030.h"';
print TABLES "\n\n";
printf TABLES "BYTE g_wUnicodeToGBTwoBytes[] = \n";
printf TABLES "{\n";
for ($i = 0; $i <= $#TwoBytesBuffer; $i++)
{
if ($i % 8 == 0)
{
printf TABLES "\n";
}
printf TABLES "0x%02x,", hex($TwoBytesBuffer[$i])/256;
printf TABLES "0x%02x, ", hex($TwoBytesBuffer[$i])%256;
}
printf TABLES "\n\n};\n\n";
printf TABLES "WORD g_wMax4BytesOffset = 0x%04x; // %d \n\n", $nFourBytesCount + $nTwoBytesDiffCount, $nFourBytesCount + $nTwoBytesDiffCount;
printf TABLES "WORD g_wUnicodeToGB[] = \n";
printf TABLES "{\n";
for ($i = 0; $i <= 0xffff; $i++)
{
if (defined($FourBytesBuffer[$i]))
{
if (length($FourBytesBuffer[$i]) == 8)
{
#
# Add 1 to offset since 0x0000 means that we should fallback to 936.txt.
#
printf TABLES ("0x%04x, ", GetFourBytesOffset($FourBytesBuffer[$i]));
} else
{
printf TABLES ("0x%04x, ", 0xfffe - $FourBytesBuffer[$i]);
}
$lineCount++;
} else
{
printf TABLES ("0x%04x, ", 0xffff);
}
if (($i+1) % 8 == 0)
{
printf TABLES " // U+%04x ~ U+%04x\n", $i - 7, $i;
}
}
printf TABLES "\n\n};\n";
#
# Generate 54936MB.txt
#
my @GBK2KTwoBytesArray = sort(keys(%GBK2KTwoBytes));
my %GBK2KLeadBytes; # Used to store the total number of bytes for a specific lead byte.
#
# Get the lead byte range.
#
for ($i = 0; $i <= $#GBK2KTwoBytesArray; $i++)
{
my $sLeadByte = lc(substr($GBK2KTwoBytesArray[$i], 0, 2)); # Get the first two character as the lead byte.
if (!(defined($GBK2KLeadBytes{$sLeadByte})))
{
$GBK2KLeadBytes{$sLeadByte} = 1;
} else
{
$GBK2KLeadBytes{$sLeadByte}++;
}
}
my @GBK2KLeadBytesArray = sort(keys(%GBK2KLeadBytes));
################################################################
#
# Generate data table for GB18030 to Unicode conversion.
#
################################################################
$nWCLines = $nFourBytesCount;
printf TABLES "\n\n";
#
# Generate data table for GB18030 four-byte data to Unicode conversion.
#
printf TABLES "WORD g_wGBFourBytesToUnicode[] = {\n\n";
my $nCount = 0;
for ($i = 0; $i <= $#FourBytesBuffer; $i++)
{
if (defined($FourBytesBuffer[$i]))
{
if (length($FourBytesBuffer[$i]) == 8)
{
printf TABLES ("0x%04x, ", $i);
$nCount++;
if ($nCount % 8 == 0)
{
printf TABLES " // Offset: %04x ~ %04x\n", ($nCount - 8 ), $nCount - 1;
}
}
}
}
printf TABLES "\n\n};\n\n\n";
#
# Generate lead bytes array
#
printf TABLES "// The following lead bytes will be converted to different Unicode values compared with GBK:\n// ";
for ($i = 0x80; $i <= 0xff; $i++)
{
my $sLeadByte = sprintf("%02x", $i);
if (defined($GBK2KLeadBytes{$sLeadByte}))
{
printf TABLES "0x%02x, ", $i;
}
}
printf TABLES "\n\n";
printf TABLES "WORD g_wGBLeadByteOffset[] =\n";
printf TABLES "{\n\n";
my $sOffsetIndex = 1;
for ($i = 0x80; $i <= 0xff; $i++)
{
my $sLeadByte = sprintf("%02x", $i);
if (!(defined($GBK2KLeadBytes{$sLeadByte})))
{
printf TABLES "0x0000, ";
} else
{
printf TABLES "0x%04x, ", ($sOffsetIndex * 256);
$sOffsetIndex++;
}
if ((($i+1) % 8) == 0)
{
printf TABLES " // Lead byte %02x ~ %02x\n", ($i - 7), $i;
}
}
printf TABLES "};\n\n";
#
# Generate data table for GB18030 two-byte data to Unicode conversion.
#
printf TABLES "WORD g_wUnicodeFromGBTwoBytes[] =\n";
printf TABLES "{\n";
for ($i = 0x81; $i <= 0xff; $i++)
{
my $sLeadByte = sprintf("%02x", $i);
if ((defined($GBK2KLeadBytes{$sLeadByte})))
{
printf TABLES "\n// Lead Byte: 0x$sLeadByte\n";
for ($j = 0x00; $j <= 0xff; $j++)
{
my $sGBK2K = sprintf("%02x%02x", $i, $j);
if (defined($gGBTwoBytesToUnicode{$sGBK2K}))
{
printf TABLES "0x%04x, ", $gGBTwoBytesToUnicode{$sGBK2K};
} else
{
printf TABLES "0x0000, ";
}
if ((($j + 1) % 8) == 0)
{
printf TABLES " // Trailing byte %02x ~ %02x\n", ($j - 7), $j;
}
}
}
}
printf TABLES "\n};\n";
close CODEPAGE936;
close GB18030;
close TABLES;
print "\n\n$sTableFile is created successfully.\n";
sub ShowUsage()
{
print "Generate GB18030 data table tables.cpp from 936 table and GB18030 mapping table.\n";
print "tables.cpp is needed for compiling c_g18030.dll.";
print "\n";
print "Usage: GenTables.pl [Path to 936.txt] [Path to no80.txt]\n";
}
sub Goto936WCTable()
{
while (<CODEPAGE936>)
{
if (/WCTABLE/)
{
last;
}
}
}
sub GetFourBytesOffset
{
my ($sGBK2K) = @_;
my $n1 = hex(substr($sGBK2K, 0, 2));
my $n2 = hex(substr($sGBK2K, 2, 2));
my $n3 = hex(substr($sGBK2K, 4, 2));
my $n4 = hex(substr($sGBK2K, 6, 2));
return (($n1 - 0x81)* 10 * 126 * 10 + ($n2 - 0x30) * 126 * 10 + ($n3 - 0x81) * 10 + ($n4 - 0x30));
}
sub GenerateGBK2K
{
my ($ucpGBK2K, $gbk2k) = @_;
if (length($gbk2k) == 8)
{
#
# There is no GBK encoding for this character. And
# the new GBK2K encoding is 4 bytes.
#
#printf ">>%04x::%s\n", $ucpGBK2K, $gbk2k;
if (GetFourBytesOffset($gbk2k) != $nFourBytesCount)
{
die "Error in assumption, $gbk2k, $nFourBytesCount";
}
$nFourBytesCount++;
$FourBytesBuffer[$ucpGBK2K] = $gbk2k;
} else
{
#
# There is no GBK encoding for this character. And
# the new GBK2K encoding is 2 bytes.
#
#printf "##%04x::%s\n", $ucpGBK2K, $gbk2k;
$TwoBytesBuffer[$nTwoBytesDiffCount] = $gbk2k;
$FourBytesBuffer[$ucpGBK2K] = $nTwoBytesDiffCount;
$GBK2KTwoBytes{$gbk2k} = $ucpGBK2K;
$nTwoBytesDiffCount++;
$gGBTwoBytesToUnicode{$gbk2k} = $ucpGBK2K;
}
}