You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
96 lines
2.2 KiB
96 lines
2.2 KiB
#!/usr/bin/perl
|
|
#
|
|
# unicodepartition_extract.pl
|
|
#
|
|
# [email protected]
|
|
# June 16, 1998
|
|
#
|
|
# Generates a 64K line output of the form HHHH NNNN, where HHHH is the Unicode
|
|
# codepoint in hex, and HHHH is the partition name. The input file is the
|
|
# spec HTML file as prepared by michelsu. As of this writing, this file can
|
|
# be located at http://ie/specs/secure/trident/text/unicode_partitions.htm
|
|
#
|
|
# The output of this script file can be analyzed by the associate script
|
|
# unicodepartition_analyze.pl. This script will indicate what codepoints are
|
|
# multiply defined, and which are not covered.
|
|
#
|
|
|
|
$in = 0;
|
|
|
|
do
|
|
{
|
|
$pat = substr <>, 0, 6;
|
|
} until $pat eq "<h3>1.";
|
|
|
|
while (<>)
|
|
{
|
|
$omit = 0;
|
|
|
|
if (/^<p class=\"partition\"/)
|
|
{
|
|
$in = 1;
|
|
|
|
if (/>(\w{4,4}) ([0-9a-fA-F]{4,4})-([0-9a-fA-F]{4,4})/)
|
|
{
|
|
$tag = $1;
|
|
$rmin = hex($2);
|
|
$rmax = hex($3);
|
|
}
|
|
elsif (/>(\w{4,4}) ([0-9a-fA-F]{4,4})/)
|
|
{
|
|
$tag = $1;
|
|
$rmin = $rmax = hex($2);
|
|
}
|
|
else
|
|
{
|
|
$omit = 1;
|
|
}
|
|
}
|
|
elsif ($in)
|
|
{
|
|
if (/^ /)
|
|
{
|
|
if (/nbsp; +([0-9a-fA-F]{4,4})-([0-9a-fA-F]{4,4})/)
|
|
{
|
|
$rmin = hex($1);
|
|
$rmax = hex($2);
|
|
}
|
|
elsif (/nbsp; +([0-9a-fA-F]{4,4})/)
|
|
{
|
|
$rmin = $rmax = hex($1);
|
|
}
|
|
else
|
|
{
|
|
$omit = 1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (/^(\w{4,4}) +([0-9a-fA-F]{4,4})-([0-9a-fA-F]{4,4})/)
|
|
{
|
|
$tag = $1;
|
|
$rmin = hex($2);
|
|
$rmax = hex($3);
|
|
}
|
|
elsif (/^(\w{4,4}) +([0-9a-fA-F]{4,4})[ &<]/)
|
|
{
|
|
$tag = $1;
|
|
$rmin = $rmax = hex($2);
|
|
}
|
|
else
|
|
{
|
|
$omit = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($in && !$omit)
|
|
{
|
|
for ($r=$rmin; $r<=$rmax; $r+=1)
|
|
{
|
|
printf("%04x %s\n", $r, $tag);
|
|
}
|
|
}
|
|
|
|
$in = 0 if (/\/p>[ \t]*$/);
|
|
}
|