Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

188 lines
4.1 KiB

  1. #
  2. # dirclass_extract.pl
  3. #
  4. # [email protected]
  5. # 31 Jul 1998
  6. #
  7. # Generates a 64K line file of the directional classifications of the Unicode
  8. # characters. Each line is of the format UUUU DDDD where UUUU is the Unicode
  9. # codepoint (in hex) and DDDD is the direction classification. The input file
  10. # is the Unicode character database, which can be found at
  11. # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
  12. #
  13. $charLast = -1;
  14. while (<>)
  15. {
  16. ($char, $name, $comb, $dir) = /^([0-9a-fA-F]{4});([^;]*);[^;]*;([\d]*);(\w+);/;
  17. $char = hex($char);
  18. NAMEDIR:
  19. {
  20. if ($dir =~ /^L$/)
  21. {
  22. $dir = "LTR";
  23. last NAMEDIR;
  24. }
  25. if ($dir =~ /^R$/)
  26. {
  27. $dir = "RTL";
  28. last NAMEDIR;
  29. }
  30. if ($dir =~ /^AL$/)
  31. {
  32. $dir = "ARA";
  33. last NAMEDIR;
  34. }
  35. if ($dir =~ /^WS$/)
  36. {
  37. $dir = "WSP";
  38. last NAMEDIR;
  39. }
  40. if ($dir =~ /^S$/)
  41. {
  42. $dir = "SEG";
  43. last NAMEDIR;
  44. }
  45. if ($dir =~ /^B$/)
  46. {
  47. $dir = "BLK";
  48. last NAMEDIR;
  49. }
  50. if ($dir =~ /^ON$/)
  51. {
  52. $dir = "NEU";
  53. last NAMEDIR;
  54. }
  55. if ($dir =~ /^BN$/)
  56. {
  57. $dir = "NEU";
  58. last NAMEDIR;
  59. }
  60. if ($dir =~ /^NSM$/)
  61. {
  62. $dir = "CBN";
  63. last NAMEDIR;
  64. }
  65. if ($dir =~ /^AN$/)
  66. {
  67. $dir = "ANM";
  68. last NAMEDIR;
  69. }
  70. if ($dir =~ /^EN$/)
  71. {
  72. $dir = "ENM";
  73. last NAMEDIR;
  74. }
  75. if ($dir =~ /^ET$/)
  76. {
  77. $dir = "ETM";
  78. last NAMEDIR;
  79. }
  80. if ($dir =~ /^ES$/)
  81. {
  82. $dir = "ESP";
  83. last NAMEDIR;
  84. }
  85. if ($dir =~ /^CS$/)
  86. {
  87. $dir = "CSP";
  88. last NAMEDIR;
  89. }
  90. if ($dir =~ /^LRE$/)
  91. {
  92. $dir = "FMT";
  93. last NAMEDIR;
  94. }
  95. if ($dir =~ /^LRO$/)
  96. {
  97. $dir = "FMT";
  98. last NAMEDIR;
  99. }
  100. if ($dir =~ /^RLE$/)
  101. {
  102. $dir = "FMT";
  103. last NAMEDIR;
  104. }
  105. if ($dir =~ /^RLO$/)
  106. {
  107. $dir = "FMT";
  108. last NAMEDIR;
  109. }
  110. if ($dir =~ /^PDF$/)
  111. {
  112. $dir = "FMT";
  113. last NAMEDIR;
  114. }
  115. $error = sprintf("Unknown direction type \'%s\' for character %d\n", $dir, $char);
  116. die($error);
  117. }
  118. NAMECHAR:
  119. {
  120. # NB (mikejoch) The '+' and '-' characters have classifications
  121. # which are not strictly Unicode. This is for compatibility with older
  122. # Windows implementations, which used the following classifications.
  123. # If Unicode changes these classifications then we can remove the
  124. # special casing.
  125. if ($char == 0x002B)
  126. {
  127. $dir = "NEU";
  128. }
  129. if ($char == 0x002D)
  130. {
  131. $dir = "NEU";
  132. }
  133. }
  134. if ($comb != 0)
  135. {
  136. $dir = "CBN";
  137. }
  138. if ($char <= $charLast)
  139. {
  140. $error = sprintf("Character %04X out of order!\n", $char);
  141. die($error);
  142. }
  143. elsif ($char != $charLast + 1)
  144. {
  145. if ($fRange)
  146. {
  147. if ($name =~ /^<[^,]*, Last>$/)
  148. {
  149. $dirRange = $dir;
  150. }
  151. else
  152. {
  153. $error = sprintf("Unclosed range before character %04X!\n", $char);
  154. die($error);
  155. }
  156. }
  157. else
  158. {
  159. $dirRange = "UNK";
  160. }
  161. for ($charT = $charLast + 1; $charT < $char; $charT++)
  162. {
  163. printf("%04X %s\n", $charT, $dirRange);
  164. }
  165. }
  166. printf("%04X %s\n", $char, $dir);
  167. $fRange = ($name =~ /^<[^,]*, First>$/);
  168. $charLast = $char;
  169. }
  170. while ($char < 65535)
  171. {
  172. $char++;
  173. printf("%04X UNK\n", $char);
  174. }