Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

285 lines
12 KiB

  1. //+--------------------------------------------------------------------------
  2. //
  3. // Microsoft Windows
  4. // Copyright (C) Microsoft Corporation, 1995 - 1999.
  5. //
  6. // File: ctplus.c
  7. //
  8. // Contents: Contains character type (orthography) data and routine
  9. // to get at it.
  10. //
  11. // History: 23-May-96 pathal Created.
  12. //
  13. //---------------------------------------------------------------------------
  14. #include "precomp.h"
  15. #include <winnls.h>
  16. //----------------------------------------------------------------------------
  17. // s_abBreakList
  18. //
  19. // This array starts at -1, so that EOF can be found in the array. It
  20. // depends on (EOF == -1) being true. Also, all references to it must be
  21. // of the form (s_abCharTypeList+1)[x]
  22. //
  23. // 000
  24. // EOF
  25. //
  26. // 001-080
  27. // The lower 7F entries from the ASCII Code Page (0000-00ff) are mapped in place
  28. // (ex. UNICODE 0009 (HT) == 009)
  29. // The word characters are: $,0-9,A-Z,_,a-z
  30. // The word separators are: bs,tab,lf,vtab,cr,spc,
  31. // ",#,%,&,',(,),*,+,comma,-,/,
  32. // :,;,<,=,>,@,[,],`
  33. // The phrase seperators are: !,.,?,\,^,{,|,},~
  34. //
  35. // NOTE: Symbols are treated as WS or PS.
  36. //
  37. // 081-0FF
  38. // The lower 7E entries from the Half Width Variant Code Page (FF00-FF7F) are
  39. // mapped to 081-0FF.
  40. //
  41. // 100-1FF
  42. // The lower FF entries from the General Punctuation Code Page (2000-2044) are
  43. // mapped to 100-1ff.
  44. //
  45. // 200-2FF
  46. // The lower FF entries from the CJK Auxiliary Code Page (3000-30FF) are mapped
  47. // to 200-2ff.
  48. //
  49. // pathal - 5/20/96
  50. // Special default character processing for selection
  51. // The following is a list of white space characters that T-Hammer will not right select on:
  52. // 0x0009 (tab), 0x0020 (ansi space), 0x2005 (narrow space, 0x3000 (wide space)
  53. // (Note: see AnalyzeHPBs for special end SPB processing of adjacent white space)
  54. // The following is a list of nls characters to be treated as text by T-Hammer:
  55. // (in other words T-Hammer will neither right nor left-select on them):
  56. // 0x001F (non-required hyphen), 0x0027 (single quote), 0x2019 (right quote),
  57. // 0x200C (non-width optional break), 0x200D (non-width no break)
  58. //----------------------------------------------------------------------------
  59. const BYTE
  60. s_abCharTypeList[0x301] =
  61. {
  62. (BYTE) -1, // EOF (-1)
  63. PS,PS,PS,PS,PS,PS,PS,PS, WS,WS,WS,WS,PS,WS,PS,PS, // 000 - 015
  64. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,CH,PS, // 016 - 031
  65. WS,PS,WS,WS,CH,CH,WS,PS, // sp ! " # $ % & '
  66. WS,WS,WS,WS,WS,WS,PS,WS, // ( ) * + , - . /
  67. CH,CH,CH,CH,CH,CH,CH,CH, // 0 1 2 3 4 5 6 7
  68. CH,CH,WS,WS,WS,WS,WS,PS, // 8 9 : ; < = > ?
  69. WS,CH,CH,CH,CH,CH,CH,CH, // @ A B C D E F G
  70. CH,CH,CH,CH,CH,CH,CH,CH, // H I J K M L N O
  71. CH,CH,CH,CH,CH,CH,CH,CH, // P Q R S T U V Y
  72. CH,CH,CH,WS,PS,WS,PS,CH, // X Y Z [ \ ] ^ _
  73. WS,CH,CH,CH,CH,CH,CH,CH, // ` a b c d e f g
  74. CH,CH,CH,CH,CH,CH,CH,CH, // h i j k m l n o
  75. CH,CH,CH,CH,CH,CH,CH,CH, // p q r s t u v y
  76. CH,CH,CH,PS,PS,PS,CH,PS, // x y z { | } ~ del
  77. WS,PS,WS,WS,CH,CH,WS,WS, // FF00-FF07 (sp ! " # $ % & ')
  78. WS,WS,WS,WS,WS,WS,PS,WS, // ( ) * + , - . /
  79. VC,VC,VC,VC,VC,VC,VC,VC, // 0 1 2 3 4 5 6 7
  80. VC,VC,WS,WS,WS,WS,WS,PS, // 8 9 : ; < = > ?
  81. WS,VC,VC,VC,VC,VC,VC,VC, // @ A B C D E F G
  82. VC,VC,VC,VC,VC,VC,VC,VC, // H I J K M L N O
  83. VC,VC,VC,VC,VC,VC,VC,VC, // P Q R S T U V Y
  84. VC,VC,VC,WS,VC,WS,PS,VC, // X Y Z [ \ ] ^ _
  85. WS,VC,VC,VC,VC,VC,VC,VC, // ` a b c d e f g
  86. VC,VC,VC,VC,VC,VC,VC,VC, // h i j k m l n o
  87. VC,VC,VC,VC,VC,VC,VC,VC, // p q r s t u v y
  88. VC,VC,VC,PS,PS,PS,VC,PS, // x y z { | } ~ del
  89. VC,PS,WS,WS,WS,WS,VC,VC, // FF60-FF67
  90. VC,VC,VC,VC,VC,VC,VC,VC, // FF68-FF6F
  91. VC,VC,VC,VC,VC,VC,VC,VC, // FF70-FF77
  92. VC,VC,VC,VC,VC,VC,VC,VC, // FF70-FF7E
  93. WS,WS,WS,WS,WS,WS,WS,WS, // 2000-2007
  94. WS,WS,WS,WS,CH,CH,WS,WS, // 2008-200F
  95. WS,CH,WS,WS,WS,KC,PS,WS, // 2010-2017
  96. WS,CH,WS,WS,WS,WS,WS,WS, // 2018-201F
  97. WS,WS,PS,PS,PS,PS,PS,CH, // 2020-2027
  98. PS,PS,CH,CH,CH,CH,CH,PS, // 2028-202F
  99. WS,WS,WS,WS,WS,WS,WS,WS, // 2030-2037
  100. WS,WS,WS,WS,WS,WS,WS,WS, // 2038-203F
  101. WS,WS,WS,PS,WS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 2040-204F
  102. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 2050-205F
  103. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 2060-206F
  104. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 2070-207F
  105. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 2080-208F
  106. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 2090-209F
  107. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 20A0-20AF
  108. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 20B0-20BF
  109. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 20C0-20CF
  110. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 20D0-20DF
  111. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 20E0-20EF
  112. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 20F0-20FF
  113. WS,WS,PS,HC,HC,IC,IC,HC, // 3000-3007
  114. WS,WS,WS,WS,WS,WS,WS,WS, // 3008-300F
  115. WS,WS,WS,WS,WS,WS,WS,WS, // 3010-3017
  116. WS,WS,WS,WS,WS,WS,WS,WS, // 3018-301F
  117. HC,HC,HC,HC,HC,HC,HC,HC, // 3020-3027
  118. HC,HC,HC,HC,HC,HC,HC,HC, // 3028-302F
  119. WS,HC,IC,HC,IC,HC,HC,HC, // 3030-3037
  120. PS,PS,PS,PS,PS,PS,PS,WS, // 3038-303F
  121. WS,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, // 3040-304F
  122. HC,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, // 3050-305F
  123. HC,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, // 3060-306F
  124. HC,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, // 3070-307F
  125. HC,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, // 3080-308F
  126. HC,HC,HC,HC,HC,PS,PS,PS, // 3090-3097
  127. PS,HC,HC,WS,WS,HC,HC,PS, // 3098-309F
  128. WS,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, // 30A0-30AF
  129. KC,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, // 30B0-30BF
  130. KC,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, // 30C0-30CF
  131. KC,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, // 30D0-30DF
  132. KC,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, // 30E0-30EF
  133. KC,KC,KC,KC,KC,KC,IC,PS, // 30F0-30F7
  134. PS,PS,PS,WS,KC,KC,KC,PS, // 30F8-30FF
  135. };
  136. //
  137. // Type C1 bits are:
  138. //
  139. // C1_UPPER 0x0001 // upper case
  140. // C1_LOWER 0x0002 // lower case
  141. // C1_DIGIT 0x0004 // decimal digits 1
  142. // C1_SPACE 0x0008 // spacing characters 2
  143. // C1_PUNCT 0x0010 // punctuation characters 4
  144. // C1_CNTRL 0x0020 // control characters 8
  145. // C1_BLANK 0x0040 // blank characters 10
  146. // C1_XDIGIT 0x0080 // other digits 20
  147. // C1_ALPHA 0x0100 // any linguistic character 40
  148. //
  149. // But since I don't care about C1_UPPER and C1_LOWER I can right-shift
  150. // the output of GetStringTypeEx and keep a 128 Byte lookup table.
  151. //
  152. // The precedence rules are: (Alpha, XDigit, Digit) --> CH
  153. // (Punct) --> PS
  154. // (Space, Blank, Control) --> WS
  155. //
  156. const BYTE
  157. s_abCTypeList[128] =
  158. {
  159. WS, CH, WS, CH, PS, CH, WS, CH, // 00 - 07
  160. WS, CH, WS, CH, PS, CH, WS, CH, // 08 - 0F
  161. WS, CH, WS, CH, PS, CH, WS, CH, // 10 - 17
  162. WS, CH, WS, CH, PS, CH, WS, CH, // 18 - 1F
  163. CH, CH, CH, CH, CH, CH, CH, CH, // 20 - 27
  164. CH, CH, CH, CH, CH, CH, CH, CH, // 20 - 27
  165. CH, CH, CH, CH, CH, CH, CH, CH, // 30 - 37
  166. CH, CH, CH, CH, CH, CH, CH, CH, // 30 - 37
  167. CH, CH, CH, CH, CH, CH, CH, CH, // 40 - 47
  168. CH, CH, CH, CH, CH, CH, CH, CH, // 48 - 4F
  169. CH, CH, CH, CH, CH, CH, CH, CH, // 50 - 57
  170. CH, CH, CH, CH, CH, CH, CH, CH, // 58 - 5F
  171. CH, CH, CH, CH, CH, CH, CH, CH, // 60 - 67
  172. CH, CH, CH, CH, CH, CH, CH, CH, // 68 - 6F
  173. CH, CH, CH, CH, CH, CH, CH, CH, // 70 - 77
  174. CH, CH, CH, CH, CH, CH, CH, CH, // 78 - 7F
  175. };
  176. //+---------------------------------------------------------------------------
  177. // Member: WBREAKJ::TypeOf
  178. //
  179. // Synopsis: Returns the type of a character
  180. //
  181. // Arguments: [c] -- Unicode Character
  182. //
  183. // Returns: type, one of CH, WS, PS, EOF
  184. //
  185. // History: 08-Apr-94 PatHal Adapted for Japanese WB
  186. //
  187. // Notes: This returns the type of a character, using the static
  188. // array s_abCharTypeList. It adds 1 so that EOF (-1) can be in
  189. // the array, and accessed normally.
  190. //
  191. // This is not done by overloading the [] opeator, because in
  192. // future versions it will not necessarly be a table lookup.
  193. //
  194. // See above (typeof comments) for an explanation of the mapping
  195. //
  196. //----------------------------------------------------------------------------
  197. BYTE
  198. GetCharType(WCHAR wc )
  199. {
  200. WCHAR wc2;
  201. // Map interesting stuff (0000, 2000, 3000, FF00) to the table range,
  202. // 0x0000 - 0x0300.
  203. //
  204. wc2 = (wc & 0x00FF);
  205. switch (wc & 0xFF00) {
  206. case 0xFF00: // Half-Width Variants
  207. if (wc2 & 0x80) {
  208. return(VC); // including Hangul
  209. }
  210. wc2 |= 0x0080;
  211. break;
  212. case 0xFE00: // Small Variants
  213. if ((wc2 <= 0x006B) && (wc2 != 0x0069)) {
  214. return(WS);
  215. }
  216. // Treat Small $ and arabic symbols as CH
  217. return(CH);
  218. // break;
  219. case 0x3000: // CJK Auxiliary
  220. wc2 |= 0x0200;
  221. break;
  222. case 0x2000: // General Punctuation
  223. wc2 |= 0x0100;
  224. break;
  225. case 0x0000: // Code page 0
  226. // Use System NLS map for code page 0
  227. if (wc2 & 0x80)
  228. {
  229. WORD wCharType = 0;
  230. GetStringTypeEx( MAKELANGID( LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED ),
  231. CT_CTYPE1,
  232. &wc2,
  233. 1,
  234. &wCharType );
  235. return s_abCTypeList[wCharType >> 2];
  236. }
  237. break;
  238. default:
  239. //
  240. // Treat the whole CJK Range as Kanji
  241. //
  242. if ((wc >= 0x4E00) && (wc <= 0x9FFF)) {
  243. return(IC);
  244. }
  245. //
  246. // Treat All Gaiji as Kanji Char, too
  247. //
  248. if ((wc >= 0xE000) && (wc < 0xE758)) {
  249. return(IC);
  250. }
  251. //
  252. // Treat all CJK symbols as word separators
  253. // NOTE: This means that the stemmer must be smart about searching
  254. // for zipcodes when given one with a preceding zipcode char.
  255. //
  256. if ((wc >= 0x3200) && (wc <= 0x33DD)) {
  257. return(WS);
  258. }
  259. // If it's not interesting return CH as default;
  260. return(CH);
  261. // break;
  262. }
  263. return( (s_abCharTypeList+1)[wc2] );
  264. }