Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

295 lines
12 KiB

  1. //+--------------------------------------------------------------------------
  2. //
  3. // Microsoft Windows
  4. // Copyright (C) Microsoft Corporation, 1997 - 1999.
  5. //
  6. // File: ctplus.c
  7. //
  8. // Contents: Contains character type (orthography) data and routine
  9. // to get at it.
  10. //
  11. // History: 23-May-96 pathal Created.
  12. // 28-Aug-97 weibz Add Hanguel char support
  13. //
  14. //---------------------------------------------------------------------------
  15. //#include <windows.h>
  16. //#include "ctplus0.h"
  17. #include "pch.cxx"
  18. //----------------------------------------------------------------------------
  19. // s_abBreakList
  20. //
  21. // This array starts at -1, so that EOF can be found in the array. It
  22. // depends on (EOF == -1) being true. Also, all references to it must be
  23. // of the form (s_abCharTypeList+1)[x]
  24. //
  25. // 000
  26. // EOF
  27. //
  28. // 001-080
  29. // The lower 7F entries from the ASCII Code Page (0000-00ff) are mapped in place
  30. // (ex. UNICODE 0009 (HT) == 009)
  31. // The word characters are: $,0-9,A-Z,_,a-z
  32. // The word separators are: bs,tab,lf,vtab,cr,spc,
  33. // ",#,%,&,',(,),*,+,comma,-,/,
  34. // :,;,<,=,>,@,[,],`
  35. // The phrase seperators are: !,.,?,\,^,{,|,},~
  36. //
  37. // NOTE: Symbols are treated as WS or PS.
  38. //
  39. // 081-0FF
  40. // The lower 7E entries from the Half Width Variant Code Page (FF00-FF7F) are
  41. // mapped to 081-0FF.
  42. //
  43. // 100-1FF
  44. // The lower FF entries from the General Punctuation Code Page (2000-2044) are
  45. // mapped to 100-1ff.
  46. //
  47. // 200-2FF
  48. // The lower FF entries from the CJK Auxiliary Code Page (3000-30FF) are mapped
  49. // to 200-2ff.
  50. //
  51. // pathal - 5/20/96
  52. // Special default character processing for selection
  53. // The following is a list of white space characters that T-Hammer will not right select on:
  54. // 0x0009 (tab), 0x0020 (ansi space), 0x2005 (narrow space, 0x3000 (wide space)
  55. // (Note: see AnalyzeHPBs for special end SPB processing of adjacent white space)
  56. // The following is a list of nls characters to be treated as text by T-Hammer:
  57. // (in other words T-Hammer will neither right nor left-select on them):
  58. // 0x001F (non-required hyphen), 0x0027 (single quote), 0x2019 (right quote),
  59. // 0x200C (non-width optional break), 0x200D (non-width no break)
  60. //----------------------------------------------------------------------------
  61. const BYTE
  62. s_abCharTypeList[0x301] =
  63. {
  64. (BYTE) -1, // EOF (-1)
  65. PS,PS,PS,PS,PS,PS,PS,PS, WS,WS,WS,WS,PS,WS,PS,PS, // 000 - 015
  66. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,CH,PS, // 016 - 031
  67. WS,PS,WS,WS,CH,CH,WS,PS, // sp ! " # $ % & '
  68. WS,WS,WS,WS,WS,WS,PS,WS, // ( ) * + , - . /
  69. CH,CH,CH,CH,CH,CH,CH,CH, // 0 1 2 3 4 5 6 7
  70. CH,CH,WS,WS,WS,WS,WS,PS, // 8 9 : ; < = > ?
  71. WS,CH,CH,CH,CH,CH,CH,CH, // @ A B C D E F G
  72. CH,CH,CH,CH,CH,CH,CH,CH, // H I J K M L N O
  73. CH,CH,CH,CH,CH,CH,CH,CH, // P Q R S T U V Y
  74. CH,CH,CH,WS,PS,WS,PS,CH, // X Y Z [ \ ] ^ _
  75. WS,CH,CH,CH,CH,CH,CH,CH, // ` a b c d e f g
  76. CH,CH,CH,CH,CH,CH,CH,CH, // h i j k m l n o
  77. CH,CH,CH,CH,CH,CH,CH,CH, // p q r s t u v y
  78. CH,CH,CH,PS,PS,PS,CH,PS, // x y z { | } ~ del
  79. WS,PS,WS,WS,CH,CH,WS,WS, // FF00-FF07 (sp ! " # $ % & ')
  80. WS,WS,WS,WS,WS,WS,PS,WS, // ( ) * + , - . /
  81. VC,VC,VC,VC,VC,VC,VC,VC, // 0 1 2 3 4 5 6 7
  82. VC,VC,WS,WS,WS,WS,WS,PS, // 8 9 : ; < = > ?
  83. WS,VC,VC,VC,VC,VC,VC,VC, // @ A B C D E F G
  84. VC,VC,VC,VC,VC,VC,VC,VC, // H I J K M L N O
  85. VC,VC,VC,VC,VC,VC,VC,VC, // P Q R S T U V Y
  86. VC,VC,VC,WS,VC,WS,PS,VC, // X Y Z [ \ ] ^ _
  87. WS,VC,VC,VC,VC,VC,VC,VC, // ` a b c d e f g
  88. VC,VC,VC,VC,VC,VC,VC,VC, // h i j k m l n o
  89. VC,VC,VC,VC,VC,VC,VC,VC, // p q r s t u v y
  90. VC,VC,VC,PS,PS,PS,VC,PS, // x y z { | } ~ del
  91. VC,PS,WS,WS,WS,WS,VC,VC, // FF60-FF67
  92. VC,VC,VC,VC,VC,VC,VC,VC, // FF68-FF6F
  93. VC,VC,VC,VC,VC,VC,VC,VC, // FF70-FF77
  94. VC,VC,VC,VC,VC,VC,VC,VC, // FF70-FF7E
  95. WS,WS,WS,WS,WS,WS,WS,WS, // 2000-2007
  96. WS,WS,WS,WS,CH,CH,WS,WS, // 2008-200F
  97. WS,CH,WS,WS,WS,KC,PS,WS, // 2010-2017
  98. WS,CH,WS,WS,WS,WS,WS,WS, // 2018-201F
  99. WS,WS,PS,PS,PS,PS,PS,CH, // 2020-2027
  100. PS,PS,CH,CH,CH,CH,CH,PS, // 2028-202F
  101. WS,WS,WS,WS,WS,WS,WS,WS, // 2030-2037
  102. WS,WS,WS,WS,WS,WS,WS,WS, // 2038-203F
  103. WS,WS,WS,PS,WS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 2040-204F
  104. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 2050-205F
  105. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 2060-206F
  106. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 2070-207F
  107. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 2080-208F
  108. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 2090-209F
  109. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 20A0-20AF
  110. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 20B0-20BF
  111. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 20C0-20CF
  112. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 20D0-20DF
  113. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 20E0-20EF
  114. PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, // 20F0-20FF
  115. WS,WS,PS,HC,HC,IC,IC,HC, // 3000-3007
  116. WS,WS,WS,WS,WS,WS,WS,WS, // 3008-300F
  117. WS,WS,WS,WS,WS,WS,WS,WS, // 3010-3017
  118. WS,WS,WS,WS,WS,WS,WS,WS, // 3018-301F
  119. HC,HC,HC,HC,HC,HC,HC,HC, // 3020-3027
  120. HC,HC,HC,HC,HC,HC,HC,HC, // 3028-302F
  121. WS,HC,IC,HC,IC,HC,HC,HC, // 3030-3037
  122. PS,PS,PS,PS,PS,PS,PS,WS, // 3038-303F
  123. WS,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, // 3040-304F
  124. HC,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, // 3050-305F
  125. HC,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, // 3060-306F
  126. HC,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, // 3070-307F
  127. HC,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, // 3080-308F
  128. HC,HC,HC,HC,HC,PS,PS,PS, // 3090-3097
  129. PS,HC,HC,WS,WS,HC,HC,PS, // 3098-309F
  130. WS,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, // 30A0-30AF
  131. KC,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, // 30B0-30BF
  132. KC,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, // 30C0-30CF
  133. KC,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, // 30D0-30DF
  134. KC,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, // 30E0-30EF
  135. KC,KC,KC,KC,KC,KC,IC,PS, // 30F0-30F7
  136. PS,PS,PS,WS,KC,KC,KC,PS, // 30F8-30FF
  137. };
  138. //
  139. // Type C1 bits are:
  140. //
  141. // C1_UPPER 0x0001 // upper case
  142. // C1_LOWER 0x0002 // lower case
  143. // C1_DIGIT 0x0004 // decimal digits 1
  144. // C1_SPACE 0x0008 // spacing characters 2
  145. // C1_PUNCT 0x0010 // punctuation characters 4
  146. // C1_CNTRL 0x0020 // control characters 8
  147. // C1_BLANK 0x0040 // blank characters 10
  148. // C1_XDIGIT 0x0080 // other digits 20
  149. // C1_ALPHA 0x0100 // any linguistic character 40
  150. //
  151. // But since I don't care about C1_UPPER and C1_LOWER I can right-shift
  152. // the output of GetStringTypeEx and keep a 128 Byte lookup table.
  153. //
  154. // The precedence rules are: (Alpha, XDigit, Digit) --> CH
  155. // (Punct) --> PS
  156. // (Space, Blank, Control) --> WS
  157. //
  158. const BYTE
  159. s_abCTypeList[128] =
  160. {
  161. WS, CH, WS, CH, PS, CH, WS, CH, // 00 - 07
  162. WS, CH, WS, CH, PS, CH, WS, CH, // 08 - 0F
  163. WS, CH, WS, CH, PS, CH, WS, CH, // 10 - 17
  164. WS, CH, WS, CH, PS, CH, WS, CH, // 18 - 1F
  165. CH, CH, CH, CH, CH, CH, CH, CH, // 20 - 27
  166. CH, CH, CH, CH, CH, CH, CH, CH, // 20 - 27
  167. CH, CH, CH, CH, CH, CH, CH, CH, // 30 - 37
  168. CH, CH, CH, CH, CH, CH, CH, CH, // 30 - 37
  169. CH, CH, CH, CH, CH, CH, CH, CH, // 40 - 47
  170. CH, CH, CH, CH, CH, CH, CH, CH, // 48 - 4F
  171. CH, CH, CH, CH, CH, CH, CH, CH, // 50 - 57
  172. CH, CH, CH, CH, CH, CH, CH, CH, // 58 - 5F
  173. CH, CH, CH, CH, CH, CH, CH, CH, // 60 - 67
  174. CH, CH, CH, CH, CH, CH, CH, CH, // 68 - 6F
  175. CH, CH, CH, CH, CH, CH, CH, CH, // 70 - 77
  176. CH, CH, CH, CH, CH, CH, CH, CH, // 78 - 7F
  177. };
  178. //+---------------------------------------------------------------------------
  179. //
  180. // Synopsis: Returns the type of a character
  181. //
  182. // Arguments: [c] -- Unicode Character
  183. //
  184. // Returns: type, one of CH, WS, PS, EOF
  185. //
  186. // History: 10-Sep-97 Weibz
  187. //
  188. // Notes: This returns the type of a character, using the static
  189. // array s_abCharTypeList. It adds 1 so that EOF (-1) can be in
  190. // the array, and accessed normally.
  191. //
  192. // This is not done by overloading the [] opeator, because in
  193. // future versions it will not necessarly be a table lookup.
  194. //
  195. // See above (typeof comments) for an explanation of the mapping
  196. //
  197. //----------------------------------------------------------------------------
  198. BYTE
  199. GetCharType(WCHAR wc )
  200. {
  201. WCHAR wc2;
  202. // Map interesting stuff (0000, 2000, 3000, FF00) to the table range,
  203. // 0x0000 - 0x0300.
  204. //
  205. wc2 = (wc & 0x00FF);
  206. switch (wc & 0xFF00) {
  207. case 0xFF00: // Half-Width Variants
  208. if (wc2 & 0x80) {
  209. return(VC); // including Hangul
  210. }
  211. wc2 |= 0x0080;
  212. break;
  213. case 0xFE00: // Small Variants
  214. if ((wc2 <= 0x006B) && (wc2 != 0x0069)) {
  215. return(WS);
  216. }
  217. // Treat Small $ and arabic symbols as CH
  218. return(CH);
  219. // break;
  220. case 0x3000: // CJK Auxiliary
  221. wc2 |= 0x0200;
  222. break;
  223. case 0x2000: // General Punctuation
  224. wc2 |= 0x0100;
  225. break;
  226. case 0x0000: // Code page 0
  227. // Use System NLS map for code page 0
  228. if (wc2 & 0x80)
  229. {
  230. WORD wCharType = 0;
  231. GetStringTypeExW( MAKELANGID( LANG_KOREAN, SUBLANG_KOREAN ),
  232. CT_CTYPE1,
  233. &wc2,
  234. 1,
  235. &wCharType );
  236. return s_abCTypeList[wCharType >> 2];
  237. }
  238. break;
  239. default:
  240. //
  241. // Treat the whole CJK Range as Kanji
  242. //
  243. if ((wc >= 0x4E00) && (wc <= 0x9FFF)) {
  244. return(IC);
  245. }
  246. //
  247. // Treat All Gaiji as Kanji Char, too
  248. //
  249. if ((wc >= 0xE000) && (wc < 0xE758)) {
  250. return(IC);
  251. }
  252. //
  253. // Treat all CJK symbols as word separators
  254. // NOTE: This means that the stemmer must be smart about searching
  255. // for zipcodes when given one with a preceding zipcode char.
  256. //
  257. if ((wc >= 0x3200) && (wc <= 0x33DD)) {
  258. return(WS);
  259. }
  260. // Treat Hanguel Region as hanguel char.
  261. if ((wc >= 0xac00) && (wc <= 0xd7a3)) {
  262. return(HG);
  263. }
  264. // If it's not interesting return CH as default;
  265. return(CH);
  266. // break;
  267. }
  268. return( (s_abCharTypeList+1)[wc2] );
  269. }