Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

973 lines
29 KiB

  1. /*
  2. * @doc INTERNAL
  3. *
  4. * @module clasifyc.cpp -- Kinsoku classify characters |
  5. *
  6. * Used in word breaking procs, particularly important
  7. * for properly wrapping a line.
  8. *
  9. * Authors: <nl>
  10. * Jon Matousek
  11. *
  12. * Copyright (c) 1995-2000 Microsoft Corporation. All rights reserved.
  13. */
  14. //FUTURE (keithcu) Some of this data we could get from GetStringTypeEx which
  15. //would make us smaller.
  16. #include "_common.h"
  17. #include "_clasfyc.h"
  18. #include "_array.h"
  19. ASSERTDATA
  20. // Data for Kinsoku character classifications.
  21. // NOTE: All values are for UNICODE characters.
  22. // "dumb" quotes and other characters with no left/right orientation.
  23. // This is a hack-around the Kinsoku rules, these are treated
  24. // like an opening paren, when leading and kind of like a closing
  25. // paren when follow--but will only break on white space in former case.
  26. #define brkclsQuote 0
  27. #define C3_FullWidth (C3_KATAKANA | C3_HIRAGANA | C3_IDEOGRAPH | C3_FULLWIDTH)
  28. const WCHAR set0[] = {
  29. 0x0022, // QUOTATION MARK
  30. 0x0027, // APOSTROPHE
  31. 0x2019, // RIGHT SINGLE QUOTATION MARK
  32. 0x301F, // LOW DOUBLE PRIME QUOTATION MARK
  33. 0xFF02, // FULLWIDTH QUOTATION MARK
  34. 0xFF07, // FULLWIDTH APOSTROPHE
  35. 0
  36. };
  37. // Opening-parenthesis character
  38. #define brkclsOpen 1
  39. const WCHAR set1[] = {
  40. 0x0028, // LEFT PARENTHESIS
  41. 0x003C, // LEFT ANGLE BRACKET
  42. 0x005B, // LEFT SQUARE BRACKET
  43. 0x007B, // LEFT CURLY BRACKET
  44. 0x00AB, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
  45. 0x2018, // LEFT SINGLE QUOTATION MARK
  46. 0x201C, // LEFT DOUBLE QUOTATION MARK
  47. 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  48. 0x2045, // LEFT SQUARE BRACKET WITH QUILL
  49. 0x207D, // SUPERSCRIPT LEFT PARENTHESIS
  50. 0x208D, // SUBSCRIPT LEFT PARENTHESIS
  51. 0x3008, // LEFT ANGLE BRACKET
  52. 0x300A, // LEFT DOUBLE ANGLE BRACKET
  53. 0x300C, // LEFT CORNER BRACKET
  54. 0x300E, // LEFT WHITE CORNER BRACKET
  55. 0x3010, // LEFT BLACK LENTICULAR BRACKET
  56. 0x3014, // LEFT TORTOISE SHELL BRACKET
  57. 0x3016, // LEFT WHITE LENTICULAR BRACKET
  58. 0x3018, // LEFT WHITE TORTOISE SHELL BRACKET
  59. 0x301A, // LEFT WHITE SQUARE BRACKET
  60. 0x301D, // REVERSED DOUBLE PRIME QUOTATION MARK
  61. 0xFD3E, // ORNATE LEFT PARENTHESIS
  62. 0xFE59, // SMALL LEFT PARENTHESIS
  63. 0xFE5B, // SMALL LEFT CURLY BRACKET
  64. 0xFE5D, // SMALL LEFT TORTOISE SHELL BRACKET
  65. 0xFF08, // FULLWIDTH LEFT PARENTHESIS
  66. 0xFF3B, // FULLWIDTH LEFT SQUARE BRACKET
  67. 0xFF5B, // FULLWIDTH LEFT CURLY BRACKET
  68. 0xFF62, // HALFWIDTH LEFT CORNER BRACKET
  69. 0xFFE9, // HALFWIDTH LEFTWARDS ARROW
  70. 0
  71. };
  72. // Closing-parenthesis character
  73. #define brkclsClose 2
  74. //FUTURE (keithcu) A dash next to a dash should be a break opportunity.
  75. const WCHAR set2[] = {
  76. // 0x002C, // COMMA moved to set 6 to conjoin numerals.
  77. 0x002D, // HYPHEN
  78. 0x2013, // EN-DASH
  79. 0x2014, // EM-DASH
  80. 0x00AD, // OPTIONAL HYPHEN
  81. 0x055D, // ARMENIAN COMMA
  82. 0x060C, // ARABIC COMMA
  83. 0x3001, // IDEOGRAPHIC COMMA
  84. 0xFE50, // SMALL COMMA
  85. 0xFE51, // SMALL IDEOGRAPHIC COMMA
  86. 0xFF0C, // FULLWIDTH COMMA
  87. 0xFF64, // HALFWIDTH IDEOGRAPHIC COMMA
  88. 0x0029, // RIGHT PARENTHESIS
  89. 0x003E, // RIGHT ANGLE BRACKET
  90. 0x005D, // RIGHT SQUARE BRACKET
  91. 0x007D, // RIGHT CURLY BRACKET
  92. 0x00BB, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
  93. //0x2019, // RIGHT SINGLE QUOTATION MARK moved to set 0
  94. 0x201D, // RIGHT DOUBLE QUOTATION MARK
  95. 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  96. 0x2046, // RIGHT SQUARE BRACKET WITH QUILL
  97. 0x207E, // SUPERSCRIPT RIGHT PARENTHESIS
  98. 0x208E, // SUBSCRIPT RIGHT PARENTHESIS
  99. 0x3009, // RIGHT ANGLE BRACKET
  100. 0x300B, // RIGHT DOUBLE ANGLE BRACKET
  101. 0x300D, // RIGHT CORNER BRACKET
  102. 0x300F, // RIGHT WHITE CORNER BRACKET
  103. 0x3011, // RIGHT BLACK LENTICULAR BRACKET
  104. 0x3015, // RIGHT TORTOISE SHELL BRACKET
  105. 0x3017, // RIGHT WHITE LENTICULAR BRACKET
  106. 0x3019, // RIGHT WHITE TORTOISE SHELL BRACKET
  107. 0x301B, // RIGHT WHITE SQUARE BRACKET
  108. 0x301E, // DOUBLE PRIME QUOTATION MARK
  109. 0xFD3F, // ORNATE RIGHT PARENTHESIS
  110. 0xFE5A, // SMALL RIGHT PARENTHESIS
  111. 0xFE5C, // SMALL RIGHT CURLY BRACKET
  112. 0xFE5E, // SMALL RIGHT TORTOISE SHELL BRACKET
  113. 0xFF09, // FULLWIDTH RIGHT PARENTHESIS
  114. 0xFF3D, // FULLWIDTH RIGHT SQUARE BRACKET
  115. 0xFF5D, // FULLWIDTH RIGHT CURLY BRACKET
  116. 0xFF63, // HALFWIDTH RIGHT CORNER BRACKET
  117. 0xFFEB, // HALFWIDTH RIGHTWARDS ARROW
  118. 0
  119. };
  120. // 'Non-breaking' em-character at line-starting point
  121. #define brkclsGlueA 3
  122. const WCHAR set3[] = {
  123. 0x3005, // IDEOGRAPHIC ITERATION MARK
  124. 0x309D, // HIRAGANA ITERATION MARK
  125. 0x309E, // HIRAGANA VOICED ITERATION MARK
  126. 0x30FC, // KATAKANA-HIRAGANA PROLONGED SOUND MARK
  127. 0x30FD, // KATAKANA ITERATION MARK
  128. 0x30FE, // KATAKANA VOICED ITERATION MARK
  129. 0x3041, // HIRAGANA LETTER SMALL A
  130. 0x3043, // HIRAGANA LETTER SMALL I
  131. 0x3045, // HIRAGANA LETTER SMALL U
  132. 0x3047, // HIRAGANA LETTER SMALL E
  133. 0x3049, // HIRAGANA LETTER SMALL O
  134. 0x3063, // HIRAGANA LETTER SMALL TU
  135. 0x3083, // HIRAGANA LETTER SMALL YA
  136. 0x3085, // HIRAGANA LETTER SMALL YU
  137. 0x3087, // HIRAGANA LETTER SMALL YO
  138. 0x308E, // HIRAGANA LETTER SMALL WA
  139. 0x309B, // KATAKANA-HIRAGANA VOICED SOUND MARK
  140. 0x309C, // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
  141. 0x30A1, // KATAKANA LETTER SMALL A
  142. 0x30A3, // KATAKANA LETTER SMALL I
  143. 0x30A5, // KATAKANA LETTER SMALL U
  144. 0x30A7, // KATAKANA LETTER SMALL E
  145. 0x30A9, // KATAKANA LETTER SMALL O
  146. 0x30C3, // KATAKANA LETTER SMALL TU
  147. 0x30E3, // KATAKANA LETTER SMALL YA
  148. 0x30E5, // KATAKANA LETTER SMALL YU
  149. 0x30E7, // KATAKANA LETTER SMALL YO
  150. 0x30EE, // KATAKANA LETTER SMALL WA
  151. 0x30F5, // KATAKANA LETTER SMALL KA
  152. 0x30F6, // KATAKANA LETTER SMALL KE
  153. 0xFF67, // HALFWIDTH KATAKANA LETTER SMALL A
  154. 0xFF68, // HALFWIDTH KATAKANA LETTER SMALL I
  155. 0xFF69, // HALFWIDTH KATAKANA LETTER SMALL U
  156. 0xFF6A, // HALFWIDTH KATAKANA LETTER SMALL E
  157. 0xFF6B, // HALFWIDTH KATAKANA LETTER SMALL O
  158. 0xFF6C, // HALFWIDTH KATAKANA LETTER SMALL YA
  159. 0xFF6D, // HALFWIDTH KATAKANA LETTER SMALL YU
  160. 0xFF6E, // HALFWIDTH KATAKANA LETTER SMALL YO
  161. 0xFF6F, // HALFWIDTH KATAKANA LETTER SMALL TU
  162. 0xFF70, // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
  163. 0xFF9E, // HALFWIDTH KATAKANA VOICED SOUND MARK
  164. 0xFF9F, // HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
  165. 0
  166. };
  167. // Expression mark
  168. #define brkclsExclaInterr 4
  169. const WCHAR set4[] = {
  170. 0x0021, // EXCLAMATION MARK
  171. 0x003F, // QUESTION MARK
  172. 0x00A1, // INVERTED EXCLAMATION MARK
  173. 0x00BF, // INVERTED QUESTION MARK
  174. 0x01C3, // LATIN LETTER RETROFLEX CLICK
  175. 0x037E, // GREEK QUESTION MARK
  176. 0x055C, // ARMENIAN EXCLAMATION MARK
  177. 0x055E, // ARMENIAN QUESTION MARK
  178. 0x055F, // ARMENIAN ABBREVIATION MARK
  179. 0x061F, // ARABIC QUESTION MARK
  180. 0x203C, // DOUBLE EXCLAMATION MARK
  181. 0x203D, // INTERROBANG
  182. 0x2762, // HEAVY EXCLAMATION MARK ORNAMENT
  183. 0x2763, // HEAVY HEART EXCLAMATION MARK ORNAMENT
  184. 0xFE56, // SMALL QUESTION MARK
  185. 0xFE57, // SMALL EXCLAMATION MARK
  186. 0xFF01, // FULLWIDTH EXCLAMATION MARK
  187. 0xFF1F, // FULLWIDTH QUESTION MARK
  188. 0
  189. };
  190. // Centered punctuation mark
  191. const WCHAR set5[] = {
  192. // 0x003A, // COLON moved to set 6 to conjoin numerals.
  193. // 0x003B, // SEMICOLON moved to set 6 to conjoin numerals
  194. 0x00B7, // MIDDLE DOT
  195. 0x30FB, // KATAKANA MIDDLE DOT
  196. 0xFF65, // HALFWIDTH KATAKANA MIDDLE DOT
  197. 0x061B, // ARABIC SEMICOLON
  198. 0xFE54, // SMALL SEMICOLON
  199. 0xFE55, // SMALL COLON
  200. 0xFF1A, // FULLWIDTH COLON
  201. 0xFF1B, // FULLWIDTH SEMICOLON
  202. 0
  203. };
  204. // Punctuation mark // diverged from the Kinsoku tables to enhance
  205. #define brkclsSlash 6
  206. const WCHAR set6[] = { // How colon, comma, and full stop are treated around
  207. 0x002C, // COMMA // numerals and set 15 (roman text).
  208. 0x002f, // SLASH // But don't break up URLs (see IsURLDelimiter())!
  209. 0x003A, // COLON
  210. 0x003B, // SEMICOLON
  211. 0x002E, // FULL STOP (PERIOD)
  212. 0x0589, // ARMENIAN FULL STOP
  213. 0x06D4, // ARABIC FULL STOP
  214. 0x3002, // IDEOGRAPHIC FULL STOP
  215. 0xFE52, // SMALL FULL STOP
  216. 0xFF0E, // FULLWIDTH FULL STOP
  217. 0xFF61, // HALFWIDTH IDEOGRAPHIC FULL STOP
  218. 0
  219. };
  220. // Inseparable character
  221. #define brkclsInseparable 7
  222. const WCHAR set7[] = {
  223. 0 // FUTURE (alexgo): maybe handle these.
  224. };
  225. // Pre-numeral abbreviation
  226. #define brkclsPrefix 8
  227. const WCHAR set8[] = {
  228. 0x0024, // DOLLAR SIGN
  229. 0x00A3, // POUND SIGN
  230. 0x00A4, // CURRENCY SIGN
  231. 0x00A5, // YEN SIGN
  232. 0x005C, // REVERSE SOLIDUS (looks like Yen in FE fonts.)
  233. 0x0E3F, // THAI CURRENCY SYMBOL BAHT
  234. 0x20AC, // EURO-CURRENCY SIGN
  235. 0x20A1, // COLON SIGN
  236. 0x20A2, // CRUZEIRO SIGN
  237. 0x20A3, // FRENCH FRANC SIGN
  238. 0x20A4, // LIRA SIGN
  239. 0x20A5, // MILL SIGN
  240. 0x20A6, // NAIRA SIGN
  241. 0x20A7, // PESETA SIGN
  242. 0x20A8, // RUPEE SIGN
  243. 0x20A9, // WON SIGN
  244. 0x20AA, // NEW SHEQEL SIGN
  245. 0xFF04, // FULLWIDTH DOLLAR SIGN
  246. 0xFFE5, // FULLWIDTH YEN SIGN
  247. 0xFFE6, // FULLWIDTH WON SIGN
  248. 0xFFE1, // FULLWIDTH POUND SIGN
  249. 0
  250. };
  251. // Post-numeral abbreviation
  252. #define brkclsPostfix 9
  253. const WCHAR set9[] = {
  254. 0x00A2, // CENT SIGN
  255. 0x00B0, // DEGREE SIGN
  256. 0x2103, // DEGREE CELSIUS
  257. 0x2109, // DEGREE FAHRENHEIT
  258. 0x212A, // KELVIN SIGN
  259. 0x0025, // PERCENT SIGN
  260. 0x066A, // ARABIC PERCENT SIGN
  261. 0xFE6A, // SMALL PERCENT SIGN
  262. 0xFF05, // FULLWIDTH PERCENT SIGN
  263. 0x2030, // PER MILLE SIGN
  264. 0x2031, // PER TEN THOUSAND SIGN
  265. 0x2032, // PRIME
  266. 0x2033, // DOUBLE PRIME
  267. 0x2034, // TRIPLE PRIME
  268. 0x2035, // REVERSED PRIME
  269. 0x2036, // REVERSED DOUBLE PRIME
  270. 0x2037, // REVERSED TRIPLE PRIME
  271. 0xFF05, // FULLWIDTH PERCENT SIGN
  272. 0xFFE0, // FULLWIDTH CENT SIGN
  273. 0
  274. };
  275. // Japanese space (blank) character
  276. #define brkclsNoStartIdeo 10
  277. const WCHAR set10[] = {
  278. 0x3000, // IDEOGRAPHIC SPACE
  279. 0
  280. };
  281. // Japanese characters other than above
  282. #define brkclsIdeographic 11
  283. const WCHAR set11[] = {
  284. 0 //we use GetStringTypeEx
  285. };
  286. // Characters included in numeral-sequence
  287. #define brkclsNumeral 12
  288. const WCHAR set12[] = {
  289. 0x0030, // DIGIT ZERO
  290. 0x0031, // DIGIT ONE
  291. 0x0032, // DIGIT TWO
  292. 0x0033, // DIGIT THREE
  293. 0x0034, // DIGIT FOUR
  294. 0x0035, // DIGIT FIVE
  295. 0x0036, // DIGIT SIX
  296. 0x0037, // DIGIT SEVEN
  297. 0x0038, // DIGIT EIGHT
  298. 0x0039, // DIGIT NINE
  299. 0x0660, // ARABIC-INDIC DIGIT ZERO
  300. 0x0661, // ARABIC-INDIC DIGIT ONE
  301. 0x0662, // ARABIC-INDIC DIGIT TWO
  302. 0x0663, // ARABIC-INDIC DIGIT THREE
  303. 0x0664, // ARABIC-INDIC DIGIT FOUR
  304. 0x0665, // ARABIC-INDIC DIGIT FIVE
  305. 0x0666, // ARABIC-INDIC DIGIT SIX
  306. 0x0667, // ARABIC-INDIC DIGIT SEVEN
  307. 0x0668, // ARABIC-INDIC DIGIT EIGHT
  308. 0x0669, // ARABIC-INDIC DIGIT NINE
  309. 0x06F0, // EXTENDED ARABIC-INDIC DIGIT ZERO
  310. 0x06F1, // EXTENDED ARABIC-INDIC DIGIT ONE
  311. 0x06F2, // EXTENDED ARABIC-INDIC DIGIT TWO
  312. 0x06F3, // EXTENDED ARABIC-INDIC DIGIT THREE
  313. 0x06F4, // EXTENDED ARABIC-INDIC DIGIT FOUR
  314. 0x06F5, // EXTENDED ARABIC-INDIC DIGIT FIVE
  315. 0x06F6, // EXTENDED ARABIC-INDIC DIGIT SIX
  316. 0x06F7, // EXTENDED ARABIC-INDIC DIGIT SEVEN
  317. 0x06F8, // EXTENDED ARABIC-INDIC DIGIT EIGHT
  318. 0x06F9, // EXTENDED ARABIC-INDIC DIGIT NINE
  319. 0x0966, // DEVANAGARI DIGIT ZERO
  320. 0x0967, // DEVANAGARI DIGIT ONE
  321. 0x0968, // DEVANAGARI DIGIT TWO
  322. 0x0969, // DEVANAGARI DIGIT THREE
  323. 0x096A, // DEVANAGARI DIGIT FOUR
  324. 0x096B, // DEVANAGARI DIGIT FIVE
  325. 0x096C, // DEVANAGARI DIGIT SIX
  326. 0x096D, // DEVANAGARI DIGIT SEVEN
  327. 0x096E, // DEVANAGARI DIGIT EIGHT
  328. 0x096F, // DEVANAGARI DIGIT NINE
  329. 0x09E6, // BENGALI DIGIT ZERO
  330. 0x09E7, // BENGALI DIGIT ONE
  331. 0x09E8, // BENGALI DIGIT TWO
  332. 0x09E9, // BENGALI DIGIT THREE
  333. 0x09EA, // BENGALI DIGIT FOUR
  334. 0x09EB, // BENGALI DIGIT FIVE
  335. 0x09EC, // BENGALI DIGIT SIX
  336. 0x09ED, // BENGALI DIGIT SEVEN
  337. 0x09EE, // BENGALI DIGIT EIGHT
  338. 0x09EF, // BENGALI DIGIT NINE
  339. 0x0A66, // GURMUKHI DIGIT ZERO
  340. 0x0A67, // GURMUKHI DIGIT ONE
  341. 0x0A68, // GURMUKHI DIGIT TWO
  342. 0x0A69, // GURMUKHI DIGIT THREE
  343. 0x0A6A, // GURMUKHI DIGIT FOUR
  344. 0x0A6B, // GURMUKHI DIGIT FIVE
  345. 0x0A6C, // GURMUKHI DIGIT SIX
  346. 0x0A6D, // GURMUKHI DIGIT SEVEN
  347. 0x0A6E, // GURMUKHI DIGIT EIGHT
  348. 0x0A6F, // GURMUKHI DIGIT NINE
  349. 0x0AE6, // GUJARATI DIGIT ZERO
  350. 0x0AE7, // GUJARATI DIGIT ONE
  351. 0x0AE8, // GUJARATI DIGIT TWO
  352. 0x0AE9, // GUJARATI DIGIT THREE
  353. 0x0AEA, // GUJARATI DIGIT FOUR
  354. 0x0AEB, // GUJARATI DIGIT FIVE
  355. 0x0AEC, // GUJARATI DIGIT SIX
  356. 0x0AED, // GUJARATI DIGIT SEVEN
  357. 0x0AEE, // GUJARATI DIGIT EIGHT
  358. 0x0AEF, // GUJARATI DIGIT NINE
  359. 0x0B66, // ORIYA DIGIT ZERO
  360. 0x0B67, // ORIYA DIGIT ONE
  361. 0x0B68, // ORIYA DIGIT TWO
  362. 0x0B69, // ORIYA DIGIT THREE
  363. 0x0B6A, // ORIYA DIGIT FOUR
  364. 0x0B6B, // ORIYA DIGIT FIVE
  365. 0x0B6C, // ORIYA DIGIT SIX
  366. 0x0B6D, // ORIYA DIGIT SEVEN
  367. 0x0B6E, // ORIYA DIGIT EIGHT
  368. 0x0B6F, // ORIYA DIGIT NINE
  369. 0x0BE7, // TAMIL DIGIT ONE
  370. 0x0BE8, // TAMIL DIGIT TWO
  371. 0x0BE9, // TAMIL DIGIT THREE
  372. 0x0BEA, // TAMIL DIGIT FOUR
  373. 0x0BEB, // TAMIL DIGIT FIVE
  374. 0x0BEC, // TAMIL DIGIT SIX
  375. 0x0BED, // TAMIL DIGIT SEVEN
  376. 0x0BEE, // TAMIL DIGIT EIGHT
  377. 0x0BEF, // TAMIL DIGIT NINE
  378. 0x0BF0, // TAMIL NUMBER TEN
  379. 0x0BF1, // TAMIL NUMBER ONE HUNDRED
  380. 0x0BF2, // TAMIL NUMBER ONE THOUSAND
  381. 0x0C66, // TELUGU DIGIT ZERO
  382. 0x0C67, // TELUGU DIGIT ONE
  383. 0x0C68, // TELUGU DIGIT TWO
  384. 0x0C69, // TELUGU DIGIT THREE
  385. 0x0C6A, // TELUGU DIGIT FOUR
  386. 0x0C6B, // TELUGU DIGIT FIVE
  387. 0x0C6C, // TELUGU DIGIT SIX
  388. 0x0C6D, // TELUGU DIGIT SEVEN
  389. 0x0C6E, // TELUGU DIGIT EIGHT
  390. 0x0C6F, // TELUGU DIGIT NINE
  391. 0x0CE6, // KANNADA DIGIT ZERO
  392. 0x0CE7, // KANNADA DIGIT ONE
  393. 0x0CE8, // KANNADA DIGIT TWO
  394. 0x0CE9, // KANNADA DIGIT THREE
  395. 0x0CEA, // KANNADA DIGIT FOUR
  396. 0x0CEB, // KANNADA DIGIT FIVE
  397. 0x0CEC, // KANNADA DIGIT SIX
  398. 0x0CED, // KANNADA DIGIT SEVEN
  399. 0x0CEE, // KANNADA DIGIT EIGHT
  400. 0x0CEF, // KANNADA DIGIT NINE
  401. 0x0D66, // MALAYALAM DIGIT ZERO
  402. 0x0D67, // MALAYALAM DIGIT ONE
  403. 0x0D68, // MALAYALAM DIGIT TWO
  404. 0x0D69, // MALAYALAM DIGIT THREE
  405. 0x0D6A, // MALAYALAM DIGIT FOUR
  406. 0x0D6B, // MALAYALAM DIGIT FIVE
  407. 0x0D6C, // MALAYALAM DIGIT SIX
  408. 0x0D6D, // MALAYALAM DIGIT SEVEN
  409. 0x0D6E, // MALAYALAM DIGIT EIGHT
  410. 0x0D6F, // MALAYALAM DIGIT NINE
  411. 0x0E50, // THAI DIGIT ZERO
  412. 0x0E51, // THAI DIGIT ONE
  413. 0x0E52, // THAI DIGIT TWO
  414. 0x0E53, // THAI DIGIT THREE
  415. 0x0E54, // THAI DIGIT FOUR
  416. 0x0E55, // THAI DIGIT FIVE
  417. 0x0E56, // THAI DIGIT SIX
  418. 0x0E57, // THAI DIGIT SEVEN
  419. 0x0E58, // THAI DIGIT EIGHT
  420. 0x0E59, // THAI DIGIT NINE
  421. 0x0ED0, // LAO DIGIT ZERO
  422. 0x0ED1, // LAO DIGIT ONE
  423. 0x0ED2, // LAO DIGIT TWO
  424. 0x0ED3, // LAO DIGIT THREE
  425. 0x0ED4, // LAO DIGIT FOUR
  426. 0x0ED5, // LAO DIGIT FIVE
  427. 0x0ED6, // LAO DIGIT SIX
  428. 0x0ED7, // LAO DIGIT SEVEN
  429. 0x0ED8, // LAO DIGIT EIGHT
  430. 0x0ED9, // LAO DIGIT NINE
  431. 0xFF10, // FULLWIDTH DIGIT ZERO
  432. 0xFF11, // FULLWIDTH DIGIT ONE
  433. 0xFF12, // FULLWIDTH DIGIT TWO
  434. 0xFF13, // FULLWIDTH DIGIT THREE
  435. 0xFF14, // FULLWIDTH DIGIT FOUR
  436. 0xFF15, // FULLWIDTH DIGIT FIVE
  437. 0xFF16, // FULLWIDTH DIGIT SIX
  438. 0xFF17, // FULLWIDTH DIGIT SEVEN
  439. 0xFF18, // FULLWIDTH DIGIT EIGHT
  440. 0xFF19, // FULLWIDTH DIGIT NINE
  441. 0x3007, // IDEOGRAPHIC NUMBER ZERO
  442. 0x3021, // HANGZHOU NUMERAL ONE
  443. 0x3022, // HANGZHOU NUMERAL TWO
  444. 0x3023, // HANGZHOU NUMERAL THREE
  445. 0x3024, // HANGZHOU NUMERAL FOUR
  446. 0x3025, // HANGZHOU NUMERAL FIVE
  447. 0x3026, // HANGZHOU NUMERAL SIX
  448. 0x3027, // HANGZHOU NUMERAL SEVEN
  449. 0x3028, // HANGZHOU NUMERAL EIGHT
  450. 0x3029, // HANGZHOU NUMERAL NINE
  451. 0
  452. };
  453. // Characters included in unit symbol group
  454. const WCHAR set13[] = {
  455. 0 //we use GetStringTypeEx
  456. };
  457. //Roman inter-word space
  458. #define brkclsSpaceN 14
  459. const WCHAR set14[] = {
  460. 0x0009, // TAB
  461. 0x0020, // SPACE
  462. 0x2002, // EN SPACE
  463. 0x2003, // EM SPACE
  464. 0x2004, // THREE-PER-EM SPACE
  465. 0x2005, // FOUR-PER-EM SPACE
  466. 0x2006, // SIX-PER-EM SPACE
  467. 0x2007, // FIGURE SPACE
  468. 0x2008, // PUNCTUATION SPACE
  469. 0x2009, // THIN SPACE
  470. 0x200A, // HAIR SPACE
  471. 0x200B, // ZERO WIDTH SPACE
  472. WCH_EMBEDDING, // OBJECT EMBEDDING (0xFFFC)
  473. 0
  474. };
  475. // Roman characters
  476. #define brkclsAlpha 15
  477. const WCHAR set15[] = {
  478. 0 //we use GetStringTypeEx
  479. };
  480. // So we can easily loop over all Kinsoku categories.
  481. const WCHAR *charCategories[] = {
  482. set0,
  483. set1,
  484. set2,
  485. set3,
  486. set4,
  487. set5,
  488. set6,
  489. set7,
  490. set8,
  491. set9,
  492. set10,
  493. set11,
  494. set12,
  495. set13,
  496. set14,
  497. set15
  498. };
  499. static const INT classifyChunkSize = 64;
  500. static const INT indexSize = 65536 / classifyChunkSize;
  501. static const INT classifyBitMapSize = indexSize / 8;
  502. static const INT bitmapShift = 6; // 16 - log(indexSize)/log(2)
  503. typedef struct {
  504. CHAR classifications[classifyChunkSize]; // must be unsigned bytes!
  505. } ClassifyChunk;
  506. static ClassifyChunk *classifyData; // Chunk array, sparse chrs
  507. static BYTE *classifyIndex; // Indexes into chunk array
  508. /*
  509. * BOOL InitKinsokuClassify()
  510. *
  511. * @func
  512. * Map the static character tables into a compact array for
  513. * quick lookup of the characters Kinsoku classification.
  514. *
  515. * @comm
  516. * Kinsoku classification is necessary for word breaking and
  517. * may be neccessary for proportional line layout, Kinsoku style.
  518. *
  519. * @devnote
  520. * We break the entire Unicode range in to chunks of characters.
  521. * Not all of the chunks will have data in them. We do not
  522. * maintain information on empty chunks, therefore we create
  523. * a compact, contiguous array of chunks for only the chunks
  524. * that do contain information. We prepend 1 empty chunk to the
  525. * beginning of this array, where all of the empty chunks map to,
  526. * this prevents a contiontional test on NULL data. The lookup
  527. * will return 0 for any character not in the tables, so the client
  528. * will then need to process the character further in such cases.
  529. *
  530. * @rdesc
  531. * return TRUE if we successfully created the lookup table.
  532. */
  533. BOOL InitKinsokuClassify()
  534. {
  535. TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "InitKinsokuClassify");
  536. WORD bitMapKey; // For calcing total chunks
  537. BYTE bitData; // For calcing total chunks
  538. WCHAR ch;
  539. LPCWSTR pWChar; // Looping over char sets.
  540. INT i, j, count; // Loop support.
  541. BYTE classifyBitMap[classifyBitMapSize], // Temp bitmap.
  542. *pIndex; // Index into chunk array.
  543. // See how many chunks we'll need. We loop over all of the special
  544. // characters
  545. AssertSz(cKinsokuCategories == ARRAY_SIZE(charCategories),
  546. "InitKinsokuClassify: incorrect Kinsoku-category count");
  547. ZeroMemory(classifyBitMap, sizeof(classifyBitMap));
  548. for (i = 0; i < cKinsokuCategories; i++ )
  549. {
  550. pWChar = charCategories[i];
  551. while ( ch = *pWChar++ )
  552. {
  553. bitMapKey = ch >> bitmapShift;
  554. classifyBitMap[bitMapKey >> 3] |= 1 << (bitMapKey & 7);
  555. }
  556. }
  557. // Now that we know how many chunks we'll need, allocate the memory.
  558. count = 1 + CountMatchingBits((DWORD *)classifyBitMap, (DWORD *)classifyBitMap, sizeof(classifyBitMap)/sizeof(DWORD));
  559. classifyData = (ClassifyChunk *) PvAlloc( sizeof(ClassifyChunk) * count, GMEM_ZEROINIT);
  560. classifyIndex = (BYTE *) PvAlloc( sizeof(BYTE) * indexSize, GMEM_ZEROINIT);
  561. // We failed if we did not get the memory.
  562. if ( !classifyData || !classifyIndex )
  563. return FALSE; // FAILED.
  564. // Set Default missing value.
  565. FillMemory( classifyData, -1, sizeof(ClassifyChunk) * count );
  566. // Init the pointers to the chunks, which are really just indexes into
  567. // a contiguous block of memory -- an one-based array of chunks.
  568. pIndex = classifyIndex;
  569. count = 1; // 1 based array.
  570. for (i = 0; i < sizeof(classifyBitMap); i++ ) // Loop over all bytes.
  571. { // Get the bitmap data.
  572. bitData = classifyBitMap[i]; // For each bit in the byte
  573. for (j = 0; j < 8; j++, bitData >>= 1, pIndex++)
  574. {
  575. if(bitData & 1)
  576. *pIndex = count++; // We used a chunk.
  577. }
  578. }
  579. // Store the classifications of each character.
  580. // Note: classifications are 1 based, a zero value
  581. // means the category was not set.
  582. for (i = 0; i < cKinsokuCategories; i++ )
  583. {
  584. pWChar = charCategories[i]; // Loop over all chars in
  585. while ( ch = *pWChar++ ) // category.
  586. {
  587. bitMapKey = ch >> bitmapShift;
  588. Assert( classifyIndex[bitMapKey] > 0 );
  589. Assert( classifyIndex[bitMapKey] < count );
  590. classifyData[classifyIndex[bitMapKey]].
  591. classifications[ ch & ( classifyChunkSize-1 )] = (char)i;
  592. }
  593. }
  594. return TRUE; // Successfully created.
  595. }
  596. void UninitKinsokuClassify()
  597. {
  598. TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "UninitKinsokuClassify");
  599. FreePv(classifyData);
  600. FreePv(classifyIndex);
  601. }
  602. /*
  603. * KinsokuClassify(ch)
  604. *
  605. * @func
  606. * Kinsoku classify the character iff it was a given from
  607. * one of the classification tables.
  608. *
  609. * @comm
  610. * Hi order bits of ch are used to get an index value used to index
  611. * into an array of chunks. Each chunk contains the classifications
  612. * for that character as well as some number of characters adjacent
  613. * to that character. The low order bits are used to index into
  614. * the chunk of adjacent characters.
  615. *
  616. * @devnote
  617. * Because of the way we constructed the array, all that we need to
  618. * do is look up the data; no conditionals necessary.
  619. *
  620. * The routine is inline to avoid the call overhead. It is static
  621. * because it only returns characters from the tables; i.e., this
  622. * routine does NOT classify all Unicode characters.
  623. *
  624. * @rdesc
  625. * Returns the classification.
  626. */
  627. static __forceinline INT
  628. KinsokuClassify(
  629. WCHAR ch ) // @parm char to classify.
  630. {
  631. //TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "KinsokuClassify");
  632. return classifyData[ classifyIndex[ ch >> bitmapShift ] ].
  633. classifications[ ch & ( classifyChunkSize-1 )];
  634. }
  635. #define IsSameNonFEClass(_c1, _c2) (!(((_c1) ^ (_c2)) & WBF_CLASS))
  636. #define IdeoKanaTypes (C3_HALFWIDTH | C3_FULLWIDTH | C3_KATAKANA | C3_HIRAGANA)
  637. #define IdeoTypes (IdeoKanaTypes | C3_IDEOGRAPH)
  638. #define IsIdeographic(_c1) ( 0 != (_c1 & (C3_KATAKANA | C3_HIRAGANA | C3_IDEOGRAPH)) )
  639. /*
  640. * IsSameClass(currType1, startType1, currType3, startType3 )
  641. *
  642. * @func Used to determine word breaks.
  643. *
  644. * @comm Ideographic chars are all considered to be unique, so that only
  645. * one at a time is selected
  646. */
  647. BOOL IsSameClass(WORD currType1, WORD startType1,
  648. WORD currType3, WORD startType3 )
  649. {
  650. BOOL fIdeographic = IsIdeographic(currType3);
  651. // Do classifications for startType3 being ideographic
  652. if(IsIdeographic(startType3))
  653. {
  654. int checkTypes = (currType3 & IdeoTypes) ^ (startType3 & IdeoTypes);
  655. // We only get picky with non-ideographic Kana chars
  656. // C3_HALFWIDTH | C3_FULLWIDTH | C3_KATAKANA | C3_HIRAGANA.
  657. return fIdeographic && (startType3 & IdeoKanaTypes) &&
  658. (!checkTypes || checkTypes == C3_FULLWIDTH || checkTypes == C3_HIRAGANA ||
  659. checkTypes == (C3_FULLWIDTH | C3_HIRAGANA));
  660. }
  661. // Do classifications for nonideographic startType3
  662. return !fIdeographic && IsSameNonFEClass(currType1, startType1);
  663. }
  664. WORD ClassifyChar(
  665. WCHAR ch,
  666. LCID lcid)
  667. {
  668. TRACEBEGIN(TRCSUBSYSBACK, TRCSCOPEINTERN, "ClassifyChar");
  669. WORD wRes, cType3;
  670. int kinsokuclass;
  671. BatchClassify(&ch, 1, lcid, &cType3, &kinsokuclass, &wRes);
  672. return wRes;
  673. }
  674. //
  675. //This is a cache of the wres information for ansi.
  676. //
  677. const byte rgwresAnsi[256] = {
  678. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, //0x00
  679. 0x00, 0x13, 0x14, 0x14, 0x14, 0x14, 0x00, 0x00, //0x08
  680. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0x10
  681. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0x18
  682. 0x32, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, //0x20
  683. 0x01, 0x01, 0x01, 0x01, 0x01, 0x41, 0x01, 0x01, //0x28
  684. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0x30
  685. 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, //0x38
  686. 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0x40
  687. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0x48
  688. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0x50
  689. 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, //0x58
  690. 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0x60
  691. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0x68
  692. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0x70
  693. 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, //0x78
  694. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0x80
  695. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0x88
  696. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0x90
  697. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0x98
  698. 0x12, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, //0xA0
  699. 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, //0xA8
  700. 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, //0xB0
  701. 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, //0xB8
  702. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0xC0
  703. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0xC8
  704. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, //0xD0
  705. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0xD8
  706. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0xE0
  707. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0xE8
  708. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //0xF0
  709. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};//0xF8
  710. /*
  711. * BatchClassify (pch, cch, lcid, pcType3, kinsokuClassifications, pwRes)
  712. *
  713. * @func
  714. * Kinsoku classify and ClassifyChar() each character of the given string.
  715. *
  716. * @comm
  717. * The Kinsoku classifications are passed to the CanBreak() routine. We
  718. * do process in batch to save on overhead.
  719. *
  720. * If the character is not in the Kinsoku classification tables then
  721. * GetStringTypeEx is used to classify any remaining character.
  722. *
  723. * *Note* Using CT_CTYPE1 values alone is unreliable since CT_CTYPE1
  724. * defines C1_PUNCT for all diacritic characters. According to KDChang,
  725. * this is by design for POSIX compatibility and it couldn't be changed
  726. * easily since Win9x shares the same NLS data with NT. (wchao)
  727. * Therefore we use CT_CTYPE3 data to distinguish diacritics, except on
  728. * Win9x, for which we use a range check, since GetStringTypeExW isn't
  729. * supported).
  730. *
  731. * @rdesc
  732. * Result in out param kinsokuClassifications.
  733. * pcType3 result from GetStringTypeEx for CT_CTYPE3
  734. */
  735. void BatchClassify (
  736. const WCHAR *pch, //@parm char string
  737. INT cch, //@parm Count of chars in string
  738. LCID lcid, //@parm lcid to use for GetStringTypeExA()
  739. WORD *pcType3, //@parm Result of GetStringTypeEx for CT_CTYPE3
  740. INT * kinsokuClassifications, // @parm Result of the classifications
  741. WORD *pwRes) //@parm ClassifyChar() result
  742. {
  743. TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "BatchClassify");
  744. WCHAR ch;
  745. WORD wRes;
  746. Assert( cch < MAX_CLASSIFY_CHARS );
  747. Assert( pch );
  748. Assert( kinsokuClassifications );
  749. W32->GetStringTypes(lcid, pch, cch, pwRes, pcType3);
  750. while ( cch-- ) // For all ch...
  751. {
  752. wRes = *pwRes;
  753. ch = *pch++;
  754. if (ch <= 255)
  755. wRes = rgwresAnsi[ch];
  756. else if(IsKorean(ch))
  757. wRes = WBF_KOREAN; // Special Korean class
  758. else if (IsThai(ch))
  759. wRes = 0; // Thai class
  760. else if (ch == WCH_EMBEDDING) // Objects
  761. wRes = 2 | WBF_BREAKAFTER;
  762. else if(wRes & C1_SPACE)
  763. {
  764. if (wRes & C1_BLANK)
  765. wRes = 2 | WBF_ISWHITE | WBF_BREAKLINE;
  766. else
  767. wRes = 4 | WBF_ISWHITE | WBF_BREAKLINE;
  768. }
  769. else if((wRes & C1_PUNCT) && !IsDiacriticOrKashida(ch, *pcType3))
  770. wRes = 1;
  771. else
  772. wRes = 0;
  773. *pwRes++ = wRes;
  774. *kinsokuClassifications++ = GetKinsokuClass(ch, *pcType3, lcid);
  775. pcType3++;
  776. }
  777. }
  778. /*
  779. * GetKinsokuClass (ch, cType3, lcid)
  780. *
  781. * @func
  782. * Kinsoku classify ch
  783. *
  784. * @comm
  785. * The Kinsoku classifications are passed to the CanBreak() routine. This
  786. * single-character routine is for use with LineServices
  787. *
  788. * If the character is not in the Kinsoku classification tables then
  789. * GetStringTypeEx is used to classify any remaining character.
  790. *
  791. * @rdesc
  792. * Kinsoku classification for ch
  793. */
  794. INT GetKinsokuClass (
  795. WCHAR ch, //@parm char
  796. WORD cType3, //@parm cType3 info
  797. LCID lcid) //@parm lcid
  798. {
  799. //TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "GetKinsokuClass");
  800. // surrogate classification
  801. if (IN_RANGE(0xD800, ch, 0xDFFF))
  802. return IN_RANGE(0xDC00, ch, 0xDFFF) ? brkclsClose : brkclsOpen;
  803. INT iCategory = KinsokuClassify(ch);
  804. if(iCategory >= 0)
  805. return iCategory;
  806. if (cType3 == 0xFFFF)
  807. W32->GetStringTypeEx(lcid, CT_CTYPE3, &ch, 1, &cType3);
  808. if(cType3 & C3_SYMBOL)
  809. return 13; // Symbol chars
  810. if(IsKorean(ch) || cType3 & C3_FullWidth)
  811. return 11; // Ideographic chars
  812. return 15; // All other chars.
  813. }
  814. /*
  815. * CanBreak(class1, class2)
  816. *
  817. * @func
  818. * Look into the truth table to see if two consecutive charcters
  819. * can have a line break between them.
  820. *
  821. * @comm
  822. * This determines whether two successive characters can break a line.
  823. * The matrix is taken from JIS X4051 and is based on categorizing
  824. * characters into 15 classifications.
  825. *
  826. * @devnote
  827. * The table is 1 based.
  828. *
  829. * @rdesc
  830. * Returns TRUE if the characters can be broken across a line.
  831. */
  832. BOOL CanBreak(
  833. INT class1, //@parm Kinsoku classification of character #1
  834. INT class2 ) //@parm Kinsoku classification of following character.
  835. {
  836. TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "CanBreak");
  837. static const WORD br[16] = {// fedc ba98 7654 3210
  838. 0x0000, // 0 0000 0000 0000 0000
  839. 0x0000, // 1 0000 0000 0000 0000
  840. 0xfd82, // 2 1111 1101 1000 0010
  841. 0xfd82, // 3 1111 1101 1000 0010
  842. 0xfd82, // 4 1111 1101 1000 0010
  843. 0xfd82, // 5 1111 1101 1000 0010
  844. 0x6d82, // 6 0110 1101 1000 0010
  845. 0xfd02, // 7 1111 1101 0000 0010
  846. 0x0000, // 8 0000 0000 0000 0000
  847. 0xfd82, // 9 1111 1101 1000 0010
  848. 0xfd83, // a 1111 1101 1000 0011
  849. 0xfd82, // b 1111 1101 1000 0010
  850. 0x6d82, // c 0110 1101 1000 0010
  851. 0x5d82, // d 0101 1101 1000 0010
  852. 0xfd83, // e 1111 1101 1000 0011
  853. 0x4d82, // f 0100 1101 1000 0010
  854. };
  855. return (br[class1] >> class2) & 1;
  856. }
  857. /*
  858. * IsURLDelimiter(ch)
  859. *
  860. * @func
  861. * Punctuation characters are those of sets 0, 1, 2, 4, 5, and 6,
  862. * and < or > which we consider to be brackets, not "less" or
  863. * "greater" signs. On the other hand; "/" (in set 6) should not be
  864. * a delimiter, but rather a part of the URL.
  865. *
  866. * @comm This function is used in URL detection
  867. *
  868. * @rdesc
  869. * Returns TRUE if the character is a punctuation mark.
  870. */
  871. BOOL IsURLDelimiter(
  872. WCHAR ch)
  873. {
  874. if (IsKorean(ch))
  875. return TRUE;
  876. INT iset = KinsokuClassify(ch);
  877. return IN_RANGE(0, iset, 2) || (IN_RANGE(4, iset, 6) && ch != '/')
  878. || ch == '<' || ch == '>';
  879. }