Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1324 lines
53 KiB

  1. /*----------------------------------------------------------------------------
  2. %%File: validate.c
  3. %%Unit: fechmap
  4. %%Contact: jpick
  5. "Rolling" state machines that allow interactive verification of
  6. DBCS and EUC files. Currently, separate tables are stored for
  7. each encoding so that the state machines can be run in parallel
  8. (i.e., multiple parse streams).
  9. These routines are used by auto-detection and if caller wants
  10. conversion routines to return errors on invalid characters.
  11. Following is a description of the structure of the DBCS and EUC
  12. encodings handled by this module. This information is taken from
  13. CJK.INF (maintained by Ken Lunde, author of _Understanding Japanese
  14. Information Processing_). This information governs the structure
  15. of the class and validation state tables used in this module.
  16. Big5
  17. Two-byte Standard Characters Encoding Ranges
  18. first byte range 0xA1-0xFE
  19. second byte ranges 0x40-0x7E, 0xA1-0xFE
  20. One-byte Characters Encoding Range
  21. ASCII 0x21-0x7E
  22. GBK
  23. Two-byte Standard Characters Encoding Ranges
  24. first byte range 0x81-0xFE
  25. second byte ranges 0x40-0x7E and 0x80-0xFE
  26. One-byte Characters Encoding Range
  27. ASCII 0x21-0x7E
  28. HZ (information from HZ spec Fung F. Lee (lee@umunhum.stanford.edu))
  29. One-byte characters Encoding Ranges
  30. first GB byte range 0x21-0x77
  31. second GB byte range 0x21-0x7E
  32. ASCII 0x21-0x7E
  33. Mode switching Encoding sequence
  34. escape sequence from GB to ASCII 0x7E followed by 0x7B ("~{")
  35. escape sequence from ASCII to GB 0x7E followed by 0x7D ("~}")
  36. line continuation marker 0x7E followed by 0x0A
  37. (Note: ASCII mode is the default mode)
  38. Shift-Jis
  39. Two-byte Standard Characters Encoding Ranges
  40. first byte ranges 0x81-0x9F, 0xE0-0xEF
  41. second byte ranges 0x40-0x7E, 0x80-0xFC
  42. Two-byte User-defined Dharacters Encoding Ranges
  43. first byte range 0xF0-0xFC
  44. second byte ranges 0x40-0x7E, 0x80-0xFC
  45. One-byte Characters Encoding Range
  46. Half-width katakana 0xA1-0xDF
  47. ASCII/JIS-Roman 0x21-0x7E
  48. Wansung
  49. Two-byte Standard Characters Encoding Ranges
  50. first byte range 0x81-0xFE
  51. second byte ranges 0x40-0x7E and 0x80-0xFE
  52. One-byte Characters Encoding Range
  53. ASCII 0x21-0x7E
  54. EUC-Cn
  55. Code set 0 (ASCII or GB 1988-89): 0x21-0x7E
  56. Code set 1 (GB 2312-80): 0xA1A1-0xFEFE
  57. Code set 2: unused
  58. Code set 3: unused
  59. EUC-Jp
  60. Code set 0 (ASCII or JIS X 0201-1976 Roman): 0x21-0x7E
  61. Code set 1 (JIS X 0208): 0xA1A1-0xFEFE
  62. Code set 2 (half-width katakana): 0x8EA1-0x8EDF
  63. Code set 3 (JIS X 0212-1990): 0x8FA1A1-0x8FFEFE
  64. EUC-Kr
  65. Code set 0 (ASCII or KS C 5636-1993): 0x21-0x7E
  66. Code set 1 (KS C 5601-1992): 0xA1A1-0xFEFE
  67. Code set 2: unused
  68. Code set 3: unused
  69. EUC-Tw
  70. Code set 0 (ASCII): 0x21-0x7E
  71. Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE
  72. Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE
  73. Code set 3: unused
  74. UTF-7 (information from the RFC2152 by D.Goldsmith)
  75. One-byte characters Encoding Ranges
  76. Direct and Optionally direct 0x21-0x2A, 0x2C-0x5B,
  77. 0x5D-0x60, 0x7B-0x7D
  78. 0x09, 0x0A, 0x0D, 0x20
  79. Modified Base64 0x2B, 0x2F-39, 0x41-0x5A, 0x61-0x7A
  80. Mode switching
  81. escape sequence from D/O to M. Base64 0x2B
  82. escape sequence from M. Base64 to D/O 0x2D (or any control character)
  83. ----------------------------------------------------------------------------*/
  84. #include <stdio.h>
  85. #include <stddef.h>
  86. #include "private.h"
  87. #include "fechmap_.h"
  88. #include "lexint_.h"
  89. /*----------------------------------------------------------------------------
  90. Common Defs for all Sequence Validation
  91. ----------------------------------------------------------------------------*/
  92. // Characters are broken down into ranges -- the smallest ranges that
  93. // are treated as important by either EUC or DBCS (all flavors). In
  94. // some cases, the smallest range is a single character. It saves
  95. // some space to avoid having two class tables (even though more states
  96. // are added to the state machines), so both encodings share these
  97. // tokens.
  98. // Common Tokens
  99. //
  100. #define ollow 0 // "other" legal low ascii character
  101. #define x000a 1 // 0x0a ("\n")
  102. #define x212a 2 // characters in range 0x21-0x2a
  103. #define x002b 3 // 0x2b ("+")
  104. #define x002c 4 // 0x2c (",")
  105. #define x002d 5 // 0x2d ("-")
  106. #define x002e 6 // 0x2e ("\")
  107. #define x2f39 7 // characters in range 0x2f-0x39
  108. #define x3a3f 8 // characters in range 0x3a-0x3f
  109. #define x0040 9 // 0x40
  110. #define x415a 10 // characters in range 0x41-0x5a
  111. #define x005b 11 // 0x5b ("[")
  112. #define x005c 12 // 0x5c ("\")
  113. #define x5d60 13 // characters in range 0x5d-0x60
  114. #define x6177 14 // characters in range 0x61-0x77
  115. #define x787a 15 // characters in range 0x78-0x7a
  116. #define x007b 16 // 0x7b ("{")
  117. #define x007c 17 // 0x7c ("|")
  118. #define x007d 18 // 0x7d ("}")
  119. #define x007e 19 // 0x7e ("~")
  120. #define x007f 20 // 0x7f (DEL)
  121. #define x0080 21 // 0x80
  122. #define x818d 22 // characters in range 0x81-0x8d
  123. #define x008e 23 // 0x8e
  124. #define x008f 24 // 0x8f
  125. #define x909f 25 // characters in range 0x90-0x9f
  126. #define x00a0 26 // 0xa0
  127. #define xa1b0 27 // characters in range 0xa1-0xb0
  128. #define xb1df 28 // characters in range 0xb1-0xdf
  129. #define xe0ef 29 // characters in range 0xe0-0xef
  130. #define xf0fc 30 // characters in range 0xf0-0xfc
  131. #define xfdfe 31 // characters in range 0xfd-0xfe
  132. #define ateof 32 // end-of-file
  133. #define other 33 // character not covered by above tokens
  134. #define nTokens 34 //
  135. // Class table
  136. //
  137. static char _rgchCharClass[256] =
  138. // 0 1 2 3 4 5 6 7 8 9 a b c d e f
  139. {
  140. // 0 nul soh stx etx eot enq ack bel bs tab lf vt np cr so si 0
  141. other, other, other, other, other, other, other, other, other, ollow, x000a, other, other, ollow, other, other,
  142. // 1 dle dc1 dc2 dc3 dc4 nak syn etb can em eof esc fs gs rs us 1
  143. other, other, other, other, other, other, other, other, other, other, ollow, other, other, other, other, other,
  144. // 2 sp ! " # $ % & ' ( ) * + , - . / 2
  145. ollow, x212a, x212a, x212a, x212a, x212a, x212a, x212a, x212a, x212a, x212a, x002b, x002c, x002d, x002e, x2f39,
  146. // 3 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 3
  147. x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x3a3f, x3a3f, x3a3f, x3a3f, x3a3f, x3a3f,
  148. // 4 @ A B C D E F G H I J K L M N O 4
  149. x0040, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a,
  150. // 5 P Q R S T U V W X Y Z [ \ ] ^ _ 5
  151. x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x005b, x005c, x5d60, x5d60, x5d60,
  152. // 6 ` a b c d e f g h i j k l m n o 6
  153. x5d60, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177,
  154. // 7 p q r s t u v w x y z { | } ~ del 7
  155. x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x787a, x787a, x787a, x007b, x007c, x007d, x007e, x007f,
  156. // 8 8
  157. x0080, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x008e, x008f,
  158. // 9 9
  159. x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f,
  160. // a a
  161. x00a0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0,
  162. // b b
  163. xa1b0, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df,
  164. // c c
  165. xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df,
  166. // d d
  167. xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df,
  168. // e e
  169. xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef,
  170. // f f
  171. xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xfdfe, xfdfe, other,
  172. // 0 1 2 3 4 5 6 7 8 9 a b c d e f
  173. };
  174. // Common States -- All SM's use these
  175. //
  176. #define ACC 0x4e
  177. #define ERR 0x7f
  178. // Other States -- All SM's use some of these, not all use all
  179. //
  180. #define ST0 0x00
  181. #define ST0c 0x40
  182. #define ST1 0x01
  183. #define ST1c 0x41
  184. #define ST2 0x02
  185. #define ST2c 0x42
  186. #define ST3 0x03
  187. #define ST3c 0x43
  188. #define ST4 0x04
  189. #define ST4c 0x44
  190. // Each state can have a corresponding counting stata i.e. stata with
  191. // with the same transitions but during which we look for special sequences.
  192. //
  193. #define FTstCounting(tst) (((tst) & 0x40) != 0) // If the state is counting (including ACC)
  194. #define TstNotCountingFromTst(tst) ((tst) & 0x3f) // Obtain the real state from the counting
  195. /*----------------------------------------------------------------------------
  196. DBCS character sequence validation
  197. ----------------------------------------------------------------------------*/
  198. #define nSJisStates 2
  199. static signed char _rgchSJisNextState[nSJisStates][nTokens] =
  200. {
  201. // o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
  202. // l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
  203. // l 0 1 0 0 0 0 e a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
  204. // o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
  205. // w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
  206. //------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  207. //
  208. // DBCS State 0 -- start (look for legal single byte or lead byte)
  209. ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ST1, ST1, ST1, ST1, ERR, ACC, ACC, ST1, ST1, ERR, ACC, ERR,
  210. // DBCS State 1 -- saw lead byte, need legal trail byte
  211. ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ERR,
  212. };
  213. #define nBig5States 2
  214. static signed char _rgchBig5NextState[nBig5States][nTokens] =
  215. {
  216. //
  217. // o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
  218. // l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
  219. // l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
  220. // o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
  221. // w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
  222. //------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  223. //
  224. // DBCS State 0 -- start (look for legal single byte or lead byte)
  225. ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ACC, ERR,
  226. // DBCS State 1 -- saw lead byte, need legal trail byte
  227. ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ERR, ERR,
  228. };
  229. #define nGbkWanStates 2
  230. static signed char _rgchGbkWanNextState[nGbkWanStates][nTokens] =
  231. {
  232. //
  233. // o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
  234. // l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
  235. // l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
  236. // o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
  237. // w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
  238. //-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  239. //
  240. // DBCS State 0 -- start (look for legal single byte or lead byte)
  241. ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ACC, ERR,
  242. // DBCS State 1 -- saw lead byte, need legal trail byte
  243. ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR,
  244. };
  245. /*----------------------------------------------------------------------------
  246. EUC character sequence validation
  247. ----------------------------------------------------------------------------*/
  248. #define nEucJpStates 4
  249. static signed char _rgchEucJpNextState[nEucJpStates][nTokens] =
  250. {
  251. //
  252. // o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
  253. // l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
  254. // l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
  255. // o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
  256. // w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
  257. //-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  258. //
  259. // EUC State 0 -- start
  260. ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ERR, ST2, ST3, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ACC, ERR,
  261. // EUC State 1 -- saw a1fe, need one more a1fe
  262. ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ERR, ERR,
  263. // EUC State 2 -- saw 8e, need a1df
  264. ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ERR, ERR, ERR, ERR, ERR,
  265. // EUC State 3 -- saw 8f, need 2 a1fe
  266. ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ERR, ERR,
  267. };
  268. #define nEucKrCnStates 2
  269. static signed char _rgchEucKrCnNextState[nEucKrCnStates][nTokens] =
  270. {
  271. //
  272. // o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
  273. // l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
  274. // l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
  275. // o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
  276. // w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
  277. //-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  278. //
  279. // EUC State 0 -- start
  280. ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ACC, ERR,
  281. // EUC State 1 -- saw a1fe, need one more a1fe
  282. ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ERR, ERR,
  283. };
  284. #define nEucTwStates 4
  285. static signed char _rgchEucTwNextState[nEucTwStates][nTokens] =
  286. {
  287. //
  288. // o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
  289. // l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
  290. // l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
  291. // o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
  292. // w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
  293. //-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  294. //
  295. // EUC State 0 -- start
  296. ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ERR, ST2, ERR, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ACC, ERR,
  297. // EUC State 1 -- saw a1fe, need one more a1fe
  298. ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ERR, ERR,
  299. // EUC State 2 -- saw 8e, need a1b0
  300. ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ST3, ERR, ERR, ERR, ERR, ERR, ERR,
  301. // EUC State 3 -- saw 8e, a1b0; need 2 a1fe
  302. ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ERR, ERR,
  303. };
  304. /*-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  305. HZ character sequence validation
  306. ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
  307. // Currently some of the rules for HZ encoding outlined above are a bit loosened up.
  308. // (e.g. the range for the first GB byte is expanded) The rules were adjusted based on real data.
  309. #define nHzStates 5
  310. static signed char _rgchHzNextState[nHzStates][nTokens] =
  311. {
  312. //
  313. // o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
  314. // l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
  315. // l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
  316. // o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
  317. // w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
  318. //-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  319. //
  320. // HZ State 0 -- ASCII
  321. ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ST1c, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ERR,
  322. // HZ State 1 -- saw "~," looking for "{" to make transition to GB mode
  323. ERR, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ST2c, ERR, ERR, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR,
  324. // HZ State 2 -- just saw "{," expecting GB byte
  325. ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ERR, ERR, ERR, ST4c, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR,
  326. // HZ State 3 -- expecting GB byte
  327. ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST4c, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR,
  328. // HZ State 4 -- saw "~," looking for "}" to make transition to ASCII mode
  329. ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ACC, ST3, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR,
  330. };
  331. /*-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  332. UTF-7 character sequence validation
  333. ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
  334. #define nUtf7States 3
  335. static signed char _rgchUtf7NextState[nUtf7States][nTokens] =
  336. {
  337. //
  338. // o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
  339. // l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
  340. // l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
  341. // o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
  342. // w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
  343. //-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  344. //
  345. // UTF7 State 0 -- Direct/optionally direct ACSII mode, state transition can happen on "+"
  346. ACC, ACC, ACC, ST1c, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ERR,
  347. // UTF7 State 1 -- Expecting first character from Modified Base64 alphabet
  348. ERR, ERR, ERR, ST2, ERR, ACC, ERR, ST2, ERR, ERR, ST2, ERR, ERR, ERR, ST2, ST2, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR,
  349. // UTF7 State 2 -- Modified Base64 alphabet mode, can be exited with "-" or any control character.
  350. ACC, ACC, ERR, ST2, ERR, ACC, ERR, ST2, ERR, ERR, ST2, ERR, ERR, ERR, ST2, ST2, ERR, ERR, ERR, ERR, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ERR,
  351. };
  352. /*----------------------------------------------------------------------------
  353. UTF-8 character sequence validation
  354. ----------------------------------------------------------------------------*/
  355. static int _nUtf8Tb = 0;
  356. #define BIT7(a) ((a) & 0x80)
  357. #define BIT6(a) ((a) & 0x40)
  358. /* N U T F 8 */
  359. /*----------------------------------------------------------------------------
  360. %%Function: _NUtf8
  361. %%Contact: jpick
  362. UTF-8 doesn't require a state table for validation, just a count
  363. of the number of expected trail bytes. See utf8lex.c for an
  364. explanation of this code.
  365. ----------------------------------------------------------------------------*/
  366. static int __inline NUtf8(UCHAR uch, BOOL fEoi)
  367. {
  368. // BIT7(uch) == 0 implies single ASCII byte.
  369. // BIT6(uch) == 0 implies one of n trail bytes.
  370. // Otherwise, lead byte, with number of bits set
  371. // up to first 0 equal to the total number bytes
  372. // in the sequence.
  373. //
  374. // REVIEW: _nUtf8Tb *is* really the state of this
  375. // validator -- use nState in structure?
  376. //
  377. if (fEoi && (_nUtf8Tb != 0))
  378. {
  379. return 0; // unexpected end-of-input
  380. }
  381. else if (BIT7(uch) == 0)
  382. {
  383. if (_nUtf8Tb != 0) // unexpected single byte
  384. return 0;
  385. return 1;
  386. }
  387. else if (BIT6(uch) == 0)
  388. {
  389. if (_nUtf8Tb == 0) // unexpected trail byte
  390. return 0;
  391. if ((--_nUtf8Tb) == 0)
  392. return 1;
  393. }
  394. else
  395. {
  396. if (_nUtf8Tb != 0) // unexpected lead byte
  397. return 0;
  398. while (BIT7(uch) != 0)
  399. {
  400. uch <<= 1;
  401. _nUtf8Tb++;
  402. }
  403. _nUtf8Tb--; // don't count lead byte
  404. }
  405. return -1;
  406. }
  407. /*----------------------------------------------------------------------------
  408. Character Mapping Defs
  409. ----------------------------------------------------------------------------*/
  410. // If caller wants us to check characters as part of validation
  411. //
  412. typedef BOOL (*PFNCHECKCHAR)(ICET icetIn);
  413. #define cchMaxBuff 5
  414. typedef struct _cc
  415. {
  416. int nCp; // code page
  417. int cchBuff; // fill count of character buffer
  418. PFNCHECKCHAR pfnCheckChar; // character check routine
  419. char rgchBuff[cchMaxBuff]; // character buffer
  420. } CC;
  421. // Character validation prototypes
  422. //
  423. static BOOL _FDbcsCheckChar(ICET icetIn);
  424. // DBCS character checker structures
  425. //
  426. // Big5
  427. static CC _ccBig5 =
  428. {
  429. nCpTaiwan,
  430. 0,
  431. _FDbcsCheckChar,
  432. };
  433. // Gbk
  434. static CC _ccGbk =
  435. {
  436. nCpChina,
  437. 0,
  438. _FDbcsCheckChar,
  439. };
  440. // ShiftJis
  441. static CC _ccSJis =
  442. {
  443. nCpJapan,
  444. 0,
  445. _FDbcsCheckChar,
  446. };
  447. // Wansung
  448. static CC _ccWan =
  449. {
  450. nCpKorea,
  451. 0,
  452. _FDbcsCheckChar,
  453. };
  454. // Character checker structures just used as buffers.
  455. //
  456. // Euc-Jp
  457. static CC _ccEucJp =
  458. {
  459. 0,
  460. 0,
  461. 0,
  462. };
  463. // Hz
  464. static CC _ccHz =
  465. {
  466. 0,
  467. 0,
  468. 0,
  469. };
  470. // Utf7
  471. static CC _ccUtf7 =
  472. {
  473. 0,
  474. 0,
  475. 0,
  476. };
  477. /*----------------------------------------------------------------------------
  478. Character Occurrence Counters
  479. ----------------------------------------------------------------------------*/
  480. // If calling app wants us to track occurrences of common character
  481. // sequences during validation (used only by auto-detection, so far).
  482. //
  483. typedef struct _coce
  484. {
  485. int cHits;
  486. short cwch;
  487. WCHAR rgwch[2];
  488. } COCE;
  489. typedef struct _coc
  490. {
  491. BOOL fMatching;
  492. short nCoceCurr;
  493. short nCoceIndex;
  494. int ccoce;
  495. COCE *rgcoce;
  496. } COC;
  497. // Big5
  498. //
  499. static COCE _rgcoceBig5[] =
  500. {
  501. {0, 2, {(WCHAR)0xa7da, (WCHAR)0xadcc},}, // "wo men"
  502. {0, 2, {(WCHAR)0xa8e4, (WCHAR)0xb9ea},}, // "qi shi"
  503. {0, 2, {(WCHAR)0xa65d, (WCHAR)0xacb0},}, // "yin wei"
  504. {0, 2, {(WCHAR)0xb8ea, (WCHAR)0xb054},}, // "zi xun"
  505. {0, 2, {(WCHAR)0xb971, (WCHAR)0xb8a3},}, // "diam nao"
  506. {0, 2, {(WCHAR)0xbaf4, (WCHAR)0xb8f4},}, // "wang lu"
  507. {0, 2, {(WCHAR)0xbd75, (WCHAR)0xa457},}, // "xian shang"
  508. {0, 2, {(WCHAR)0xc577, (WCHAR)0xaaef},}, // "huan ying"
  509. {0, 2, {(WCHAR)0xa477, (WCHAR)0xb867},}, // "yi jing"
  510. };
  511. static COC _cocBig5 =
  512. {
  513. fFalse, // fMatching
  514. 0, // nCoceCurr
  515. 0, // nCoceIndex
  516. sizeof(_rgcoceBig5) / sizeof(_rgcoceBig5[0]), // ccoce
  517. _rgcoceBig5, // rgcoce
  518. };
  519. // Euc-Cn
  520. //
  521. static COCE _rgcoceEucCn[] =
  522. {
  523. {0, 2, {(WCHAR)0xcbfb, (WCHAR)0xc3c7},}, // "ta men"
  524. {0, 2, {(WCHAR)0xced2, (WCHAR)0xc3c7},}, // "wo men"
  525. {0, 2, {(WCHAR)0xd2f2, (WCHAR)0xb4cb},}, // "yin ci"
  526. {0, 2, {(WCHAR)0xcab2, (WCHAR)0xc3b4},}, // "shen mo"
  527. {0, 2, {(WCHAR)0xc8e7, (WCHAR)0xb9fb},}, // "ru guo"
  528. {0, 2, {(WCHAR)0xd2f2, (WCHAR)0xceaa},}, // "yin wei"
  529. {0, 2, {(WCHAR)0xcbf9, (WCHAR)0xd2d4},}, // "suo yi"
  530. {0, 2, {(WCHAR)0xbbb6, (WCHAR)0xd3ad},}, // "huan ying"
  531. {0, 2, {(WCHAR)0xcdf8, (WCHAR)0xc2e7},}, // "wang luo"
  532. {0, 2, {(WCHAR)0xd0c5, (WCHAR)0xcfa2},}, // "xin xi"
  533. {0, 2, {(WCHAR)0xbcc6, (WCHAR)0xcbe3},}, // "ji guan"
  534. };
  535. static COC _cocEucCn =
  536. {
  537. fFalse, // fMatching
  538. 0, // nCoceCurr
  539. 0, // nCoceIndex
  540. sizeof(_rgcoceEucCn) / sizeof(_rgcoceEucCn[0]), // ccoce
  541. _rgcoceEucCn, // rgcoce
  542. };
  543. // Euc-Kr
  544. //
  545. static COCE _rgcoceEucKr[] =
  546. {
  547. {0, 2, {(WCHAR)0xb0a1, (WCHAR)0x0020},},
  548. {0, 2, {(WCHAR)0xb0a1, (WCHAR)0xa1a1},},
  549. {0, 2, {(WCHAR)0xb4c2, (WCHAR)0x0020},},
  550. {0, 2, {(WCHAR)0xb4c2, (WCHAR)0xa1a1},},
  551. {0, 2, {(WCHAR)0xb4d9, (WCHAR)0x002e},},
  552. {0, 2, {(WCHAR)0xb4d9, (WCHAR)0xa3ae},},
  553. {0, 2, {(WCHAR)0xb8a6, (WCHAR)0x0020},},
  554. {0, 2, {(WCHAR)0xb8a6, (WCHAR)0xa1a1},},
  555. {0, 2, {(WCHAR)0xc0ba, (WCHAR)0x0020},},
  556. {0, 2, {(WCHAR)0xc0ba, (WCHAR)0xa1a1},},
  557. {0, 2, {(WCHAR)0xc0bb, (WCHAR)0x0020},},
  558. {0, 2, {(WCHAR)0xc0bb, (WCHAR)0xa1a1},},
  559. {0, 2, {(WCHAR)0xc0cc, (WCHAR)0x0020},},
  560. {0, 2, {(WCHAR)0xc0cc, (WCHAR)0xa1a1},},
  561. };
  562. static COC _cocEucKr =
  563. {
  564. fFalse, // fMatching
  565. 0, // nCoceCurr
  566. 0, // nCoceIndex
  567. sizeof(_rgcoceEucKr) / sizeof(_rgcoceEucKr[0]), // ccoce
  568. _rgcoceEucKr, // rgcoce
  569. };
  570. // EUC-Jp
  571. //
  572. static COCE _rgcoceEucJp[] =
  573. {
  574. {0, 2, {(WCHAR)0xa4c7, (WCHAR)0xa4b9},}, // "de su"
  575. {0, 2, {(WCHAR)0xa4c0, (WCHAR)0xa1a3},}, // "da ."
  576. {0, 2, {(WCHAR)0xa4a4, (WCHAR)0xa4eb},}, // "i ru"
  577. {0, 2, {(WCHAR)0xa4de, (WCHAR)0xa4b9},}, // "ma su"
  578. {0, 2, {(WCHAR)0xa4b7, (WCHAR)0xa4bf},}, // "shi ta"
  579. {0, 2, {(WCHAR)0xa4b9, (WCHAR)0xa4eb},}, // "su ru"
  580. {0, 2, {(WCHAR)0xa4bf, (WCHAR)0xa1a3},}, // "ta ."
  581. {0, 2, {(WCHAR)0xa4eb, (WCHAR)0xa1a3},}, // "ru ."
  582. };
  583. static COC _cocEucJp =
  584. {
  585. fFalse, // fMatching
  586. 0, // nCoceCurr
  587. 0, // nCoceIndex
  588. sizeof(_rgcoceEucJp) / sizeof(_rgcoceEucJp[0]), // ccoce
  589. _rgcoceEucJp, // rgcoce
  590. };
  591. // GBK
  592. //
  593. static COCE _rgcoceGbk[] =
  594. {
  595. {0, 2, {(WCHAR)0xcbfb, (WCHAR)0xc3c7},}, // "ta men"
  596. {0, 2, {(WCHAR)0xced2, (WCHAR)0xc3c7},}, // "wo men"
  597. {0, 2, {(WCHAR)0xd2f2, (WCHAR)0xb4cb},}, // "yin ci"
  598. {0, 2, {(WCHAR)0xcab2, (WCHAR)0xc3b4},}, // "shen mo"
  599. {0, 2, {(WCHAR)0xc8e7, (WCHAR)0xb9fb},}, // "ru guo"
  600. {0, 2, {(WCHAR)0xd2f2, (WCHAR)0xceaa},}, // "yin wei"
  601. {0, 2, {(WCHAR)0xcbf9, (WCHAR)0xd2d4},}, // "suo yi"
  602. {0, 2, {(WCHAR)0xbbb6, (WCHAR)0xd3ad},}, // "huan ying"
  603. {0, 2, {(WCHAR)0xcdf8, (WCHAR)0xc2e7},}, // "wang luo"
  604. {0, 2, {(WCHAR)0xd0c5, (WCHAR)0xcfa2},}, // "xin xi"
  605. {0, 2, {(WCHAR)0xbcc6, (WCHAR)0xcbe3},}, // "ji guan"
  606. };
  607. static COC _cocGbk =
  608. {
  609. fFalse, // fMatching
  610. 0, // nCoceCurr
  611. 0, // nCoceIndex
  612. sizeof(_rgcoceGbk) / sizeof(_rgcoceGbk[0]), // ccoce
  613. _rgcoceGbk, // rgcoce
  614. };
  615. // Shift-JIS
  616. //
  617. static COCE _rgcoceSJis[] =
  618. {
  619. {0, 2, {(WCHAR)0x82c5, (WCHAR)0x82b7},}, // "de su"
  620. {0, 2, {(WCHAR)0x82be, (WCHAR)0x8142},}, // "da ."
  621. {0, 2, {(WCHAR)0x82a2, (WCHAR)0x82e9},}, // "i ru"
  622. {0, 2, {(WCHAR)0x82dc, (WCHAR)0x82b7},}, // "ma su"
  623. {0, 2, {(WCHAR)0x82b5, (WCHAR)0x82bd},}, // "shi ta"
  624. {0, 2, {(WCHAR)0x82b7, (WCHAR)0x82e9},}, // "su ru"
  625. {0, 2, {(WCHAR)0x82bd, (WCHAR)0x8142},}, // "ta ."
  626. {0, 2, {(WCHAR)0x82e9, (WCHAR)0x8142},}, // "ru ."
  627. };
  628. static COC _cocSJis =
  629. {
  630. fFalse, // fMatching
  631. 0, // nCoceCurr
  632. 0, // nCoceIndex
  633. sizeof(_rgcoceSJis) / sizeof(_rgcoceSJis[0]), // ccoce
  634. _rgcoceSJis, // rgcoce
  635. };
  636. // Wansung
  637. //
  638. // REVIEW: bug (1/2 this table is being ignored)
  639. //
  640. static COCE _rgcoceWan[] =
  641. {
  642. {0, 2, {(WCHAR)0xb0a1, (WCHAR)0x0020},},
  643. {0, 2, {(WCHAR)0xb0a1, (WCHAR)0xa1a1},},
  644. {0, 2, {(WCHAR)0xb4c2, (WCHAR)0x0020},},
  645. {0, 2, {(WCHAR)0xb4c2, (WCHAR)0xa1a1},},
  646. {0, 2, {(WCHAR)0xb4d9, (WCHAR)0x002e},},
  647. {0, 2, {(WCHAR)0xb4d9, (WCHAR)0xa3ae},},
  648. {0, 2, {(WCHAR)0xb8a6, (WCHAR)0x0020},},
  649. {0, 2, {(WCHAR)0xb8a6, (WCHAR)0xa1a1},},
  650. {0, 2, {(WCHAR)0xc0ba, (WCHAR)0x0020},},
  651. {0, 2, {(WCHAR)0xc0ba, (WCHAR)0xa1a1},},
  652. {0, 2, {(WCHAR)0xc0bb, (WCHAR)0x0020},},
  653. {0, 2, {(WCHAR)0xc0bb, (WCHAR)0xa1a1},},
  654. {0, 2, {(WCHAR)0xc0cc, (WCHAR)0x0020},},
  655. {0, 2, {(WCHAR)0xc0cc, (WCHAR)0xa1a1},},
  656. };
  657. static COC _cocWan =
  658. {
  659. fFalse, // fMatching
  660. 0, // nCoceCurr
  661. 0, // nCoceIndex
  662. sizeof(_rgcoceWan) / sizeof(_rgcoceWan[0]), // ccoce
  663. _rgcoceWan, // rgcoce
  664. };
  665. // Hz
  666. //
  667. static COCE _rgcoceHz[] =
  668. {
  669. {0, 2, {(WCHAR)0x007e, (WCHAR)0x007b},}, // ~{
  670. {0, 2, {(WCHAR)0x007e, (WCHAR)0x007d},}, // ~}
  671. };
  672. static COC _cocHz =
  673. {
  674. fFalse, // fMatching
  675. 0, // nCoceCurr
  676. 0, // nCoceIndex
  677. sizeof(_rgcoceHz) / sizeof(_rgcoceHz[0]), // ccoce
  678. _rgcoceHz, // rgcoce
  679. };
  680. // Utf7
  681. //
  682. static COCE _rgcoceUtf7[] =
  683. {
  684. {0, 2, {(WCHAR)0x002b, (WCHAR)0x002d},}, // +-
  685. };
  686. static COC _cocUtf7 =
  687. {
  688. fFalse, // fMatching
  689. 0, // nCoceCurr
  690. 0, // nCoceIndex
  691. sizeof(_rgcoceUtf7) / sizeof(_rgcoceUtf7[0]), // ccoce
  692. _rgcoceUtf7, // rgcoce
  693. };
  694. // Character counter prototype.
  695. //
  696. static void _CountChars(ICET icetIn);
  697. /*----------------------------------------------------------------------------
  698. Main Definitions
  699. ----------------------------------------------------------------------------*/
  700. // Structure to keep state, state machine and other associated
  701. // information for a given character set "parse stream."
  702. //
  703. typedef struct _vr
  704. {
  705. BOOL fInUse;
  706. DWORD dwFlags;
  707. int nState;
  708. CC *ccCheck;
  709. signed char (*rgchNextState)[nTokens];
  710. } VR;
  711. // Array of validation records. We allow multiple, active parse
  712. // streams for auto-detect -- this way, it can concurrently keep
  713. // a parse stream for each encoding type, without needing to read
  714. // its input multiple times.
  715. //
  716. static VR _mpicetvr[icetCount] =
  717. {
  718. {fTrue, 0, ST0, 0, _rgchEucKrCnNextState,}, // icetEucCn
  719. {fTrue, 0, ST0, &_ccEucJp, _rgchEucJpNextState,}, // icetEucJp
  720. {fTrue, 0, ST0, 0, _rgchEucKrCnNextState,}, // icetEucKr
  721. {fTrue, 0, ST0, 0, _rgchEucTwNextState,}, // icetEucTw
  722. {fFalse, 0, ST0, 0, 0,}, // icetIso2022Cn
  723. {fFalse, 0, ST0, 0, 0,}, // icetIso2022Jp
  724. {fFalse, 0, ST0, 0, 0,}, // icetIso2022Kr
  725. {fFalse, 0, ST0, 0, 0,}, // icetIso2022Tw
  726. {fTrue, 0, ST0, &_ccBig5, _rgchBig5NextState,}, // icetBig5
  727. {fTrue, 0, ST0, &_ccGbk, _rgchGbkWanNextState,}, // icetGbk
  728. {fTrue, 0, ST0, &_ccHz, _rgchHzNextState,}, // icetHz
  729. {fTrue, 0, ST0, &_ccSJis, _rgchSJisNextState,}, // icetShiftJis
  730. {fTrue, 0, ST0, &_ccWan, _rgchGbkWanNextState,}, // icetWansung
  731. {fTrue, 0, ST0, &_ccUtf7, _rgchUtf7NextState,}, // icetUtf7
  732. {fTrue, 0, ST0, 0, 0,}, // icetUtf8
  733. };
  734. // Array of character sequence counters, one per encoding type.
  735. //
  736. static COC *_mpicetlpcoc[icetCount] =
  737. {
  738. &_cocEucCn, // icetEucCn
  739. &_cocEucJp, // icetEucJp
  740. &_cocEucKr, // icetEucKr
  741. 0, // icetEucTw
  742. 0, // icetIso2022Cn
  743. 0, // icetIso2022Jp
  744. 0, // icetIso2022Kr
  745. 0, // icetIso2022Tw
  746. &_cocBig5, // icetBig5
  747. &_cocGbk, // icetGbk
  748. &_cocHz, // icetHz
  749. &_cocSJis, // icetShiftJis
  750. &_cocWan, // icetWansung
  751. &_cocUtf7, // icetUtf7
  752. 0, // icetUtf8
  753. };
  754. /* V A L I D A T E I N I T */
  755. /*----------------------------------------------------------------------------
  756. %%Function: ValidateInit
  757. %%Contact: jpick
  758. Initialize the state machine for the given character set (set its
  759. state to ST0 (the start state) and store its parsing options).
  760. ----------------------------------------------------------------------------*/
  761. void ValidateInit(ICET icetIn, DWORD dwFlags)
  762. {
  763. // Initialize the character occurrence counter, if caller wants
  764. // us to count common character sequences (auto-detect, only,
  765. // for now). Turn off the count-common-chars flag if we're not
  766. // set up to count sequences (meaning we don't have a set of
  767. // common characters for this encoding type or have no place
  768. // to buffer them).
  769. //
  770. if (dwFlags & grfCountCommonChars)
  771. {
  772. if ((_mpicetlpcoc[icetIn]) && (_mpicetvr[icetIn].ccCheck))
  773. {
  774. int i;
  775. for (i = 0; i < _mpicetlpcoc[icetIn]->ccoce; i++)
  776. _mpicetlpcoc[icetIn]->rgcoce[i].cHits = 0;
  777. _mpicetlpcoc[icetIn]->fMatching = fFalse;
  778. }
  779. else
  780. {
  781. dwFlags &= ~grfCountCommonChars;
  782. }
  783. }
  784. // If validation not supported for the encoding type, there's
  785. // nothing else for us to do here.
  786. //
  787. if (!_mpicetvr[icetIn].fInUse)
  788. return;
  789. _mpicetvr[icetIn].nState = ST0;
  790. // Can't do character mapping validation without character
  791. // checker information. (If we do have the character checker,
  792. // initialize its buffer length to 0).
  793. //
  794. if (_mpicetvr[icetIn].ccCheck)
  795. _mpicetvr[icetIn].ccCheck->cchBuff = 0;
  796. else
  797. dwFlags &= ~grfValidateCharMapping;
  798. // It's also impossible without a valid code page.
  799. //
  800. if ((dwFlags & grfValidateCharMapping) && !IsValidCodePage(_mpicetvr[icetIn].ccCheck->nCp))
  801. dwFlags &= ~grfValidateCharMapping;
  802. _mpicetvr[icetIn].dwFlags = dwFlags;
  803. if (icetIn == icetUtf8)
  804. _nUtf8Tb = 0;
  805. }
  806. /* V A L I D A T E R E S E T A L L*/
  807. /*----------------------------------------------------------------------------
  808. %%Function: ValidateInitAll
  809. %%Contact: jpick
  810. Initialize the state machines for all character sets (set their
  811. states to ST0 (the start state) and store their parsing options).
  812. ----------------------------------------------------------------------------*/
  813. void ValidateInitAll(DWORD dwFlags)
  814. {
  815. int i;
  816. for (i = 0 ; i < icetCount; i++)
  817. {
  818. if (!_mpicetvr[i].fInUse)
  819. continue;
  820. ValidateInit((ICET)i, dwFlags);
  821. }
  822. }
  823. /* V A L I D A T E R E S E T */
  824. /*----------------------------------------------------------------------------
  825. %%Function: ValidateReset
  826. %%Contact: jpick
  827. Reset the state machine for the given character set (set its state
  828. to ST0 (the start state)).
  829. ----------------------------------------------------------------------------*/
  830. void ValidateReset(ICET icetIn)
  831. {
  832. // Initialize the character occurrence counter, if caller wants
  833. // us to count common character sequences (auto-detect, only,
  834. // for now). We're guaranteed to have the structures if the
  835. // flag is set by ValidateInit(), above.
  836. //
  837. if (_mpicetvr[icetIn].dwFlags & grfCountCommonChars)
  838. {
  839. int i;
  840. for (i = 0; i < _mpicetlpcoc[icetIn]->ccoce; i++)
  841. _mpicetlpcoc[icetIn]->rgcoce[i].cHits = 0;
  842. _mpicetlpcoc[icetIn]->fMatching = fFalse;
  843. }
  844. // If validation not supported for the encoding type, there's
  845. // nothing else for us to do here.
  846. //
  847. if (!_mpicetvr[icetIn].fInUse)
  848. return;
  849. _mpicetvr[icetIn].nState = ST0;
  850. if (_mpicetvr[icetIn].ccCheck)
  851. _mpicetvr[icetIn].ccCheck->cchBuff = 0;
  852. if (icetIn == icetUtf8)
  853. _nUtf8Tb = 0;
  854. }
  855. /* V A L I D A T E R E S E T A L L */
  856. /*----------------------------------------------------------------------------
  857. %%Function: ValidateResetAll
  858. %%Contact: jpick
  859. Reset the state machines for all character sets (set their states to
  860. ST0 (the start state)).
  861. ----------------------------------------------------------------------------*/
  862. void ValidateResetAll(void)
  863. {
  864. int i;
  865. for (i=0 ; i < icetCount; i++)
  866. {
  867. if (!_mpicetvr[i].fInUse)
  868. continue;
  869. ValidateReset((ICET)i);
  870. }
  871. }
  872. /* N V A L I D A T E U C H */
  873. /*----------------------------------------------------------------------------
  874. %%Function: NValidateUch
  875. %%Contact: jpick
  876. Single step parser, takes one transition through the state table
  877. for the given character set. Current state is kept for each
  878. character set's parse stream.
  879. Routine returns -1 if it does not reach a final state on this
  880. transition; 0 if transitioned to ERR(or) and 1 if transtioned
  881. to ACC(ept).
  882. If final state is ACC(ept), machine reset to ST0 (start state).
  883. (i.e., there's no need to manually reset on ACC(ept)).
  884. Routine is also a convenient collection point for certain
  885. statistics (currently only the counting of occurrences of common
  886. character sequences (defined for character sets, above)).
  887. ----------------------------------------------------------------------------*/
  888. int NValidateUch(ICET icetIn, UCHAR uch, BOOL fEoi)
  889. {
  890. int nToken;
  891. int nPrevState;
  892. int rc = -1;
  893. // If not validating this icet, nothing to do (so say
  894. // we accept the character).
  895. //
  896. if (!_mpicetvr[icetIn].fInUse)
  897. return 1;
  898. if (_mpicetvr[icetIn].nState == ERR)
  899. return 0;
  900. // Ignore all zeros in the detection file.
  901. if (!uch && !fEoi)
  902. {
  903. goto _LRet;
  904. }
  905. // Hack -- want to validate UTF-8, but don't need a state
  906. // table to do so. Treat as special case here and return.
  907. //
  908. if (icetIn == icetUtf8)
  909. {
  910. if ((rc = NUtf8(uch, fEoi)) == 0)
  911. _mpicetvr[icetIn].nState = ERR;
  912. return rc;
  913. }
  914. // Classify the character...
  915. //
  916. nPrevState = _mpicetvr[icetIn].nState;
  917. nToken = fEoi ? ateof : _rgchCharClass[uch];
  918. // First obtain a real number for a state based on the counting state...
  919. // Then do the transition...
  920. //
  921. _mpicetvr[icetIn].nState = (_mpicetvr[icetIn].rgchNextState)[TstNotCountingFromTst(_mpicetvr[icetIn].nState)][nToken];
  922. #if 0
  923. if (_mpicetvr[icetIn].nState == ERR)
  924. printf("Character 0x%.2x; Going from state %.2x to state %.2x\n", uch, nPrevState, _mpicetvr[icetIn].nState);
  925. #endif
  926. // If we're in an error state or have seen end-of-input, return.
  927. //
  928. if ((_mpicetvr[icetIn].nState == ERR) || (nToken == ateof))
  929. goto _LRet;
  930. // Are we to do character mapping validation? (If this flag
  931. // is set, we're guaranteed to have a character checker
  932. // structure). How about character occurrence counting?
  933. // (This also guarantees us a character checker structure).
  934. //
  935. if (!(_mpicetvr[icetIn].dwFlags & grfValidateCharMapping) &&
  936. !(_mpicetvr[icetIn].dwFlags & grfCountCommonChars))
  937. {
  938. goto _LRet;
  939. }
  940. // Buffer the current character (trusting that we'll never get
  941. // more than the max amount -- present tables enforce this)
  942. // (if it's Utf7 or Hz, buffer only if we are in the counting state
  943. //
  944. if (FTstCounting(_mpicetvr[icetIn].nState) || (icetIn != icetHz && icetIn != icetUtf7))
  945. _mpicetvr[icetIn].ccCheck->rgchBuff[_mpicetvr[icetIn].ccCheck->cchBuff++] = uch;
  946. // Return if we are not in the counting state
  947. //
  948. if (!(FTstCounting(_mpicetvr[icetIn].nState)))
  949. goto _LRet;
  950. // Call the character checker, if we have one.
  951. //
  952. if (_mpicetvr[icetIn].dwFlags & grfValidateCharMapping)
  953. {
  954. if (_mpicetvr[icetIn].ccCheck->pfnCheckChar && !(_mpicetvr[icetIn].ccCheck->pfnCheckChar)(icetIn))
  955. {
  956. _mpicetvr[icetIn].nState = ERR;
  957. goto _LRet;
  958. }
  959. }
  960. // If we're counting common characters, do so now.
  961. //
  962. if (_mpicetvr[icetIn].dwFlags & grfCountCommonChars)
  963. _CountChars(icetIn);
  964. // Reset the character checker/counter buffer.
  965. //
  966. _mpicetvr[icetIn].ccCheck->cchBuff = 0;
  967. _LRet:
  968. // Return the appropriate code.
  969. //
  970. switch (_mpicetvr[icetIn].nState)
  971. {
  972. case ERR:
  973. return 0;
  974. case ACC:
  975. _mpicetvr[icetIn].nState = ST0; // Reset
  976. return 1;
  977. default:
  978. return -1; // need more data
  979. }
  980. }
  981. /* F V A L I D A T E C H A R C O U N T */
  982. /*----------------------------------------------------------------------------
  983. %%Function: FValidateCharCount
  984. %%Contact: jpick
  985. Return the number of matched special character sequences for the
  986. given character set. If we're not keeping track of these sequences
  987. for the character set, either because we don't have the necessary
  988. static data or because the flag wasn't set by the calling routine,
  989. return fFalse. Otherwise, return the count in *lpcMatch and return
  990. fTrue;
  991. (We track the counts separately for each sequence, just in case
  992. we want to weight them differently in the future. Return the
  993. total, here).
  994. ----------------------------------------------------------------------------*/
  995. BOOL FValidateCharCount(ICET icetIn, int *lpcMatch)
  996. {
  997. int i;
  998. COC *lpcoc = _mpicetlpcoc[icetIn];
  999. VR *lpvr = &_mpicetvr[icetIn];
  1000. if (!lpcoc || !lpvr->fInUse || !(lpvr->dwFlags & grfCountCommonChars))
  1001. return fFalse;
  1002. for (i = 0, *lpcMatch = 0; i < lpcoc->ccoce; i++)
  1003. *lpcMatch += lpcoc->rgcoce[i].cHits;
  1004. return fTrue;
  1005. }
  1006. /* _ C O U N T C H A R S */
  1007. /*----------------------------------------------------------------------------
  1008. %%Function: _CountChars
  1009. %%Contact: jpick
  1010. We've just completed a legal character for the given character
  1011. set. Match it against the set of special character sequences for
  1012. the character set, if we have them. Update match counts and
  1013. current match indices (since sequences can span multiple legal
  1014. characters) as needed.
  1015. ----------------------------------------------------------------------------*/
  1016. static void _CountChars(ICET icetIn)
  1017. {
  1018. WCHAR wch;
  1019. int i;
  1020. BOOL fFound;
  1021. // Anything to do?
  1022. //
  1023. if (!_mpicetlpcoc[icetIn] || !_mpicetvr[icetIn].ccCheck)
  1024. return;
  1025. // Build the WCHAR.
  1026. //
  1027. switch (_mpicetvr[icetIn].ccCheck->cchBuff)
  1028. {
  1029. case 1:
  1030. wch = WchFromUchUch(0, _mpicetvr[icetIn].ccCheck->rgchBuff[0]);
  1031. break;
  1032. case 2:
  1033. wch = WchFromUchUch(_mpicetvr[icetIn].ccCheck->rgchBuff[0],
  1034. _mpicetvr[icetIn].ccCheck->rgchBuff[1]);
  1035. break;
  1036. case 3:
  1037. wch = WchFromUchUch(_mpicetvr[icetIn].ccCheck->rgchBuff[1],
  1038. _mpicetvr[icetIn].ccCheck->rgchBuff[2]);
  1039. break;
  1040. case 4:
  1041. wch = WchFromUchUch(_mpicetvr[icetIn].ccCheck->rgchBuff[2],
  1042. _mpicetvr[icetIn].ccCheck->rgchBuff[3]);
  1043. break;
  1044. default:
  1045. return;
  1046. }
  1047. // Are we currently working on matching a sequence?
  1048. //
  1049. if ((_mpicetlpcoc[icetIn]->fMatching) &&
  1050. (wch == _mpicetlpcoc[icetIn]->rgcoce[_mpicetlpcoc[icetIn]->nCoceCurr].rgwch[_mpicetlpcoc[icetIn]->nCoceIndex]))
  1051. {
  1052. // Did we just match the entire sequence? If so, increment the
  1053. // hit count and reset.
  1054. //
  1055. if (++_mpicetlpcoc[icetIn]->nCoceIndex >= _mpicetlpcoc[icetIn]->rgcoce[_mpicetlpcoc[icetIn]->nCoceCurr].cwch)
  1056. {
  1057. ++_mpicetlpcoc[icetIn]->rgcoce[_mpicetlpcoc[icetIn]->nCoceCurr].cHits;
  1058. _mpicetlpcoc[icetIn]->fMatching = fFalse;
  1059. }
  1060. // All done.
  1061. //
  1062. return;
  1063. }
  1064. // If we need to start matching again (either because we're not
  1065. // currently in a sequence or because a 2nd or later character
  1066. // didn't match), try the current character as a lead character.
  1067. //
  1068. // REVIEW: wrong for sequences longer than 2 wchars.
  1069. //
  1070. for (i = 0, fFound = fFalse; (!fFound && (i < _mpicetlpcoc[icetIn]->ccoce)); i++)
  1071. {
  1072. if (wch == _mpicetlpcoc[icetIn]->rgcoce[i].rgwch[0])
  1073. fFound = fTrue;
  1074. }
  1075. // Any luck?
  1076. //
  1077. if (!fFound)
  1078. {
  1079. _mpicetlpcoc[icetIn]->fMatching = fFalse;
  1080. return;
  1081. }
  1082. // Store the matching state.
  1083. //
  1084. _mpicetlpcoc[icetIn]->fMatching = fTrue;
  1085. _mpicetlpcoc[icetIn]->nCoceCurr = i - 1;
  1086. _mpicetlpcoc[icetIn]->nCoceIndex = 1; // where to look next
  1087. }
  1088. /* _ D B C S C H E C K C H A R */
  1089. /*----------------------------------------------------------------------------
  1090. %%Function: _DbcsCheckChar
  1091. %%Contact: jpick
  1092. Character validator for DBCS formats. Attempts to round-trip a
  1093. legal multi-byte sequence to ensure that its valid for the given
  1094. character set.
  1095. REVIEW: Slow, slow, slow -- do we really gain anything from the
  1096. round-trip check, or is conversion *to* Unicode a sufficient test?
  1097. ----------------------------------------------------------------------------*/
  1098. static WCHAR _rgwBuff[10];
  1099. static UCHAR _rgchBuff[30];
  1100. static BOOL _FDbcsCheckChar(ICET icetIn)
  1101. {
  1102. int cCvt;
  1103. // skip 1 byte characters, mostly uninteresting (Shift-Jis ??).
  1104. //
  1105. if (_mpicetvr[icetIn].ccCheck->cchBuff == 1)
  1106. return fTrue;
  1107. if (!(cCvt = MultiByteToWideChar(_mpicetvr[icetIn].ccCheck->nCp,
  1108. MB_ERR_INVALID_CHARS,
  1109. _mpicetvr[icetIn].ccCheck->rgchBuff,
  1110. _mpicetvr[icetIn].ccCheck->cchBuff,
  1111. _rgwBuff, 10)))
  1112. {
  1113. if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
  1114. return fFalse;
  1115. }
  1116. return fTrue; // probably not always right
  1117. }