Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

485 lines
16 KiB

  1. /*----------------------------------------------------------------------------
  2. %%File: jislex.c
  3. %%Unit: fechmap
  4. %%Contact: jpick
  5. Simple converter for decoding a subset of possible ISO-2022-7 encoded
  6. files (ISO-2022). Data is translated to and from Unicode. Converter
  7. operates according to user options.
  8. Module currently handles ISO-2022-JP (and JIS) and ISO-2022-KR.
  9. Converter is set up to handle ISO-2022-TW and ISO-2022-CN, but there
  10. are as yet no conversion tables for these.
  11. ----------------------------------------------------------------------------*/
  12. #include <stdio.h>
  13. #include <stddef.h>
  14. #include "private.h"
  15. #include "fechmap_.h"
  16. #include "lexint_.h"
  17. // State table for reading ISO-2022-7 encoded text
  18. //
  19. // Lexer recognizes the following designator sequences, used
  20. // to select a one or two byte character set:
  21. //
  22. // <esc> $ @ -- JIS C 6626-1978 (synonym of <esc> $ ( @)
  23. // <esc> $ A -- GB 2312-80 (synonym of <esc> $ ( A)
  24. // <esc> $ B -- JIS X 0208-1983 (synonym of <esc> $ ( B)
  25. //
  26. // <esc> $ ( @ -- JIS C 6626-1978
  27. // <esc> $ ( A -- GB 2312-80
  28. // <esc> $ ( B -- JIS X 0208-1983
  29. // <esc> $ ( C -- KS C 5601-1992
  30. // <esc> $ ( D -- JIS X 0212-1990
  31. // <esc> $ ( E -- ??? (ISO-IR-165:1992) ???
  32. // <esc> $ ( G -- CNS 11643-1992 Plane 1
  33. // <esc> $ ( H -- CNS 11643-1992 Plane 2
  34. // <esc> $ ( I -- CNS 11643-1992 Plane 3
  35. // <esc> $ ( J -- CNS 11643-1992 Plane 4
  36. // <esc> $ ( K -- CNS 11643-1992 Plane 5
  37. // <esc> $ ( L -- CNS 11643-1992 Plane 6
  38. // <esc> $ ( M -- CNS 11643-1992 Plane 7
  39. //
  40. // <esc> $ ) C -- KSC 5601-1987 (Implies ISO-2022-KR ??)
  41. //
  42. // <esc> & @ <esc> $ B -- JIS X 0208-1990
  43. //
  44. // <esc> ( B -- Ascii
  45. // <esc> ( H -- Deprecated variant of JIS-Roman
  46. // <esc> ( I -- Half-Width Katakana
  47. // <esc> ( J -- JIS-Roman
  48. // <esc> ( T -- GB 1988-89 Roman
  49. //
  50. // Lexer recognizes the following shift sequences, used to allow
  51. // interpretation of a given byte or bytes:
  52. //
  53. // <si> -- locking shift, interpret bytes as G0
  54. // <so> -- locking shift, interpret bytes as G1
  55. // <esc> n -- locking shift, interpret bytes as G2
  56. // <esc> o -- locking shift, interpret bytes as G3
  57. // <esc> N -- single shift, interpret bytes as G2
  58. // <esc> O -- single shift, interpret bytes as G3
  59. //
  60. // REVIEW (jpick): don't currently need the final four shift
  61. // sequences. If we support ISO-2022-CN, we'll need to use
  62. // G2 and G3 and potentially, then, the last four shifts.
  63. //
  64. /*----------------------------------------------------------------------------
  65. Character Classification Table
  66. ----------------------------------------------------------------------------*/
  67. // Tokens
  68. //
  69. #define txt (JTK) 0
  70. #define ext (JTK) 1 // extended characters that are legal under certain circumstances
  71. #define esc (JTK) 2
  72. #define si (JTK) 3
  73. #define so (JTK) 4
  74. #define dlr (JTK) 5
  75. #define at (JTK) 6
  76. #define amp (JTK) 7
  77. #define opr (JTK) 8
  78. #define cpr (JTK) 9
  79. #define tkA (JTK) 10
  80. #define tkB (JTK) 11
  81. #define tkC (JTK) 12
  82. #define tkD (JTK) 13
  83. #define tkE (JTK) 14
  84. #define tkG (JTK) 15
  85. #define tkH (JTK) 16
  86. #define tkI (JTK) 17
  87. #define tkJ (JTK) 18
  88. #define tkK (JTK) 19
  89. #define tkL (JTK) 20
  90. #define tkM (JTK) 21
  91. #define tkT (JTK) 22
  92. #define unk (JTK) 23 // Unexpected character
  93. #define eof (JTK) 24 // end-of-file
  94. #define err (JTK) 25 // read error
  95. #define nTokens 26
  96. // Lookup table for ISO-2022-7 encoded files
  97. //
  98. static JTK _rgjtkCharClass[256] =
  99. // 0 1 2 3 4 5 6 7 8 9 a b c d e f
  100. {
  101. // nul soh stx etx eot enq ack bel bs tab lf vt np cr so si 0
  102. txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, so, si,
  103. // dle dc1 dc2 dc3 dc4 nak syn etb can em eof esc fs gs rs us 1
  104. txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, esc, txt, txt, txt, txt,
  105. // sp ! " # $ % & ' ( ) * + , - . / 2
  106. txt, txt, txt, txt, dlr, txt, amp, txt, opr, cpr, txt, txt, txt, txt, txt, txt,
  107. // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 3
  108. txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,
  109. // @ A B C D E F G H I J K L M N O 4
  110. at, tkA, tkB, tkC, tkD, tkE, txt, tkG, tkH, tkI, tkJ, tkK, tkL, tkM, txt, txt,
  111. // P Q R S T U V W X Y Z [ \ ] ^ _ 5
  112. txt, txt, txt, txt, tkT, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,
  113. // ` a b c d e f g h i j k l m n o 6
  114. txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,
  115. // p q r s t u v w x y z { | } ~ del 7
  116. txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,
  117. // 8
  118. unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,
  119. // 9
  120. unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,
  121. // a
  122. unk, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,
  123. // b
  124. ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,
  125. // c
  126. ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,
  127. // d
  128. ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,
  129. // e
  130. unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,
  131. // f
  132. unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,
  133. // 0 1 2 3 4 5 6 7 8 9 a b c d e f
  134. };
  135. /*----------------------------------------------------------------------------
  136. State Table
  137. ----------------------------------------------------------------------------*/
  138. // Final states have the high-bit set. States that represent the reading
  139. // of a valid character escape sequence also encode the character set
  140. // "name" (moniker??) -- the state with the high bit masked off.
  141. //
  142. // Table State
  143. //
  144. typedef unsigned char TST;
  145. // Final State Mask, Related
  146. //
  147. #define grfFinal (TST) 0x80
  148. #define _NEscTypeFromState(nState) (int) ((nState) & 0x7f)
  149. // ASCII Escape Sequence (Final State)
  150. #define ASC (TST) (grfFinal | 0x00) // Ascii
  151. // Japanese Escape Sequences (Final States)
  152. #define JS0 (TST) (grfFinal | 0x01) // JIS-Roman
  153. #define JS1 (TST) (grfFinal | 0x02) // Half-Width Katakana
  154. #define JS2 (TST) (grfFinal | 0x03) // JIS C 6226-1978
  155. #define JS3 (TST) (grfFinal | 0x04) // JIS X 0208-1983
  156. #define JS4 (TST) (grfFinal | 0x05) // JIS X 0208-1990
  157. #define JS5 (TST) (grfFinal | 0x06) // JIS X 0212-1990
  158. // Chinese (PRC) Escape Sequences (Final States)
  159. #define CS0 (TST) (grfFinal | 0x07) // GB 1988-89 Roman
  160. #define CS1 (TST) (grfFinal | 0x08) // GB 2312-80
  161. // Chinese (Taiwan) Escape Sequences (Final States)
  162. #define TS0 (TST) (grfFinal | 0x09) // CNS 11643-1992 Plane 1
  163. #define TS1 (TST) (grfFinal | 0x0a) // CNS 11643-1992 Plane 2
  164. #define TS2 (TST) (grfFinal | 0x0b) // CNS 11643-1992 Plane 3
  165. #define TS3 (TST) (grfFinal | 0x0c) // CNS 11643-1992 Plane 4
  166. #define TS4 (TST) (grfFinal | 0x0d) // CNS 11643-1992 Plane 5
  167. #define TS5 (TST) (grfFinal | 0x0e) // CNS 11643-1992 Plane 6
  168. #define TS6 (TST) (grfFinal | 0x0f) // CNS 11643-1992 Plane 7
  169. // Korean Escape Sequences (Final State)
  170. #define KS0 (TST) (grfFinal | 0x10) // KS C 5601-1992
  171. // Document "Signal" for ISO-2022-KR (Doc needs special processing)
  172. #define KSD (TST) (grfFinal | 0x11) // ISO-2022-KR Document Signal
  173. // Number of unique *character set* escape sequences
  174. //
  175. #define cCsEsc 18
  176. // Special States (not escape sequence) (Final States)
  177. //
  178. #define TXT (TST) (grfFinal | (cCsEsc + 1)) // Process Text
  179. #define EXT (TST) (grfFinal | (cCsEsc + 2)) // Process (Possibly Illegal) Extended Chars
  180. #define FIN (TST) (grfFinal | (cCsEsc + 3)) // Finish
  181. #define EOI (TST) (grfFinal | (cCsEsc + 4)) // Unexpected End-Of-Input
  182. #define UNK (TST) (grfFinal | (cCsEsc + 5)) // Unknown State (Unexpected Character)
  183. #define ERR (TST) (grfFinal | (cCsEsc + 6)) // Read Error
  184. // Shift Sequences (do not specify character set) (Final States)
  185. //
  186. #define LSO (TST) (grfFinal | (cCsEsc + 7)) // Locking shift out (g1 into GL)
  187. #define LSI (TST) (grfFinal | (cCsEsc + 8)) // Locking shift in (g0 into GL)
  188. // For convenience, also define constants for the sets
  189. // that the states represent.
  190. //
  191. #define csNIL (-1) // Invalid Designator
  192. #define csASC (_NEscTypeFromState(ASC)) // Ascii
  193. #define csJS0 (_NEscTypeFromState(JS0)) // JIS-Roman
  194. #define csJS1 (_NEscTypeFromState(JS1)) // Half-Width Katakana
  195. #define csJS2 (_NEscTypeFromState(JS2)) // JIS C 6226-1978
  196. #define csJS3 (_NEscTypeFromState(JS3)) // JIS X 0208-1983
  197. #define csJS4 (_NEscTypeFromState(JS4)) // JIS X 0208-1990
  198. #define csJS5 (_NEscTypeFromState(JS5)) // JIS X 0212-1990
  199. #define csCS0 (_NEscTypeFromState(CS0)) // GB 1988-89 Roman
  200. #define csCS1 (_NEscTypeFromState(CS1)) // GB 2312-80
  201. #define csTS0 (_NEscTypeFromState(TS0)) // CNS 11643-1992 Plane 1
  202. #define csTS1 (_NEscTypeFromState(TS1)) // CNS 11643-1992 Plane 2
  203. #define csTS2 (_NEscTypeFromState(TS2)) // CNS 11643-1992 Plane 3
  204. #define csTS3 (_NEscTypeFromState(TS3)) // CNS 11643-1992 Plane 4
  205. #define csTS4 (_NEscTypeFromState(TS4)) // CNS 11643-1992 Plane 5
  206. #define csTS5 (_NEscTypeFromState(TS5)) // CNS 11643-1992 Plane 6
  207. #define csTS6 (_NEscTypeFromState(TS6)) // CNS 11643-1992 Plane 7
  208. #define csKS0 (_NEscTypeFromState(KS0)) // KS C 5601-1992 (into G0)
  209. #define csKSD (_NEscTypeFromState(KSD)) // KS C 5601-1992 (into G1)
  210. // Table States (Intermediate States)
  211. #define ST0 (TST) 0
  212. #define ST1 (TST) 1
  213. #define ST2 (TST) 2
  214. #define ST3 (TST) 3
  215. #define ST4 (TST) 4
  216. #define ST5 (TST) 5
  217. #define ST6 (TST) 6
  218. #define ST7 (TST) 7
  219. #define ST8 (TST) 8
  220. #define ST9 (TST) 9
  221. // Number of "real" (table) states
  222. //
  223. #define nStates 10
  224. #define IsFinal(state) ((state) & grfFinal)
  225. // State Have Seen Looking For
  226. // ----------------------------------------------------------
  227. // ST0 -- Start State -- <ESC> Text
  228. // ST1 <ESC> $ & (
  229. // ST2 <ESC> $ ( ) @ A B (**)
  230. // ST3 <ESC> $ ( @ A B C D E G H I J K L M
  231. // ST4 <ESC> $ ) C
  232. // ST5 <ESC> & @
  233. // ST6 <ESC> & @ <ESC>
  234. // ST7 <ESC> & @ <ESC> $
  235. // ST8 <ESC> & @ <ESC> $ B
  236. // ST9 <ESC> ( B H I J T
  237. //
  238. // (**) "<ESC> $ ID" is a synonym of "<ESC> $ ( ID" for ID=(@, A, B)
  239. //
  240. // Because of the large number of tokens, this table is
  241. // inverted (tokens x states).
  242. //
  243. static signed char _rgchNextState[nTokens][nStates] =
  244. {
  245. //
  246. // S S S S S S S S S S
  247. // T T T T T T T T T T
  248. // 0 1 2 3 4 5 6 7 8 9
  249. //--------------------------------------------------------------------
  250. //
  251. /* txt */ TXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
  252. /* ext */ EXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
  253. /* esc */ ST1, UNK, UNK, UNK, UNK, UNK, ST7, UNK, UNK, UNK,
  254. /* si */ LSI, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
  255. /* so */ LSO, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
  256. /* $ */ TXT, ST2, UNK, UNK, UNK, UNK, UNK, ST8, UNK, UNK,
  257. /* @ */ TXT, UNK, JS2, JS2, UNK, ST6, UNK, UNK, UNK, UNK,
  258. /* & */ TXT, ST5, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
  259. /* ( */ TXT, ST9, ST3, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
  260. /* ) */ TXT, UNK, ST4, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
  261. /* A */ TXT, UNK, CS1, CS1, UNK, UNK, UNK, UNK, UNK, UNK,
  262. /* B */ TXT, UNK, JS3, JS3, UNK, UNK, UNK, UNK, JS4, ASC,
  263. /* C */ TXT, UNK, UNK, KS0, KSD, UNK, UNK, UNK, UNK, UNK,
  264. /* D */ TXT, UNK, UNK, JS5, UNK, UNK, UNK, UNK, UNK, UNK,
  265. /* E */ TXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
  266. /* G */ TXT, UNK, UNK, TS0, UNK, UNK, UNK, UNK, UNK, UNK,
  267. /* H */ TXT, UNK, UNK, TS1, UNK, UNK, UNK, UNK, UNK, JS0,
  268. /* I */ TXT, UNK, UNK, TS2, UNK, UNK, UNK, UNK, UNK, JS1,
  269. /* J */ TXT, UNK, UNK, TS3, UNK, UNK, UNK, UNK, UNK, JS0,
  270. /* K */ TXT, UNK, UNK, TS4, UNK, UNK, UNK, UNK, UNK, UNK,
  271. /* L */ TXT, UNK, UNK, TS5, UNK, UNK, UNK, UNK, UNK, UNK,
  272. /* M */ TXT, UNK, UNK, TS6, UNK, UNK, UNK, UNK, UNK, UNK,
  273. /* T */ TXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, CS0,
  274. /* unk */ UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
  275. /* eof */ FIN, EOI, EOI, EOI, EOI, EOI, EOI, EOI, EOI, EOI,
  276. /* err */ ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR,
  277. };
  278. // Also for ISO-2022 out. Build arrays of possible character
  279. // sets for each type of input character set. Character sets
  280. // should appear in order of hit probability (e.g., in 2022-Jp
  281. // JS3 is the most common set). Mark the end of array with -1.
  282. // (Only store these for non-ascii sets).
  283. //
  284. //
  285. // China (icetIso2022Cn)
  286. static int _rgceCn[] = { -1, };
  287. // Japan (icetIso2022Jp)
  288. static int _rgceJp[] = { csJS3, csJS1, csJS5, -1, };
  289. // Korea (icetIso2022Kr)
  290. static int _rgceKr[] = { -1, };
  291. // Taiwan (icetIso2022Tw)
  292. static int _rgceTw[] = { -1, };
  293. static int *_mpicetrgce[icetCount] =
  294. {
  295. 0, // icetEucCn
  296. 0, // icetEucJp
  297. 0, // icetEucKr
  298. 0, // icetEucTw
  299. _rgceCn, // icetIso2022Cn
  300. _rgceJp, // icetIso2022Jp
  301. _rgceKr, // icetIso2022Kr
  302. _rgceTw, // icetIso2022Tw
  303. 0, // icetBig5
  304. 0, // icetGbk
  305. 0, // icetShiftJis
  306. 0, // icetWansung
  307. 0, // icetUtf8
  308. };
  309. /* _ J T K G E T N E X T */
  310. /*----------------------------------------------------------------------------
  311. %%Function: _JtkGetNext
  312. %%Contact: jpick
  313. Get the next character and classify it. Return the token.
  314. ----------------------------------------------------------------------------*/
  315. static JTK __inline _JtkGetNext(IStream *pstmIn, PUCHAR puch)
  316. {
  317. ULONG rc;
  318. HRESULT hr;
  319. hr = pstmIn->Read(puch, 1, &rc);
  320. if (hr != S_OK )
  321. return err;
  322. else if (rc == 0)
  323. return eof;
  324. else
  325. return _rgjtkCharClass[*puch];
  326. }
  327. /* C C E R E A D E S C S E Q */
  328. /*----------------------------------------------------------------------------
  329. %%Function: CceReadEscSeq
  330. %%Contact: jpick
  331. Read pointer is positioned at an escape sequence, figure out
  332. which escape sequence it is.
  333. ----------------------------------------------------------------------------*/
  334. CCE CceReadEscSeq(IStream *pstmIn, ICET *lpicet)
  335. {
  336. UCHAR uch;
  337. TST tstCurr;
  338. JTK jtk;
  339. CCE cceRet;
  340. #ifdef DEBUG
  341. TST tstPrev;
  342. #endif
  343. // Sanity checks ...
  344. //
  345. #ifdef DEBUG
  346. if (!pstmIn || !lpicet)
  347. return cceInvalidParameter;
  348. #endif
  349. tstCurr = ST0;
  350. while (1)
  351. {
  352. // Find the next stopping state.
  353. //
  354. do
  355. {
  356. // Get the next character and clasify it.
  357. //
  358. jtk = _JtkGetNext(pstmIn, &uch);
  359. #ifdef DEBUG
  360. // Save the previous state for debugging purposes, only.
  361. //
  362. tstPrev = tstCurr;
  363. #endif
  364. // Transition -- note that order is different than
  365. // "normal" transition tables.
  366. //
  367. tstCurr = _rgchNextState[jtk][tstCurr];
  368. } while (!IsFinal(tstCurr));
  369. switch (tstCurr)
  370. {
  371. case JS0: // JIS-Roman
  372. case JS1: // Half-Width Katakana
  373. case JS2: // JIS C 6226-1978
  374. case JS3: // JIS X 0208-1983
  375. case JS4: // JIS X 0208-1990
  376. case JS5: // JIS X 0212-1990
  377. *lpicet = icetIso2022Jp;
  378. cceRet = cceSuccess;
  379. goto _LRet;
  380. case CS0: // GB 1988-89 Roman
  381. case CS1: // GB 2312-80
  382. *lpicet = icetIso2022Cn;
  383. cceRet = cceSuccess;
  384. goto _LRet;
  385. case TS0: // CNS 11643-1992 Plane 1
  386. case TS1: // CNS 11643-1992 Plane 2
  387. case TS2: // CNS 11643-1992 Plane 3
  388. case TS3: // CNS 11643-1992 Plane 4
  389. case TS4: // CNS 11643-1992 Plane 5
  390. case TS5: // CNS 11643-1992 Plane 6
  391. case TS6: // CNS 11643-1992 Plane 7
  392. *lpicet = icetIso2022Tw;
  393. cceRet = cceSuccess;
  394. goto _LRet;
  395. case KS0: // KS C 5601-1992
  396. case KSD: // ISO-2022-KR Document Signal
  397. *lpicet = icetIso2022Kr;
  398. cceRet = cceSuccess;
  399. goto _LRet;
  400. case ASC: // Ascii
  401. case LSO:
  402. case LSI:
  403. case TXT:
  404. case EXT:
  405. case FIN:
  406. // Insufficient information to choose a flavor ...
  407. cceRet = cceMayBeAscii;
  408. goto _LRet;
  409. case ERR:
  410. cceRet = cceRead;
  411. goto _LRet;
  412. default: // UNK, EOI
  413. cceRet = cceUnknownInput;
  414. goto _LRet;
  415. }
  416. }
  417. _LRet:
  418. return cceRet;
  419. }