windows-server-2003/shell/ext/mlang/jislex.cpp


								/*----------------------------------------------------------------------------

								    %%File: jislex.c

								    %%Unit: fechmap

								    %%Contact: jpick


								    Simple converter for decoding a subset of possible ISO-2022-7 encoded

								    files (ISO-2022).  Data is translated to and from Unicode.  Converter

								    operates according to user options.


								    Module currently handles ISO-2022-JP (and JIS) and ISO-2022-KR.


								    Converter is set up to handle ISO-2022-TW and ISO-2022-CN, but there

								    are as yet no conversion tables for these.

								----------------------------------------------------------------------------*/


								#include <stdio.h>

								#include <stddef.h>


								#include "private.h"

								#include "fechmap_.h"

								#include "lexint_.h"


								// State table for reading ISO-2022-7 encoded text

								//

								// Lexer recognizes the following designator sequences, used

								// to select a one or two byte character set:

								//

								//    <esc> $ @             -- JIS C 6626-1978  (synonym of <esc> $ ( @)

								//    <esc> $ A             -- GB 2312-80       (synonym of <esc> $ ( A)

								//    <esc> $ B             -- JIS X 0208-1983  (synonym of <esc> $ ( B)

								//

								//    <esc> $ ( @           -- JIS C 6626-1978

								//    <esc> $ ( A           -- GB 2312-80

								//    <esc> $ ( B           -- JIS X 0208-1983

								//    <esc> $ ( C           -- KS C 5601-1992

								//    <esc> $ ( D           -- JIS X 0212-1990

								//    <esc> $ ( E           -- ??? (ISO-IR-165:1992) ???

								//    <esc> $ ( G           -- CNS 11643-1992 Plane 1

								//    <esc> $ ( H           -- CNS 11643-1992 Plane 2

								//    <esc> $ ( I           -- CNS 11643-1992 Plane 3

								//    <esc> $ ( J           -- CNS 11643-1992 Plane 4

								//    <esc> $ ( K           -- CNS 11643-1992 Plane 5

								//    <esc> $ ( L           -- CNS 11643-1992 Plane 6

								//    <esc> $ ( M           -- CNS 11643-1992 Plane 7

								//

								//    <esc> $ ) C           -- KSC 5601-1987 (Implies ISO-2022-KR ??)

								//

								//    <esc> & @ <esc> $ B   -- JIS X 0208-1990

								//

								//    <esc> ( B             -- Ascii

								//    <esc> ( H             -- Deprecated variant of JIS-Roman

								//    <esc> ( I             -- Half-Width Katakana

								//    <esc> ( J             -- JIS-Roman

								//    <esc> ( T             -- GB 1988-89 Roman

								//

								// Lexer recognizes the following shift sequences, used to allow

								// interpretation of a given byte or bytes:

								//

								//    <si>                  -- locking shift, interpret bytes as G0

								//    <so>                  -- locking shift, interpret bytes as G1

								//    <esc> n               -- locking shift, interpret bytes as G2

								//    <esc> o               -- locking shift, interpret bytes as G3

								//    <esc> N               -- single shift, interpret bytes as G2

								//    <esc> O               -- single shift, interpret bytes as G3

								//

								// REVIEW (jpick): don't currently need the final four shift

								//   sequences.  If we support ISO-2022-CN, we'll need to use

								//   G2 and G3 and potentially, then, the last four shifts.

								//


								/*----------------------------------------------------------------------------

								    Character Classification Table

								----------------------------------------------------------------------------*/


								// Tokens

								//

								#define txt         (JTK) 0

								#define ext         (JTK) 1     // extended characters that are legal under certain circumstances

								#define esc         (JTK) 2

								#define si          (JTK) 3

								#define so          (JTK) 4

								#define dlr         (JTK) 5

								#define at          (JTK) 6

								#define amp         (JTK) 7

								#define opr         (JTK) 8

								#define cpr         (JTK) 9

								#define tkA         (JTK) 10

								#define tkB         (JTK) 11

								#define tkC         (JTK) 12

								#define tkD         (JTK) 13

								#define tkE         (JTK) 14

								#define tkG         (JTK) 15

								#define tkH         (JTK) 16

								#define tkI         (JTK) 17

								#define tkJ         (JTK) 18

								#define tkK         (JTK) 19

								#define tkL         (JTK) 20

								#define tkM         (JTK) 21

								#define tkT         (JTK) 22

								#define unk         (JTK) 23    // Unexpected character

								#define eof         (JTK) 24    // end-of-file

								#define err         (JTK) 25    // read error


								#define nTokens     26


								// Lookup table for ISO-2022-7 encoded files

								//

								static JTK _rgjtkCharClass[256] =

								//  0    1    2    3    4    5    6    7    8    9    a    b    c    d    e    f

								    {

								//  nul  soh  stx  etx  eot  enq  ack  bel  bs   tab  lf   vt   np   cr   so   si       0

								    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, so,  si,


								//  dle  dc1  dc2  dc3  dc4  nak  syn  etb  can  em   eof  esc  fs   gs   rs   us       1

								    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, esc, txt, txt, txt, txt,


								//  sp   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /        2

								    txt, txt, txt, txt, dlr, txt, amp, txt, opr, cpr, txt, txt, txt, txt, txt, txt,


								//  0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?        3

								    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,


								//  @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O        4

								    at,  tkA, tkB, tkC, tkD, tkE, txt, tkG, tkH, tkI, tkJ, tkK, tkL, tkM, txt, txt,


								//  P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _        5

								    txt, txt, txt, txt, tkT, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,


								//  `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o        6

								    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,


								//  p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~    del      7

								    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,


								//                                                                                      8

								    unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,


								//                                                                                      9

								    unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,


								//                                                                                      a

								    unk, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,


								//                                                                                      b

								    ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,


								//                                                                                      c

								    ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,


								//                                                                                      d

								    ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,


								//                                                                                      e

								    unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,


								//                                                                                      f

								    unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,


								//  0    1    2    3    4    5    6    7    8    9    a    b    c    d    e    f

								};


								/*----------------------------------------------------------------------------

								    State Table

								----------------------------------------------------------------------------*/


								// Final states have the high-bit set.  States that represent the reading

								// of a valid character escape sequence also encode the character set

								// "name" (moniker??) -- the state with the high bit masked off.

								//

								// Table State

								//

								typedef unsigned char TST;


								// Final State Mask, Related

								//

								#define grfFinal                            (TST) 0x80

								#define _NEscTypeFromState(nState)          (int) ((nState) & 0x7f)


								// ASCII Escape Sequence (Final State)

								#define ASC     (TST) (grfFinal | 0x00)     // Ascii


								// Japanese Escape Sequences (Final States)

								#define JS0     (TST) (grfFinal | 0x01)     // JIS-Roman

								#define JS1     (TST) (grfFinal | 0x02)     // Half-Width Katakana

								#define JS2     (TST) (grfFinal | 0x03)     // JIS C 6226-1978

								#define JS3     (TST) (grfFinal | 0x04)     // JIS X 0208-1983

								#define JS4     (TST) (grfFinal | 0x05)     // JIS X 0208-1990

								#define JS5     (TST) (grfFinal | 0x06)     // JIS X 0212-1990


								// Chinese (PRC) Escape Sequences (Final States)

								#define CS0     (TST) (grfFinal | 0x07)     // GB 1988-89 Roman

								#define CS1     (TST) (grfFinal | 0x08)     // GB 2312-80


								// Chinese (Taiwan) Escape Sequences (Final States)

								#define TS0     (TST) (grfFinal | 0x09)     // CNS 11643-1992 Plane 1

								#define TS1     (TST) (grfFinal | 0x0a)     // CNS 11643-1992 Plane 2

								#define TS2     (TST) (grfFinal | 0x0b)     // CNS 11643-1992 Plane 3

								#define TS3     (TST) (grfFinal | 0x0c)     // CNS 11643-1992 Plane 4

								#define TS4     (TST) (grfFinal | 0x0d)     // CNS 11643-1992 Plane 5

								#define TS5     (TST) (grfFinal | 0x0e)     // CNS 11643-1992 Plane 6

								#define TS6     (TST) (grfFinal | 0x0f)     // CNS 11643-1992 Plane 7


								// Korean Escape Sequences (Final State)

								#define KS0     (TST) (grfFinal | 0x10)     // KS C 5601-1992


								// Document "Signal" for ISO-2022-KR (Doc needs special processing)

								#define KSD     (TST) (grfFinal | 0x11)     // ISO-2022-KR Document Signal


								// Number of unique *character set* escape sequences

								//

								#define cCsEsc  18


								// Special States (not escape sequence) (Final States)

								//

								#define TXT     (TST) (grfFinal | (cCsEsc + 1))     // Process Text

								#define EXT     (TST) (grfFinal | (cCsEsc + 2))     // Process (Possibly Illegal) Extended Chars

								#define FIN     (TST) (grfFinal | (cCsEsc + 3))     // Finish

								#define EOI     (TST) (grfFinal | (cCsEsc + 4))     // Unexpected End-Of-Input

								#define UNK     (TST) (grfFinal | (cCsEsc + 5))     // Unknown State (Unexpected Character)

								#define ERR     (TST) (grfFinal | (cCsEsc + 6))     // Read Error


								// Shift Sequences (do not specify character set) (Final States)

								//

								#define LSO     (TST) (grfFinal | (cCsEsc + 7))     // Locking shift out (g1 into GL)

								#define LSI     (TST) (grfFinal | (cCsEsc + 8))     // Locking shift in (g0 into GL)


								// For convenience, also define constants for the sets

								// that the states represent.

								//

								#define csNIL       (-1)                            // Invalid Designator

								#define csASC       (_NEscTypeFromState(ASC))       // Ascii

								#define csJS0       (_NEscTypeFromState(JS0))       // JIS-Roman

								#define csJS1       (_NEscTypeFromState(JS1))       // Half-Width Katakana

								#define csJS2       (_NEscTypeFromState(JS2))       // JIS C 6226-1978

								#define csJS3       (_NEscTypeFromState(JS3))       // JIS X 0208-1983

								#define csJS4       (_NEscTypeFromState(JS4))       // JIS X 0208-1990

								#define csJS5       (_NEscTypeFromState(JS5))       // JIS X 0212-1990

								#define csCS0       (_NEscTypeFromState(CS0))       // GB 1988-89 Roman

								#define csCS1       (_NEscTypeFromState(CS1))       // GB 2312-80

								#define csTS0       (_NEscTypeFromState(TS0))       // CNS 11643-1992 Plane 1

								#define csTS1       (_NEscTypeFromState(TS1))       // CNS 11643-1992 Plane 2

								#define csTS2       (_NEscTypeFromState(TS2))       // CNS 11643-1992 Plane 3

								#define csTS3       (_NEscTypeFromState(TS3))       // CNS 11643-1992 Plane 4

								#define csTS4       (_NEscTypeFromState(TS4))       // CNS 11643-1992 Plane 5

								#define csTS5       (_NEscTypeFromState(TS5))       // CNS 11643-1992 Plane 6

								#define csTS6       (_NEscTypeFromState(TS6))       // CNS 11643-1992 Plane 7

								#define csKS0       (_NEscTypeFromState(KS0))       // KS C 5601-1992 (into G0)

								#define csKSD       (_NEscTypeFromState(KSD))       // KS C 5601-1992 (into G1)


								// Table States (Intermediate States)

								#define ST0     (TST)  0

								#define ST1     (TST)  1

								#define ST2     (TST)  2

								#define ST3     (TST)  3

								#define ST4     (TST)  4

								#define ST5     (TST)  5

								#define ST6     (TST)  6

								#define ST7     (TST)  7

								#define ST8     (TST)  8

								#define ST9     (TST)  9


								// Number of "real" (table) states

								//

								#define nStates     10


								#define IsFinal(state)  ((state) & grfFinal)


								// State    Have Seen               Looking For

								// ----------------------------------------------------------

								// ST0      -- Start State --       <ESC> Text

								// ST1      <ESC>                   $ & (

								// ST2      <ESC> $                 ( ) @ A B   (**)

								// ST3      <ESC> $ (               @ A B C D E G H I J K L M

								// ST4      <ESC> $ )               C

								// ST5      <ESC> &                 @

								// ST6      <ESC> & @               <ESC>

								// ST7      <ESC> & @ <ESC>         $

								// ST8      <ESC> & @ <ESC> $       B

								// ST9      <ESC> (                 B H I J T

								//

								// (**)  "<ESC> $ ID" is a synonym of "<ESC> $ ( ID" for ID=(@, A, B)

								//

								// Because of the large number of tokens, this table is

								// inverted (tokens x states).

								//

								static signed char _rgchNextState[nTokens][nStates] =

								{

								//

								//           S     S     S     S     S     S     S     S     S     S

								//           T     T     T     T     T     T     T     T     T     T

								//           0     1     2     3     4     5     6     7     8     9

								//--------------------------------------------------------------------

								//

								/* txt */  TXT,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* ext */  EXT,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* esc */  ST1,  UNK,  UNK,  UNK,  UNK,  UNK,  ST7,  UNK,  UNK,  UNK,

								/* si  */  LSI,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* so  */  LSO,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* $   */  TXT,  ST2,  UNK,  UNK,  UNK,  UNK,  UNK,  ST8,  UNK,  UNK,

								/* @   */  TXT,  UNK,  JS2,  JS2,  UNK,  ST6,  UNK,  UNK,  UNK,  UNK,

								/* &   */  TXT,  ST5,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* (   */  TXT,  ST9,  ST3,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* )   */  TXT,  UNK,  ST4,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* A   */  TXT,  UNK,  CS1,  CS1,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* B   */  TXT,  UNK,  JS3,  JS3,  UNK,  UNK,  UNK,  UNK,  JS4,  ASC,

								/* C   */  TXT,  UNK,  UNK,  KS0,  KSD,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* D   */  TXT,  UNK,  UNK,  JS5,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* E   */  TXT,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* G   */  TXT,  UNK,  UNK,  TS0,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* H   */  TXT,  UNK,  UNK,  TS1,  UNK,  UNK,  UNK,  UNK,  UNK,  JS0,

								/* I   */  TXT,  UNK,  UNK,  TS2,  UNK,  UNK,  UNK,  UNK,  UNK,  JS1,

								/* J   */  TXT,  UNK,  UNK,  TS3,  UNK,  UNK,  UNK,  UNK,  UNK,  JS0,

								/* K   */  TXT,  UNK,  UNK,  TS4,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* L   */  TXT,  UNK,  UNK,  TS5,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* M   */  TXT,  UNK,  UNK,  TS6,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* T   */  TXT,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  CS0,

								/* unk */  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,

								/* eof */  FIN,  EOI,  EOI,  EOI,  EOI,  EOI,  EOI,  EOI,  EOI,  EOI,

								/* err */  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,

								};


								// Also for ISO-2022 out.  Build arrays of possible character

								// sets for each type of input character set.  Character sets

								// should appear in order of hit probability (e.g., in 2022-Jp

								// JS3 is the most common set).  Mark the end of array with -1.

								// (Only store these for non-ascii sets).

								//

								//

								// China (icetIso2022Cn)

								static int _rgceCn[] = { -1, };


								// Japan (icetIso2022Jp)

								static int _rgceJp[] = { csJS3, csJS1, csJS5, -1, };


								// Korea (icetIso2022Kr)

								static int _rgceKr[] = { -1, };


								// Taiwan (icetIso2022Tw)

								static int _rgceTw[] = { -1, };


								static int *_mpicetrgce[icetCount] =

								    {

								    0,              // icetEucCn

								    0,              // icetEucJp

								    0,              // icetEucKr

								    0,              // icetEucTw

								    _rgceCn,        // icetIso2022Cn

								    _rgceJp,        // icetIso2022Jp

								    _rgceKr,        // icetIso2022Kr

								    _rgceTw,        // icetIso2022Tw

								    0,              // icetBig5

								    0,              // icetGbk

								    0,              // icetShiftJis

								    0,              // icetWansung

								    0,              // icetUtf8

								    };


								/* _ J T K  G E T  N E X T */

								/*----------------------------------------------------------------------------

								    %%Function: _JtkGetNext

								    %%Contact: jpick


								    Get the next character and classify it.  Return the token.

								----------------------------------------------------------------------------*/

								static JTK __inline _JtkGetNext(IStream *pstmIn, PUCHAR puch)

								{

								    ULONG rc;

								    HRESULT hr;


								    hr = pstmIn->Read(puch, 1, &rc);


								    if (hr != S_OK )

								        return err;

								    else if (rc == 0)

								        return eof;

								    else

								        return _rgjtkCharClass[*puch];

								}


								/* C C E  R E A D  E S C  S E Q */

								/*----------------------------------------------------------------------------

								    %%Function: CceReadEscSeq

								    %%Contact: jpick


								    Read pointer is positioned at an escape sequence, figure out

								    which escape sequence it is.

								----------------------------------------------------------------------------*/

								CCE CceReadEscSeq(IStream *pstmIn, ICET *lpicet)

								{

								    UCHAR uch;

								    TST tstCurr;

								    JTK jtk;

								    CCE cceRet;

								#ifdef DEBUG

								    TST tstPrev;

								#endif


								    // Sanity checks ...

								    //

								#ifdef DEBUG

								    if (!pstmIn || !lpicet)

								        return cceInvalidParameter;

								#endif


								    tstCurr = ST0;


								    while (1)

								        {

								        // Find the next stopping state.

								        //

								        do

								            {

								            // Get the next character and clasify it.

								            //

								            jtk = _JtkGetNext(pstmIn, &uch);


								#ifdef DEBUG

								            // Save the previous state for debugging purposes, only.

								            //

								            tstPrev = tstCurr;

								#endif

								            // Transition -- note that order is different than

								            // "normal" transition tables.

								            //

								            tstCurr = _rgchNextState[jtk][tstCurr];


								            } while (!IsFinal(tstCurr));


								        switch (tstCurr)

								            {

								            case JS0:           // JIS-Roman

								            case JS1:           // Half-Width Katakana

								            case JS2:           // JIS C 6226-1978

								            case JS3:           // JIS X 0208-1983

								            case JS4:           // JIS X 0208-1990

								            case JS5:           // JIS X 0212-1990

								                *lpicet = icetIso2022Jp;

								                cceRet = cceSuccess;

								                goto _LRet;

								            case CS0:           // GB 1988-89 Roman

								            case CS1:           // GB 2312-80

								                *lpicet = icetIso2022Cn;

								                cceRet = cceSuccess;

								                goto _LRet;

								            case TS0:           // CNS 11643-1992 Plane 1

								            case TS1:           // CNS 11643-1992 Plane 2

								            case TS2:           // CNS 11643-1992 Plane 3

								            case TS3:           // CNS 11643-1992 Plane 4

								            case TS4:           // CNS 11643-1992 Plane 5

								            case TS5:           // CNS 11643-1992 Plane 6

								            case TS6:           // CNS 11643-1992 Plane 7

								                *lpicet = icetIso2022Tw;

								                cceRet = cceSuccess;

								                goto _LRet;

								            case KS0:           // KS C 5601-1992

								            case KSD:           // ISO-2022-KR Document Signal

								                *lpicet = icetIso2022Kr;

								                cceRet = cceSuccess;

								                goto _LRet;

								            case ASC:           // Ascii

								            case LSO:

								            case LSI:

								            case TXT:

								            case EXT:

								            case FIN:

								                // Insufficient information to choose a flavor ...

								                cceRet = cceMayBeAscii;

								                goto _LRet;

								            case ERR:

								                cceRet = cceRead;

								                goto _LRet;

								            default:            // UNK, EOI

								                cceRet = cceUnknownInput;

								                goto _LRet;

								            }

								        }


								_LRet:


								    return cceRet;

								}