windows-server-2003/shell/ext/mlang/jislex.cpp

/*----------------------------------------------------------------------------
    %%File: jislex.c
    %%Unit: fechmap
    %%Contact: jpick

    Simple converter for decoding a subset of possible ISO-2022-7 encoded
    files (ISO-2022).  Data is translated to and from Unicode.  Converter
    operates according to user options.
    
    Module currently handles ISO-2022-JP (and JIS) and ISO-2022-KR.  
    
    Converter is set up to handle ISO-2022-TW and ISO-2022-CN, but there
    are as yet no conversion tables for these.
----------------------------------------------------------------------------*/

#include <stdio.h>
#include <stddef.h>

#include "private.h"
#include "fechmap_.h"
#include "lexint_.h"


// State table for reading ISO-2022-7 encoded text
//
// Lexer recognizes the following designator sequences, used 
// to select a one or two byte character set:
//
//    <esc> $ @             -- JIS C 6626-1978  (synonym of <esc> $ ( @)
//    <esc> $ A             -- GB 2312-80       (synonym of <esc> $ ( A)
//    <esc> $ B             -- JIS X 0208-1983  (synonym of <esc> $ ( B)
//
//    <esc> $ ( @           -- JIS C 6626-1978
//    <esc> $ ( A           -- GB 2312-80
//    <esc> $ ( B           -- JIS X 0208-1983
//    <esc> $ ( C           -- KS C 5601-1992
//    <esc> $ ( D           -- JIS X 0212-1990
//    <esc> $ ( E           -- ??? (ISO-IR-165:1992) ???
//    <esc> $ ( G           -- CNS 11643-1992 Plane 1
//    <esc> $ ( H           -- CNS 11643-1992 Plane 2
//    <esc> $ ( I           -- CNS 11643-1992 Plane 3
//    <esc> $ ( J           -- CNS 11643-1992 Plane 4
//    <esc> $ ( K           -- CNS 11643-1992 Plane 5
//    <esc> $ ( L           -- CNS 11643-1992 Plane 6
//    <esc> $ ( M           -- CNS 11643-1992 Plane 7
//
//    <esc> $ ) C           -- KSC 5601-1987 (Implies ISO-2022-KR ??)
//
//    <esc> & @ <esc> $ B   -- JIS X 0208-1990
//
//    <esc> ( B             -- Ascii
//    <esc> ( H             -- Deprecated variant of JIS-Roman
//    <esc> ( I             -- Half-Width Katakana
//    <esc> ( J             -- JIS-Roman
//    <esc> ( T             -- GB 1988-89 Roman
//
// Lexer recognizes the following shift sequences, used to allow
// interpretation of a given byte or bytes:
//
//    <si>                  -- locking shift, interpret bytes as G0
//    <so>                  -- locking shift, interpret bytes as G1
//    <esc> n               -- locking shift, interpret bytes as G2
//    <esc> o               -- locking shift, interpret bytes as G3
//    <esc> N               -- single shift, interpret bytes as G2
//    <esc> O               -- single shift, interpret bytes as G3
//
// REVIEW (jpick): don't currently need the final four shift
//   sequences.  If we support ISO-2022-CN, we'll need to use
//   G2 and G3 and potentially, then, the last four shifts.
//

/*----------------------------------------------------------------------------
    Character Classification Table
----------------------------------------------------------------------------*/

// Tokens
//
#define txt         (JTK) 0
#define ext         (JTK) 1     // extended characters that are legal under certain circumstances
#define esc         (JTK) 2
#define si          (JTK) 3
#define so          (JTK) 4
#define dlr         (JTK) 5
#define at          (JTK) 6
#define amp         (JTK) 7
#define opr         (JTK) 8
#define cpr         (JTK) 9
#define tkA         (JTK) 10
#define tkB         (JTK) 11
#define tkC         (JTK) 12
#define tkD         (JTK) 13
#define tkE         (JTK) 14
#define tkG         (JTK) 15
#define tkH         (JTK) 16
#define tkI         (JTK) 17
#define tkJ         (JTK) 18
#define tkK         (JTK) 19
#define tkL         (JTK) 20
#define tkM         (JTK) 21
#define tkT         (JTK) 22
#define unk         (JTK) 23    // Unexpected character
#define eof         (JTK) 24    // end-of-file
#define err         (JTK) 25    // read error

#define nTokens     26

// Lookup table for ISO-2022-7 encoded files
//
static JTK _rgjtkCharClass[256] =
//  0    1    2    3    4    5    6    7    8    9    a    b    c    d    e    f
    {
//  nul  soh  stx  etx  eot  enq  ack  bel  bs   tab  lf   vt   np   cr   so   si       0
    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, so,  si, 

//  dle  dc1  dc2  dc3  dc4  nak  syn  etb  can  em   eof  esc  fs   gs   rs   us       1
    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, esc, txt, txt, txt, txt, 

//  sp   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /        2
    txt, txt, txt, txt, dlr, txt, amp, txt, opr, cpr, txt, txt, txt, txt, txt, txt, 

//  0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?        3
    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, 

//  @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O        4
    at,  tkA, tkB, tkC, tkD, tkE, txt, tkG, tkH, tkI, tkJ, tkK, tkL, tkM, txt, txt, 

//  P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _        5
    txt, txt, txt, txt, tkT, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, 

//  `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o        6
    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, 

//  p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~    del      7
    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, 

//                                                                                      8
    unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, 

//                                                                                      9
    unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, 

//                                                                                      a
    unk, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, 

//                                                                                      b
    ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, 

//                                                                                      c
    ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, 

//                                                                                      d
    ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, 

//                                                                                      e
    unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, 

//                                                                                      f
    unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, 

//  0    1    2    3    4    5    6    7    8    9    a    b    c    d    e    f
};


/*----------------------------------------------------------------------------
    State Table
----------------------------------------------------------------------------*/

// Final states have the high-bit set.  States that represent the reading
// of a valid character escape sequence also encode the character set
// "name" (moniker??) -- the state with the high bit masked off.
//
// Table State
//
typedef unsigned char TST;

// Final State Mask, Related
//
#define grfFinal                            (TST) 0x80
#define _NEscTypeFromState(nState)          (int) ((nState) & 0x7f)

// ASCII Escape Sequence (Final State)
#define ASC     (TST) (grfFinal | 0x00)     // Ascii

// Japanese Escape Sequences (Final States)
#define JS0     (TST) (grfFinal | 0x01)     // JIS-Roman
#define JS1     (TST) (grfFinal | 0x02)     // Half-Width Katakana
#define JS2     (TST) (grfFinal | 0x03)     // JIS C 6226-1978
#define JS3     (TST) (grfFinal | 0x04)     // JIS X 0208-1983
#define JS4     (TST) (grfFinal | 0x05)     // JIS X 0208-1990
#define JS5     (TST) (grfFinal | 0x06)     // JIS X 0212-1990

// Chinese (PRC) Escape Sequences (Final States)
#define CS0     (TST) (grfFinal | 0x07)     // GB 1988-89 Roman
#define CS1     (TST) (grfFinal | 0x08)     // GB 2312-80

// Chinese (Taiwan) Escape Sequences (Final States)
#define TS0     (TST) (grfFinal | 0x09)     // CNS 11643-1992 Plane 1
#define TS1     (TST) (grfFinal | 0x0a)     // CNS 11643-1992 Plane 2
#define TS2     (TST) (grfFinal | 0x0b)     // CNS 11643-1992 Plane 3
#define TS3     (TST) (grfFinal | 0x0c)     // CNS 11643-1992 Plane 4
#define TS4     (TST) (grfFinal | 0x0d)     // CNS 11643-1992 Plane 5
#define TS5     (TST) (grfFinal | 0x0e)     // CNS 11643-1992 Plane 6
#define TS6     (TST) (grfFinal | 0x0f)     // CNS 11643-1992 Plane 7

// Korean Escape Sequences (Final State)
#define KS0     (TST) (grfFinal | 0x10)     // KS C 5601-1992

// Document "Signal" for ISO-2022-KR (Doc needs special processing)
#define KSD     (TST) (grfFinal | 0x11)     // ISO-2022-KR Document Signal

// Number of unique *character set* escape sequences
//
#define cCsEsc  18

// Special States (not escape sequence) (Final States)
//
#define TXT     (TST) (grfFinal | (cCsEsc + 1))     // Process Text
#define EXT     (TST) (grfFinal | (cCsEsc + 2))     // Process (Possibly Illegal) Extended Chars
#define FIN     (TST) (grfFinal | (cCsEsc + 3))     // Finish
#define EOI     (TST) (grfFinal | (cCsEsc + 4))     // Unexpected End-Of-Input
#define UNK     (TST) (grfFinal | (cCsEsc + 5))     // Unknown State (Unexpected Character)
#define ERR     (TST) (grfFinal | (cCsEsc + 6))     // Read Error

// Shift Sequences (do not specify character set) (Final States)
//
#define LSO     (TST) (grfFinal | (cCsEsc + 7))     // Locking shift out (g1 into GL)
#define LSI     (TST) (grfFinal | (cCsEsc + 8))     // Locking shift in (g0 into GL)

// For convenience, also define constants for the sets
// that the states represent.
//
#define csNIL       (-1)                            // Invalid Designator
#define csASC       (_NEscTypeFromState(ASC))       // Ascii
#define csJS0       (_NEscTypeFromState(JS0))       // JIS-Roman
#define csJS1       (_NEscTypeFromState(JS1))       // Half-Width Katakana
#define csJS2       (_NEscTypeFromState(JS2))       // JIS C 6226-1978
#define csJS3       (_NEscTypeFromState(JS3))       // JIS X 0208-1983
#define csJS4       (_NEscTypeFromState(JS4))       // JIS X 0208-1990
#define csJS5       (_NEscTypeFromState(JS5))       // JIS X 0212-1990
#define csCS0       (_NEscTypeFromState(CS0))       // GB 1988-89 Roman
#define csCS1       (_NEscTypeFromState(CS1))       // GB 2312-80
#define csTS0       (_NEscTypeFromState(TS0))       // CNS 11643-1992 Plane 1
#define csTS1       (_NEscTypeFromState(TS1))       // CNS 11643-1992 Plane 2
#define csTS2       (_NEscTypeFromState(TS2))       // CNS 11643-1992 Plane 3
#define csTS3       (_NEscTypeFromState(TS3))       // CNS 11643-1992 Plane 4
#define csTS4       (_NEscTypeFromState(TS4))       // CNS 11643-1992 Plane 5
#define csTS5       (_NEscTypeFromState(TS5))       // CNS 11643-1992 Plane 6
#define csTS6       (_NEscTypeFromState(TS6))       // CNS 11643-1992 Plane 7
#define csKS0       (_NEscTypeFromState(KS0))       // KS C 5601-1992 (into G0)
#define csKSD       (_NEscTypeFromState(KSD))       // KS C 5601-1992 (into G1)

// Table States (Intermediate States)
#define ST0     (TST)  0
#define ST1     (TST)  1
#define ST2     (TST)  2
#define ST3     (TST)  3
#define ST4     (TST)  4
#define ST5     (TST)  5
#define ST6     (TST)  6
#define ST7     (TST)  7
#define ST8     (TST)  8
#define ST9     (TST)  9

// Number of "real" (table) states
//
#define nStates     10

#define IsFinal(state)  ((state) & grfFinal)


// State    Have Seen               Looking For
// ----------------------------------------------------------
// ST0      -- Start State --       <ESC> Text
// ST1      <ESC>                   $ & (
// ST2      <ESC> $                 ( ) @ A B   (**)
// ST3      <ESC> $ (               @ A B C D E G H I J K L M
// ST4      <ESC> $ )               C
// ST5      <ESC> &                 @
// ST6      <ESC> & @               <ESC>
// ST7      <ESC> & @ <ESC>         $
// ST8      <ESC> & @ <ESC> $       B
// ST9      <ESC> (                 B H I J T
//
// (**)  "<ESC> $ ID" is a synonym of "<ESC> $ ( ID" for ID=(@, A, B)
//
// Because of the large number of tokens, this table is
// inverted (tokens x states).
//
static signed char _rgchNextState[nTokens][nStates] =
{
//
//           S     S     S     S     S     S     S     S     S     S 
//           T     T     T     T     T     T     T     T     T     T  
//           0     1     2     3     4     5     6     7     8     9   
//--------------------------------------------------------------------
//
/* txt */  TXT,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* ext */  EXT,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* esc */  ST1,  UNK,  UNK,  UNK,  UNK,  UNK,  ST7,  UNK,  UNK,  UNK,
/* si  */  LSI,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* so  */  LSO,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* $   */  TXT,  ST2,  UNK,  UNK,  UNK,  UNK,  UNK,  ST8,  UNK,  UNK,
/* @   */  TXT,  UNK,  JS2,  JS2,  UNK,  ST6,  UNK,  UNK,  UNK,  UNK,
/* &   */  TXT,  ST5,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* (   */  TXT,  ST9,  ST3,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* )   */  TXT,  UNK,  ST4,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* A   */  TXT,  UNK,  CS1,  CS1,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* B   */  TXT,  UNK,  JS3,  JS3,  UNK,  UNK,  UNK,  UNK,  JS4,  ASC,
/* C   */  TXT,  UNK,  UNK,  KS0,  KSD,  UNK,  UNK,  UNK,  UNK,  UNK,
/* D   */  TXT,  UNK,  UNK,  JS5,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* E   */  TXT,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* G   */  TXT,  UNK,  UNK,  TS0,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* H   */  TXT,  UNK,  UNK,  TS1,  UNK,  UNK,  UNK,  UNK,  UNK,  JS0,
/* I   */  TXT,  UNK,  UNK,  TS2,  UNK,  UNK,  UNK,  UNK,  UNK,  JS1,
/* J   */  TXT,  UNK,  UNK,  TS3,  UNK,  UNK,  UNK,  UNK,  UNK,  JS0,
/* K   */  TXT,  UNK,  UNK,  TS4,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* L   */  TXT,  UNK,  UNK,  TS5,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* M   */  TXT,  UNK,  UNK,  TS6,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* T   */  TXT,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  CS0,
/* unk */  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* eof */  FIN,  EOI,  EOI,  EOI,  EOI,  EOI,  EOI,  EOI,  EOI,  EOI,
/* err */  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,
};


// Also for ISO-2022 out.  Build arrays of possible character
// sets for each type of input character set.  Character sets
// should appear in order of hit probability (e.g., in 2022-Jp
// JS3 is the most common set).  Mark the end of array with -1.
// (Only store these for non-ascii sets).
//
//
// China (icetIso2022Cn)
static int _rgceCn[] = { -1, };

// Japan (icetIso2022Jp)
static int _rgceJp[] = { csJS3, csJS1, csJS5, -1, };

// Korea (icetIso2022Kr)
static int _rgceKr[] = { -1, };

// Taiwan (icetIso2022Tw)
static int _rgceTw[] = { -1, };

static int *_mpicetrgce[icetCount] =
    {
    0,              // icetEucCn
    0,              // icetEucJp
    0,              // icetEucKr
    0,              // icetEucTw
    _rgceCn,        // icetIso2022Cn
    _rgceJp,        // icetIso2022Jp
    _rgceKr,        // icetIso2022Kr
    _rgceTw,        // icetIso2022Tw
    0,              // icetBig5
    0,              // icetGbk
    0,              // icetShiftJis
    0,              // icetWansung
    0,              // icetUtf8
    };

/* _ J T K  G E T  N E X T */
/*----------------------------------------------------------------------------
    %%Function: _JtkGetNext
    %%Contact: jpick

    Get the next character and classify it.  Return the token.
----------------------------------------------------------------------------*/
static JTK __inline _JtkGetNext(IStream *pstmIn, PUCHAR puch)
{
    ULONG rc;
    HRESULT hr;
          
    hr = pstmIn->Read(puch, 1, &rc);
    
    if (hr != S_OK )
        return err;
    else if (rc == 0)
        return eof;
    else
        return _rgjtkCharClass[*puch];
}

/* C C E  R E A D  E S C  S E Q */
/*----------------------------------------------------------------------------
    %%Function: CceReadEscSeq
    %%Contact: jpick

    Read pointer is positioned at an escape sequence, figure out
    which escape sequence it is.
----------------------------------------------------------------------------*/
CCE CceReadEscSeq(IStream *pstmIn, ICET *lpicet)
{
    UCHAR uch;
    TST tstCurr;
    JTK jtk;
    CCE cceRet;
#ifdef DEBUG
    TST tstPrev;
#endif

    // Sanity checks ...
    //
#ifdef DEBUG
    if (!pstmIn || !lpicet)
        return cceInvalidParameter;
#endif
        
    tstCurr = ST0;

    while (1)
        {
        // Find the next stopping state.
        //
        do
            {
            // Get the next character and clasify it.
            //
            jtk = _JtkGetNext(pstmIn, &uch);
                
#ifdef DEBUG
            // Save the previous state for debugging purposes, only.
            //
            tstPrev = tstCurr;
#endif
            // Transition -- note that order is different than
            // "normal" transition tables.
            //
            tstCurr = _rgchNextState[jtk][tstCurr];
        
            } while (!IsFinal(tstCurr));
        
        switch (tstCurr)
            {
            case JS0:           // JIS-Roman
            case JS1:           // Half-Width Katakana
            case JS2:           // JIS C 6226-1978
            case JS3:           // JIS X 0208-1983
            case JS4:           // JIS X 0208-1990
            case JS5:           // JIS X 0212-1990
                *lpicet = icetIso2022Jp;
                cceRet = cceSuccess;
                goto _LRet;
            case CS0:           // GB 1988-89 Roman
            case CS1:           // GB 2312-80
                *lpicet = icetIso2022Cn;
                cceRet = cceSuccess;
                goto _LRet;
            case TS0:           // CNS 11643-1992 Plane 1
            case TS1:           // CNS 11643-1992 Plane 2
            case TS2:           // CNS 11643-1992 Plane 3
            case TS3:           // CNS 11643-1992 Plane 4
            case TS4:           // CNS 11643-1992 Plane 5
            case TS5:           // CNS 11643-1992 Plane 6
            case TS6:           // CNS 11643-1992 Plane 7
                *lpicet = icetIso2022Tw;
                cceRet = cceSuccess;
                goto _LRet;
            case KS0:           // KS C 5601-1992
            case KSD:           // ISO-2022-KR Document Signal
                *lpicet = icetIso2022Kr;
                cceRet = cceSuccess;
                goto _LRet;
            case ASC:           // Ascii
            case LSO:
            case LSI:
            case TXT:
            case EXT:
            case FIN:
                // Insufficient information to choose a flavor ...
                cceRet = cceMayBeAscii;
                goto _LRet;
            case ERR:
                cceRet = cceRead;
                goto _LRet;
            default:            // UNK, EOI
                cceRet = cceUnknownInput;
                goto _LRet;
            }
        }
        
_LRet:

    return cceRet;
}