windows-server-2003/base/win32/winnls/data/dlls/tools/gb18030/gbunicnv/rtfparser.cpp

// ==========================================================================================
//  RtfParser.cpp
//
//  Impl RTF parser
//
//  History:
//      first created   
// ==========================================================================================
//#include <windows.h>

#include "stdafx.h"
#include <stdio.h>
#include <assert.h>

#include "rtfparser.h"
#include "ConvEng.h"

//extern BOOL MapFunc(PBYTE, UINT, PBYTE, UINT*);

const char szRTFSignature[] = "{\\rtf";

// Keyword descriptions
SYM g_rgSymRtf[] = {
//  keyword     kwd         idx
    "*",        kwdSpec,    ipfnSkipDest,
    "'",        kwdSpec,    ipfnHex,
    "bin",      kwdSpec,    ipfnBin,
    "upr",      kwdDest,    idestSkip,
    "fonttbl",  kwdDest,    idestSkip,
/*
// we will search through following destinations
    "author",   kwdDest,    idestSkip,
    "buptim",   kwdDest,    idestSkip,
    "colortbl", kwdDest,    idestSkip,
    "comment",  kwdDest,    idestSkip,
    "creatim",  kwdDest,    idestSkip,
    "doccomm",  kwdDest,    idestSkip,
    "fonttbl",  kwdDest,    idestSkip,
    "footer",   kwdDest,    idestSkip,
    "footerf",  kwdDest,    idestSkip,
    "footerl",  kwdDest,    idestSkip,
    "footerr",  kwdDest,    idestSkip,
    "footnote", kwdDest,    idestSkip,
    "ftncn",    kwdDest,    idestSkip,
    "ftnsep",   kwdDest,    idestSkip,
    "ftnsepc",  kwdDest,    idestSkip,
    "header",   kwdDest,    idestSkip,
    "headerf",  kwdDest,    idestSkip,
    "headerl",  kwdDest,    idestSkip,
    "headerr",  kwdDest,    idestSkip,
    "info",     kwdDest,    idestSkip,
    "keywords", kwdDest,    idestSkip,
    "operator", kwdDest,    idestSkip,
    "pict",     kwdDest,    idestSkip,
    "printim",  kwdDest,    idestSkip,
    "private1", kwdDest,    idestSkip,
    "revtim",   kwdDest,    idestSkip,
    "rxe",      kwdDest,    idestSkip,
    "stylesheet",    kwdDest,    idestSkip,
    "subject",  kwdDest,    idestSkip,
    "tc",       kwdDest,    idestSkip,
    "title",    kwdDest,    idestSkip,
    "txe",      kwdDest,    idestSkip,
    "xe",       kwdDest,    idestSkip,
*/
    };
int g_iSymMax = sizeof(g_rgSymRtf) / sizeof(SYM);

// ctor
CRtfParser::CRtfParser( BYTE* pchInput, UINT cchInput, 
                        BYTE* pchOutput, UINT cchOutput)
{
    m_fInit = FALSE;

    m_pchInput = pchInput;
    m_cchInput = cchInput;
    m_pchOutput = pchOutput;
    m_cchOutput = cchOutput;

    Reset();

    if (pchInput && pchOutput && cchInput && cchOutput) {
        m_fInit = TRUE;
    } 
}

// Reset
// clean internal status before start the parser
void CRtfParser::Reset(void)
{
    m_cGroup = 0;
    m_cbBin = 0;
    m_fSkipDestIfUnk = FALSE;
    m_ris = risNorm;
    m_rds = rdsNorm;

    m_psave = NULL;
    m_uCursor = 0;
    m_uOutPos = 0;
    m_bsStatus = bsDefault;
    m_uConvStart = 0; 
    m_cchConvLen = 0; 

    memset(&m_sKeyword,0, sizeof(SKeyword));
} 

// check signature
BOOL CRtfParser::fRTFFile()
{
    if (m_fInit &&
        0 == memcmp(m_pchInput, szRTFSignature, strlen(szRTFSignature))) 
    {
        return TRUE;
    }

    return FALSE;
}

// Get major version
int
CRtfParser::GetVersion(PDWORD pdwVersion)
{
    int ec;

    *pdwVersion = 1;

    // set keyword to get
    m_sKeyword.wStatus |= KW_ENABLE;
    strcpy(m_sKeyword.szKeyword, "rtf");
    
    ec = Do();

    if (ec == ecOK && 
        (m_sKeyword.wStatus & KW_FOUND) && 
        (m_sKeyword.wStatus & KW_PARAM)) 
    {
        *pdwVersion = (DWORD) atoi(m_sKeyword.szParameter);
    }

    Reset();

    return ec;
}

// GetCodepage
int
CRtfParser::GetCodepage(PDWORD pdwCodepage)
{
    int ec;
    
    *pdwCodepage = 0;

    // set keyword to get
    m_sKeyword.wStatus |= KW_ENABLE;
    strcpy(m_sKeyword.szKeyword, "ansicpg");
    
    ec = Do();

    if (ec == ecOK && 
        (m_sKeyword.wStatus & KW_FOUND) && 
        (m_sKeyword.wStatus & KW_PARAM)) 
    {
        *pdwCodepage = atoi(m_sKeyword.szParameter);
    }

    Reset();

    return ec;
}

// do
// main parser function
int 
CRtfParser::Do()
{
    int ec;
    int cNibble = 2;
    BYTE ch;

    BSTATUS bsStatus;

    while ((ec = GetByte(&ch)) == ecOK)
    {
        if (m_cGroup < 0)
            return ecStackUnderflow;

        // check if search specific keyword
        if (m_sKeyword.wStatus & KW_ENABLE) {
            if (m_sKeyword.wStatus & KW_FOUND) {
                ReleaseRtfState();
                break;
            }
        }
        // set buf status
        bsStatus = bsDefault;

        if (m_ris == risBin)                      // if we're parsing binary data, handle it directly
        {
            // fall through
        }
        else
        {   
            switch (ch)
            {
            case '{':
                if ((ec = PushRtfState()) != ecOK)
                    return ec;
                break;
            case '}':
                if ((ec = PopRtfState()) != ecOK)
                    return ec;
                break;
            case '\\':
                if ((ec = ParseRtfKeyword()) != ecOK)
                    return ec;
                continue;  // all keyword is processed in ParseRtfKeyword
            case 0x0d:
            case 0x0a:          // cr and lf are noise characters...
                break;
            default:
                if (m_ris == risNorm )
                {
                    bsStatus = bsText;
                } else if (m_ris == risHex)
                {
                    cNibble--;
                    if (!cNibble) {
                        cNibble = 2;
                        m_ris = risNorm;
                    }
                    bsStatus = bsHex;
                } else {
                    return ecAssertion;
                }
                break;
            }       // switch
        }           // else (ris != risBin)

        if ((ec = ParseChar(ch, bsStatus)) != ecOK)
            return ec;
    }               // while
    if (m_cGroup < 0)
        return ecStackUnderflow;
    if (m_cGroup > 0)
        return ecUnmatchedBrace;
    return ecOK;
}


//
// PushRtfState
//
// Save relevant info on a linked list of SAVE structures.
//

int
CRtfParser::PushRtfState(void)
{
    SAVE *psaveNew = new SAVE;
    if (!psaveNew)
        return ecStackOverflow;

    psaveNew -> pNext = m_psave;
    psaveNew -> rds = m_rds;
    psaveNew -> ris = m_ris;
    m_ris = risNorm;
    // do not save rds, rds status spread to sub destination until this destination
    //  terminated
    m_psave = psaveNew;
    m_cGroup++;
    return ecOK;
}

//
// PopRtfState
//
// If we're ending a destination (that is, the destination is changing),
// call ecEndGroupAction.
// Always restore relevant info from the top of the SAVE list.
//

int
CRtfParser::PopRtfState(void)
{
    SAVE *psaveOld;

    if (!m_psave)
        return ecStackUnderflow;

    if (m_rds != m_psave->rds)
    {  // todo:
//        if ((ec = EndGroupAction(rds)) != ecOK)
//            return ec;
    }

    m_rds = m_psave->rds;
    m_ris = m_psave->ris;

    psaveOld = m_psave;
    m_psave = m_psave->pNext;
    m_cGroup--;
    delete psaveOld;
    return ecOK;
}

//
// ReleaseRtfState
// when find specific keyword and want to abort the parser abnormally
// call this function to flash the state stack
//

int CRtfParser::ReleaseRtfState(void)
{
    SAVE *psaveOld;

    while(psaveOld = m_psave)
    {
        assert(m_cGroup);
        m_psave = m_psave->pNext;
        m_cGroup--;
        delete psaveOld;
    }

    return ecOK;
}


//
// ParseChar
//
// Route the character to the appropriate destination stream.
//

int
CRtfParser::ParseChar(BYTE ch, BSTATUS bsStatus)
{
    int ec;

    if (m_ris == risBin && --m_cbBin <= 0)
        m_ris = risNorm;

    switch (m_rds)
    {
        case rdsSkip:
            // Toss this character.
            bsStatus = bsDefault;
            break;
        case rdsNorm:
            // Output a character. Properties are valid at this point.
            break;
        default:
        // handle other destinations....
            break;
    }
    
    // set status, trigger the conversion if any
    if ((ec = SetStatus(bsStatus)) != ecOK) {
        return ec;
    }

    // save the char
    if ((ec = SaveByte(ch)) != ecOK) {
        return ec;
    }

    return ec;
}

//
// ParseRtfKeyword
//
// get a control word (and its associated value) and
// call TranslateKeyword to dispatch the control.
//

int
CRtfParser::ParseRtfKeyword()
{
    BOOL fNeg = FALSE;
    char *pch;
    char szKeyword[30];
    char szParameter[20];
    BYTE ch;

    szKeyword[0] = '\0';
    szParameter[0] = '\0';

    if (GetByte(&ch) != ecOK)
        return ecEndOfFile;

    if (!isalpha(ch))           // a control symbol; no delimiter.
    {
        szKeyword[0] = (char) ch;
        szKeyword[1] = '\0';
        return TranslateKeyword(szKeyword, szParameter);
    }
    for (pch = szKeyword; isalpha(ch); GetByte(&ch))
        *pch++ = (char) ch;
    *pch = '\0';
    if (ch == '-')
    {
        fNeg  = TRUE;
        if (GetByte(&ch) != ecOK)
            return ecEndOfFile;
    }
    if (isdigit(ch))
    {
        pch = szParameter;
        if (fNeg) *pch++ = '-';
        for (; isdigit(ch); GetByte(&ch))
            *pch++ = (char) ch;
        *pch = '\0';
    }
    if (ch != ' ') {
        unGetByte(ch);
    } else {
        strcat(szParameter, " ");  // append the space to keyword
    }

    return TranslateKeyword(szKeyword, szParameter);
}

//
// TranslateKeyword.
// Inputs:
// szKeyword:   The RTF control to evaluate.

int
CRtfParser::TranslateKeyword(char *szKeyword, char* szParameter)
{
    BSTATUS bsStatus;
    int     isym;
    int     ec;
    BYTE    ch;

    // check specific keyword first
    if (m_sKeyword.wStatus & KW_ENABLE) 
    {
        if (strcmp(szKeyword, m_sKeyword.szKeyword) == 0) 
        {
            strcpy(m_sKeyword.szParameter, szParameter);
            if (szParameter[0] != '\0' && szParameter[0] != ' ')
                m_sKeyword.wStatus |= KW_PARAM;
            m_sKeyword.wStatus |= KW_FOUND;
            return ecOK;
        }
    }

    // search for szKeyword in rgsymRtf
    for (isym = 0; isym < g_iSymMax; isym++) {
        if (strcmp(szKeyword, g_rgSymRtf[isym].szKeyword) == 0)
            break;
    }

    if (isym == g_iSymMax)            // control word not found
    {
        if (m_fSkipDestIfUnk)         // if this is a new destination
            m_rds = rdsSkip;          // skip the destination
                                    // else just discard it
        m_fSkipDestIfUnk = FALSE;
        ec =  ecOK;
        goto gotoExit;
    }

    // found it!  use kwd and idx to determine what to do with it.

    m_fSkipDestIfUnk = FALSE;
    switch (g_rgSymRtf[isym].kwd)
    {
        case kwdChar:
            break;
        case kwdDest:
            ec = ChangeDest((IDEST)g_rgSymRtf[isym].idx);
            break;
        case kwdSpec:
            ec = ParseSpecialKeyword((IPFN)g_rgSymRtf[isym].idx, szParameter);
            break;
        default:
            ec = ecBadTable;
    }

gotoExit:
    // save keyword and parameter
    if (m_ris == risHex) {
        bsStatus = bsHex;
    } else {
        bsStatus =bsDefault;
    }
    ParseChar('\\', bsStatus);
    while (ch = *szKeyword++) ParseChar(ch, bsStatus);
    while (ch = *szParameter++) ParseChar(ch, bsStatus);

    return ec;
}

//
// ParseSpecialKeyword
//
// Evaluate an RTF control that needs special processing.
//

int
CRtfParser::ParseSpecialKeyword(IPFN ipfn, char* szParameter)
{
    if (m_rds == rdsSkip && ipfn != ipfnBin)  // if we're skipping, and it's not
        return ecOK;                        // the \bin keyword, ignore it.
    
    switch (ipfn)
    {
        case ipfnBin:
            m_ris = risBin;
            m_cbBin = atol(szParameter);
            break;
        case ipfnSkipDest:
            m_fSkipDestIfUnk = TRUE;
            break;
        case ipfnHex:
            m_ris = risHex;
            break;
        default:
            return ecBadTable;
    }
    return ecOK;
}

//
// ChangeDest
//
// Change to the destination specified by idest.
// There's usually more to do here than this...
//

int
CRtfParser::ChangeDest(IDEST idest)
{
    if (m_rds == rdsSkip)             // if we're skipping text,
        return ecOK;                // don't do anything

    switch (idest)
    {
        case idestPict:
        case idestSkip:
        default:
            m_rds = rdsSkip;              // when in doubt, skip it...
            break;
    }
    return ecOK;
}


//
// GetByte
//
// Get one char from input buffer
//

int
CRtfParser::GetByte(BYTE* pch)
{
    if (m_uCursor >= m_cchInput) {
        return ecEndOfFile;
    }

    *pch = *(m_pchInput + m_uCursor);
    m_uCursor ++;

    return ecOK;
}

//
// unGetByte
//
// adjust the cursor, return one char
//

int
CRtfParser::unGetByte(BYTE ch)
{
    if (m_uCursor) {
        m_uCursor--;
    }
    return ecOK;
}


//
// SaveByte
//
// Save one char to output buffer
//

int
CRtfParser::SaveByte(BYTE ch)
{
    if (m_uOutPos >= m_cchOutput) {
        return ecBufTooSmall;
    }

    *(m_pchOutput + m_uOutPos) = ch;
    m_uOutPos++;  // output buffer ++
    m_cchConvLen++;   // mapping range also ++

    return ecOK;
}


//
// SetStatus
//
// set the buffer status, if buffer status changed then start convert
//

int
CRtfParser::SetStatus(BSTATUS bsStatus)
{
    PBYTE pchDBCS, pchWCHAR, pchUniDes;
    UINT  i, cchLen;

    assert(m_uOutPos == m_uConvStart + m_cchConvLen);

    if (bsStatus != m_bsStatus) 
    {
        switch(m_bsStatus) 
        {
            case bsDefault:
                // control symbol, keyword, group char...
                break;

            case bsText:
                // here we got Ansi text
                // we do not do conversion for ansi text

                /*
                pchWCHAR = new BYTE[m_cchConvLen*2 + 8];
                if (!pchWCHAR) return ecOutOfMemory;

                MapFunc(m_pchOutput + m_uConvStart, m_cchConvLen, 
                    pchWCHAR, &cchLen);

                // replace old buffer with mapped buffer 
                for (i=0; i<cchLen; i++, m_uConvStart++) {
                    *(m_pchOutput + m_uConvStart) = *(pchWCHAR + i);
                }
                // set new output buffer position
                m_uOutPos = m_uConvStart;
                //
                delete [] pchWCHAR;
                */
                break;

            case bsHex:
                // when we are here, 
                // the rtf contains DBCS chars like "\'xx\'xx"
                // we only need to do DBCS->Unicode conversion, since we can not get
                // \upr keyword here (\upr is skipped, see keyword table)
                // so the MapFunc can be only (ANSI->Unicode) converter

                // we will map DBCS string "\'xx\'xx" to
                // "{\upr{"\'xx\'xx"}{\*\ud{\uc0 "Unicode string"}}}
                // in which Unicode string is like this:
                // \u12345\u-12345....
                // rtf treat unicode value as signed 16-bit decimal
                // so we don't distinquish 16-bit or 32-bit wide char, all
                // processed as 2-byte WCHAR

                if (m_cchConvLen == 0) {
                    break;
                }

                pchDBCS = new BYTE[m_cchConvLen * 3 + 8];
                if (!pchDBCS) return ecOutOfMemory;
                
                pchWCHAR = pchDBCS + m_cchConvLen; 
                // length: pchDBCS = m_cchConvLen
                //         pchWCHAR = m_cchConvLen * 2 + 8

                // map Hex string to DBCS string
                // return cchLen in Byte
                Hex2Char(m_pchOutput + m_uConvStart, m_cchConvLen, pchDBCS, m_cchConvLen, &cchLen);
                
                // map DBCS string to Unicode string
                // return cchLen in WCHAR
                cchLen = AnsiStrToUnicodeStr(pchDBCS, cchLen, (PWCH)pchWCHAR, cchLen+4);

//                MapFunc(pchDBCS, cchLen, pchWCHAR, &cchLen);

                // allocate a buffer for unicode destination
                // since one WCHAR map to max \u-xxxxx, that's 8 bytes
                // adding other 20 bytes for surrounding keywords and group chars
                // adding DBCS strings
                pchUniDes = new BYTE[cchLen * 8 + 32 + m_cchConvLen];
                if (!pchUniDes) {
                    delete [] pchDBCS;
                    return ecOutOfMemory;
                }

                // map to unicode destination
                GetUnicodeDestination(pchUniDes, (LPWSTR)pchWCHAR, cchLen, &cchLen);

                // replace old hex with new hex
                for (i=0; i<cchLen; i++, m_uConvStart++) {
                    *(m_pchOutput + m_uConvStart) = *(pchUniDes + i);
                }

                // set new output position
                m_uOutPos = m_uConvStart;

                // 
                delete [] pchDBCS;
                delete [] pchUniDes;
                break;

            default:
                assert(0);
                return ecAssertion;
        }

        // clean map buffer
        m_uConvStart = m_uOutPos;
        m_cchConvLen = 0;

        // set status
        m_bsStatus = bsStatus;
    }

    return ecOK;
}

//
// Hex2Char
//
// convert hex string to char string
//

int
CRtfParser::Hex2Char(BYTE* pchSrc, UINT cchSrc, BYTE* pchDes, UINT cchDes, UINT* pcchLen)
{
    BYTE* pchTmp = pchDes;
    BYTE ch;
    BYTE b = 0;
    BYTE cNibble = 2;

    // should be \'xx\'xx\'xx
    assert (cchSrc % 4 == 0);
    *pcchLen = 0;
    if (cchDes < cchSrc/4) {
        goto gotoError;
    }

    while (cchSrc--) 
    {
        ch = *pchSrc++;
        if (ch == '\\') {
            if (*pchSrc != '\'') {
                goto gotoError;
            }
        } else if (ch == '\'') { 
        } 
        else 
        {
            b = b << 4;
            if (isdigit(ch))
                b += (char) ch - '0';
            else
            {
                if (islower(ch))
                {
                    if (ch < 'a' || ch > 'f')
                        goto gotoError;
                    b += (char) ch - 'a' + 10;
                }
                else
                {
                    if (ch < 'A' || ch > 'F')
                        goto gotoError;
                    b += (char) ch - 'A' + 10;
                }
            }
            cNibble--;
            if (!cNibble)
            {
                *pchDes++ = b;
                cNibble = 2;
                b = 0;
            }
        }
    }

    *pcchLen = (UINT)(pchDes - pchTmp);
    return ecOK;

gotoError:
    assert(0);
    return ecInvalidHex;
}


#define LONIBBLE(c) (c&0x0f)
#define HINIBBLE(c) ((c&0xf0)>>4)

//
// Char2Hex
//
// convert char string to hex string
//

int  
CRtfParser::Char2Hex(BYTE* pchSrc, UINT cchSrc, BYTE* pchDes, UINT cchDes, UINT* pcchLen)
{
    BYTE* pchTmp = pchDes;
    BYTE ch,c;
    
    *pcchLen = 0;
    if (cchDes < cchSrc * 4) {
        goto gotoError;
    }

    while(cchSrc--)
    {
        *pchDes++ = '\\';
        *pchDes++ = '\'';
        ch = *pchSrc++;
        c = HINIBBLE(ch);
        if(c>9 && c<=0xF) {
            c += 'a'-10;
        } else if (c<=9) {
            c += '0';
        } else {
            goto gotoError;
        }
        *pchDes++ = c;

        c = LONIBBLE(ch);
        if(c>9 && c<=0xF) {
            c += 'a'-10;
        } else if (c<=9) {
            c += '0';
        } else {
            goto gotoError;
        }
        *pchDes++ = c;
    }

    *pcchLen = (UINT)(pchDes - pchTmp);
    return ecOK;

gotoError:
    assert(0);
    return ecInvalidHex;
}


//
// GetUnicodeDestination
//
// convert unicode string to unicode destination in RTF
// the format is:
//     "{\upr{\'xx\'xx}{\*\ud{\uc0 \u12345\u-12345}}
//

int
CRtfParser::GetUnicodeDestination(BYTE* pchUniDes, LPWSTR pwchStr, UINT wchLen, UINT* pcchLen)
{

    static char pch1[] = "{\\upr{";
    static char pch2[] = "}{\\*\\ud{\\uc0 ";
    static char pch3[] = "}}}";

    UINT  cchLen, cchDone;

    // copy \upr
    cchLen = strlen(pch1);
    memcpy(pchUniDes, pch1, cchLen);

    // copy DBCS string
    memcpy(pchUniDes + cchLen, m_pchOutput+m_uConvStart, m_cchConvLen);
    cchLen += m_cchConvLen;

    // copy middle part
    memcpy(pchUniDes + cchLen, pch2, strlen(pch2));
    cchLen += strlen(pch2);

    // copy unicode string
    for (UINT i=0; i<wchLen; i++)
    {
        WideCharToKeyword(pwchStr[i], pchUniDes + cchLen, &cchDone);

        cchLen += cchDone;
    }

    // copy last part
    memcpy(pchUniDes + cchLen, pch3, strlen(pch3));
    cchLen += strlen(pch3);

    // return
    *pcchLen = cchLen;

    return ecOK;
}


//
// WideCharToKeyword
//
// map one wide char to \u keyword
//

int 
CRtfParser::WideCharToKeyword(WCHAR wch, BYTE* pchDes, UINT* pcchLen)
{
    short num = (short) wch;
    char* pch = (char*) pchDes;

    sprintf(pch,"\\u%d", num);

    *pcchLen = strlen(pch);

    return ecOK;
}