// ========================================================================================== // RtfParser.cpp // // Impl RTF parser // // History: // first created // ========================================================================================== //#include #include "stdafx.h" #include #include #include "rtfparser.h" #include "ConvEng.h" //extern BOOL MapFunc(PBYTE, UINT, PBYTE, UINT*); const char szRTFSignature[] = "{\\rtf"; // Keyword descriptions SYM g_rgSymRtf[] = { // keyword kwd idx "*", kwdSpec, ipfnSkipDest, "'", kwdSpec, ipfnHex, "bin", kwdSpec, ipfnBin, "upr", kwdDest, idestSkip, "fonttbl", kwdDest, idestSkip, /* // we will search through following destinations "author", kwdDest, idestSkip, "buptim", kwdDest, idestSkip, "colortbl", kwdDest, idestSkip, "comment", kwdDest, idestSkip, "creatim", kwdDest, idestSkip, "doccomm", kwdDest, idestSkip, "fonttbl", kwdDest, idestSkip, "footer", kwdDest, idestSkip, "footerf", kwdDest, idestSkip, "footerl", kwdDest, idestSkip, "footerr", kwdDest, idestSkip, "footnote", kwdDest, idestSkip, "ftncn", kwdDest, idestSkip, "ftnsep", kwdDest, idestSkip, "ftnsepc", kwdDest, idestSkip, "header", kwdDest, idestSkip, "headerf", kwdDest, idestSkip, "headerl", kwdDest, idestSkip, "headerr", kwdDest, idestSkip, "info", kwdDest, idestSkip, "keywords", kwdDest, idestSkip, "operator", kwdDest, idestSkip, "pict", kwdDest, idestSkip, "printim", kwdDest, idestSkip, "private1", kwdDest, idestSkip, "revtim", kwdDest, idestSkip, "rxe", kwdDest, idestSkip, "stylesheet", kwdDest, idestSkip, "subject", kwdDest, idestSkip, "tc", kwdDest, idestSkip, "title", kwdDest, idestSkip, "txe", kwdDest, idestSkip, "xe", kwdDest, idestSkip, */ }; int g_iSymMax = sizeof(g_rgSymRtf) / sizeof(SYM); // ctor CRtfParser::CRtfParser( BYTE* pchInput, UINT cchInput, BYTE* pchOutput, UINT cchOutput) { m_fInit = FALSE; m_pchInput = pchInput; m_cchInput = cchInput; m_pchOutput = pchOutput; m_cchOutput = cchOutput; Reset(); if (pchInput && pchOutput && cchInput && cchOutput) { m_fInit = TRUE; } } // Reset // clean internal status before start the parser void CRtfParser::Reset(void) { m_cGroup = 0; m_cbBin = 0; m_fSkipDestIfUnk = FALSE; m_ris = risNorm; m_rds = rdsNorm; m_psave = NULL; m_uCursor = 0; m_uOutPos = 0; m_bsStatus = bsDefault; m_uConvStart = 0; m_cchConvLen = 0; memset(&m_sKeyword,0, sizeof(SKeyword)); } // check signature BOOL CRtfParser::fRTFFile() { if (m_fInit && 0 == memcmp(m_pchInput, szRTFSignature, strlen(szRTFSignature))) { return TRUE; } return FALSE; } // Get major version int CRtfParser::GetVersion(PDWORD pdwVersion) { int ec; *pdwVersion = 1; // set keyword to get m_sKeyword.wStatus |= KW_ENABLE; strcpy(m_sKeyword.szKeyword, "rtf"); ec = Do(); if (ec == ecOK && (m_sKeyword.wStatus & KW_FOUND) && (m_sKeyword.wStatus & KW_PARAM)) { *pdwVersion = (DWORD) atoi(m_sKeyword.szParameter); } Reset(); return ec; } // GetCodepage int CRtfParser::GetCodepage(PDWORD pdwCodepage) { int ec; *pdwCodepage = 0; // set keyword to get m_sKeyword.wStatus |= KW_ENABLE; strcpy(m_sKeyword.szKeyword, "ansicpg"); ec = Do(); if (ec == ecOK && (m_sKeyword.wStatus & KW_FOUND) && (m_sKeyword.wStatus & KW_PARAM)) { *pdwCodepage = atoi(m_sKeyword.szParameter); } Reset(); return ec; } // do // main parser function int CRtfParser::Do() { int ec; int cNibble = 2; BYTE ch; BSTATUS bsStatus; while ((ec = GetByte(&ch)) == ecOK) { if (m_cGroup < 0) return ecStackUnderflow; // check if search specific keyword if (m_sKeyword.wStatus & KW_ENABLE) { if (m_sKeyword.wStatus & KW_FOUND) { ReleaseRtfState(); break; } } // set buf status bsStatus = bsDefault; if (m_ris == risBin) // if we're parsing binary data, handle it directly { // fall through } else { switch (ch) { case '{': if ((ec = PushRtfState()) != ecOK) return ec; break; case '}': if ((ec = PopRtfState()) != ecOK) return ec; break; case '\\': if ((ec = ParseRtfKeyword()) != ecOK) return ec; continue; // all keyword is processed in ParseRtfKeyword case 0x0d: case 0x0a: // cr and lf are noise characters... break; default: if (m_ris == risNorm ) { bsStatus = bsText; } else if (m_ris == risHex) { cNibble--; if (!cNibble) { cNibble = 2; m_ris = risNorm; } bsStatus = bsHex; } else { return ecAssertion; } break; } // switch } // else (ris != risBin) if ((ec = ParseChar(ch, bsStatus)) != ecOK) return ec; } // while if (m_cGroup < 0) return ecStackUnderflow; if (m_cGroup > 0) return ecUnmatchedBrace; return ecOK; } // // PushRtfState // // Save relevant info on a linked list of SAVE structures. // int CRtfParser::PushRtfState(void) { SAVE *psaveNew = new SAVE; if (!psaveNew) return ecStackOverflow; psaveNew -> pNext = m_psave; psaveNew -> rds = m_rds; psaveNew -> ris = m_ris; m_ris = risNorm; // do not save rds, rds status spread to sub destination until this destination // terminated m_psave = psaveNew; m_cGroup++; return ecOK; } // // PopRtfState // // If we're ending a destination (that is, the destination is changing), // call ecEndGroupAction. // Always restore relevant info from the top of the SAVE list. // int CRtfParser::PopRtfState(void) { SAVE *psaveOld; if (!m_psave) return ecStackUnderflow; if (m_rds != m_psave->rds) { // todo: // if ((ec = EndGroupAction(rds)) != ecOK) // return ec; } m_rds = m_psave->rds; m_ris = m_psave->ris; psaveOld = m_psave; m_psave = m_psave->pNext; m_cGroup--; delete psaveOld; return ecOK; } // // ReleaseRtfState // when find specific keyword and want to abort the parser abnormally // call this function to flash the state stack // int CRtfParser::ReleaseRtfState(void) { SAVE *psaveOld; while(psaveOld = m_psave) { assert(m_cGroup); m_psave = m_psave->pNext; m_cGroup--; delete psaveOld; } return ecOK; } // // ParseChar // // Route the character to the appropriate destination stream. // int CRtfParser::ParseChar(BYTE ch, BSTATUS bsStatus) { int ec; if (m_ris == risBin && --m_cbBin <= 0) m_ris = risNorm; switch (m_rds) { case rdsSkip: // Toss this character. bsStatus = bsDefault; break; case rdsNorm: // Output a character. Properties are valid at this point. break; default: // handle other destinations.... break; } // set status, trigger the conversion if any if ((ec = SetStatus(bsStatus)) != ecOK) { return ec; } // save the char if ((ec = SaveByte(ch)) != ecOK) { return ec; } return ec; } // // ParseRtfKeyword // // get a control word (and its associated value) and // call TranslateKeyword to dispatch the control. // int CRtfParser::ParseRtfKeyword() { BOOL fNeg = FALSE; char *pch; char szKeyword[30]; char szParameter[20]; BYTE ch; szKeyword[0] = '\0'; szParameter[0] = '\0'; if (GetByte(&ch) != ecOK) return ecEndOfFile; if (!isalpha(ch)) // a control symbol; no delimiter. { szKeyword[0] = (char) ch; szKeyword[1] = '\0'; return TranslateKeyword(szKeyword, szParameter); } for (pch = szKeyword; isalpha(ch); GetByte(&ch)) *pch++ = (char) ch; *pch = '\0'; if (ch == '-') { fNeg = TRUE; if (GetByte(&ch) != ecOK) return ecEndOfFile; } if (isdigit(ch)) { pch = szParameter; if (fNeg) *pch++ = '-'; for (; isdigit(ch); GetByte(&ch)) *pch++ = (char) ch; *pch = '\0'; } if (ch != ' ') { unGetByte(ch); } else { strcat(szParameter, " "); // append the space to keyword } return TranslateKeyword(szKeyword, szParameter); } // // TranslateKeyword. // Inputs: // szKeyword: The RTF control to evaluate. int CRtfParser::TranslateKeyword(char *szKeyword, char* szParameter) { BSTATUS bsStatus; int isym; int ec; BYTE ch; // check specific keyword first if (m_sKeyword.wStatus & KW_ENABLE) { if (strcmp(szKeyword, m_sKeyword.szKeyword) == 0) { strcpy(m_sKeyword.szParameter, szParameter); if (szParameter[0] != '\0' && szParameter[0] != ' ') m_sKeyword.wStatus |= KW_PARAM; m_sKeyword.wStatus |= KW_FOUND; return ecOK; } } // search for szKeyword in rgsymRtf for (isym = 0; isym < g_iSymMax; isym++) { if (strcmp(szKeyword, g_rgSymRtf[isym].szKeyword) == 0) break; } if (isym == g_iSymMax) // control word not found { if (m_fSkipDestIfUnk) // if this is a new destination m_rds = rdsSkip; // skip the destination // else just discard it m_fSkipDestIfUnk = FALSE; ec = ecOK; goto gotoExit; } // found it! use kwd and idx to determine what to do with it. m_fSkipDestIfUnk = FALSE; switch (g_rgSymRtf[isym].kwd) { case kwdChar: break; case kwdDest: ec = ChangeDest((IDEST)g_rgSymRtf[isym].idx); break; case kwdSpec: ec = ParseSpecialKeyword((IPFN)g_rgSymRtf[isym].idx, szParameter); break; default: ec = ecBadTable; } gotoExit: // save keyword and parameter if (m_ris == risHex) { bsStatus = bsHex; } else { bsStatus =bsDefault; } ParseChar('\\', bsStatus); while (ch = *szKeyword++) ParseChar(ch, bsStatus); while (ch = *szParameter++) ParseChar(ch, bsStatus); return ec; } // // ParseSpecialKeyword // // Evaluate an RTF control that needs special processing. // int CRtfParser::ParseSpecialKeyword(IPFN ipfn, char* szParameter) { if (m_rds == rdsSkip && ipfn != ipfnBin) // if we're skipping, and it's not return ecOK; // the \bin keyword, ignore it. switch (ipfn) { case ipfnBin: m_ris = risBin; m_cbBin = atol(szParameter); break; case ipfnSkipDest: m_fSkipDestIfUnk = TRUE; break; case ipfnHex: m_ris = risHex; break; default: return ecBadTable; } return ecOK; } // // ChangeDest // // Change to the destination specified by idest. // There's usually more to do here than this... // int CRtfParser::ChangeDest(IDEST idest) { if (m_rds == rdsSkip) // if we're skipping text, return ecOK; // don't do anything switch (idest) { case idestPict: case idestSkip: default: m_rds = rdsSkip; // when in doubt, skip it... break; } return ecOK; } // // GetByte // // Get one char from input buffer // int CRtfParser::GetByte(BYTE* pch) { if (m_uCursor >= m_cchInput) { return ecEndOfFile; } *pch = *(m_pchInput + m_uCursor); m_uCursor ++; return ecOK; } // // unGetByte // // adjust the cursor, return one char // int CRtfParser::unGetByte(BYTE ch) { if (m_uCursor) { m_uCursor--; } return ecOK; } // // SaveByte // // Save one char to output buffer // int CRtfParser::SaveByte(BYTE ch) { if (m_uOutPos >= m_cchOutput) { return ecBufTooSmall; } *(m_pchOutput + m_uOutPos) = ch; m_uOutPos++; // output buffer ++ m_cchConvLen++; // mapping range also ++ return ecOK; } // // SetStatus // // set the buffer status, if buffer status changed then start convert // int CRtfParser::SetStatus(BSTATUS bsStatus) { PBYTE pchDBCS, pchWCHAR, pchUniDes; UINT i, cchLen; assert(m_uOutPos == m_uConvStart + m_cchConvLen); if (bsStatus != m_bsStatus) { switch(m_bsStatus) { case bsDefault: // control symbol, keyword, group char... break; case bsText: // here we got Ansi text // we do not do conversion for ansi text /* pchWCHAR = new BYTE[m_cchConvLen*2 + 8]; if (!pchWCHAR) return ecOutOfMemory; MapFunc(m_pchOutput + m_uConvStart, m_cchConvLen, pchWCHAR, &cchLen); // replace old buffer with mapped buffer for (i=0; iUnicode conversion, since we can not get // \upr keyword here (\upr is skipped, see keyword table) // so the MapFunc can be only (ANSI->Unicode) converter // we will map DBCS string "\'xx\'xx" to // "{\upr{"\'xx\'xx"}{\*\ud{\uc0 "Unicode string"}}} // in which Unicode string is like this: // \u12345\u-12345.... // rtf treat unicode value as signed 16-bit decimal // so we don't distinquish 16-bit or 32-bit wide char, all // processed as 2-byte WCHAR if (m_cchConvLen == 0) { break; } pchDBCS = new BYTE[m_cchConvLen * 3 + 8]; if (!pchDBCS) return ecOutOfMemory; pchWCHAR = pchDBCS + m_cchConvLen; // length: pchDBCS = m_cchConvLen // pchWCHAR = m_cchConvLen * 2 + 8 // map Hex string to DBCS string // return cchLen in Byte Hex2Char(m_pchOutput + m_uConvStart, m_cchConvLen, pchDBCS, m_cchConvLen, &cchLen); // map DBCS string to Unicode string // return cchLen in WCHAR cchLen = AnsiStrToUnicodeStr(pchDBCS, cchLen, (PWCH)pchWCHAR, cchLen+4); // MapFunc(pchDBCS, cchLen, pchWCHAR, &cchLen); // allocate a buffer for unicode destination // since one WCHAR map to max \u-xxxxx, that's 8 bytes // adding other 20 bytes for surrounding keywords and group chars // adding DBCS strings pchUniDes = new BYTE[cchLen * 8 + 32 + m_cchConvLen]; if (!pchUniDes) { delete [] pchDBCS; return ecOutOfMemory; } // map to unicode destination GetUnicodeDestination(pchUniDes, (LPWSTR)pchWCHAR, cchLen, &cchLen); // replace old hex with new hex for (i=0; i 'f') goto gotoError; b += (char) ch - 'a' + 10; } else { if (ch < 'A' || ch > 'F') goto gotoError; b += (char) ch - 'A' + 10; } } cNibble--; if (!cNibble) { *pchDes++ = b; cNibble = 2; b = 0; } } } *pcchLen = (UINT)(pchDes - pchTmp); return ecOK; gotoError: assert(0); return ecInvalidHex; } #define LONIBBLE(c) (c&0x0f) #define HINIBBLE(c) ((c&0xf0)>>4) // // Char2Hex // // convert char string to hex string // int CRtfParser::Char2Hex(BYTE* pchSrc, UINT cchSrc, BYTE* pchDes, UINT cchDes, UINT* pcchLen) { BYTE* pchTmp = pchDes; BYTE ch,c; *pcchLen = 0; if (cchDes < cchSrc * 4) { goto gotoError; } while(cchSrc--) { *pchDes++ = '\\'; *pchDes++ = '\''; ch = *pchSrc++; c = HINIBBLE(ch); if(c>9 && c<=0xF) { c += 'a'-10; } else if (c<=9) { c += '0'; } else { goto gotoError; } *pchDes++ = c; c = LONIBBLE(ch); if(c>9 && c<=0xF) { c += 'a'-10; } else if (c<=9) { c += '0'; } else { goto gotoError; } *pchDes++ = c; } *pcchLen = (UINT)(pchDes - pchTmp); return ecOK; gotoError: assert(0); return ecInvalidHex; } // // GetUnicodeDestination // // convert unicode string to unicode destination in RTF // the format is: // "{\upr{\'xx\'xx}{\*\ud{\uc0 \u12345\u-12345}} // int CRtfParser::GetUnicodeDestination(BYTE* pchUniDes, LPWSTR pwchStr, UINT wchLen, UINT* pcchLen) { static char pch1[] = "{\\upr{"; static char pch2[] = "}{\\*\\ud{\\uc0 "; static char pch3[] = "}}}"; UINT cchLen, cchDone; // copy \upr cchLen = strlen(pch1); memcpy(pchUniDes, pch1, cchLen); // copy DBCS string memcpy(pchUniDes + cchLen, m_pchOutput+m_uConvStart, m_cchConvLen); cchLen += m_cchConvLen; // copy middle part memcpy(pchUniDes + cchLen, pch2, strlen(pch2)); cchLen += strlen(pch2); // copy unicode string for (UINT i=0; i