Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1033 lines
24 KiB

  1. /*
  2. * @doc INTERNAL
  3. *
  4. * @module RTFLEX.CPP - RichEdit RTF reader lexical analyzer |
  5. *
  6. * This file contains the implementation of the lexical analyzer part of
  7. * the RTF reader.
  8. *
  9. * Authors: <nl>
  10. * Original RichEdit 1.0 RTF converter: Anthony Francisco <nl>
  11. * Conversion to C++ and RichEdit 2.0: Murray Sargent <nl>
  12. *
  13. * @devnote
  14. * All sz's in the RTF*.? files refer to a LPSTRs, not LPWSTRs, unless
  15. * noted as a szUnicode.
  16. *
  17. * Copyright (c) 1995-2000, Microsoft Corporation. All rights reserved.
  18. */
  19. #include "_common.h"
  20. #include "_rtfread.h"
  21. #include "hash.h"
  22. #include "tokens.cpp"
  23. ASSERTDATA
  24. // Array used by character classification macros to speed classification
  25. // of chars residing in two or more discontiguous ranges, e.g., alphanumeric
  26. // or hex. The alphabetics used in RTF control words are lower-case ASCII.
  27. // *** DO NOT DBCS rgbCharClass[] ***
  28. #define fCS fCT + fSP
  29. #define fSB fBL + fSP
  30. #define fHD fHX + fDG
  31. #define fHU fHX + fUC
  32. #define fHL fHX + fLC
  33. const BYTE rgbCharClass[256] =
  34. {
  35. fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT, fCT,fCS,fCS,fCS,fCS,fCS,fCT,fCT,
  36. fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT, fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT,
  37. fSB,fPN,fPN,fPN,fPN,fPN,fPN,fPN, fPN,fPN,fPN,fPN,fPN,fPN,fPN,fPN,
  38. fHD,fHD,fHD,fHD,fHD,fHD,fHD,fHD, fHD,fHD,fPN,fPN,fPN,fPN,fPN,fPN,
  39. fPN,fHU,fHU,fHU,fHU,fHU,fHU,fUC, fUC,fUC,fUC,fUC,fUC,fUC,fUC,fUC,
  40. fUC,fUC,fUC,fUC,fUC,fUC,fUC,fUC, fUC,fUC,fUC,fPN,fPN,fPN,fPN,fPN,
  41. fPN,fHL,fHL,fHL,fHL,fHL,fHL,fLC, fLC,fLC,fLC,fLC,fLC,fLC,fLC,fLC,
  42. fLC,fLC,fLC,fLC,fLC,fLC,fLC,fLC, fLC,fLC,fLC,fPN,fPN,fPN,fPN,fPN,
  43. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  44. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  45. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  46. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  47. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  48. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  49. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  50. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  51. };
  52. // Specifies the number of bytes we can safely "UngetChar"
  53. // before possibly underflowing the buffer.
  54. const int cbBackupMax = 4;
  55. // Bug2298 - I found an RTF writer which emits uppercase RTF keywords,
  56. // so I had to change IsLCAscii to IsAlphaChar for use in scanning
  57. // for RTF keywords.
  58. inline BOOL IsAlphaChar(BYTE b)
  59. {
  60. return IN_RANGE('a', b | 0x20, 'z');
  61. }
  62. /*
  63. * IsRTF(pstr, cb)
  64. *
  65. * @func
  66. * Return FALSE if cb < 7 or pstr is NULL or if pstr doesn't start
  67. * with "{\rtf"N or "{\urtf"N, where N is an ASCII number. cb gives
  68. * the minimum length of pstr unless pstr is NULL-terminated, in which
  69. * case the null terminator marks the end of the string.
  70. *
  71. * @rdesc
  72. * TRUE if pstr points at a valid start of RTF data
  73. */
  74. BOOL IsRTF(
  75. char *pstr, //@parm String to check
  76. LONG cb) //@parm Min byte count if string isn't null terminated
  77. {
  78. if(!pstr || cb < 7 || *pstr++ != '{' || *pstr++ != '\\')
  79. return FALSE; // Quick out for most common cases
  80. if(*pstr == 'u') // Bypass u of possible urtf
  81. pstr++;
  82. return !CompareMemory("rtf", pstr, 3) && IsASCIIDigit((BYTE)pstr[3]);
  83. }
  84. /*
  85. * CRTFRead::InitLex()
  86. *
  87. * @mfunc
  88. * Initialize the lexical analyzer. Reset the variables. if reading in
  89. * from resource file, sort the keyword list (). Uses global hinstRE
  90. * from the RichEdit to find out where its resources are. Note: in
  91. * RichEdit 2.0, currently the resource option is not supported.
  92. *
  93. * @rdesc
  94. * TRUE If lexical analyzer was initialized
  95. */
  96. BOOL CRTFRead::InitLex()
  97. {
  98. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::InitLex");
  99. AssertSz(cKeywords == i_TokenIndexMax,
  100. "Keyword index enumeration is incompatible with rgKeyword[]");
  101. Assert(!_szText && !_pchRTFBuffer);
  102. // Allocate our buffers with an extra byte for szText so that hex
  103. // conversion doesn't have to worry about running off the end if the
  104. // first char is NULL
  105. if ((_szText = (BYTE *)PvAlloc(cachTextMax + 1, GMEM_ZEROINIT)) &&
  106. (_pchRTFBuffer = (BYTE *)PvAlloc(cachBufferMost, GMEM_ZEROINIT)))
  107. {
  108. return TRUE; // Signal that lexer is initialized
  109. }
  110. _ped->GetCallMgr()->SetOutOfMemory();
  111. _ecParseError = ecLexInitFailed;
  112. return FALSE;
  113. }
  114. /*
  115. * CRTFRead::DeinitLex()
  116. *
  117. * @mfunc
  118. * Shut down lexical analyzer
  119. */
  120. void CRTFRead::DeinitLex()
  121. {
  122. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::DeinitLex");
  123. #ifdef KEYWORD_RESOURCE
  124. if (hglbKeywords)
  125. {
  126. FreeResource(hglbKeywords);
  127. hglbKeywords = NULL;
  128. rgKeyword = NULL;
  129. }
  130. #endif
  131. FreePv(_szText);
  132. FreePv(_pchRTFBuffer);
  133. }
  134. /*
  135. * CRTFRead::GetChar()
  136. *
  137. * @mfunc
  138. * Get next char, filling buffer as needed
  139. *
  140. * @rdesc
  141. * BYTE nonzero char value if success; else 0
  142. */
  143. BYTE CRTFRead::GetChar()
  144. {
  145. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetChar");
  146. if (_pchRTFCurrent == _pchRTFEnd && !FillBuffer())
  147. {
  148. _ecParseError = ecUnexpectedEOF;
  149. return 0;
  150. }
  151. return *_pchRTFCurrent++;
  152. }
  153. /*
  154. * CRTFRead::GetCharEx()
  155. *
  156. * @mfunc
  157. * Get next char including escaped chars of form \'xx
  158. *
  159. * @rdesc
  160. * BYTE nonzero char value if success; else 0
  161. */
  162. BYTE CRTFRead::GetCharEx()
  163. {
  164. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetCharEx");
  165. BYTE ach;
  166. do
  167. ach = GetChar();
  168. while (ach == CR || ach == LF); // Ignore CRLFs
  169. if(ach == BSLASH)
  170. {
  171. if(GetChar() == '\'')
  172. {
  173. // Convert hex to char and store result in _token
  174. if(TokenGetHex() != tokenError)
  175. return (BYTE)_token;
  176. _ecParseError = ecUnexpectedChar;
  177. }
  178. UngetChar();
  179. }
  180. return ach;
  181. }
  182. /*
  183. * CRTFRead::FillBuffer()
  184. *
  185. * @mfunc
  186. * Fill RTF buffer & return != 0 if successful
  187. *
  188. * @rdesc
  189. * LONG # chars read
  190. *
  191. * @comm
  192. * This routine doesn't bother copying anything down if
  193. * pchRTFCurrent <lt> pchRTFEnd so anything not read yet is lost.
  194. * The only exception to this is that it always copies down the
  195. * last two bytes read so that UngetChar() will work. ReadData()
  196. * actually counts on this behavior, so if you change it, change
  197. * ReadData() accordingly.
  198. */
  199. LONG CRTFRead::FillBuffer()
  200. {
  201. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::FillBuffer");
  202. LONG cchRead;
  203. if (!_pchRTFCurrent)
  204. {
  205. // No data yet, nothing for backup
  206. // Leave cbBackupMax NULL chars so backup
  207. // area of buffer doesn't contain garbage.
  208. for(int i = 0; i < cbBackupMax; i++)
  209. {
  210. _pchRTFBuffer[i] = 0;
  211. }
  212. }
  213. else
  214. {
  215. Assert(_pchRTFCurrent == _pchRTFEnd);
  216. // Copy most recently read chars in case
  217. // we need to back up
  218. int cbBackup = min((UINT) cbBackupMax, DiffPtrs(_pchRTFCurrent, &_pchRTFBuffer[cbBackupMax]));
  219. int i;
  220. for(i = -1; i >= -cbBackup; i--)
  221. _pchRTFBuffer[cbBackupMax + i] = _pchRTFCurrent[i];
  222. if(cbBackup < cbBackupMax)
  223. {
  224. // NULL before the first valid character in the backup buffer
  225. _pchRTFBuffer[cbBackupMax + i] = 0;
  226. }
  227. }
  228. _pchRTFCurrent = &_pchRTFBuffer[cbBackupMax];
  229. // Fill buffer with as much as we can take given our starting offset
  230. _pes->dwError = _pes->pfnCallback(_pes->dwCookie,
  231. _pchRTFCurrent,
  232. cachBufferMost - cbBackupMax,
  233. &cchRead);
  234. if (_pes->dwError)
  235. {
  236. TRACEERRSZSC("RTFLEX: GetChar()", _pes->dwError);
  237. _ecParseError = ecGeneralFailure;
  238. return 0;
  239. }
  240. _pchRTFEnd = &_pchRTFBuffer[cbBackupMax + cchRead]; // Point the end
  241. #if defined(DEBUG)
  242. if(_hfileCapture)
  243. {
  244. DWORD cbLeftToWrite = cchRead;
  245. DWORD cbWritten = 0;
  246. BYTE *pbToWrite = (BYTE *)_pchRTFCurrent;
  247. while(WriteFile(_hfileCapture,
  248. pbToWrite,
  249. cbLeftToWrite,
  250. &cbWritten,
  251. NULL) &&
  252. (pbToWrite += cbWritten,
  253. (cbLeftToWrite -= cbWritten)));
  254. }
  255. #endif
  256. return cchRead;
  257. }
  258. /*
  259. * CRTFRead::UngetChar()
  260. *
  261. * @mfunc
  262. * Bump our file pointer back one char
  263. *
  264. * @rdesc
  265. * BOOL TRUE on success
  266. *
  267. * @comm
  268. * You can safely UngetChar _at most_ cbBackupMax times without
  269. * error.
  270. */
  271. BOOL CRTFRead::UngetChar()
  272. {
  273. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::UngetChar");
  274. if (_pchRTFCurrent == _pchRTFBuffer || !_pchRTFCurrent)
  275. {
  276. Assert(0);
  277. _ecParseError = ecUnGetCharFailed;
  278. return FALSE;
  279. }
  280. --_pchRTFCurrent;
  281. return TRUE;
  282. }
  283. /*
  284. * CRTFRead::UngetChar(cch)
  285. *
  286. * @mfunc
  287. * Bump our file pointer back 'cch' chars
  288. *
  289. * @rdesc
  290. * BOOL TRUE on success
  291. *
  292. * @comm
  293. * You can safely UngetChar _at most_ cbBackupMax times without
  294. * error.
  295. */
  296. BOOL CRTFRead::UngetChar(
  297. UINT cch) //@parm cch to put back in buffer
  298. {
  299. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::UngetChar");
  300. AssertSz(cch <= cbBackupMax, "CRTFRead::UngetChar(): Number of UngetChar's "
  301. "exceeds size of backup buffer.");
  302. while(cch-- > 0)
  303. {
  304. if(!UngetChar())
  305. return FALSE;
  306. }
  307. return TRUE;
  308. }
  309. /*
  310. * CRTFRead::GetHex()
  311. *
  312. * @mfunc
  313. * Get next char if hex and return hex value
  314. * If not hex, leave char in buffer and return 255
  315. *
  316. * @rdesc
  317. * BYTE hex value of GetChar() if hex; else 255
  318. */
  319. BYTE CRTFRead::GetHex()
  320. {
  321. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetHex");
  322. BYTE ch = GetChar();
  323. if(IsXDigit(ch))
  324. return (BYTE)(ch <= '9' ? ch - '0' : (ch & 0x4f) - 'A' + 10);
  325. if(ch)
  326. UngetChar();
  327. return 255;
  328. }
  329. /*
  330. * CRTFRead::GetHexSkipCRLF()
  331. *
  332. * @mfunc
  333. * Get next char if hex and return hex value
  334. * If not hex, leave char in buffer and return 255
  335. *
  336. * @rdesc
  337. * BYTE hex value of GetChar() if hex; else 255
  338. *
  339. * @devnote
  340. * Keep this in sync with GetHex above.
  341. */
  342. BYTE CRTFRead::GetHexSkipCRLF()
  343. {
  344. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetHexSkipCRLF");
  345. BYTE ch = GetChar();
  346. // Skip \r \n
  347. while(ch == CR || ch == LF)
  348. ch = GetChar();
  349. // Rest is same as CRTFRead::GetHex()
  350. if(IsXDigit(ch))
  351. return (BYTE)(ch <= '9' ? ch - '0' : (ch & 0x4f) - 'A' + 10);
  352. if(ch)
  353. UngetChar();
  354. return 255;
  355. }
  356. /*
  357. * CRTFRead::TokenGetHex()
  358. *
  359. * @mfunc
  360. * Get an 8 bit character saved as a 2 hex digit value
  361. *
  362. * @rdesc
  363. * TOKEN value of hex number read in
  364. */
  365. TOKEN CRTFRead::TokenGetHex()
  366. {
  367. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetHex");
  368. _token = tokenError; // Default error
  369. BYTE bChar0 = GetHex(); // Get hexadigit
  370. if(bChar0 < 16) // It's valid
  371. {
  372. BYTE bChar1 = GetHex(); // Get next hexadigit
  373. if(bChar1 < 16) // It's valid too
  374. _token = (WORD)(bChar0 << 4 | bChar1);
  375. else
  376. UngetChar(); // Invalid: put back 1st hexadigit
  377. }
  378. return _token;
  379. }
  380. /*
  381. * CRTFRead::SkipToEndOfGroup()
  382. *
  383. * @mfunc
  384. * Skip to end of current group
  385. *
  386. * @rdesc
  387. * EC An error code
  388. */
  389. EC CRTFRead::SkipToEndOfGroup()
  390. {
  391. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::SkipToEndOfGroup");
  392. INT nDepth = 1;
  393. BYTE ach;
  394. while(TRUE)
  395. {
  396. ach = GetChar();
  397. switch(ach)
  398. {
  399. case BSLASH:
  400. {
  401. BYTE achNext = GetChar();
  402. // EOF: goto done; else ignore NULLs
  403. if(!achNext && _ecParseError == ecUnexpectedEOF)
  404. goto done;
  405. if(achNext == 'b' && UngetChar() &&
  406. TokenGetKeyword() == tokenBinaryData)
  407. {
  408. // We've encountered the \binN tag in the RTF we want
  409. // to skip. _iParam contains N from \binN once the
  410. // tag is parsed by TokenGetKeyword()
  411. SkipBinaryData(_iParam);
  412. }
  413. break;
  414. }
  415. case LBRACE:
  416. nDepth++;
  417. break;
  418. case RBRACE:
  419. if (--nDepth <= 0)
  420. goto done;
  421. break;
  422. case 0:
  423. if(_ecParseError == ecUnexpectedEOF)
  424. goto done;
  425. default:
  426. // Detect Lead bytes here.
  427. int cTrailBytes = GetTrailBytesCount(ach, _nCodePage);
  428. if (cTrailBytes)
  429. {
  430. for (int i = 0; i < cTrailBytes; i++)
  431. {
  432. ach = GetChar();
  433. if(ach == 0 && _ecParseError == ecUnexpectedEOF)
  434. goto done;
  435. }
  436. }
  437. break;
  438. }
  439. }
  440. Assert(!_ecParseError);
  441. _ecParseError = ecUnexpectedEOF;
  442. done:
  443. return _ecParseError;
  444. }
  445. /*
  446. * CRTFRead::TokenFindKeyword(szKeyword, prgKeyword, cKeyword)
  447. *
  448. * @mfunc
  449. * Find keyword <p szKeyword> and return its token value
  450. *
  451. * @rdesc
  452. * TOKEN token number of keyword
  453. */
  454. TOKEN CRTFRead::TokenFindKeyword(
  455. BYTE * szKeyword, //@parm Keyword to find
  456. const KEYWORD *prgKeyword, //@parm Keyword array to use
  457. LONG cKeyword) //@parm Count of keywords
  458. {
  459. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenFindKeyword");
  460. INT iMax;
  461. INT iMid;
  462. INT iMin;
  463. INT nComp;
  464. BYTE * pchCandidate;
  465. BYTE * pchKeyword;
  466. const KEYWORD * pk;
  467. AssertSz(szKeyword[0],
  468. "CRTFRead::TokenFindKeyword: null keyword");
  469. _iKeyword = 0;
  470. #ifdef RTF_HASHCACHE
  471. if ( _rtfHashInited )
  472. {
  473. // Hash is 23% faster than the following binary search on finds
  474. // and 55% faster on misses: For 97 words stored in a 257 cache.
  475. // Performance numbers will change when the total stored goes up.
  476. pk = HashKeyword_Fetch ( (CHAR *) szKeyword );
  477. }
  478. else
  479. #endif
  480. {
  481. iMin = 0;
  482. iMax = cKeyword - 1;
  483. pk = NULL;
  484. do
  485. {
  486. iMid = (iMin + iMax) / 2;
  487. pchCandidate = (BYTE *)prgKeyword[iMid].szKeyword;
  488. pchKeyword = szKeyword;
  489. while (!(nComp = (*pchKeyword | 0x20) - (*pchCandidate | 0x20)) // Be sure to match
  490. && *pchKeyword) // terminating 0's
  491. {
  492. pchKeyword++;
  493. pchCandidate++;
  494. }
  495. if (nComp < 0)
  496. iMax = iMid - 1;
  497. else if (nComp)
  498. iMin = iMid + 1;
  499. else
  500. {
  501. pk = &prgKeyword[iMid];
  502. _iKeyword = iMid; // Save keyword index
  503. break;
  504. }
  505. } while (iMin <= iMax);
  506. }
  507. if(pk)
  508. {
  509. _token = pk->token;
  510. // Log the RTF keyword scan to aid in tracking RTF tag coverage
  511. // TODO: Implement RTF tag logging for the Mac and WinCE
  512. #if defined(DEBUG) && !defined(NOFULLDEBUG)
  513. if(_prtflg)
  514. {
  515. #ifdef RTF_HASCACHE
  516. _prtflg->AddAt(szKeyword);
  517. #else
  518. _prtflg->AddAt((size_t)iMid);
  519. #endif
  520. }
  521. #endif
  522. }
  523. else
  524. { // No match: place to take
  525. _token = tokenUnknownKeyword; // care of unrecognized RTF
  526. if(_fNotifyLowFiRTF)
  527. {
  528. iMin = 0; // Use binary search as above
  529. iMax = crgszUnrecognizedRTF - 1;
  530. do
  531. {
  532. iMid = (iMin + iMax) / 2;
  533. pchCandidate = (BYTE *)rgszUnrecognizedRTF[iMid];
  534. pchKeyword = szKeyword;
  535. while (!(nComp = (*pchKeyword | 0x20) - (*pchCandidate | 0x20))
  536. && *pchKeyword)
  537. {
  538. pchKeyword++;
  539. pchCandidate++;
  540. }
  541. if (nComp < 0)
  542. iMax = iMid - 1;
  543. else if (nComp && *pchCandidate)
  544. iMin = iMid + 1;
  545. else // Found keyword
  546. {
  547. _iKeyword = -iMid - 1;
  548. CheckNotifyLowFiRTF();
  549. break;
  550. }
  551. } while (iMin <= iMax);
  552. }
  553. }
  554. return _token;
  555. }
  556. /*
  557. * CRTFRead::CheckNotifyLowFiRTF()
  558. *
  559. * @mfunc
  560. * If LowFi RTF notifications are enabled, send notification for the
  561. * keyword with index _iKeyword to client and turn off the notifications
  562. * for the rest of this read.
  563. */
  564. void CRTFRead::CheckNotifyLowFiRTF(
  565. BOOL fEnable)
  566. {
  567. if(_fNotifyLowFiRTF && (_fBody || fEnable))
  568. {
  569. char *pach = _iKeyword >= 0
  570. ? (char *)rgKeyword[_iKeyword].szKeyword
  571. : (char *)rgszUnrecognizedRTF[-_iKeyword - 1];
  572. _ped->HandleLowFiRTF(pach);
  573. _fNotifyLowFiRTF = FALSE;
  574. }
  575. }
  576. /*
  577. * CRTFRead::TokenGetKeyword()
  578. *
  579. * @mfunc
  580. * Collect a keyword and its parameter. Return token's keyword
  581. *
  582. * @rdesc
  583. * TOKEN token number of keyword
  584. *
  585. * @comm
  586. * Most RTF control words (keywords) consist of a span of lower-case
  587. * ASCII letters possibly followed by a span of decimal digits. Other
  588. * control words consist of a single character that isn't LC ASCII. No
  589. * control words contain upper-case characters.
  590. */
  591. TOKEN CRTFRead::TokenGetKeyword()
  592. {
  593. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetKeyword");
  594. BYTE ach = GetChar();
  595. BYTE *pach;
  596. BYTE szKeyword[cachKeywordMax];
  597. BYTE *pachEnd = szKeyword + cachKeywordMax - 1;
  598. if(!IsAlphaChar(ach)) // Not alpha, i.e.,
  599. { // single char
  600. if (ach == '\'') // Most common case needs
  601. { // special treatment
  602. // Convert hex to char and store result in _token
  603. if(TokenGetHex() == tokenError)
  604. {
  605. _ecParseError = ecUnexpectedChar;
  606. goto TokenError;
  607. }
  608. if((_token == CR || _token == LF) && FInDocTextDest())
  609. {
  610. // Add raw CR or LF in the byte stream as a \par
  611. return tokenEndParagraph;
  612. }
  613. }
  614. else
  615. {
  616. // Check for other known symbols
  617. const BYTE *pachSym = szSymbolKeywords;
  618. while(ach != *pachSym && *pachSym)
  619. pachSym++;
  620. if(*pachSym) // Found one
  621. {
  622. _token = tokenSymbol[pachSym - szSymbolKeywords];
  623. if(_token > 0x7F) // Token or larger Unicode
  624. return _token; // value
  625. }
  626. else if (!ach) // No more input chars
  627. goto TokenError;
  628. else // Code for unrecognized RTF
  629. _token = ach; // We'll just insert it for now
  630. }
  631. _token = TokenGetText((BYTE)_token);
  632. return _token;
  633. }
  634. szKeyword[0] = ach; // Collect keyword that starts
  635. pach = szKeyword + 1; // with Alpha
  636. while (IsAlphaChar(ach = GetChar()))
  637. {
  638. if (pach < pachEnd)
  639. *pach++ = ach;
  640. }
  641. *pach = '\0'; // Terminate keyword
  642. GetParam(ach); // Get keyword N in _iParam
  643. if (!_ecParseError) // Find and return keyword
  644. return TokenFindKeyword(szKeyword, rgKeyword, cKeywords);
  645. TokenError:
  646. TRACEERRSZSC("TokenGetKeyword()", _ecParseError);
  647. return _token = tokenError;
  648. }
  649. /*
  650. * CRTFRead::GetParam(ach)
  651. *
  652. * @mfunc
  653. * Get any numeric parameter following a keyword, storing the result
  654. * in _iParam and setting _fParam = TRUE iff a number is found.
  655. */
  656. void CRTFRead::GetParam(
  657. char ach) // @parm First char of 8-bit text string
  658. {
  659. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetText");
  660. _fParam = FALSE; // Clear parameter
  661. _iParam = 0;
  662. if(IsDigit(ach) || ach == '-') // Collect parameter
  663. {
  664. BOOL fNegativeParam = TRUE;
  665. _fParam = TRUE;
  666. if(ach != '-')
  667. {
  668. _iParam = ach - '0'; // Get parameter value
  669. fNegativeParam = FALSE;
  670. }
  671. while (IsDigit(ach = GetChar()))
  672. _iParam = _iParam*10 + ach - '0';
  673. if (fNegativeParam)
  674. _iParam = -_iParam;
  675. }
  676. if(ach != ' ')
  677. UngetChar(); // If not ' ', unget char
  678. }
  679. /*
  680. * CRTFRead::TokenGetText(ach)
  681. *
  682. * @mfunc
  683. * Collect a string of text starting with the char <p ach> and treat as a
  684. * single token. The string ends when a LBRACE, RBRACE, or single '\\' is found.
  685. *
  686. * @devnote
  687. * We peek past the '\\' for \\'xx, which we decode and keep on going;
  688. * else we return in a state where the next character is the '\\'.
  689. *
  690. * @rdesc
  691. * TOKEN Token number of next token (tokenText or tokenError)
  692. */
  693. TOKEN CRTFRead::TokenGetText(
  694. BYTE ach) // @parm First char of 8-bit text string
  695. {
  696. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetText");
  697. BYTE * pach = _szText;
  698. SHORT cachText = 0;
  699. LONG CodePage = _pstateStackTop->nCodePage;
  700. BOOL fAllASCII = TRUE;
  701. int cTrailBytesNeeded = 0;
  702. _token = tokenError; // Default error
  703. // FUTURE(BradO): This 'goto' into a while loop is pretty weak.
  704. // Restructure this 'while' loop such that the 'goto' is removed.
  705. // Add character passed into routine
  706. goto add;
  707. // If cTrailBytesNeeded is non-zero, we need to get all the trail bytes. Otherwise,
  708. // a string end in the middle of a DBC or UTF-8 will cause bad display/print problem
  709. // - 5 to allow extra space for up to 4 bytes for UTF-8 and Null char
  710. while (cachText < cachTextMax - 5 || cTrailBytesNeeded)
  711. {
  712. ach = GetChar();
  713. switch (ach)
  714. {
  715. case BSLASH:
  716. {
  717. // FUTURE(BradO): This code looks ALOT like TokenGetKeyword.
  718. // We should combine the two into a common routine.
  719. BYTE achNext;
  720. // Get char after BSLASH
  721. achNext = GetChar();
  722. if(!achNext)
  723. goto error;
  724. if(achNext == '\'') // Handle most frequent
  725. { // case here
  726. if(TokenGetHex() == tokenError)
  727. {
  728. if(cTrailBytesNeeded)
  729. {
  730. // The trail-byte must be a raw BSLASH.
  731. // Unget the single-quote.
  732. if(!UngetChar())
  733. goto error;
  734. // fall through to add BSLASH
  735. }
  736. else
  737. {
  738. _ecParseError = ecUnexpectedChar;
  739. goto error;
  740. }
  741. }
  742. else
  743. {
  744. ach = (BYTE)_token;
  745. if (cTrailBytesNeeded == 0 && (ach == CR || ach == LF) &&
  746. FInDocTextDest())
  747. {
  748. // Here, we have a raw CR or LF in document text.
  749. // Unget the whole lot of characters and bail out.
  750. // TokenGetKeyword will convert this CR or LF into
  751. // a \par.
  752. if(!UngetChar(4))
  753. goto error;
  754. goto done;
  755. }
  756. }
  757. goto add;
  758. }
  759. // Check next byte against list of RTF symbol
  760. // NOTE:- we need to check for RTF symbol even if we
  761. // are expecting a trail byte. According to the rtf spec,
  762. // we cannot just take this backslash as trail byte.
  763. // HWC 9/97
  764. const BYTE *pachSymbol = szSymbolKeywords;
  765. while(achNext != *pachSymbol && *pachSymbol)
  766. pachSymbol++;
  767. TOKEN tokenTmp;
  768. if (*pachSymbol &&
  769. (tokenTmp = tokenSymbol[pachSymbol - szSymbolKeywords])
  770. <= 0x7F)
  771. {
  772. ach = (BYTE)tokenTmp;
  773. goto add;
  774. }
  775. // In either of the last two cases below, we will want
  776. // to unget the byte following the BSLASH
  777. if(!UngetChar())
  778. goto error;
  779. if(cTrailBytesNeeded && !IsAlphaChar(achNext))
  780. {
  781. // In this situation, either this BSLASH begins the next
  782. // RTF keyword or it is a raw BSLASH which is the trail
  783. // byte for a DBCS character.
  784. // I think a fair assumption here is that if an alphanum
  785. // follows the BSLASH, that the BSLASH begins the next
  786. // RTF keyword.
  787. // add the raw BSLASH
  788. goto add;
  789. }
  790. // Here, my guess is that the BSLASH begins the next RTF
  791. // keyword, so unget the BSLASH
  792. if(!UngetChar())
  793. goto error;
  794. goto done;
  795. }
  796. case LBRACE: // End of text string
  797. case RBRACE:
  798. if(cTrailBytesNeeded)
  799. {
  800. // Previous char was a lead-byte of a DBCS pair or UTF-8, which
  801. // makes this char a raw trail-byte.
  802. goto add;
  803. }
  804. if(!UngetChar()) // Unget delimeter
  805. goto error;
  806. goto done;
  807. case LF: // Throw away noise chars
  808. case CR:
  809. break;
  810. case 0:
  811. if(_ecParseError == ecUnexpectedEOF)
  812. goto done;
  813. ach = ' '; // Replace NULL by blank
  814. default: // Collect chars
  815. add:
  816. *pach++ = ach;
  817. ++cachText;
  818. if(ach > 0x7F)
  819. fAllASCII = FALSE;
  820. // Check if we are expecting more trail bytes
  821. if (cTrailBytesNeeded)
  822. cTrailBytesNeeded--;
  823. else
  824. cTrailBytesNeeded = GetTrailBytesCount(ach, CodePage);
  825. Assert(cTrailBytesNeeded >= 0);
  826. }
  827. }
  828. done:
  829. _token = (WORD)(fAllASCII ? tokenASCIIText : tokenText);
  830. *pach = '\0'; // Terminate token string
  831. error:
  832. return _token;
  833. }
  834. /*
  835. * CRTFRead::TokenGetToken()
  836. *
  837. * @mfunc
  838. * This function reads in next token from input stream
  839. *
  840. * @rdesc
  841. * TOKEN token number of next token
  842. */
  843. TOKEN CRTFRead::TokenGetToken()
  844. {
  845. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetToken");
  846. BYTE ach;
  847. _tokenLast = _token; // Used by \* destinations and FE
  848. _token = tokenEOF; // Default end-of-file
  849. SkipNoise:
  850. ach = GetChar();
  851. switch (ach)
  852. {
  853. case CR:
  854. case LF:
  855. goto SkipNoise;
  856. case LBRACE:
  857. _token = tokenStartGroup;
  858. break;
  859. case RBRACE:
  860. _token = tokenEndGroup;
  861. break;
  862. case BSLASH:
  863. _token = TokenGetKeyword();
  864. break;
  865. case 0:
  866. if(_ecParseError == ecUnexpectedEOF)
  867. break;
  868. ach = ' '; // Replace NULL by blank
  869. // Fall thru to default
  870. default:
  871. if( !_pstateStackTop )
  872. {
  873. TRACEWARNSZ("Unexpected token in rtf file");
  874. Assert(_token == tokenEOF);
  875. if (_ped->Get10Mode())
  876. _ecParseError = ecUnexpectedToken; // Signal bad file
  877. }
  878. else if (_pstateStackTop->sDest == destObjectData ||
  879. _pstateStackTop->sDest == destPicture )
  880. // not text but data
  881. {
  882. _token = (WORD)(tokenObjectDataValue + _pstateStackTop->sDest
  883. - destObjectData);
  884. UngetChar();
  885. }
  886. else
  887. _token = TokenGetText(ach);
  888. }
  889. return _token;
  890. }
  891. #define FINDOCTEXTDEST ((1 << destRTF) | \
  892. (1 << destField) | \
  893. (1 << destFieldResult) | (1 << destFieldInstruction) | \
  894. (1 << destParaNumText) | (1 << destParaNumbering) | \
  895. (1 << destNULL))
  896. /*
  897. * CRTFRead::FInDocTextDest()
  898. *
  899. * @mfunc
  900. * Returns a BOOL indicating if the current destination is one in which
  901. * we would encounter document text.
  902. *
  903. * @rdesc
  904. * BOOL indicates the current destination may contain document text.
  905. */
  906. BOOL CRTFRead::FInDocTextDest() const
  907. {
  908. AssertSz(_pstateStackTop->sDest < destMAX,
  909. "CRTFRead::FInDocTextDest(): New destination encountered - update enum in _rtfread.h");
  910. return (FINDOCTEXTDEST & (1 << _pstateStackTop->sDest)) != 0;
  911. }