Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

969 lines
22 KiB

  1. /*
  2. * @doc INTERNAL
  3. *
  4. * @module RTFLEX.CPP - RichEdit RTF reader lexical analyzer |
  5. *
  6. * This file contains the implementation of the lexical analyzer part of
  7. * the RTF reader.
  8. *
  9. * Authors: <nl>
  10. * Original RichEdit 1.0 RTF converter: Anthony Francisco <nl>
  11. * Conversion to C++ and RichEdit 2.0: Murray Sargent <nl>
  12. *
  13. * @devnote
  14. * All sz's in the RTF*.? files refer to a LPSTRs, not LPTSTRs, unless
  15. * noted as a szUnicode.
  16. *
  17. * Copyright (c) 1995-1997, Microsoft Corporation. All rights reserved.
  18. */
  19. #include "_common.h"
  20. #include "_rtfread.h"
  21. #include "hash.h"
  22. ASSERTDATA
  23. #include "tokens.cpp"
  24. // Array used by character classification macros to speed classification
  25. // of chars residing in two or more discontiguous ranges, e.g., alphanumeric
  26. // or hex. The alphabetics used in RTF control words are lower-case ASCII.
  27. // *** DO NOT DBCS rgbCharClass[] ***
  28. #define fCS fCT + fSP
  29. #define fSB fBL + fSP
  30. #define fHD fHX + fDG
  31. #define fHU fHX + fUC
  32. #define fHL fHX + fLC
  33. const BYTE rgbCharClass[256] =
  34. {
  35. fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT, fCT,fCS,fCS,fCS,fCS,fCS,fCT,fCT,
  36. fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT, fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT,
  37. fSB,fPN,fPN,fPN,fPN,fPN,fPN,fPN, fPN,fPN,fPN,fPN,fPN,fPN,fPN,fPN,
  38. fHD,fHD,fHD,fHD,fHD,fHD,fHD,fHD, fHD,fHD,fPN,fPN,fPN,fPN,fPN,fPN,
  39. fPN,fHU,fHU,fHU,fHU,fHU,fHU,fUC, fUC,fUC,fUC,fUC,fUC,fUC,fUC,fUC,
  40. fUC,fUC,fUC,fUC,fUC,fUC,fUC,fUC, fUC,fUC,fUC,fPN,fPN,fPN,fPN,fPN,
  41. fPN,fHL,fHL,fHL,fHL,fHL,fHL,fLC, fLC,fLC,fLC,fLC,fLC,fLC,fLC,fLC,
  42. fLC,fLC,fLC,fLC,fLC,fLC,fLC,fLC, fLC,fLC,fLC,fPN,fPN,fPN,fPN,fPN,
  43. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  44. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  45. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  46. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  47. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  48. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  49. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  50. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  51. };
  52. const char szRTFSig[] = "rtf";
  53. #define cchRTFSig 3
  54. #define cbRTFSig (cchRTFSig * sizeof(char))
  55. // Specifies the number of bytes we can safely "UngetChar"
  56. // before possibly underflowing the buffer.
  57. const int cbBackupMax = 4;
  58. // Bug2298 - I found an RTF writer which emits uppercase RTF keywords,
  59. // so I had to change IsLCAscii to IsAlphaChar for use in scanning
  60. // for RTF keywords.
  61. inline BOOL IsAlphaChar(BYTE b)
  62. {
  63. return IN_RANGE('a', b, 'z') || IN_RANGE('A', b, 'Z');
  64. }
  65. // Quick and dirty tolower(b)
  66. inline BYTE REToLower(BYTE b)
  67. {
  68. Assert(!b || IsAlphaChar(b));
  69. return b ? (BYTE)(b | 0x20) : 0;
  70. }
  71. extern BOOL IsRTF(char *pstr);
  72. BOOL IsRTF(
  73. char *pstr)
  74. {
  75. if(!pstr || *pstr++ != '{' || *pstr++ != '\\')
  76. return FALSE; // Quick out for most common cases
  77. if(*pstr == 'u') // Bypass u of possible urtf
  78. pstr++;
  79. return !CompareMemory(szRTFSig, pstr, cbRTFSig);
  80. }
  81. /*
  82. * CRTFRead::InitLex()
  83. *
  84. * @mfunc
  85. * Initialize the lexical analyzer. Reset the variables. if reading in
  86. * from resource file, sort the keyword list (). Uses global hinstRE
  87. * from the RichEdit to find out where its resources are. Note: in
  88. * RichEdit 2.0, currently the resource option is not supported.
  89. *
  90. * @rdesc
  91. * TRUE If lexical analyzer was initialized
  92. */
  93. BOOL CRTFRead::InitLex()
  94. {
  95. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::InitLex");
  96. AssertSz(cKeywords == i_TokenIndexMax,
  97. "Keyword index enumeration is incompatible with rgKeyword[]");
  98. Assert(!_szText && !_pchRTFBuffer);
  99. // Allocate our buffers with an extra byte for szText so that hex
  100. // conversion doesn't have to worry about running off the end if the
  101. // first char is NULL
  102. if ((_szText = (BYTE *)PvAlloc(cachTextMax + 1, GMEM_ZEROINIT)) &&
  103. (_pchRTFBuffer = (BYTE *)PvAlloc(cachBufferMost, GMEM_ZEROINIT)))
  104. {
  105. return TRUE; // Signal that lexer is initialized
  106. }
  107. _ped->GetCallMgr()->SetOutOfMemory();
  108. _ecParseError = ecLexInitFailed;
  109. return FALSE;
  110. }
  111. /*
  112. * CRTFRead::DeinitLex()
  113. *
  114. * @mfunc
  115. * Shut down lexical analyzer
  116. */
  117. void CRTFRead::DeinitLex()
  118. {
  119. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::DeinitLex");
  120. #ifdef KEYWORD_RESOURCE
  121. if (hglbKeywords)
  122. {
  123. FreeResource(hglbKeywords);
  124. hglbKeywords = NULL;
  125. rgKeyword = NULL;
  126. }
  127. #endif
  128. FreePv(_szText);
  129. FreePv(_pchRTFBuffer);
  130. }
  131. /*
  132. * CRTFRead::GetChar()
  133. *
  134. * @mfunc
  135. * Get next char, filling buffer as needed
  136. *
  137. * @rdesc
  138. * BYTE nonzero char value if success; else 0
  139. */
  140. BYTE CRTFRead::GetChar()
  141. {
  142. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetChar");
  143. if (_pchRTFCurrent == _pchRTFEnd && !FillBuffer())
  144. {
  145. _ecParseError = ecUnexpectedEOF;
  146. return 0;
  147. }
  148. return *_pchRTFCurrent++;
  149. }
  150. /*
  151. * CRTFRead::FillBuffer()
  152. *
  153. * @mfunc
  154. * Fill RTF buffer & return != 0 if successful
  155. *
  156. * @rdesc
  157. * LONG # chars read
  158. *
  159. * @comm
  160. * This routine doesn't bother copying anything down if
  161. * pchRTFCurrent <lt> pchRTFEnd so anything not read yet is lost.
  162. * The only exception to this is that it always copies down the
  163. * last two bytes read so that UngetChar() will work. ReadData()
  164. * actually counts on this behavior, so if you change it, change
  165. * ReadData() accordingly.
  166. */
  167. LONG CRTFRead::FillBuffer()
  168. {
  169. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::FillBuffer");
  170. LONG cchRead;
  171. if (!_pchRTFCurrent)
  172. {
  173. // No data yet, nothing for backup
  174. // Leave cbBackupMax NULL chars so backup
  175. // area of buffer doesn't contain garbage.
  176. for(int i = 0; i < cbBackupMax; i++)
  177. {
  178. _pchRTFBuffer[i] = 0;
  179. }
  180. }
  181. else
  182. {
  183. Assert(_pchRTFCurrent == _pchRTFEnd);
  184. // Copy most recently read chars in case
  185. // we need to back up
  186. int cbBackup = min((UINT) cbBackupMax, DiffPtrs(_pchRTFCurrent, &_pchRTFBuffer[cbBackupMax]));
  187. int i;
  188. for(i = -1; i >= -cbBackup; i--)
  189. _pchRTFBuffer[cbBackupMax + i] = _pchRTFCurrent[i];
  190. if(cbBackup < cbBackupMax)
  191. {
  192. // NULL before the first valid character in the backup buffer
  193. _pchRTFBuffer[cbBackupMax + i] = 0;
  194. }
  195. }
  196. _pchRTFCurrent = &_pchRTFBuffer[cbBackupMax];
  197. // Fill buffer with as much as we can take given our starting offset
  198. _pes->dwError = _pes->pfnCallback(_pes->dwCookie,
  199. _pchRTFCurrent,
  200. cachBufferMost - cbBackupMax,
  201. &cchRead);
  202. if (_pes->dwError)
  203. {
  204. TRACEERRSZSC("RTFLEX: GetChar()", _pes->dwError);
  205. _ecParseError = ecGeneralFailure;
  206. return 0;
  207. }
  208. _pchRTFEnd = &_pchRTFBuffer[cbBackupMax + cchRead]; // Point the end
  209. #if defined(DEBUG) && !defined(MACPORT)
  210. if(_hfileCapture)
  211. {
  212. DWORD cbLeftToWrite = cchRead;
  213. DWORD cbWritten = 0;
  214. BYTE *pbToWrite = (BYTE *)_pchRTFCurrent;
  215. while(WriteFile(_hfileCapture,
  216. pbToWrite,
  217. cbLeftToWrite,
  218. &cbWritten,
  219. NULL) &&
  220. (pbToWrite += cbWritten,
  221. (cbLeftToWrite -= cbWritten)));
  222. }
  223. #endif
  224. return cchRead;
  225. }
  226. /*
  227. * CRTFRead::UngetChar()
  228. *
  229. * @mfunc
  230. * Bump our file pointer back one char
  231. *
  232. * @rdesc
  233. * BOOL TRUE on success
  234. *
  235. * @comm
  236. * You can safely UngetChar _at most_ cbBackupMax times without
  237. * error.
  238. */
  239. BOOL CRTFRead::UngetChar()
  240. {
  241. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::UngetChar");
  242. if (_pchRTFCurrent == _pchRTFBuffer || !_pchRTFCurrent)
  243. {
  244. Assert(0);
  245. _ecParseError = ecUnGetCharFailed;
  246. return FALSE;
  247. }
  248. --_pchRTFCurrent;
  249. return TRUE;
  250. }
  251. /*
  252. * CRTFRead::UngetChar(cch)
  253. *
  254. * @mfunc
  255. * Bump our file pointer back 'cch' chars
  256. *
  257. * @rdesc
  258. * BOOL TRUE on success
  259. *
  260. * @comm
  261. * You can safely UngetChar _at most_ cbBackupMax times without
  262. * error.
  263. */
  264. BOOL CRTFRead::UngetChar(UINT cch)
  265. {
  266. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::UngetChar");
  267. AssertSz(cch <= cbBackupMax, "CRTFRead::UngetChar(): Number of UngetChar's "
  268. "exceeds size of backup buffer.");
  269. while(cch-- > 0)
  270. {
  271. if(!UngetChar())
  272. return FALSE;
  273. }
  274. return TRUE;
  275. }
  276. /*
  277. * CRTFRead::GetHex()
  278. *
  279. * @mfunc
  280. * Get next char if hex and return hex value
  281. * If not hex, leave char in buffer and return 255
  282. *
  283. * @rdesc
  284. * BYTE hex value of GetChar() if hex; else 255
  285. */
  286. BYTE CRTFRead::GetHex()
  287. {
  288. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetHex");
  289. BYTE ch = GetChar();
  290. if(IsXDigit(ch))
  291. return (BYTE)(ch <= '9' ? ch - '0' : (ch & 0x4f) - 'A' + 10);
  292. if(ch)
  293. UngetChar();
  294. return 255;
  295. }
  296. /*
  297. * CRTFRead::GetHexSkipCRLF()
  298. *
  299. * @mfunc
  300. * Get next char if hex and return hex value
  301. * If not hex, leave char in buffer and return 255
  302. *
  303. * @rdesc
  304. * BYTE hex value of GetChar() if hex; else 255
  305. *
  306. * @devnote
  307. * Keep this in sync with GetHex above.
  308. */
  309. BYTE CRTFRead::GetHexSkipCRLF()
  310. {
  311. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetHexSkipCRLF");
  312. BYTE ch = GetChar();
  313. // Skip \r \n
  314. while(ch == CR || ch == LF)
  315. ch = GetChar();
  316. // Rest is same as CRTFRead::GetHex()
  317. if(IsXDigit(ch))
  318. return (BYTE)(ch <= '9' ? ch - '0' : (ch & 0x4f) - 'A' + 10);
  319. if(ch)
  320. UngetChar();
  321. return 255;
  322. }
  323. /*
  324. * CRTFRead::TokenGetHex()
  325. *
  326. * @mfunc
  327. * Get an 8 bit character saved as a 2 hex digit value
  328. *
  329. * @rdesc
  330. * TOKEN value of hex number read in
  331. */
  332. TOKEN CRTFRead::TokenGetHex()
  333. {
  334. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetHex");
  335. BYTE bChar0 = GetHex();
  336. BYTE bChar1;
  337. if(bChar0 < 16 && (bChar1 = GetHex()) < 16)
  338. _token = (WORD)(bChar0 << 4 | bChar1);
  339. else
  340. _token = tokenError;
  341. return _token;
  342. }
  343. /*
  344. * CRTFRead::SkipToEndOfGroup()
  345. *
  346. * @mfunc
  347. * Skip to end of current group
  348. *
  349. * @rdesc
  350. * EC An error code
  351. */
  352. EC CRTFRead::SkipToEndOfGroup()
  353. {
  354. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::SkipToEndOfGroup");
  355. INT nDepth = 1;
  356. BYTE ach;
  357. while(TRUE)
  358. {
  359. ach = GetChar();
  360. switch(ach)
  361. {
  362. case BSLASH:
  363. {
  364. BYTE achNext = GetChar();
  365. // EOF: goto done; else ignore NULLs
  366. if(!achNext && _ecParseError == ecUnexpectedEOF)
  367. goto done;
  368. if(achNext == 'b' && UngetChar() &&
  369. TokenGetKeyword() == tokenBinaryData)
  370. {
  371. // We've encountered the \binN tag in the RTF we want
  372. // to skip. _iParam contains N from \binN once the
  373. // tag is parsed by TokenGetKeyword()
  374. SkipBinaryData(_iParam);
  375. }
  376. break;
  377. }
  378. case LBRACE:
  379. nDepth++;
  380. break;
  381. case RBRACE:
  382. if (--nDepth <= 0)
  383. goto done;
  384. break;
  385. case 0:
  386. if(_ecParseError == ecUnexpectedEOF)
  387. goto done;
  388. default:
  389. // Detect Lead bytes here.
  390. int cTrailBytes = GetTrailBytesCount(ach, _nCodePage);
  391. if (cTrailBytes)
  392. {
  393. for (int i = 0; i < cTrailBytes; i++)
  394. {
  395. ach = GetChar();
  396. if(ach == 0 && _ecParseError == ecUnexpectedEOF)
  397. goto done;
  398. }
  399. }
  400. break;
  401. }
  402. }
  403. Assert(!_ecParseError);
  404. _ecParseError = ecUnexpectedEOF;
  405. done:
  406. return _ecParseError;
  407. }
  408. /*
  409. * CRTFRead::TokenFindKeyword(szKeyword)
  410. *
  411. * @mfunc
  412. * Find keyword <p szKeyword> and return its token value
  413. *
  414. * @rdesc
  415. * TOKEN token number of keyword
  416. */
  417. TOKEN CRTFRead::TokenFindKeyword(
  418. BYTE * szKeyword) // @parm Keyword to find
  419. {
  420. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenFindKeyword");
  421. INT iMin;
  422. INT iMax;
  423. INT iMid;
  424. INT nComp;
  425. BYTE * pchCandidate;
  426. BYTE * pchKeyword;
  427. const KEYWORD * pk;
  428. AssertSz(szKeyword[0],
  429. "CRTFRead::TokenFindKeyword: null keyword");
  430. #ifdef RTF_HASHCACHE
  431. if ( _rtfHashInited )
  432. {
  433. // Hash is 23% faster than the following binary search on finds
  434. // and 55% faster on misses: For 97 words stored in a 257 cache.
  435. // Performance numbers will change when the total stored goes up.
  436. pk = HashKeyword_Fetch ( (CHAR *) szKeyword );
  437. }
  438. else
  439. #endif
  440. {
  441. iMin = 0;
  442. iMax = cKeywords - 1;
  443. pk = NULL;
  444. do // Note (MS3): Hash would be quicker than binary search
  445. {
  446. iMid = (iMin + iMax) / 2;
  447. pchCandidate = (BYTE *)rgKeyword[iMid].szKeyword;
  448. pchKeyword = szKeyword;
  449. while (!(nComp = REToLower(*pchKeyword) - *pchCandidate) // Be sure to match
  450. && *pchKeyword) // terminating 0's
  451. {
  452. pchKeyword++;
  453. pchCandidate++;
  454. }
  455. if (nComp < 0)
  456. iMax = iMid - 1;
  457. else if (nComp)
  458. iMin = iMid + 1;
  459. else
  460. {
  461. pk = &rgKeyword[iMid];
  462. break;
  463. }
  464. } while (iMin <= iMax);
  465. }
  466. if(pk)
  467. {
  468. _token = pk->token;
  469. // here, we log the RTF keyword scan to aid in tracking RTF tag ocverage
  470. // TODO: Implement RTF tag logging for the Mac and WinCE
  471. #if defined(DEBUG) && !defined(MACPORT) && !defined(PEGASUS)
  472. if(_prtflg)
  473. {
  474. #ifdef RTF_HASCACHE
  475. _prtflg->AddAt(szKeyword);
  476. #else
  477. _prtflg->AddAt((size_t)iMid);
  478. #endif
  479. }
  480. #endif
  481. }
  482. else
  483. _token = tokenUnknownKeyword; // No match: TODO: place to take
  484. return _token; // care of unrecognized RTF
  485. }
  486. /*
  487. * CRTFRead::TokenGetKeyword()
  488. *
  489. * @mfunc
  490. * Collect a keyword and its parameter. Return token's keyword
  491. *
  492. * @rdesc
  493. * TOKEN token number of keyword
  494. *
  495. * @comm
  496. * Most RTF control words (keywords) consist of a span of lower-case
  497. * ASCII letters possibly followed by a span of decimal digits. Other
  498. * control words consist of a single character that isn't LC ASCII. No
  499. * control words contain upper-case characters.
  500. */
  501. TOKEN CRTFRead::TokenGetKeyword()
  502. {
  503. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetKeyword");
  504. BYTE ach = GetChar();
  505. BYTE *pach;
  506. SHORT cachKeyword = 1;
  507. BYTE szKeyword[cachKeywordMax];
  508. _szParam[0] = '\0'; // Clear parameter
  509. _iParam = 0;
  510. if(!IsAlphaChar(ach)) // Not alpha, i.e.,
  511. { // single char
  512. if (ach == '\'') // Most common case needs
  513. { // special treatment
  514. // Convert hex to char and store result in _token
  515. if(TokenGetHex() == tokenError)
  516. {
  517. _ecParseError = ecUnexpectedChar;
  518. goto TokenError;
  519. }
  520. if((_token == CR || _token == LF) && FInDocTextDest())
  521. {
  522. // Add raw CR or LF in the byte stream as a \par
  523. return tokenEndParagraph;
  524. }
  525. }
  526. else
  527. {
  528. // Check for other known symbols
  529. const BYTE *pachSym = szSymbolKeywords;
  530. while(ach != *pachSym && *pachSym)
  531. pachSym++;
  532. if(*pachSym) // Found one
  533. {
  534. _token = tokenSymbol[pachSym - szSymbolKeywords];
  535. if(_token > 0x7F) // Token or larger Unicode
  536. return _token; // value
  537. }
  538. else if (!ach) // No more input chars
  539. goto TokenError;
  540. else // Code for unrecognized RTF
  541. _token = ach; // We'll just insert it for now
  542. }
  543. _token = TokenGetText((BYTE)_token);
  544. return _token;
  545. }
  546. szKeyword[0] = ach; // Collect keyword that starts
  547. pach = szKeyword + 1; // with ASCII
  548. while (cachKeyword < cachKeywordMax &&
  549. IsAlphaChar(ach = GetChar()))
  550. {
  551. cachKeyword++;
  552. *pach++ = ach;
  553. }
  554. if (cachKeyword == cachKeywordMax)
  555. {
  556. _ecParseError = ecKeywordTooLong;
  557. goto TokenError;
  558. }
  559. *pach = '\0'; // Terminate keyword
  560. if (IsDigit(ach) || ach == '-') // Collect parameter
  561. {
  562. pach = _szParam;
  563. *pach++ = ach;
  564. if(ach != '-')
  565. _iParam = ach - '0'; // Get parameter value
  566. while (IsDigit(ach = GetChar()))
  567. {
  568. _iParam = _iParam*10 + ach - '0';
  569. *pach++ = ach;
  570. }
  571. *pach = '\0'; // Terminate parameter string
  572. if (_szParam[0] == '-')
  573. _iParam = -_iParam;
  574. }
  575. if (!_ecParseError && // We overshot:
  576. (ach == ' ' || UngetChar())) // if not ' ', unget char
  577. return TokenFindKeyword(szKeyword); // Find and return keyword
  578. TokenError:
  579. TRACEERRSZSC("TokenGetKeyword()", _ecParseError);
  580. return _token = tokenError;
  581. }
  582. /*
  583. * CRTFRead::TokenGetText(ach)
  584. *
  585. * @mfunc
  586. * Collect a string of text starting with the char <p ach> and treat as a
  587. * single token. The string ends when a LBRACE, RBRACE, or single '\\' is found.
  588. *
  589. * @devnote
  590. * We peek past the '\\' for \\'xx, which we decode and keep on going;
  591. * else we return in a state where the next character is the '\\'.
  592. *
  593. * @rdesc
  594. * TOKEN Token number of next token (tokenText or tokenError)
  595. */
  596. TOKEN CRTFRead::TokenGetText(
  597. BYTE ach) // @parm First char of 8-bit text string
  598. {
  599. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetText");
  600. BYTE * pach = _szText;
  601. SHORT cachText = 0;
  602. LONG CodePage = _pstateStackTop->nCodePage;
  603. BOOL fAllASCII = TRUE;
  604. int cTrailBytesNeeded = 0;
  605. _token = tokenError; // Default error
  606. // FUTURE(BradO): This 'goto' into a while loop is pretty weak.
  607. // Restructure this 'while' loop such that the 'goto' is removed.
  608. // Add character passed into routine
  609. goto add;
  610. // If cTrailBytesNeeded is non-zero, we need to get all the trail bytes. Otherwise,
  611. // a string end in the middle of a DBC or UTF-8 will cause bad display/print problem
  612. // - 5 to allow extra space for up to 4 bytes for UTF-8 and Null char
  613. while (cachText < cachTextMax - 5 || cTrailBytesNeeded)
  614. {
  615. ach = GetChar();
  616. switch (ach)
  617. {
  618. case BSLASH:
  619. {
  620. // FUTURE(BradO): This code looks ALOT like TokenGetKeyword.
  621. // We should combine the two into a common routine.
  622. BYTE achNext;
  623. // Get char after BSLASH
  624. achNext = GetChar();
  625. if(!achNext)
  626. goto error;
  627. if(achNext == '\'') // Handle most frequent
  628. { // case here
  629. if(TokenGetHex() == tokenError)
  630. {
  631. if(cTrailBytesNeeded)
  632. {
  633. // The trail-byte must be a raw BSLASH.
  634. // Unget the single-quote.
  635. if(!UngetChar())
  636. goto error;
  637. // fall through to add BSLASH
  638. }
  639. else
  640. {
  641. _ecParseError = ecUnexpectedChar;
  642. goto error;
  643. }
  644. }
  645. else
  646. {
  647. ach = (BYTE)_token;
  648. if (cTrailBytesNeeded == 0 && (ach == CR || ach == LF) &&
  649. FInDocTextDest())
  650. {
  651. // Here, we have a raw CR or LF in document text.
  652. // Unget the whole lot of characters and bail out.
  653. // TokenGetKeyword will convert this CR or LF into
  654. // a \par.
  655. if(!UngetChar(4))
  656. goto error;
  657. goto done;
  658. }
  659. }
  660. goto add;
  661. }
  662. // Check next byte against list of RTF symbol
  663. // NOTE:- we need to check for RTF symbol even if we
  664. // are expecting a trail byte. According to the rtf spec,
  665. // we cannot just take this backslash as trail byte.
  666. // HWC 9/97
  667. const BYTE *pachSymbol = szSymbolKeywords;
  668. while(achNext != *pachSymbol && *pachSymbol)
  669. pachSymbol++;
  670. TOKEN tokenTmp;
  671. if (*pachSymbol &&
  672. (tokenTmp = tokenSymbol[pachSymbol - szSymbolKeywords])
  673. <= 0x7F)
  674. {
  675. ach = (BYTE)tokenTmp;
  676. goto add;
  677. }
  678. // In either of the last two cases below, we will want
  679. // to unget the byte following the BSLASH
  680. if(!UngetChar())
  681. goto error;
  682. if(cTrailBytesNeeded && !IsAlphaChar(achNext))
  683. {
  684. // In this situation, either this BSLASH begins the next
  685. // RTF keyword or it is a raw BSLASH which is the trail
  686. // byte for a DBCS character.
  687. // I think a fair assumption here is that if an alphanum
  688. // follows the BSLASH, that the BSLASH begins the next
  689. // RTF keyword.
  690. // add the raw BSLASH
  691. goto add;
  692. }
  693. // Here, my guess is that the BSLASH begins the next RTF
  694. // keyword, so unget the BSLASH
  695. if(!UngetChar())
  696. goto error;
  697. goto done;
  698. }
  699. case LBRACE: // End of text string
  700. case RBRACE:
  701. if(cTrailBytesNeeded)
  702. {
  703. // Previous char was a lead-byte of a DBCS pair or UTF-8, which
  704. // makes this char a raw trail-byte.
  705. goto add;
  706. }
  707. if(!UngetChar()) // Unget delimeter
  708. goto error;
  709. goto done;
  710. case LF: // Throw away noise chars
  711. case CR:
  712. break;
  713. case 0:
  714. if(_ecParseError == ecUnexpectedEOF)
  715. goto done;
  716. ach = ' '; // Replace NULL by blank
  717. default: // Collect chars
  718. add:
  719. // Outstanding chars to be skipped after \uN tag
  720. if(_cbSkipForUnicode)
  721. {
  722. _cbSkipForUnicode--;
  723. continue;
  724. }
  725. *pach++ = ach;
  726. ++cachText;
  727. if(ach > 0x7F)
  728. fAllASCII = FALSE;
  729. // Check if we are expecting more trail bytes
  730. if (cTrailBytesNeeded)
  731. cTrailBytesNeeded--;
  732. else
  733. cTrailBytesNeeded = GetTrailBytesCount(ach, CodePage);
  734. Assert(cTrailBytesNeeded >= 0);
  735. }
  736. }
  737. done:
  738. _token = (WORD)(fAllASCII ? tokenASCIIText : tokenText);
  739. *pach = '\0'; // Terminate token string
  740. error:
  741. return _token;
  742. }
  743. /*
  744. * CRTFRead::TokenGetToken()
  745. *
  746. * @mfunc
  747. * This function reads in next token from input stream
  748. *
  749. * @rdesc
  750. * TOKEN token number of next token
  751. */
  752. TOKEN CRTFRead::TokenGetToken()
  753. {
  754. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetToken");
  755. BYTE ach;
  756. _tokenLast = _token; // Used by \* destinations and FE
  757. _token = tokenEOF; // Default end-of-file
  758. SkipNoise:
  759. ach = GetChar();
  760. switch (ach)
  761. {
  762. case CR:
  763. case LF:
  764. goto SkipNoise;
  765. case LBRACE:
  766. _token = tokenStartGroup;
  767. break;
  768. case RBRACE:
  769. _token = tokenEndGroup;
  770. break;
  771. case BSLASH:
  772. _token = TokenGetKeyword();
  773. break;
  774. case 0:
  775. if(_ecParseError == ecUnexpectedEOF)
  776. break;
  777. ach = ' '; // Replace NULL by blank
  778. // Fall thru to default
  779. default:
  780. if( !_pstateStackTop )
  781. {
  782. TRACEWARNSZ("Unexpected token in rtf file");
  783. Assert(_token == tokenEOF);
  784. if (_ped->Get10Mode())
  785. _ecParseError = ecUnexpectedToken; // Signal bad file
  786. }
  787. else if (_pstateStackTop->sDest == destObjectData ||
  788. _pstateStackTop->sDest == destPicture )
  789. // not text but data
  790. {
  791. _token = (WORD)(tokenObjectDataValue + _pstateStackTop->sDest
  792. - destObjectData);
  793. UngetChar();
  794. }
  795. else
  796. _token = TokenGetText(ach);
  797. }
  798. return _token;
  799. }
  800. /*
  801. * CRTFRead::FInDocTextDest()
  802. *
  803. * @mfunc
  804. * Returns a BOOL indicating if the current destination is one in which
  805. * we would encounter document text.
  806. *
  807. * @rdesc
  808. * BOOL indicates the current destination may contain document text.
  809. */
  810. BOOL CRTFRead::FInDocTextDest() const
  811. {
  812. switch(_pstateStackTop->sDest)
  813. {
  814. case destRTF:
  815. case destField:
  816. case destFieldResult:
  817. case destFieldInstruction:
  818. case destParaNumbering:
  819. case destParaNumText:
  820. case destNULL:
  821. return TRUE;
  822. case destFontTable:
  823. case destRealFontName:
  824. case destObjectClass:
  825. case destObjectName:
  826. case destFollowingPunct:
  827. case destLeadingPunct:
  828. case destColorTable:
  829. case destBinary:
  830. case destObject:
  831. case destObjectData:
  832. case destPicture:
  833. case destDocumentArea:
  834. return FALSE;
  835. default:
  836. AssertSz(0, "CRTFRead::FInDocTextDest(): New destination "
  837. "encountered - update enum in _rtfread.h");
  838. return TRUE;
  839. }
  840. }