Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

975 lines
23 KiB

  1. /*
  2. * @doc INTERNAL
  3. *
  4. * @module RTFLEX.CPP - RichEdit RTF reader lexical analyzer |
  5. *
  6. * This file contains the implementation of the lexical analyzer part of
  7. * the RTF reader.
  8. *
  9. * Authors: <nl>
  10. * Original RichEdit 1.0 RTF converter: Anthony Francisco <nl>
  11. * Conversion to C++ and RichEdit 2.0: Murray Sargent <nl>
  12. *
  13. * @devnote
  14. * All sz's in the RTF*.? files refer to a LPSTRs, not LPTSTRs, unless
  15. * noted as a szUnicode.
  16. *
  17. * Copyright (c) 1995-1997, Microsoft Corporation. All rights reserved.
  18. */
  19. #include "_common.h"
  20. #include "_rtfread.h"
  21. #include "hash.h"
  22. ASSERTDATA
  23. #include "tokens.cpp"
  24. // Array used by character classification macros to speed classification
  25. // of chars residing in two or more discontiguous ranges, e.g., alphanumeric
  26. // or hex. The alphabetics used in RTF control words are lower-case ASCII.
  27. // *** DO NOT DBCS rgbCharClass[] ***
  28. #define fCS fCT + fSP
  29. #define fSB fBL + fSP
  30. #define fHD fHX + fDG
  31. #define fHU fHX + fUC
  32. #define fHL fHX + fLC
  33. const BYTE rgbCharClass[256] =
  34. {
  35. fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT, fCT,fCS,fCS,fCS,fCS,fCS,fCT,fCT,
  36. fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT, fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT,
  37. fSB,fPN,fPN,fPN,fPN,fPN,fPN,fPN, fPN,fPN,fPN,fPN,fPN,fPN,fPN,fPN,
  38. fHD,fHD,fHD,fHD,fHD,fHD,fHD,fHD, fHD,fHD,fPN,fPN,fPN,fPN,fPN,fPN,
  39. fPN,fHU,fHU,fHU,fHU,fHU,fHU,fUC, fUC,fUC,fUC,fUC,fUC,fUC,fUC,fUC,
  40. fUC,fUC,fUC,fUC,fUC,fUC,fUC,fUC, fUC,fUC,fUC,fPN,fPN,fPN,fPN,fPN,
  41. fPN,fHL,fHL,fHL,fHL,fHL,fHL,fLC, fLC,fLC,fLC,fLC,fLC,fLC,fLC,fLC,
  42. fLC,fLC,fLC,fLC,fLC,fLC,fLC,fLC, fLC,fLC,fLC,fPN,fPN,fPN,fPN,fPN,
  43. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  44. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  45. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  46. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  47. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  48. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  49. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  50. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  51. };
  52. const char szRTFSig[] = "rtf";
  53. #define cchRTFSig 3
  54. #define cbRTFSig (cchRTFSig * sizeof(char))
  55. // Specifies the number of bytes we can safely "UngetChar"
  56. // before possibly underflowing the buffer.
  57. const int cbBackupMax = 4;
  58. // Bug2298 - I found an RTF writer which emits uppercase RTF keywords,
  59. // so I had to change IsLCAscii to IsAlphaChar for use in scanning
  60. // for RTF keywords.
  61. inline BOOL IsAlphaChar(BYTE b)
  62. {
  63. return IN_RANGE('a', b, 'z') || IN_RANGE('A', b, 'Z');
  64. }
  65. // Quick and dirty tolower(b)
  66. inline BYTE REToLower(BYTE b)
  67. {
  68. Assert(!b || IsAlphaChar(b));
  69. return b ? (BYTE)(b | 0x20) : 0;
  70. }
  71. extern BOOL IsRTF(char *pstr);
  72. BOOL IsRTF(
  73. char *pstr)
  74. {
  75. if(!pstr || *pstr++ != '{' || *pstr++ != '\\')
  76. return FALSE; // Quick out for most common cases
  77. if(*pstr == 'u') // Bypass u of possible urtf
  78. pstr++;
  79. return !CompareMemory(szRTFSig, pstr, cbRTFSig);
  80. }
  81. /*
  82. * CRTFRead::InitLex()
  83. *
  84. * @mfunc
  85. * Initialize the lexical analyzer. Reset the variables. if reading in
  86. * from resource file, sort the keyword list (). Uses global hinstRE
  87. * from the RichEdit to find out where its resources are. Note: in
  88. * RichEdit 2.0, currently the resource option is not supported.
  89. *
  90. * @rdesc
  91. * TRUE If lexical analyzer was initialized
  92. */
  93. BOOL CRTFRead::InitLex()
  94. {
  95. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::InitLex");
  96. AssertSz(cKeywords == i_TokenIndexMax,
  97. "Keyword index enumeration is incompatible with rgKeyword[]");
  98. Assert(!_szText && !_pchRTFBuffer);
  99. // Allocate our buffers with an extra byte for szText so that hex
  100. // conversion doesn't have to worry about running off the end if the
  101. // first char is NULL
  102. if ((_szText = (BYTE *)PvAlloc(cachTextMax + 1, GMEM_ZEROINIT)) &&
  103. (_pchRTFBuffer = (BYTE *)PvAlloc(cachBufferMost, GMEM_ZEROINIT)))
  104. {
  105. return TRUE; // Signal that lexer is initialized
  106. }
  107. _ped->GetCallMgr()->SetOutOfMemory();
  108. _ecParseError = ecLexInitFailed;
  109. return FALSE;
  110. }
  111. /*
  112. * CRTFRead::DeinitLex()
  113. *
  114. * @mfunc
  115. * Shut down lexical analyzer
  116. */
  117. void CRTFRead::DeinitLex()
  118. {
  119. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::DeinitLex");
  120. #ifdef KEYWORD_RESOURCE
  121. if (hglbKeywords)
  122. {
  123. FreeResource(hglbKeywords);
  124. hglbKeywords = NULL;
  125. rgKeyword = NULL;
  126. }
  127. #endif
  128. FreePv(_szText);
  129. FreePv(_pchRTFBuffer);
  130. }
  131. /*
  132. * CRTFRead::GetChar()
  133. *
  134. * @mfunc
  135. * Get next char, filling buffer as needed
  136. *
  137. * @rdesc
  138. * BYTE nonzero char value if success; else 0
  139. */
  140. BYTE CRTFRead::GetChar()
  141. {
  142. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetChar");
  143. if (_pchRTFCurrent == _pchRTFEnd && !FillBuffer())
  144. {
  145. _ecParseError = ecUnexpectedEOF;
  146. return 0;
  147. }
  148. return *_pchRTFCurrent++;
  149. }
  150. /*
  151. * CRTFRead::FillBuffer()
  152. *
  153. * @mfunc
  154. * Fill RTF buffer & return != 0 if successful
  155. *
  156. * @rdesc
  157. * LONG # chars read
  158. *
  159. * @comm
  160. * This routine doesn't bother copying anything down if
  161. * pchRTFCurrent <lt> pchRTFEnd so anything not read yet is lost.
  162. * The only exception to this is that it always copies down the
  163. * last two bytes read so that UngetChar() will work. ReadData()
  164. * actually counts on this behavior, so if you change it, change
  165. * ReadData() accordingly.
  166. */
  167. LONG CRTFRead::FillBuffer()
  168. {
  169. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::FillBuffer");
  170. LONG cchRead;
  171. if (!_pchRTFCurrent)
  172. {
  173. // No data yet, nothing for backup
  174. // Leave cbBackupMax NULL chars so backup
  175. // area of buffer doesn't contain garbage.
  176. for(int i = 0; i < cbBackupMax; i++)
  177. {
  178. _pchRTFBuffer[i] = 0;
  179. }
  180. }
  181. else
  182. {
  183. Assert(_pchRTFCurrent == _pchRTFEnd);
  184. // Copy most recently read chars in case
  185. // we need to back up
  186. int cbBackup = min((UINT) cbBackupMax, DiffPtrs(_pchRTFCurrent, &_pchRTFBuffer[cbBackupMax]));
  187. int i;
  188. for(i = -1; i >= -cbBackup; i--)
  189. _pchRTFBuffer[cbBackupMax + i] = _pchRTFCurrent[i];
  190. if(cbBackup < cbBackupMax)
  191. {
  192. // NULL before the first valid character in the backup buffer
  193. _pchRTFBuffer[cbBackupMax + i] = 0;
  194. }
  195. }
  196. _pchRTFCurrent = &_pchRTFBuffer[cbBackupMax];
  197. // Fill buffer with as much as we can take given our starting offset
  198. _pes->dwError = _pes->pfnCallback(_pes->dwCookie,
  199. _pchRTFCurrent,
  200. cachBufferMost - cbBackupMax,
  201. &cchRead);
  202. if (_pes->dwError)
  203. {
  204. TRACEERRSZSC("RTFLEX: GetChar()", _pes->dwError);
  205. _ecParseError = ecGeneralFailure;
  206. return 0;
  207. }
  208. _pchRTFEnd = &_pchRTFBuffer[cbBackupMax + cchRead]; // Point the end
  209. #if defined(DEBUG) && !defined(MACPORT)
  210. if(_hfileCapture)
  211. {
  212. DWORD cbLeftToWrite = cchRead;
  213. DWORD cbWritten = 0;
  214. BYTE *pbToWrite = (BYTE *)_pchRTFCurrent;
  215. while(WriteFile(_hfileCapture,
  216. pbToWrite,
  217. cbLeftToWrite,
  218. &cbWritten,
  219. NULL) &&
  220. (pbToWrite += cbWritten,
  221. (cbLeftToWrite -= cbWritten)));
  222. }
  223. #endif
  224. return cchRead;
  225. }
  226. /*
  227. * CRTFRead::UngetChar()
  228. *
  229. * @mfunc
  230. * Bump our file pointer back one char
  231. *
  232. * @rdesc
  233. * BOOL TRUE on success
  234. *
  235. * @comm
  236. * You can safely UngetChar _at most_ cbBackupMax times without
  237. * error.
  238. */
  239. BOOL CRTFRead::UngetChar()
  240. {
  241. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::UngetChar");
  242. if (_pchRTFCurrent == _pchRTFBuffer || !_pchRTFCurrent)
  243. {
  244. Assert(0);
  245. _ecParseError = ecUnGetCharFailed;
  246. return FALSE;
  247. }
  248. --_pchRTFCurrent;
  249. return TRUE;
  250. }
  251. /*
  252. * CRTFRead::UngetChar(cch)
  253. *
  254. * @mfunc
  255. * Bump our file pointer back 'cch' chars
  256. *
  257. * @rdesc
  258. * BOOL TRUE on success
  259. *
  260. * @comm
  261. * You can safely UngetChar _at most_ cbBackupMax times without
  262. * error.
  263. */
  264. BOOL CRTFRead::UngetChar(UINT cch)
  265. {
  266. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::UngetChar");
  267. AssertSz(cch <= cbBackupMax, "CRTFRead::UngetChar(): Number of UngetChar's "
  268. "exceeds size of backup buffer.");
  269. while(cch-- > 0)
  270. {
  271. if(!UngetChar())
  272. return FALSE;
  273. }
  274. return TRUE;
  275. }
  276. /*
  277. * CRTFRead::GetHex()
  278. *
  279. * @mfunc
  280. * Get next char if hex and return hex value
  281. * If not hex, leave char in buffer and return 255
  282. *
  283. * @rdesc
  284. * BYTE hex value of GetChar() if hex; else 255
  285. */
  286. BYTE CRTFRead::GetHex()
  287. {
  288. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetHex");
  289. BYTE ch = GetChar();
  290. if(IsXDigit(ch))
  291. return (BYTE)(ch <= '9' ? ch - '0' : (ch & 0x4f) - 'A' + 10);
  292. if(ch)
  293. UngetChar();
  294. return 255;
  295. }
  296. /*
  297. * CRTFRead::GetHexSkipCRLF()
  298. *
  299. * @mfunc
  300. * Get next char if hex and return hex value
  301. * If not hex, leave char in buffer and return 255
  302. *
  303. * @rdesc
  304. * BYTE hex value of GetChar() if hex; else 255
  305. *
  306. * @devnote
  307. * Keep this in sync with GetHex above.
  308. */
  309. BYTE CRTFRead::GetHexSkipCRLF()
  310. {
  311. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetHexSkipCRLF");
  312. BYTE ch = GetChar();
  313. // Skip \r \n
  314. while(ch == CR || ch == LF)
  315. ch = GetChar();
  316. // Rest is same as CRTFRead::GetHex()
  317. if(IsXDigit(ch))
  318. return (BYTE)(ch <= '9' ? ch - '0' : (ch & 0x4f) - 'A' + 10);
  319. if(ch)
  320. UngetChar();
  321. return 255;
  322. }
  323. /*
  324. * CRTFRead::TokenGetHex()
  325. *
  326. * @mfunc
  327. * Get an 8 bit character saved as a 2 hex digit value
  328. *
  329. * @rdesc
  330. * TOKEN value of hex number read in
  331. */
  332. TOKEN CRTFRead::TokenGetHex()
  333. {
  334. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetHex");
  335. BYTE bChar0 = GetHex();
  336. BYTE bChar1;
  337. if(bChar0 < 16 && (bChar1 = GetHex()) < 16)
  338. _token = (WORD)(bChar0 << 4 | bChar1);
  339. else
  340. _token = tokenError;
  341. return _token;
  342. }
  343. /*
  344. * CRTFRead::SkipToEndOfGroup()
  345. *
  346. * @mfunc
  347. * Skip to end of current group
  348. *
  349. * @rdesc
  350. * EC An error code
  351. */
  352. EC CRTFRead::SkipToEndOfGroup()
  353. {
  354. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::SkipToEndOfGroup");
  355. INT nDepth = 1;
  356. BYTE ach;
  357. while(TRUE)
  358. {
  359. ach = GetChar();
  360. switch(ach)
  361. {
  362. case BSLASH:
  363. {
  364. BYTE achNext = GetChar();
  365. // EOF: goto done; else ignore NULLs
  366. if(!achNext && _ecParseError == ecUnexpectedEOF)
  367. goto done;
  368. if(achNext == 'b' && UngetChar() &&
  369. TokenGetKeyword() == tokenBinaryData)
  370. {
  371. // We've encountered the \binN tag in the RTF we want
  372. // to skip. _iParam contains N from \binN once the
  373. // tag is parsed by TokenGetKeyword()
  374. SkipBinaryData(_iParam);
  375. }
  376. break;
  377. }
  378. case LBRACE:
  379. nDepth++;
  380. break;
  381. case RBRACE:
  382. if (--nDepth <= 0)
  383. goto done;
  384. break;
  385. case 0:
  386. if(_ecParseError == ecUnexpectedEOF)
  387. goto done;
  388. default:
  389. // Detect Lead bytes here.
  390. int cTrailBytes = GetTrailBytesCount(ach, _nCodePage);
  391. if (cTrailBytes)
  392. {
  393. for (int i = 0; i < cTrailBytes; i++)
  394. {
  395. ach = GetChar();
  396. if(ach == 0 && _ecParseError == ecUnexpectedEOF)
  397. goto done;
  398. }
  399. }
  400. break;
  401. }
  402. }
  403. Assert(!_ecParseError);
  404. _ecParseError = ecUnexpectedEOF;
  405. done:
  406. return _ecParseError;
  407. }
  408. /*
  409. * CRTFRead::TokenFindKeyword(szKeyword)
  410. *
  411. * @mfunc
  412. * Find keyword <p szKeyword> and return its token value
  413. *
  414. * @rdesc
  415. * TOKEN token number of keyword
  416. */
  417. TOKEN CRTFRead::TokenFindKeyword(
  418. BYTE * szKeyword) // @parm Keyword to find
  419. {
  420. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenFindKeyword");
  421. INT iMin;
  422. INT iMax;
  423. INT iMid;
  424. INT nComp;
  425. BYTE * pchCandidate;
  426. BYTE * pchKeyword;
  427. const KEYWORD * pk;
  428. AssertSz(szKeyword[0],
  429. "CRTFRead::TokenFindKeyword: null keyword");
  430. #ifdef RTF_HASHCACHE
  431. if ( _rtfHashInited )
  432. {
  433. // Hash is 23% faster than the following binary search on finds
  434. // and 55% faster on misses: For 97 words stored in a 257 cache.
  435. // Performance numbers will change when the total stored goes up.
  436. pk = HashKeyword_Fetch ( (CHAR *) szKeyword );
  437. }
  438. else
  439. #endif
  440. {
  441. iMin = 0;
  442. iMax = cKeywords - 1;
  443. pk = NULL;
  444. do // Note (MS3): Hash would be quicker than binary search
  445. {
  446. iMid = (iMin + iMax) / 2;
  447. pchCandidate = (BYTE *)rgKeyword[iMid].szKeyword;
  448. pchKeyword = szKeyword;
  449. while (!(nComp = REToLower(*pchKeyword) - *pchCandidate) // Be sure to match
  450. && *pchKeyword) // terminating 0's
  451. {
  452. pchKeyword++;
  453. pchCandidate++;
  454. }
  455. if (nComp < 0)
  456. iMax = iMid - 1;
  457. else if (nComp)
  458. iMin = iMid + 1;
  459. else
  460. {
  461. pk = &rgKeyword[iMid];
  462. break;
  463. }
  464. } while (iMin <= iMax);
  465. }
  466. if(pk)
  467. {
  468. _token = pk->token;
  469. // here, we log the RTF keyword scan to aid in tracking RTF tag ocverage
  470. // TODO: Implement RTF tag logging for the Mac and WinCE
  471. #if defined(DEBUG) && !defined(MACPORT) && !defined(PEGASUS)
  472. if(_prtflg)
  473. {
  474. #ifdef RTF_HASCACHE
  475. _prtflg->AddAt(szKeyword);
  476. #else
  477. _prtflg->AddAt((size_t)iMid);
  478. #endif
  479. }
  480. #endif
  481. }
  482. else
  483. _token = tokenUnknownKeyword; // No match: TODO: place to take
  484. return _token; // care of unrecognized RTF
  485. }
  486. /*
  487. * CRTFRead::TokenGetKeyword()
  488. *
  489. * @mfunc
  490. * Collect a keyword and its parameter. Return token's keyword
  491. *
  492. * @rdesc
  493. * TOKEN token number of keyword
  494. *
  495. * @comm
  496. * Most RTF control words (keywords) consist of a span of lower-case
  497. * ASCII letters possibly followed by a span of decimal digits. Other
  498. * control words consist of a single character that isn't LC ASCII. No
  499. * control words contain upper-case characters.
  500. */
  501. TOKEN CRTFRead::TokenGetKeyword()
  502. {
  503. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetKeyword");
  504. BYTE ach = GetChar();
  505. BYTE *pach;
  506. SHORT cachKeyword = 1;
  507. BYTE szKeyword[cachKeywordMax];
  508. _szParam[0] = '\0'; // Clear parameter
  509. _iParam = 0;
  510. if(!IsAlphaChar(ach)) // Not alpha, i.e.,
  511. { // single char
  512. if (ach == '\'') // Most common case needs
  513. { // special treatment
  514. // Convert hex to char and store result in _token
  515. if(TokenGetHex() == tokenError)
  516. {
  517. _ecParseError = ecUnexpectedChar;
  518. goto TokenError;
  519. }
  520. if((_token == CR || _token == LF) && FInDocTextDest())
  521. {
  522. // Add raw CR or LF in the byte stream as a \par
  523. return tokenEndParagraph;
  524. }
  525. }
  526. else
  527. {
  528. // Check for other known symbols
  529. const BYTE *pachSym = szSymbolKeywords;
  530. while(ach != *pachSym && *pachSym)
  531. pachSym++;
  532. if(*pachSym) // Found one
  533. {
  534. _token = tokenSymbol[pachSym - szSymbolKeywords];
  535. if(_token > 0x7F) // Token or larger Unicode
  536. return _token; // value
  537. }
  538. else if (!ach) // No more input chars
  539. goto TokenError;
  540. else // Code for unrecognized RTF
  541. _token = ach; // We'll just insert it for now
  542. }
  543. _token = TokenGetText((BYTE)_token);
  544. return _token;
  545. }
  546. szKeyword[0] = ach; // Collect keyword that starts
  547. pach = szKeyword + 1; // with ASCII
  548. while (cachKeyword < cachKeywordMax &&
  549. IsAlphaChar(ach = GetChar()))
  550. {
  551. cachKeyword++;
  552. *pach++ = ach;
  553. }
  554. if (cachKeyword == cachKeywordMax)
  555. {
  556. _ecParseError = ecKeywordTooLong;
  557. goto TokenError;
  558. }
  559. *pach = '\0'; // Terminate keyword
  560. if (IsDigit(ach) || ach == '-') // Collect parameter
  561. {
  562. BYTE *pachEnd = _szParam + sizeof(_szParam);
  563. pach = _szParam;
  564. *pach++ = ach;
  565. if(ach != '-')
  566. _iParam = ach - '0'; // Get parameter value
  567. while (IsDigit(ach = GetChar()))
  568. {
  569. _iParam = _iParam*10 + ach - '0';
  570. *pach++ = ach;
  571. if (pach >= pachEnd)
  572. {
  573. _ecParseError = ecKeywordTooLong;
  574. goto TokenError;
  575. }
  576. }
  577. *pach = '\0'; // Terminate parameter string
  578. if (_szParam[0] == '-')
  579. _iParam = -_iParam;
  580. }
  581. if (!_ecParseError && // We overshot:
  582. (ach == ' ' || UngetChar())) // if not ' ', unget char
  583. return TokenFindKeyword(szKeyword); // Find and return keyword
  584. TokenError:
  585. TRACEERRSZSC("TokenGetKeyword()", _ecParseError);
  586. return _token = tokenError;
  587. }
  588. /*
  589. * CRTFRead::TokenGetText(ach)
  590. *
  591. * @mfunc
  592. * Collect a string of text starting with the char <p ach> and treat as a
  593. * single token. The string ends when a LBRACE, RBRACE, or single '\\' is found.
  594. *
  595. * @devnote
  596. * We peek past the '\\' for \\'xx, which we decode and keep on going;
  597. * else we return in a state where the next character is the '\\'.
  598. *
  599. * @rdesc
  600. * TOKEN Token number of next token (tokenText or tokenError)
  601. */
  602. TOKEN CRTFRead::TokenGetText(
  603. BYTE ach) // @parm First char of 8-bit text string
  604. {
  605. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetText");
  606. BYTE * pach = _szText;
  607. SHORT cachText = 0;
  608. LONG CodePage = _pstateStackTop->nCodePage;
  609. BOOL fAllASCII = TRUE;
  610. int cTrailBytesNeeded = 0;
  611. _token = tokenError; // Default error
  612. // FUTURE(BradO): This 'goto' into a while loop is pretty weak.
  613. // Restructure this 'while' loop such that the 'goto' is removed.
  614. // Add character passed into routine
  615. goto add;
  616. // If cTrailBytesNeeded is non-zero, we need to get all the trail bytes. Otherwise,
  617. // a string end in the middle of a DBC or UTF-8 will cause bad display/print problem
  618. // - 5 to allow extra space for up to 4 bytes for UTF-8 and Null char
  619. while (cachText < cachTextMax - 5 || cTrailBytesNeeded)
  620. {
  621. ach = GetChar();
  622. switch (ach)
  623. {
  624. case BSLASH:
  625. {
  626. // FUTURE(BradO): This code looks ALOT like TokenGetKeyword.
  627. // We should combine the two into a common routine.
  628. BYTE achNext;
  629. // Get char after BSLASH
  630. achNext = GetChar();
  631. if(!achNext)
  632. goto error;
  633. if(achNext == '\'') // Handle most frequent
  634. { // case here
  635. if(TokenGetHex() == tokenError)
  636. {
  637. if(cTrailBytesNeeded)
  638. {
  639. // The trail-byte must be a raw BSLASH.
  640. // Unget the single-quote.
  641. if(!UngetChar())
  642. goto error;
  643. // fall through to add BSLASH
  644. }
  645. else
  646. {
  647. _ecParseError = ecUnexpectedChar;
  648. goto error;
  649. }
  650. }
  651. else
  652. {
  653. ach = (BYTE)_token;
  654. if (cTrailBytesNeeded == 0 && (ach == CR || ach == LF) &&
  655. FInDocTextDest())
  656. {
  657. // Here, we have a raw CR or LF in document text.
  658. // Unget the whole lot of characters and bail out.
  659. // TokenGetKeyword will convert this CR or LF into
  660. // a \par.
  661. if(!UngetChar(4))
  662. goto error;
  663. goto done;
  664. }
  665. }
  666. goto add;
  667. }
  668. // Check next byte against list of RTF symbol
  669. // NOTE:- we need to check for RTF symbol even if we
  670. // are expecting a trail byte. According to the rtf spec,
  671. // we cannot just take this backslash as trail byte.
  672. // HWC 9/97
  673. const BYTE *pachSymbol = szSymbolKeywords;
  674. while(achNext != *pachSymbol && *pachSymbol)
  675. pachSymbol++;
  676. TOKEN tokenTmp;
  677. if (*pachSymbol &&
  678. (tokenTmp = tokenSymbol[pachSymbol - szSymbolKeywords])
  679. <= 0x7F)
  680. {
  681. ach = (BYTE)tokenTmp;
  682. goto add;
  683. }
  684. // In either of the last two cases below, we will want
  685. // to unget the byte following the BSLASH
  686. if(!UngetChar())
  687. goto error;
  688. if(cTrailBytesNeeded && !IsAlphaChar(achNext))
  689. {
  690. // In this situation, either this BSLASH begins the next
  691. // RTF keyword or it is a raw BSLASH which is the trail
  692. // byte for a DBCS character.
  693. // I think a fair assumption here is that if an alphanum
  694. // follows the BSLASH, that the BSLASH begins the next
  695. // RTF keyword.
  696. // add the raw BSLASH
  697. goto add;
  698. }
  699. // Here, my guess is that the BSLASH begins the next RTF
  700. // keyword, so unget the BSLASH
  701. if(!UngetChar())
  702. goto error;
  703. goto done;
  704. }
  705. case LBRACE: // End of text string
  706. case RBRACE:
  707. if(cTrailBytesNeeded)
  708. {
  709. // Previous char was a lead-byte of a DBCS pair or UTF-8, which
  710. // makes this char a raw trail-byte.
  711. goto add;
  712. }
  713. if(!UngetChar()) // Unget delimeter
  714. goto error;
  715. goto done;
  716. case LF: // Throw away noise chars
  717. case CR:
  718. break;
  719. case 0:
  720. if(_ecParseError == ecUnexpectedEOF)
  721. goto done;
  722. ach = ' '; // Replace NULL by blank
  723. default: // Collect chars
  724. add:
  725. // Outstanding chars to be skipped after \uN tag
  726. if(_cbSkipForUnicode)
  727. {
  728. _cbSkipForUnicode--;
  729. continue;
  730. }
  731. *pach++ = ach;
  732. ++cachText;
  733. if(ach > 0x7F)
  734. fAllASCII = FALSE;
  735. // Check if we are expecting more trail bytes
  736. if (cTrailBytesNeeded)
  737. cTrailBytesNeeded--;
  738. else
  739. cTrailBytesNeeded = GetTrailBytesCount(ach, CodePage);
  740. Assert(cTrailBytesNeeded >= 0);
  741. }
  742. }
  743. done:
  744. _token = (WORD)(fAllASCII ? tokenASCIIText : tokenText);
  745. *pach = '\0'; // Terminate token string
  746. error:
  747. return _token;
  748. }
  749. /*
  750. * CRTFRead::TokenGetToken()
  751. *
  752. * @mfunc
  753. * This function reads in next token from input stream
  754. *
  755. * @rdesc
  756. * TOKEN token number of next token
  757. */
  758. TOKEN CRTFRead::TokenGetToken()
  759. {
  760. TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetToken");
  761. BYTE ach;
  762. _tokenLast = _token; // Used by \* destinations and FE
  763. _token = tokenEOF; // Default end-of-file
  764. SkipNoise:
  765. ach = GetChar();
  766. switch (ach)
  767. {
  768. case CR:
  769. case LF:
  770. goto SkipNoise;
  771. case LBRACE:
  772. _token = tokenStartGroup;
  773. break;
  774. case RBRACE:
  775. _token = tokenEndGroup;
  776. break;
  777. case BSLASH:
  778. _token = TokenGetKeyword();
  779. break;
  780. case 0:
  781. if(_ecParseError == ecUnexpectedEOF)
  782. break;
  783. ach = ' '; // Replace NULL by blank
  784. // Fall thru to default
  785. default:
  786. if( !_pstateStackTop )
  787. {
  788. TRACEWARNSZ("Unexpected token in rtf file");
  789. Assert(_token == tokenEOF);
  790. if (_ped->Get10Mode())
  791. _ecParseError = ecUnexpectedToken; // Signal bad file
  792. }
  793. else if (_pstateStackTop->sDest == destObjectData ||
  794. _pstateStackTop->sDest == destPicture )
  795. // not text but data
  796. {
  797. _token = (WORD)(tokenObjectDataValue + _pstateStackTop->sDest
  798. - destObjectData);
  799. UngetChar();
  800. }
  801. else
  802. _token = TokenGetText(ach);
  803. }
  804. return _token;
  805. }
  806. /*
  807. * CRTFRead::FInDocTextDest()
  808. *
  809. * @mfunc
  810. * Returns a BOOL indicating if the current destination is one in which
  811. * we would encounter document text.
  812. *
  813. * @rdesc
  814. * BOOL indicates the current destination may contain document text.
  815. */
  816. BOOL CRTFRead::FInDocTextDest() const
  817. {
  818. switch(_pstateStackTop->sDest)
  819. {
  820. case destRTF:
  821. case destField:
  822. case destFieldResult:
  823. case destFieldInstruction:
  824. case destParaNumbering:
  825. case destParaNumText:
  826. case destNULL:
  827. return TRUE;
  828. case destFontTable:
  829. case destRealFontName:
  830. case destObjectClass:
  831. case destObjectName:
  832. case destFollowingPunct:
  833. case destLeadingPunct:
  834. case destColorTable:
  835. case destBinary:
  836. case destObject:
  837. case destObjectData:
  838. case destPicture:
  839. case destDocumentArea:
  840. return FALSE;
  841. default:
  842. AssertSz(0, "CRTFRead::FInDocTextDest(): New destination "
  843. "encountered - update enum in _rtfread.h");
  844. return TRUE;
  845. }
  846. }