Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1393 lines
39 KiB

  1. // Copyright (c)1997-1999 Microsoft Corporation, All Rights Reserved
  2. /* copied from ..\htmed\lexer.cpp */
  3. /*++
  4. Copyright (c) 1995 Microsoft Corporation
  5. File: lexer.cpp
  6. Abstract:
  7. Nitty Gritty Lexer stuff
  8. Contents:
  9. SetValueSeen()
  10. IsSingleOp()
  11. IsWhiteSpace()
  12. MapToken()
  13. FindEndTag()
  14. MakeSublang()
  15. SetLanguage()
  16. FindTable()
  17. FindTable()
  18. RemoveTable()
  19. MakeTableSet()
  20. GetToken()
  21. IfHackComment()
  22. FindServerScript()
  23. FindEndComment()
  24. FindEndEntity()
  25. FindEntityRef()
  26. FindValue()
  27. FindEndString()
  28. FindTagOpen()
  29. FindText()
  30. FindNextToken()
  31. GetTextHint()
  32. GetHint()
  33. GetTokenLength()
  34. GetValueTokenLength()
  35. IsElementName()
  36. IsAttributeName()
  37. IsIdentifier()
  38. IsUnknownID()
  39. IsNumber()
  40. CColorHtml::SetTable()
  41. CColorHtml::InitSublanguages()
  42. History:
  43. 2/14/97 cgomes: Created
  44. --*/
  45. #include "stdafx.h"
  46. #include "resource.h"
  47. #include "guids.h"
  48. #include "token.h"
  49. #include "table.h"
  50. #include "lexer.h"
  51. UINT FindClientScriptEnd(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token);
  52. #undef ASSERT
  53. #define ASSERT(b) _ASSERTE(b)
  54. // HACK: we keep a copy of a ptr to the ASP table and sublang
  55. // so we can do special behavior for ASP files
  56. CTableSet* g_ptabASP = 0;
  57. PSUBLANG g_psublangASP = 0;
  58. PTABLESET g_arpTables[CV_MAX+1];
  59. // NOTE: added to handle value tokens properly.
  60. UINT GetValueTokenLength(LPCTSTR pchLine, UINT cbLen, UINT cbCur);
  61. // mark state transition from value -> next attribute
  62. inline int SetValueSeen(DWORD *plxs)
  63. {
  64. if (*plxs & inValue)
  65. {
  66. *plxs &= ~inValue;
  67. *plxs |= inAttribute;
  68. return TRUE;
  69. }
  70. else
  71. return FALSE;
  72. }
  73. // REVIEW (walts) - need better way
  74. inline void SetScriptLanguage(LPCTSTR pchLine, DWORD *plxs)
  75. {
  76. LPCTSTR strJavaScript = _T("javascript");
  77. LPCTSTR strVBScript = _T("vbscript");
  78. // triedit's special language. Its set when we convert server-side scripts into
  79. // client-side scripts. Its a dummy language. if we find that as language, we
  80. // set in ServerASP. It is reset(removed) in FindNextToken().
  81. LPCTSTR strServerAsp = _T("serverasp");
  82. // language attribute may have quotes around it.
  83. // if it does then advance past the first quote.
  84. // ex. <SCRIPT LANGUAGE="VBScript">
  85. if(*pchLine == L'\"')
  86. pchLine++;
  87. if (_tcsnicmp(pchLine, strJavaScript, lstrlen(strJavaScript)) == 0)
  88. {
  89. *plxs &= ~inVBScript;
  90. *plxs &= ~inServerASP;
  91. *plxs |= inJavaScript;
  92. }
  93. else if (_tcsnicmp(pchLine, strVBScript, lstrlen(strVBScript)) == 0)
  94. {
  95. *plxs &= ~inJavaScript;
  96. *plxs &= ~inServerASP;
  97. *plxs |= inVBScript;
  98. }
  99. else if (_tcsnicmp(pchLine, strServerAsp, lstrlen(strServerAsp)) == 0)
  100. {
  101. *plxs &= ~inJavaScript;
  102. *plxs &= ~inVBScript;
  103. *plxs |= inServerASP;
  104. }
  105. }
  106. inline BOOL IsSingleOp(HINT hint)
  107. {
  108. return ((hint >= tokOP_SINGLE) && (hint < tokOP_MAX));
  109. };
  110. inline BOOL IsWhiteSpace(TCHAR c)
  111. {
  112. return _istspace(c);
  113. };
  114. // NOTE: Added to handle value tokens properly
  115. inline IsValueChar(TCHAR ch)
  116. {
  117. // REVIEW(cgomes): specify all the invalid value characters
  118. return ch != _T('<') && ch != _T('>');
  119. };
  120. ////////////////////////////////////////////////////////////////////////////
  121. //
  122. // map parsed token to returned token
  123. // left column must be in ascending order
  124. static TOKEN _rgTokenMap[] =
  125. {
  126. tokName, tokSpace,
  127. tokNum, tokSpace,
  128. tokParEnt, tokSpace,
  129. tokResName, tokSpace,
  130. 0, 0
  131. };
  132. static TOKEN MapToken(TOKEN tokClass, DWORD lxs)
  133. {
  134. if (IsSingleOp((HINT)tokClass))
  135. return tokOp;
  136. else if ((tokClass == tokTag) && (lxs & inHTXTag))
  137. return tokSSS;
  138. for (int i = 0; (_rgTokenMap[i] != 0) && (_rgTokenMap[i] >= tokClass); i += 2)
  139. {
  140. if (_rgTokenMap[i] == tokClass)
  141. return _rgTokenMap[i + 1];
  142. }
  143. return tokClass;
  144. }
  145. ////////////////////////////////////////////////////////////////////////////
  146. UINT FindEndTag(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD *plxs, TXTB & token)
  147. {
  148. ASSERT(pchLine);
  149. TCHAR szEnd[16];
  150. ELLEX * pellex = pellexFromTextState(*plxs);
  151. ASSERT(0 != pellex); // shouldn't be called with something other than special text state
  152. UINT cbCmp = 3 + pellex->cb; // length of end tag
  153. ASSERT(cbCmp < sizeof szEnd);
  154. _tcscpy(szEnd, _T("</"));
  155. _tcscat(szEnd, pellex->sz);
  156. _tcscat(szEnd, _T(">"));
  157. while (cbCur < cbLen)
  158. {
  159. if (_T('<') == pchLine[cbCur])
  160. {
  161. if ((cbLen - cbCur >= cbCmp) && (0 == _tcsnicmp(szEnd, &pchLine[cbCur], cbCmp)))
  162. {
  163. *plxs &= ~TEXTMASK; // special text modes are exclusive
  164. token.ibTokMac = cbCur;
  165. return cbCur;
  166. }
  167. else if ((cbCur + 1 < cbLen) && (_T('%') == pchLine[cbCur+1]))
  168. {
  169. *plxs |= inHTXTag;
  170. token.ibTokMac = cbCur;
  171. break;
  172. }
  173. else
  174. cbCur++;
  175. }
  176. else
  177. cbCur += _tclen(&pchLine[cbCur]);
  178. }
  179. token.ibTokMac = cbCur;
  180. return cbCur;
  181. }
  182. ////////////////////////////////////////////////////////////////////////////
  183. BOOL MakeSublang(PSUBLANG ps, UINT id, const TCHAR *strName, UINT nIdTemplate, CLSID clsid)
  184. {
  185. int len;
  186. ASSERT( NULL != ps );
  187. ps->szSubLang = NULL;
  188. ps->lxsInitial = LxsFromSubLangIndex(id);
  189. ps->nIdTemplate = nIdTemplate;
  190. ps->clsidTemplate = clsid;
  191. if ((len = lstrlen(strName)) != 0)
  192. {
  193. LPTSTR szNew = new TCHAR [len+1];
  194. if (NULL != szNew)
  195. {
  196. _tcscpy(szNew,strName);
  197. ps->szSubLang = szNew;
  198. return TRUE;
  199. }
  200. }
  201. return FALSE;
  202. }
  203. // Set sublang and tableset array members,
  204. // putting the default one in 0th position.
  205. //
  206. void SetLanguage(TCHAR * strDefault, PSUBLANG rgSublang,
  207. PTABLESET pTab, UINT & index, UINT nIdTemplate, CLSID clsid)
  208. {
  209. if (pTab != NULL)
  210. {
  211. int i;
  212. if (lstrcmp(strDefault, pTab->Name()) == 0)
  213. i = 0;
  214. else
  215. i = index;
  216. if (MakeSublang(rgSublang+i, i, pTab->Name(), nIdTemplate, clsid))
  217. {
  218. g_arpTables[i] = pTab;
  219. if (i)
  220. index++;
  221. else
  222. g_pTable = pTab;
  223. }
  224. else
  225. delete pTab;
  226. }
  227. }
  228. CTableSet * FindTable(CTableSet ** rgpts, TCHAR *strName)
  229. {
  230. for (int n = 0; rgpts[n]; n++)
  231. {
  232. if (rgpts[n]->Name() == strName)
  233. //if (strcmp(rgpts[n]->Name(), strName) == 0)
  234. return rgpts[n];
  235. }
  236. return NULL;
  237. }
  238. CTableSet * FindTable(CTableSet ** rgpts, CTableSet * pts)
  239. {
  240. for (int n = 0; rgpts[n]; n++)
  241. {
  242. if (rgpts[n] == pts)
  243. return rgpts[n];
  244. }
  245. return NULL;
  246. }
  247. void RemoveTable(CTableSet ** rgpts, CTableSet *pts)
  248. {
  249. int n;
  250. for (n = 0; rgpts[n]; n++)
  251. {
  252. if (rgpts[n] == pts)
  253. {
  254. for(; rgpts[n]; n++)
  255. rgpts[n] = rgpts[n+1];
  256. return;
  257. }
  258. }
  259. }
  260. CTableSet * MakeTableSet(CTableSet ** /*rgpts*/, RWATT_T att, UINT nIdName)
  261. {
  262. return new CStaticTableSet(att, nIdName);
  263. }
  264. ////////////////////////////////////////////////////////////////////////
  265. // GetToken()
  266. //
  267. UINT GetToken(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
  268. {
  269. ASSERT (cbCur < cbLen);
  270. if(cbCur > cbLen)
  271. return cbCur;
  272. UINT cbCount = 0;
  273. // init token
  274. token.tok = 0;
  275. // initialize location where token starts
  276. token.ibTokMin = cbCur;
  277. if (*plxs & inHTXTag)
  278. cbCount = FindServerScript(pchLine, cbLen, cbCur, plxs, token);
  279. else if (*plxs & inSCRIPT && !(*plxs & inTag) && !(*plxs & inServerASP))
  280. {
  281. // NOTE that we want to skip tokenizing scripts that are special to triedit
  282. // when we wrap server-side scripts in client-side scripts, we set a dummy
  283. // language as 'serverasp'. inServerASP is set in that case.
  284. cbCount = FindClientScriptEnd(pchLine, cbLen, cbCur, plxs, token);
  285. }
  286. else if (*plxs & inComment) // in a comment
  287. {
  288. if (*plxs & inSCRIPT)
  289. *plxs |= inScriptText;
  290. COMMENTTYPE ct = IfHackComment(pchLine, cbLen, cbCur, plxs, token);
  291. if (ct == CT_METADATA)
  292. {
  293. // Treat as an element
  294. cbCount = FindNextToken(pchLine, cbLen, cbCur, plxs, token);
  295. // Remove inBangTag
  296. *plxs &= ~inBangTag;
  297. }
  298. else if (ct == CT_IECOMMENT)
  299. cbCount = token.ibTokMac;
  300. else
  301. cbCount = FindEndComment(pchLine, cbLen, cbCur, plxs, token);
  302. }
  303. else if (*plxs & INSTRING) // in a string
  304. cbCount = FindEndString(pchLine, cbLen, cbCur, plxs, token);
  305. else
  306. cbCount = FindNextToken(pchLine, cbLen, cbCur, plxs, token);
  307. token.tokClass = MapToken(token.tokClass, *plxs);
  308. return cbCount;
  309. }
  310. ///////////////////////////////////////////////////////////////////////////////////
  311. // IfHackComment
  312. //
  313. // Probe ahead in the current line to see if we have what IE recognizes
  314. // as the end of a comment ("->"). This does not conform to RFC 1866 or SGML,
  315. // but suppports browser behavior. This lets us tolerate comments of the
  316. // form: "<!--- whatever ->"
  317. // (note how it ends)
  318. //
  319. // Returns a COMMENTTYPE enum.
  320. // 0 if norma comment
  321. // 1 if IE comment
  322. // -1 if METADATA comment
  323. //
  324. // Proper comments are scanned using FindEndComment().
  325. //
  326. COMMENTTYPE IfHackComment(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
  327. {
  328. token.tokClass = tokComment;
  329. while (cbCur+1 < cbLen)
  330. {
  331. if(_tcsnicmp(&pchLine[cbCur], _T("METADATA"), lstrlen(_T("METADATA"))) == 0)
  332. {
  333. token.ibTokMac = cbCur + 1; // include second dash??
  334. *plxs &= ~inComment;
  335. // Remove inBangTag
  336. *plxs &= ~inBangTag;
  337. *plxs |= inTag;
  338. return CT_METADATA; // METADATA
  339. }
  340. else if (pchLine[cbCur] == '-' && pchLine[cbCur + 1] == '>')
  341. {
  342. token.ibTokMac = cbCur + 1;
  343. *plxs &= ~inComment;
  344. *plxs &= ~inScriptText;
  345. return CT_IECOMMENT;
  346. }
  347. else
  348. {
  349. cbCur += _tclen(&pchLine[cbCur]);
  350. }
  351. }
  352. return CT_NORMAL;
  353. }
  354. UINT FindServerScript(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
  355. {
  356. LPCTSTR pCurrent = &pchLine[cbCur];
  357. int cb;
  358. // parse HTX start tag
  359. if (*pCurrent == _T('<') && (cbCur+1 < cbLen) && *(pCurrent+1) == '%')
  360. {
  361. token.tokClass = tokTag;
  362. token.tok = TokTag_SSSOPEN;
  363. token.ibTokMac = cbCur + 2;
  364. *plxs |= inHTXTag;
  365. return token.ibTokMac;
  366. }
  367. ASSERT(*plxs & inHTXTag); // should be in HTXTag state here
  368. if (*pCurrent == _T('%') && (cbCur+1 < cbLen) && *(pCurrent+1) == '>')
  369. {
  370. token.tok = TokTag_SSSCLOSE;
  371. token.tokClass = tokSSS; //tokTag;
  372. token.ibTokMac = cbCur + 2;
  373. *plxs &= ~inHTXTag;
  374. if (*plxs & inNestedQuoteinSSS)
  375. *plxs &= ~inNestedQuoteinSSS;
  376. return token.ibTokMac;
  377. }
  378. token.tokClass = tokSSS;
  379. while (cbCur < cbLen)
  380. {
  381. if (*pCurrent == _T('%') && (cbCur+1 < cbLen) && (*(pCurrent+1) == _T('>')))
  382. break;
  383. if ( *pCurrent == _T('"')
  384. && *plxs&inTag
  385. && *plxs&inHTXTag
  386. && *plxs&inAttribute
  387. && *plxs&inString
  388. )
  389. *plxs |= inNestedQuoteinSSS;
  390. cb = _tclen(pCurrent);
  391. cbCur += cb;
  392. pCurrent += cb;
  393. }
  394. token.ibTokMac = cbCur;
  395. return cbCur;
  396. }
  397. ///////////////////////////////////////////////////////////////////////////////////
  398. // FindClientScriptEnd()
  399. //
  400. // HTMED CHANGE: Find the end of client script block
  401. //
  402. UINT FindClientScriptEnd(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
  403. {
  404. LPCTSTR pCurrent = &pchLine[cbCur];
  405. int cb;
  406. TCHAR rgEndScript[] = _T("</SCRIPT");
  407. int cchEndScript = (wcslen(rgEndScript) - 1);
  408. if( cbCur + cchEndScript < cbLen &&
  409. 0 == _tcsnicmp(pCurrent, rgEndScript, cchEndScript))
  410. {
  411. token.tokClass = tokTag;
  412. token.tok = TokTag_END;
  413. *plxs &= ~inSCRIPT;
  414. *plxs |= inEndTag;
  415. token.ibTokMac = cbCur + 2;
  416. return token.ibTokMac;
  417. }
  418. token.tokClass = tokSpace;
  419. while (cbCur < cbLen)
  420. {
  421. if (*pCurrent == _T('<') && (cbCur+1 < cbLen) && (*(pCurrent+1) == _T('/')))
  422. {
  423. // Check if found end </SCRIPT
  424. if( cbCur + cchEndScript < cbLen &&
  425. 0 == _tcsnicmp(pCurrent, rgEndScript, cchEndScript))
  426. {
  427. // Check if found end </SCRIPT
  428. break;
  429. }
  430. }
  431. cb = _tclen(pCurrent);
  432. cbCur += cb;
  433. pCurrent += cb;
  434. }
  435. token.ibTokMac = cbCur;
  436. return cbCur;
  437. }
  438. ///////////////////////////////////////////////////////////////////////////////////
  439. // FindEndComment()
  440. //
  441. // Find the end of comment ("--").
  442. //
  443. UINT FindEndComment(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
  444. {
  445. LPCTSTR pCurrent = &pchLine[cbCur];
  446. BOOL bEndComment = FALSE;
  447. int cb;
  448. ASSERT(*plxs & inComment); // must be in a comment now
  449. token.tokClass = tokComment;
  450. while (!bEndComment && cbCur < cbLen)
  451. {
  452. if (*pCurrent == _T('-')) // check the character to see if it's the first "-" in "--"
  453. {
  454. pCurrent++;
  455. cbCur++;
  456. if ((cbCur < cbLen) &&
  457. (*pCurrent == _T('-'))) // we're possibly at the end, so search for the final "--" pair
  458. {
  459. bEndComment = TRUE;
  460. }
  461. }
  462. else
  463. {
  464. cb = _tclen(pCurrent);
  465. cbCur += cb;
  466. pCurrent += cb;
  467. }
  468. }
  469. if (cbCur < cbLen)
  470. {
  471. cb = _tclen(pCurrent);
  472. cbCur += cb;
  473. pCurrent += cb;
  474. }
  475. token.ibTokMac = cbCur;
  476. // reset state if we reach end of comment
  477. if (bEndComment)
  478. *plxs &= ~inComment;
  479. return cbCur;
  480. }
  481. /////////////////////////////////////////////////////////////
  482. // FindEndEntity()
  483. //
  484. // Find the end of the special character sequence (ends with ; or whitespace).
  485. //
  486. UINT FindEndEntity(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * /*plxs*/, TXTB & token)
  487. {
  488. token.tokClass = tokEntity;
  489. int cb = GetTokenLength(pchLine, cbLen, cbCur);
  490. if (pchLine[cbCur + cb] == ';')
  491. cb++;
  492. token.ibTokMac = cbCur + cb;
  493. return token.ibTokMac;
  494. }
  495. /////////////////////////////////////////////////////////////
  496. // Find an entity reference or non-entity ref, literal "&..."
  497. //
  498. UINT FindEntityRef(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * /*plxs*/, TXTB & token)
  499. {
  500. ASSERT(cbCur < cbLen);
  501. ASSERT(pchLine[cbCur] == '&'); // must be on ERO
  502. cbCur++;
  503. if (cbCur == cbLen)
  504. {
  505. NotEntity:
  506. token.tokClass = tokIDENTIFIER; // plain text
  507. token.ibTokMac = cbCur;
  508. return cbCur;
  509. }
  510. if (pchLine[cbCur] == '#')
  511. {
  512. // parse and check valid number
  513. if (!IsNumber(pchLine, cbLen, cbCur + 1, token))
  514. goto NotEntity;
  515. // must be <= 3 digits
  516. if (token.ibTokMac - (cbCur + 1) > 3)
  517. goto NotEntity;
  518. // validate range
  519. TCHAR szNum[4];
  520. _tcsncpy(szNum, &pchLine[cbCur + 1], 3);
  521. if (_tcstoul(szNum, 0, 10) > 255)
  522. goto NotEntity;
  523. // we now have a valid numeric entity ref
  524. token.tokClass = tokEntity;
  525. cbCur = token.ibTokMac;
  526. // scan for end of entity ref
  527. // scan rest of alphanumeric token
  528. // REVIEW: Is this correct? IE 4.40.308 behaves this way
  529. while ((cbCur < cbLen) && IsCharAlphaNumeric(pchLine[cbCur]))
  530. cbCur++;
  531. // scan delimiter
  532. if (cbCur < cbLen)
  533. cbCur++;
  534. token.ibTokMac = cbCur;
  535. return cbCur;
  536. }
  537. else if (!IsCharAlpha(pchLine[cbCur]))
  538. {
  539. goto NotEntity;
  540. }
  541. else
  542. {
  543. // parse and check entity name
  544. UINT nLen = GetTokenLength(pchLine, cbLen, cbCur);
  545. if (!g_pTable->FindEntity(&pchLine[cbCur], nLen))
  546. goto NotEntity;
  547. cbCur += nLen;
  548. // eat delimiter if necessary
  549. if ((cbCur < cbLen) &&
  550. (pchLine[cbCur] == ';' || IsWhiteSpace(pchLine[cbCur])))
  551. cbCur++;
  552. token.tokClass = tokEntity;
  553. token.ibTokMac = cbCur;
  554. return cbCur;
  555. }
  556. }
  557. /////////////////////////////////////////////////////////////
  558. // FindEndValue
  559. // Find the end of an unquoted value.
  560. //
  561. // Scan for whitespace or end if tag
  562. //
  563. UINT FindValue(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
  564. {
  565. ASSERT(cbCur < cbLen);
  566. do
  567. {
  568. cbCur++;
  569. } while ( cbCur < cbLen &&
  570. !IsWhiteSpace(pchLine[cbCur]) &&
  571. pchLine[cbCur] != '>' );
  572. token.tokClass = tokValue;
  573. token.ibTokMac = cbCur;
  574. // switch from value to attribute
  575. *plxs &= ~inValue;
  576. *plxs |= inAttribute;
  577. return cbCur;
  578. }
  579. /////////////////////////////////////////////////////////////
  580. // FindEndString()
  581. // Find the end of the string.
  582. // Should only be called when we are in the string mode already.
  583. //
  584. UINT FindEndString (LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
  585. {
  586. LPCTSTR pCurrent = &pchLine[cbCur];
  587. int cb;
  588. BOOL bInString = TRUE;
  589. TCHAR chDelim;
  590. ASSERT (*plxs & INSTRING); // must be in a string now
  591. token.tokClass = tokString;
  592. chDelim = (*plxs & inStringA) ? _T('\'') : _T('"');
  593. while (bInString && cbCur < cbLen)
  594. {
  595. if (*pCurrent == chDelim)
  596. {
  597. *plxs &= ~INSTRING;
  598. bInString = FALSE;
  599. SetValueSeen(plxs);
  600. }
  601. else if (*pCurrent == _T('<') &&
  602. cbCur+1 < cbLen &&
  603. *(pCurrent+1) == _T('%'))
  604. {
  605. *plxs |= inHTXTag;
  606. break;
  607. }
  608. cb = _tclen(pCurrent);
  609. cbCur += cb;
  610. pCurrent += cb;
  611. }
  612. token.ibTokMac = cbCur;
  613. return cbCur;
  614. }
  615. //////////////////////////////////////////////////////////////////
  616. //
  617. UINT FindTagOpen(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
  618. {
  619. ASSERT(pchLine[cbCur] == '<');
  620. token.tokClass = tokTag;
  621. *plxs &= ~inScriptText; // turn off script coloring when inside tags
  622. cbCur++;
  623. if (cbCur == cbLen)
  624. {
  625. *plxs |= inTag;
  626. }
  627. else
  628. {
  629. #ifdef NEEDED // copied from htmed\lexer.cpp
  630. //
  631. // HTMED CHANGE:
  632. // REVIEW(cgomes): Figure out if I should turn off inSCRIPT in any of the
  633. // following cases. Right now I only do it for the </ case.
  634. //
  635. #endif //NEEDED
  636. switch (pchLine[cbCur])
  637. {
  638. case '!': // MDO - Markup Declaration Open
  639. cbCur++;
  640. *plxs |= inBangTag;
  641. token.tok = TokTag_BANG;
  642. break;
  643. case '/': // End tag
  644. cbCur++;
  645. *plxs |= inEndTag;
  646. token.tok = TokTag_END;
  647. #ifdef NEEDED // copied from htmed\lexer.cpp
  648. // HTMED CHANGE:
  649. // REVIEW(cgomes): Colorizer bug: it never removes the inSCRIPT state
  650. // This removes the inSCRIPT in the case <SCRIPT <BODY>
  651. // in this case <BODY is in error.
  652. //
  653. *plxs &= ~inSCRIPT;
  654. #endif //NEEDED
  655. break;
  656. // REVIEW: PI is SGML -- not in HTML, but might be added
  657. case '?': // PI - Processing Instruction
  658. cbCur++;
  659. *plxs |= inPITag;
  660. token.tok = TokTag_PI;
  661. break;
  662. case '%': // HTX -- ODBC server HTML extension
  663. cbCur++;
  664. *plxs |= inHTXTag;
  665. token.tok = TokTag_SSSOPEN;
  666. break;
  667. default: // Tag
  668. if (IsCharAlpha(pchLine[cbCur]))
  669. {
  670. *plxs |= inTag;
  671. token.tok = TokTag_START;
  672. }
  673. else
  674. token.tokClass = tokIDENTIFIER; // NOT a TAG
  675. break;
  676. }
  677. }
  678. token.ibTokMac = cbCur;
  679. return cbCur;
  680. }
  681. //////////////////////////////////////////////////////////////////
  682. // FindText
  683. // Scan a token of text
  684. // NOTE DO NOT MODIFY this function, mainly b/c the side effects
  685. // will be hard to find, and will break the way
  686. // that everything works.
  687. //
  688. UINT FindText(LPCTSTR pchLine, UINT cbLen, UINT cbCur, TXTB & token)
  689. {
  690. //BOOL fExtraSpace = FALSE;
  691. //int cSpace = 0;
  692. ASSERT (cbCur < cbLen);
  693. token.tokClass = tokIDENTIFIER;
  694. //if (pchLine[cbCur] == ' ' && !fExtraSpace)
  695. // fExtraSpace = TRUE;
  696. cbCur += _tclen(&pchLine[cbCur]);
  697. while (cbCur < cbLen)
  698. {
  699. switch (pchLine[cbCur])
  700. {
  701. case _T('\0'):
  702. case _T('\n'):
  703. case _T('<'):
  704. case _T('&'):
  705. //if (cSpace > 0) // found extra spaces so remember them somewhere
  706. goto ret;
  707. break;
  708. //case _T(' '):
  709. // if (!fExtraSpace)
  710. // fExtraSpace = TRUE;
  711. // else
  712. // cSpace++;
  713. // break;
  714. default:
  715. //if (cSpace > 0) // found extra spaces so remember them somewhere
  716. //cSpace = 0;
  717. //fExtraSpace = FALSE;
  718. break;
  719. }
  720. cbCur += _tclen(&pchLine[cbCur]);
  721. }
  722. ret:
  723. token.ibTokMac = cbCur;
  724. return cbCur;
  725. }
  726. //////////////////////////////////////////////////////////////////
  727. // FindNextToken()
  728. // Find the next token in the line
  729. //
  730. UINT FindNextToken(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
  731. {
  732. ASSERT (cbCur < cbLen);
  733. HINT hint;
  734. if (!(*plxs & INTAG)) // scanning text
  735. {
  736. if (*plxs & TEXTMASK)
  737. {
  738. if (*plxs & inCOMMENT)
  739. token.tokClass = tokComment;
  740. else
  741. token.tokClass = tokIDENTIFIER;
  742. // probe for end tag </comment>
  743. UINT cbEnd = FindEndTag(pchLine, cbLen, cbCur, plxs, token);
  744. if (cbEnd > cbCur) // parsed a nonzero-length token
  745. {
  746. return cbEnd;
  747. }
  748. //else fall through to normal processing
  749. }
  750. hint = GetTextHint(pchLine, cbLen, cbCur, plxs, token);
  751. switch (hint)
  752. {
  753. case HTA:
  754. // begin a tag
  755. return FindTagOpen(pchLine, cbLen, cbCur, plxs, token);
  756. case HEN:
  757. // scan an entity reference
  758. token.ibTokMac = FindEntityRef(pchLine, cbLen, cbCur, plxs, token);
  759. return token.ibTokMac;
  760. case EOS:
  761. case ONL:
  762. return token.ibTokMac;
  763. case ERR:
  764. default:
  765. // scan text as a single token
  766. // If the editor uses token info for more than coloring
  767. // (e.g. extended selections), then this will need to
  768. // return smaller chunks.
  769. if (*plxs & inSCRIPT)
  770. *plxs |= inScriptText;
  771. return FindText(pchLine, cbLen, cbCur, token);
  772. break;
  773. }
  774. return cbCur;
  775. }
  776. ASSERT(*plxs & INTAG); // must be in a tag here
  777. BOOL bError = FALSE;
  778. hint = GetHint(pchLine, cbLen, cbCur, plxs, token);
  779. switch (hint)
  780. {
  781. case HTE:
  782. // Tag end: remove all tag state bits
  783. *plxs &= ~TAGMASK;
  784. cbCur++;
  785. token.tokClass = tokTag;
  786. token.tok = TokTag_CLOSE;
  787. token.ibTokMac = cbCur;
  788. break;
  789. case HNU:
  790. #if 0 // lexing HTML instance, not a DTD!
  791. if (!IsNumber(pchLine, cbLen, cbCur, token))
  792. bError = TRUE;
  793. if (SetValueSeen(plxs))
  794. token.tokClass = tokValue;
  795. break;
  796. #else
  797. // fall through
  798. #endif
  799. case HRN: // reserved name start: #
  800. #if 1 // lexing HTML instance, not a DTD!
  801. // simple nonwhitespace stream
  802. if (!(*plxs & inValue))
  803. bError = TRUE;
  804. FindValue(pchLine, cbLen, cbCur, plxs, token);
  805. if (bError)
  806. {
  807. token.tokClass = tokSpace;
  808. bError = FALSE; //"corrected" the error
  809. }
  810. #else
  811. cbCur++;
  812. if (cbCur == cbLen)
  813. token.tokClass = tokOp;
  814. else
  815. {
  816. if (IsIdChar(pchLine[cbCur]))
  817. {
  818. cbCur++;
  819. while (cbCur < cbLen && IsIdChar(pchLine[cbCur]))
  820. cbCur++;
  821. token.tokClass = tokResName;
  822. }
  823. else
  824. token.tokClass = tokOp;
  825. }
  826. token.ibTokMac = cbCur;
  827. if (SetValueSeen(plxs))
  828. token.tokClass = tokValue;
  829. #endif
  830. break;
  831. case HEP: // parameter entity: %
  832. #if 1 // lexing HTML instance, not a DTD!
  833. goto BadChar;
  834. #else
  835. cbCur++;
  836. if (cbCur == cbLen)
  837. {
  838. token.tokClass = tokOp;
  839. token.ibTokMac = cbCur;
  840. }
  841. else
  842. {
  843. if (IsIdChar(pchLine[cbCur]))
  844. {
  845. token.ibTokMac = FindEndEntity(pchLine, cbLen, cbCur, plxs, token);
  846. token.tokClass = tokParEnt;
  847. }
  848. else
  849. {
  850. token.ibTokMac = cbCur;
  851. token.tokClass = tokOp;
  852. }
  853. }
  854. if (SetValueSeen(plxs))
  855. token.tokClass = tokValue;
  856. #endif
  857. break;
  858. // ported HTMED change (walts) -- handle some chars as valid start char for attribute values.
  859. case HAV:
  860. {
  861. if (!(*plxs & inTag) || !SetValueSeen(plxs))
  862. goto BadChar; // not in tag or attribute value.
  863. int iTokenLength = GetValueTokenLength(pchLine, cbLen, cbCur);
  864. token.ibTokMac = token.ibTokMin + iTokenLength;
  865. token.tokClass = tokValue;
  866. break;
  867. }
  868. // ported HTMED change (walts) -- handle some chars as valid start char for attribute values.
  869. case HKW: // identifier
  870. {
  871. int iTokenLength = GetTokenLength(pchLine, cbLen, cbCur);
  872. token.ibTokMac = token.ibTokMin + iTokenLength;
  873. token.tokClass = tokName;
  874. //FUTURE: Don't scan attributes in an end tag
  875. if (*plxs & (inTag|inEndTag))
  876. {
  877. if (*plxs & inAttribute)
  878. {
  879. IsAttributeName(pchLine, cbCur, iTokenLength, token);
  880. // don't change attribute/value state here
  881. // we only look for values after we've seen "=" in case OEQ below
  882. // REVIEW(cgomes): what if more attributes follow
  883. // the SPAN??
  884. // if found STARTSPAN then pretend I am not in a tag
  885. if(token.tok == TokAttrib_STARTSPAN)
  886. *plxs &= ~(inTag | inAttribute);
  887. // if found ENDSPAN then goback to comment state
  888. else if(token.tok == TokAttrib_ENDSPAN)
  889. {
  890. *plxs &= ~(inTag | inAttribute);
  891. *plxs |= inBangTag | inComment;
  892. }
  893. }
  894. else if (SetValueSeen(plxs))
  895. {
  896. // REVIEW (walts)
  897. // Handle the client side script language detection here for the
  898. // following case (language attribute value is NOT wrapped by quotes.)
  899. // <SCRIPT LANGUAGE=VBScript>
  900. if (*plxs & inSCRIPT)
  901. {
  902. SetScriptLanguage(&pchLine[cbCur], plxs);
  903. }
  904. //
  905. // REVIEW(cgomes): It seems that any non-white space character
  906. // is valid for non-quoted attribute values.
  907. // Problem is that GetTokenLength is used to determine
  908. // the token length, which works great non-values,
  909. // but pulls egss for values.
  910. // I use GetValueTokenLength here to get the length
  911. // of value token. GetValueTokenLength will not
  912. // stop till it hits a white space character.
  913. //
  914. iTokenLength = GetValueTokenLength(pchLine, cbLen, cbCur);
  915. token.ibTokMac = token.ibTokMin + iTokenLength;
  916. token.tokClass = tokName;
  917. token.tokClass = tokValue;
  918. }
  919. else
  920. {
  921. IsElementName(pchLine, cbCur, iTokenLength, token);
  922. // look for attributes
  923. *plxs |= inAttribute;
  924. // set content state
  925. if (*plxs & inTag)
  926. *plxs |= TextStateFromElement(&pchLine[token.ibTokMin], iTokenLength);
  927. else if ((*plxs & inEndTag) && (*plxs & TEXTMASK))
  928. *plxs &= ~TextStateFromElement(&pchLine[token.ibTokMin], iTokenLength);
  929. else if ((*plxs & inEndTag) && (*plxs & inSCRIPT))
  930. *plxs &= ~(inSCRIPT | inScriptText | inServerASP/* | inVBScript | inJavaScript*/);
  931. }
  932. }
  933. else if (*plxs & inBangTag)
  934. {
  935. // FUTURE: other <!...> items like "HTML", "PUBLIC"? -- nice for DTDs
  936. // Use a RW table for it if we do
  937. // recognize <!DOCTYPE ...> as 'element'
  938. if ((iTokenLength == 7) &&
  939. (0 == _tcsnicmp(&pchLine[cbCur], _T("doctype"), 7)))
  940. token.tokClass = tokElem;
  941. }
  942. break;
  943. }
  944. case HST: // string "..."
  945. *plxs |= inString;
  946. goto String;
  947. case HSL: // string alternate '...'
  948. *plxs |= inStringA;
  949. String:
  950. cbCur++;
  951. token.ibTokMac = FindEndString(pchLine, cbLen, cbCur, plxs, token);
  952. SetValueSeen(plxs);
  953. // Handle the client side script language detection here for the
  954. // following case (language attribute value is wrapped by quotes.)
  955. // <SCRIPT LANGUAGE="VBScript">
  956. if((*plxs & inSCRIPT) && (*plxs & inAttribute))
  957. {
  958. SetScriptLanguage(&pchLine[cbCur], plxs);
  959. }
  960. break;
  961. case HWS: // tag whitespace
  962. do
  963. {
  964. cbCur++;
  965. } while (cbCur < cbLen && IsWhiteSpace(pchLine[cbCur]));
  966. token.tokClass = tokSpace;
  967. token.ibTokMac = cbCur;
  968. break;
  969. case OEQ:
  970. // GetHint has set token info
  971. if (*plxs & inAttribute)
  972. {
  973. // start looking for values
  974. *plxs &= ~inAttribute;
  975. *plxs |= inValue;
  976. }
  977. else
  978. goto BadChar;
  979. break;
  980. case HTA:
  981. if (cbCur+1 < cbLen && '%' == pchLine[cbCur+1])
  982. {
  983. SetValueSeen(plxs);
  984. return FindTagOpen(pchLine, cbLen, cbCur, plxs, token);
  985. }
  986. // else fall through
  987. case ERR:
  988. case HEN:
  989. BadChar:
  990. token.tokClass = tokSpace;
  991. // DS96# 10116 [CFlaat]: we can be in DBCS here, and so we need
  992. // to make sure that our increment is double-byte aware
  993. cbCur += _tcsnbcnt(pchLine + cbCur, 1); // byte count for current char
  994. ASSERT(cbCur <= cbLen);
  995. token.ibTokMac = cbCur;
  996. break;
  997. // ported HTMED CHANGE (walts) - added this case to handle dbcs attribute values.
  998. case HDB:
  999. {
  1000. // DBCS char. Handle for attribute values within tag.
  1001. if (!SetValueSeen(plxs))
  1002. goto BadChar;
  1003. int iTokenLength = GetValueTokenLength(pchLine, cbLen, cbCur);
  1004. token.ibTokMac = token.ibTokMin + iTokenLength;
  1005. token.tokClass = tokValue;
  1006. }
  1007. break;
  1008. // ported HTMED CHANGE END
  1009. default:
  1010. // GetHint has set token info
  1011. if (token.tokClass != tokComment && (*plxs & inValue))
  1012. FindValue(pchLine, cbLen, cbCur, plxs, token);
  1013. break;
  1014. }
  1015. if (bError)
  1016. IsUnknownID(pchLine, cbLen, cbCur, token);
  1017. return token.ibTokMac;
  1018. }
  1019. ////////////////////////////////////////////////////////////////////
  1020. // GetTextHint()
  1021. // Like GetHint when scanning text -- look only for tags and entities
  1022. //
  1023. HINT GetTextHint(LPCTSTR pchLine, UINT /*cbLen*/, UINT cbCur, DWORD * /*plxs*/, TXTB & token)
  1024. {
  1025. // if the character is bigger than 128 (dbcs) then return error
  1026. if (pchLine[cbCur] & ~0x7F)
  1027. return HDB;
  1028. HINT hint = g_hintTable[pchLine[cbCur]];
  1029. if (IsSingleOp(hint))
  1030. {
  1031. hint = ERR;
  1032. }
  1033. else if (hint == ONL || hint == EOS)
  1034. {
  1035. token.tokClass = tokOp;
  1036. token.ibTokMac = cbCur + 1;
  1037. }
  1038. return hint;
  1039. }
  1040. ////////////////////////////////////////////////////////////////////
  1041. // GetHint()
  1042. // Use hint table to guess what the next token going to be
  1043. // If it is a single operator, it will fill in the token info
  1044. // as well
  1045. //
  1046. HINT GetHint(LPCTSTR pchLine, UINT cbLen, UINT cbCur, DWORD * plxs, TXTB & token)
  1047. {
  1048. // if the character is bigger than 128 (dbcs) then return error
  1049. if (pchLine[cbCur] & ~0x7F)
  1050. return HDB;
  1051. HINT hint = g_hintTable[pchLine[cbCur]];
  1052. // check if it is a single op, new line or end of stream
  1053. if (IsSingleOp(hint) || hint == ONL || hint == EOS)
  1054. {
  1055. token.tokClass = hint;
  1056. token.ibTokMac = cbCur + 1;
  1057. }
  1058. else if (hint == ODA)
  1059. {
  1060. if ((cbCur + 1 < cbLen) &&
  1061. (g_hintTable[pchLine[cbCur + 1]] == ODA) &&
  1062. (*plxs & inBangTag))
  1063. {
  1064. cbCur += 2;
  1065. *plxs |= inComment;
  1066. COMMENTTYPE ct = IfHackComment(pchLine, cbLen, cbCur, plxs, token);
  1067. if (ct == 0)
  1068. {
  1069. token.tokClass = tokComment;
  1070. token.ibTokMac = cbCur;
  1071. }
  1072. else if(ct == CT_METADATA)
  1073. hint = HTA; // tag open
  1074. }
  1075. else
  1076. {
  1077. // single -
  1078. token.tokClass = tokOp;
  1079. token.ibTokMac = cbCur + 1;
  1080. }
  1081. }
  1082. return hint;
  1083. }
  1084. ///////////////////////////////////////////////////////////////////
  1085. // GetTokenLength ()
  1086. // return the length of a token identifier/keyword
  1087. //
  1088. UINT GetTokenLength(LPCTSTR pchLine, UINT cbLen, UINT cbCur)
  1089. {
  1090. LPCTSTR pCurrent = &pchLine[cbCur];
  1091. UINT cb;
  1092. UINT cbOld = cbCur;
  1093. if (IsCharAlphaNumeric(*pCurrent))
  1094. {
  1095. while (cbCur < cbLen && IsIdChar(*pCurrent))
  1096. {
  1097. cb = _tclen(pCurrent);
  1098. cbCur += cb;
  1099. pCurrent += cb;
  1100. }
  1101. }
  1102. return (int) max((cbCur - cbOld), 1);
  1103. }
  1104. /*
  1105. UINT GetValueTokenLength
  1106. Description:
  1107. Gets the length of the token.
  1108. This version will accept any non whitespace character
  1109. in the token.
  1110. */
  1111. UINT GetValueTokenLength(LPCTSTR pchLine, UINT cbLen, UINT cbCur)
  1112. {
  1113. LPCTSTR pCurrent = &pchLine[cbCur];
  1114. UINT cb;
  1115. UINT cbOld = cbCur;
  1116. while (cbCur < cbLen && !_istspace(*pCurrent) && IsValueChar(*pCurrent))
  1117. {
  1118. cb = _tclen(pCurrent);
  1119. cbCur += cb;
  1120. pCurrent += cb;
  1121. }
  1122. return (int) max((cbCur - cbOld), 1);
  1123. }
  1124. ////////////////////////////////////////////////////////////////
  1125. // IsElementName ()
  1126. // lookup the keyword table to determine if it is a keyword or not
  1127. //
  1128. BOOL IsElementName(LPCTSTR pchLine, UINT cbCur, int iTokenLength, TXTB & token)
  1129. {
  1130. LPCTSTR pCurrent = &pchLine[cbCur];
  1131. int iFound = NOT_FOUND;
  1132. if (NOT_FOUND != (iFound = g_pTable->FindElement(pCurrent, iTokenLength)))
  1133. {
  1134. token.tokClass = tokElem;
  1135. token.ibTokMac = cbCur + iTokenLength;
  1136. token.tok = iFound; // set token
  1137. }
  1138. return (iFound != NOT_FOUND);
  1139. }
  1140. int IndexFromElementName(LPCTSTR pszName)
  1141. {
  1142. return g_pTable->FindElement(pszName, lstrlen(pszName));
  1143. }
  1144. ////////////////////////////////////////////////////////////////
  1145. // IsAttributeName ()
  1146. // lookup the keyword table to determine if it is a keyword or not
  1147. //
  1148. BOOL IsAttributeName(LPCTSTR pchLine, UINT cbCur, int iTokenLength, TXTB & token)
  1149. {
  1150. LPCTSTR pCurrent = &pchLine[cbCur];
  1151. int iFound = NOT_FOUND;
  1152. if (NOT_FOUND != (iFound = g_pTable->FindAttribute(pCurrent, iTokenLength)))
  1153. {
  1154. token.tokClass = tokAttr;
  1155. // ENDSPAN__ is needed b/c the lexer does not recognize the
  1156. // endspan-- as 2 seperate tokens.
  1157. if(iFound == TokAttrib_ENDSPAN__)
  1158. {
  1159. // endspan-- found. return TokAttrib_ENDSPAN
  1160. // set ibTokMac to not include --.
  1161. token.tok = TokAttrib_ENDSPAN;
  1162. token.ibTokMac = cbCur + iTokenLength - 2;
  1163. }
  1164. else
  1165. {
  1166. token.ibTokMac = cbCur + iTokenLength;
  1167. token.tok = iFound; // set token
  1168. }
  1169. }
  1170. return (iFound != NOT_FOUND);
  1171. }
  1172. //////////////////////////////////////////////////////////////////////////
  1173. // IsIdentifier()
  1174. // check if it is an identifier
  1175. //
  1176. BOOL IsIdentifier (int iTokenLength, TXTB & token)
  1177. {
  1178. if (iTokenLength > 0)
  1179. {
  1180. token.tokClass = tokName;
  1181. token.ibTokMac = token.ibTokMin + iTokenLength;
  1182. return TRUE;
  1183. }
  1184. else
  1185. return FALSE;
  1186. }
  1187. ////////////////////////////////////////////////////////////////////
  1188. // IsUnknownID ()
  1189. // Mark the next token as an ID
  1190. //
  1191. BOOL IsUnknownID (LPCTSTR pchLine, UINT cbLen, UINT cbCur, TXTB & token)
  1192. {
  1193. ASSERT(cbCur < cbLen);
  1194. UINT cb;
  1195. LPCTSTR pCurrent = &pchLine[cbCur];
  1196. cb = _tclen(pCurrent);
  1197. cbCur += cb;
  1198. pCurrent += cb;
  1199. while ((cbCur < cbLen) && IsIdChar(*pCurrent))
  1200. {
  1201. cb = _tclen(pCurrent);
  1202. cbCur += cb;
  1203. pCurrent += cb;
  1204. }
  1205. token.tokClass = tokSpace;
  1206. token.ibTokMac = cbCur;
  1207. return TRUE;
  1208. }
  1209. /////////////////////////////////////////////////////////////////////////
  1210. // IsNumber()
  1211. // Check whether the next token is an SGML NUMTOKEN
  1212. //
  1213. BOOL IsNumber(LPCTSTR pchLine, UINT cbLen, UINT cbCur, TXTB & token)
  1214. {
  1215. if (cbCur >= cbLen)
  1216. return FALSE;
  1217. if (!_istdigit(pchLine[cbCur]))
  1218. return FALSE;
  1219. token.tokClass = tokNum;
  1220. // assume all digits are one byte
  1221. ASSERT(1 == _tclen(&pchLine[cbCur]));
  1222. cbCur++;
  1223. while (cbCur < cbLen && _istdigit(pchLine[cbCur]))
  1224. {
  1225. // assume all digits are one byte
  1226. ASSERT(1 == _tclen(&pchLine[cbCur]));
  1227. cbCur++;
  1228. }
  1229. token.ibTokMac = cbCur;
  1230. return TRUE;
  1231. }
  1232. /* end of file */