Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

390 lines
8.4 KiB

  1. // Copyright (c) 1999 Microsoft Corporation. All rights reserved.
  2. //
  3. // Declaration of Lexer.
  4. //
  5. //#define LIMITEDVBSCRIPT_LOGLEXER // ��
  6. #include "stdinc.h"
  7. #include "enginc.h"
  8. #include "englex.h"
  9. #include "limits"
  10. #ifdef LIMITEDVBSCRIPT_LOGLEXER
  11. #include "englog.h"
  12. #endif
  13. //////////////////////////////////////////////////////////////////////
  14. // Unicode/ASCII character classification
  15. inline bool iswasciialpha(WCHAR c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); }
  16. inline bool iswasciidigit(WCHAR c) { return c >= L'0' && c <= L'9'; }
  17. inline bool iswasciialnum(WCHAR c) { return iswasciialpha(c) || iswasciidigit(c); }
  18. inline WCHAR towasciilower(WCHAR c) { return (c >= L'A' && c <= L'Z') ? c + (L'a' - L'A') : c; }
  19. //////////////////////////////////////////////////////////////////////
  20. // token tables
  21. const TokenKeysym g_TokenKeysyms[] =
  22. {
  23. { L'(', TOKEN_lparen },
  24. { L')', TOKEN_rparen },
  25. { L',', TOKEN_comma },
  26. { L'-', TOKEN_op_minus },
  27. { L'^', TOKEN_op_pow },
  28. { L'*', TOKEN_op_mult },
  29. { L'\\', TOKEN_op_div },
  30. { L'+', TOKEN_op_plus },
  31. { L'<', TOKEN_op_lt },
  32. { L'>', TOKEN_op_gt },
  33. { L'=', TOKEN_op_eq },
  34. { L'\0', TOKEN_eof }
  35. };
  36. const TokenKeyword g_TokenKeywords[] =
  37. {
  38. { L"sub", TOKEN_sub },
  39. { L"dim", TOKEN_dim },
  40. { L"if", TOKEN_if },
  41. { L"then", TOKEN_then },
  42. { L"end", TOKEN_end },
  43. { L"elseif", TOKEN_elseif },
  44. { L"else", TOKEN_else },
  45. { L"set", TOKEN_set },
  46. { L"call", TOKEN_call },
  47. { L"not", TOKEN_op_not },
  48. { L"mod", TOKEN_op_mod },
  49. { L"is", TOKEN_is },
  50. { L"and", TOKEN_and },
  51. { L"or", TOKEN_or },
  52. { NULL, TOKEN_eof }
  53. };
  54. //////////////////////////////////////////////////////////////////////
  55. // helper functions
  56. bool
  57. CheckOperatorType(Token t, bool fAcceptParens, bool fAcceptUnary, bool fAcceptBinary, bool fAcceptOverloadedAssignmentTokens)
  58. {
  59. switch (t)
  60. {
  61. case TOKEN_set:
  62. case TOKEN_sub:
  63. return fAcceptOverloadedAssignmentTokens;
  64. case TOKEN_lparen:
  65. case TOKEN_rparen:
  66. return fAcceptParens;
  67. case TOKEN_op_minus:
  68. return fAcceptUnary || fAcceptBinary;
  69. case TOKEN_op_not:
  70. return fAcceptUnary;
  71. case TOKEN_op_pow:
  72. case TOKEN_op_mult:
  73. case TOKEN_op_div:
  74. case TOKEN_op_mod:
  75. case TOKEN_op_plus:
  76. case TOKEN_op_lt:
  77. case TOKEN_op_leq:
  78. case TOKEN_op_gt:
  79. case TOKEN_op_geq:
  80. case TOKEN_op_eq:
  81. case TOKEN_op_neq:
  82. case TOKEN_is:
  83. case TOKEN_and:
  84. case TOKEN_or:
  85. return fAcceptBinary;
  86. }
  87. return false;
  88. }
  89. //////////////////////////////////////////////////////////////////////
  90. // Lexer
  91. Lexer::Lexer(const WCHAR *pwszSource)
  92. : m_p(pwszSource),
  93. m_pNext(NULL),
  94. m_iLine(1),
  95. m_iColumn(1),
  96. m_t(TOKEN_sub)
  97. {
  98. this->Scan();
  99. }
  100. void
  101. Lexer::Next()
  102. {
  103. assert(m_t != TOKEN_eof);
  104. if (m_pNext)
  105. {
  106. m_iColumn += (int)(m_pNext - m_p);
  107. m_p = m_pNext;
  108. m_pNext = NULL;
  109. }
  110. else
  111. {
  112. ++m_p;
  113. ++m_iColumn;
  114. }
  115. }
  116. void
  117. Lexer::Scan()
  118. {
  119. m_szStr[0] = L'\0';
  120. m_iNum = 0;
  121. bool fLineBreak = m_t == TOKEN_linebreak;
  122. for (;;)
  123. {
  124. if (fLineBreak)
  125. {
  126. // line breaks tokens are reported on the line/column that they occur so this isn't isn't adjusted until the next pass
  127. ++m_iLine;
  128. m_iColumn = 1;
  129. }
  130. ScanMain();
  131. if (!fLineBreak || m_t != TOKEN_linebreak)
  132. break;
  133. Next();
  134. }
  135. #ifdef LIMITEDVBSCRIPT_LOGLEXER
  136. LogToken(*this);
  137. #endif
  138. }
  139. void
  140. Lexer::ScanMain()
  141. {
  142. for (;; this->Next())
  143. {
  144. switch (*m_p)
  145. {
  146. case L'\0':
  147. // end of script
  148. m_t = TOKEN_eof;
  149. return;
  150. case L'\'':
  151. // comment till end of line
  152. for (; *m_p && *m_p != L'\n'; ++m_p)
  153. {}
  154. --m_p; // put one char back so the next loop can process it
  155. break;
  156. case L'\t': case L' ':
  157. // ignore horizontal white space
  158. break;
  159. case L'\r':
  160. // ignore carriage returns
  161. --m_iColumn; // in fact, they don't even count as characters
  162. break;
  163. case L'\n':
  164. // line break
  165. m_t = TOKEN_linebreak;
  166. return;
  167. default:
  168. if (*m_p == L'\"')
  169. {
  170. // string literal
  171. m_pNext = m_p + 1;
  172. char *pszDest = m_szStr;
  173. const char *pszMax = m_szStr + g_iMaxBuffer - 1;
  174. do
  175. {
  176. if (!iswascii(*m_pNext))
  177. {
  178. this->Next(); // this will update the current position to the offending character -- indicating the correct column of the error
  179. this->err(LEXERR_NonAsciiCharacterInStringLiteral);
  180. return;
  181. }
  182. if (*m_pNext == L'\n' || *m_pNext == L'\r')
  183. {
  184. this->err(LEXERR_StringLiteralUnterminated);
  185. return;
  186. }
  187. if (*m_pNext == L'\"')
  188. {
  189. if (*++m_pNext != L'\"')
  190. break; // found terminating quote
  191. // There were two quotes, the escape sequence for a single quote. The first was skipped and we're all ready to append the second.
  192. }
  193. *pszDest++ = *m_pNext++; // we know this works because the character is ascii and those codes correspond to the same numbers in Unicode
  194. } while (pszDest <= pszMax);
  195. if (pszDest > pszMax)
  196. {
  197. this->err(LEXERR_StringLiteralTooLong);
  198. }
  199. else
  200. {
  201. *pszDest = L'\0';
  202. m_t = TOKEN_stringliteral;
  203. }
  204. return;
  205. }
  206. if (iswasciidigit(*m_p))
  207. {
  208. // numeric literal
  209. // Cant find a _wtoi like function that handles overflow so do the conversion myself.
  210. // �� Look at runtime version to be sure these aren't constantly recomputed
  211. const int iMaxChop = std::numeric_limits<int>::max() / 10; // if number gets bigger than this and there's another digit then we're going to overflow
  212. const WCHAR wchMaxLast = std::numeric_limits<int>::max() % 10 + L'0'; // if number equals iMaxChop and the next digit is bigger than this then we're going to overflow
  213. m_pNext = m_p;
  214. m_iNum = 0;
  215. do
  216. {
  217. m_iNum *= 10;
  218. m_iNum += *m_pNext++ - L'0';
  219. } while (iswasciidigit(*m_pNext) && (m_iNum < iMaxChop || (m_iNum == iMaxChop && *m_pNext <= wchMaxLast)));
  220. if (iswasciidigit(*m_pNext))
  221. this->err(LEXERR_NumericLiteralTooLarge);
  222. else
  223. m_t = TOKEN_numericliteral;
  224. return;
  225. }
  226. if (!iswasciialpha(*m_p) && !(*m_p == L'_'))
  227. {
  228. // look for a token in the table of symbols
  229. for (int i = 0; g_TokenKeysyms[i].c; ++i)
  230. {
  231. if (*m_p == g_TokenKeysyms[i].c)
  232. {
  233. // we have a match
  234. m_t = g_TokenKeysyms[i].t;
  235. // check for the two-character symbols (>=, <=, <>)
  236. if (m_t == TOKEN_op_lt)
  237. {
  238. WCHAR wchNext = *(m_p + 1);
  239. if (wchNext == L'=')
  240. {
  241. m_t = TOKEN_op_leq;
  242. m_pNext = m_p + 2;
  243. }
  244. else if (wchNext == L'>')
  245. {
  246. m_t = TOKEN_op_neq;
  247. m_pNext = m_p + 2;
  248. }
  249. }
  250. else if (m_t == TOKEN_op_gt)
  251. {
  252. if (*(m_p + 1) == L'=')
  253. {
  254. m_t = TOKEN_op_geq;
  255. m_pNext = m_p + 2;
  256. }
  257. }
  258. return;
  259. }
  260. }
  261. // the symbol was not recognized
  262. this->err(LEXERR_InvalidCharacter);
  263. return;
  264. }
  265. // look for a token in the table of keywords
  266. for (int i = 0; g_TokenKeywords[i].s; ++i)
  267. {
  268. const WCHAR *pwchToken = g_TokenKeywords[i].s;
  269. const WCHAR *pwchSource = m_p;
  270. while (*pwchToken && *pwchSource && towasciilower(*pwchToken) == towasciilower(*pwchSource))
  271. {
  272. ++pwchToken;
  273. ++pwchSource;
  274. }
  275. if (!*pwchToken && !iswasciialnum(*pwchSource))
  276. {
  277. // made it to the end of Token and source word
  278. m_t = g_TokenKeywords[i].t;
  279. m_pNext = pwchSource;
  280. return;
  281. }
  282. }
  283. // must be an identifier
  284. for (m_pNext = m_p + 1; iswasciialnum(*m_pNext) || *m_pNext == L'_'; ++m_pNext)
  285. {}
  286. if (m_pNext - m_p > g_iMaxBuffer - 1)
  287. {
  288. this->err(LEXERR_IdentifierTooLong);
  289. return;
  290. }
  291. char *psz = m_szStr;
  292. for (const WCHAR *pwsz = m_p; pwsz < m_pNext; ++psz, ++pwsz)
  293. {
  294. *psz = *pwsz;
  295. }
  296. *psz = '\0';
  297. if (*m_pNext == L'.')
  298. {
  299. ++m_pNext;
  300. m_t = TOKEN_identifierdot;
  301. }
  302. else
  303. {
  304. m_t = TOKEN_identifier;
  305. }
  306. return;
  307. }
  308. }
  309. }
  310. void Lexer::err(LexErr iErr)
  311. {
  312. static const char *s_rgpszErrorText[] =
  313. {
  314. "Unexpected error!", // shouldn't ever get this error
  315. "Invalid character",
  316. "Identifier too long",
  317. "String too long",
  318. "Unterminated string constant",
  319. "Number too large"
  320. };
  321. assert(ARRAY_SIZE(s_rgpszErrorText) == LEXERR_Max);
  322. if (iErr <= 0 || iErr >= LEXERR_Max)
  323. {
  324. assert(false);
  325. iErr = LEXERR_NoError;
  326. }
  327. m_t = TOKEN_eof;
  328. m_iNum = iErr;
  329. // copy error into the buffer
  330. const char *psz = s_rgpszErrorText[iErr];
  331. const char *pszMax = m_szStr + g_iMaxBuffer - 1;
  332. for (char *pszDest = m_szStr; pszDest < pszMax && *psz; *pszDest++ = *psz++)
  333. {}
  334. assert(!*psz); // since this function is used with hard-coded strings we shouldn't ever get one too long
  335. *pszDest = '\0';
  336. }