|
|
// Copyright (c) 1999 Microsoft Corporation. All rights reserved.
//
// Declaration of Lexer.
//
//#define LIMITEDVBSCRIPT_LOGLEXER // ��
#include "stdinc.h"
#include "enginc.h"
#include "englex.h"
#include "limits"
#ifdef LIMITEDVBSCRIPT_LOGLEXER
#include "englog.h"
#endif
//////////////////////////////////////////////////////////////////////
// Unicode/ASCII character classification
inline bool iswasciialpha(WCHAR c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); } inline bool iswasciidigit(WCHAR c) { return c >= L'0' && c <= L'9'; } inline bool iswasciialnum(WCHAR c) { return iswasciialpha(c) || iswasciidigit(c); } inline WCHAR towasciilower(WCHAR c) { return (c >= L'A' && c <= L'Z') ? c + (L'a' - L'A') : c; }
//////////////////////////////////////////////////////////////////////
// token tables
const TokenKeysym g_TokenKeysyms[] = { { L'(', TOKEN_lparen }, { L')', TOKEN_rparen }, { L',', TOKEN_comma }, { L'-', TOKEN_op_minus }, { L'^', TOKEN_op_pow }, { L'*', TOKEN_op_mult }, { L'\\', TOKEN_op_div }, { L'+', TOKEN_op_plus }, { L'<', TOKEN_op_lt }, { L'>', TOKEN_op_gt }, { L'=', TOKEN_op_eq }, { L'\0', TOKEN_eof } };
const TokenKeyword g_TokenKeywords[] = { { L"sub", TOKEN_sub }, { L"dim", TOKEN_dim }, { L"if", TOKEN_if }, { L"then", TOKEN_then }, { L"end", TOKEN_end }, { L"elseif", TOKEN_elseif }, { L"else", TOKEN_else }, { L"set", TOKEN_set }, { L"call", TOKEN_call }, { L"not", TOKEN_op_not }, { L"mod", TOKEN_op_mod }, { L"is", TOKEN_is }, { L"and", TOKEN_and }, { L"or", TOKEN_or }, { NULL, TOKEN_eof } };
//////////////////////////////////////////////////////////////////////
// helper functions
bool CheckOperatorType(Token t, bool fAcceptParens, bool fAcceptUnary, bool fAcceptBinary, bool fAcceptOverloadedAssignmentTokens) { switch (t) { case TOKEN_set: case TOKEN_sub: return fAcceptOverloadedAssignmentTokens;
case TOKEN_lparen: case TOKEN_rparen: return fAcceptParens;
case TOKEN_op_minus: return fAcceptUnary || fAcceptBinary;
case TOKEN_op_not: return fAcceptUnary;
case TOKEN_op_pow: case TOKEN_op_mult: case TOKEN_op_div: case TOKEN_op_mod: case TOKEN_op_plus: case TOKEN_op_lt: case TOKEN_op_leq: case TOKEN_op_gt: case TOKEN_op_geq: case TOKEN_op_eq: case TOKEN_op_neq: case TOKEN_is: case TOKEN_and: case TOKEN_or: return fAcceptBinary; }
return false; }
//////////////////////////////////////////////////////////////////////
// Lexer
Lexer::Lexer(const WCHAR *pwszSource) : m_p(pwszSource), m_pNext(NULL), m_iLine(1), m_iColumn(1), m_t(TOKEN_sub) { this->Scan(); }
void Lexer::Next() { assert(m_t != TOKEN_eof); if (m_pNext) { m_iColumn += (int)(m_pNext - m_p); m_p = m_pNext; m_pNext = NULL; } else { ++m_p; ++m_iColumn; } }
void Lexer::Scan() { m_szStr[0] = L'\0'; m_iNum = 0; bool fLineBreak = m_t == TOKEN_linebreak; for (;;) { if (fLineBreak) { // line breaks tokens are reported on the line/column that they occur so this isn't isn't adjusted until the next pass
++m_iLine; m_iColumn = 1; }
ScanMain(); if (!fLineBreak || m_t != TOKEN_linebreak) break;
Next(); }
#ifdef LIMITEDVBSCRIPT_LOGLEXER
LogToken(*this); #endif
}
void Lexer::ScanMain() { for (;; this->Next()) { switch (*m_p) { case L'\0': // end of script
m_t = TOKEN_eof; return;
case L'\'': // comment till end of line
for (; *m_p && *m_p != L'\n'; ++m_p) {}
--m_p; // put one char back so the next loop can process it
break;
case L'\t': case L' ': // ignore horizontal white space
break;
case L'\r': // ignore carriage returns
--m_iColumn; // in fact, they don't even count as characters
break;
case L'\n': // line break
m_t = TOKEN_linebreak; return;
default: if (*m_p == L'\"') { // string literal
m_pNext = m_p + 1; char *pszDest = m_szStr; const char *pszMax = m_szStr + g_iMaxBuffer - 1; do { if (!iswascii(*m_pNext)) { this->Next(); // this will update the current position to the offending character -- indicating the correct column of the error
this->err(LEXERR_NonAsciiCharacterInStringLiteral); return; }
if (*m_pNext == L'\n' || *m_pNext == L'\r') { this->err(LEXERR_StringLiteralUnterminated); return; }
if (*m_pNext == L'\"') { if (*++m_pNext != L'\"') break; // found terminating quote
// There were two quotes, the escape sequence for a single quote. The first was skipped and we're all ready to append the second.
} *pszDest++ = *m_pNext++; // we know this works because the character is ascii and those codes correspond to the same numbers in Unicode
} while (pszDest <= pszMax);
if (pszDest > pszMax) { this->err(LEXERR_StringLiteralTooLong); } else { *pszDest = L'\0'; m_t = TOKEN_stringliteral; } return; }
if (iswasciidigit(*m_p)) { // numeric literal
// Cant find a _wtoi like function that handles overflow so do the conversion myself.
// �� Look at runtime version to be sure these aren't constantly recomputed
const int iMaxChop = std::numeric_limits<int>::max() / 10; // if number gets bigger than this and there's another digit then we're going to overflow
const WCHAR wchMaxLast = std::numeric_limits<int>::max() % 10 + L'0'; // if number equals iMaxChop and the next digit is bigger than this then we're going to overflow
m_pNext = m_p; m_iNum = 0; do { m_iNum *= 10; m_iNum += *m_pNext++ - L'0'; } while (iswasciidigit(*m_pNext) && (m_iNum < iMaxChop || (m_iNum == iMaxChop && *m_pNext <= wchMaxLast)));
if (iswasciidigit(*m_pNext)) this->err(LEXERR_NumericLiteralTooLarge); else m_t = TOKEN_numericliteral; return; }
if (!iswasciialpha(*m_p) && !(*m_p == L'_')) { // look for a token in the table of symbols
for (int i = 0; g_TokenKeysyms[i].c; ++i) { if (*m_p == g_TokenKeysyms[i].c) { // we have a match
m_t = g_TokenKeysyms[i].t;
// check for the two-character symbols (>=, <=, <>)
if (m_t == TOKEN_op_lt) { WCHAR wchNext = *(m_p + 1); if (wchNext == L'=') { m_t = TOKEN_op_leq; m_pNext = m_p + 2; } else if (wchNext == L'>') { m_t = TOKEN_op_neq; m_pNext = m_p + 2; } } else if (m_t == TOKEN_op_gt) { if (*(m_p + 1) == L'=') { m_t = TOKEN_op_geq; m_pNext = m_p + 2; } }
return; } }
// the symbol was not recognized
this->err(LEXERR_InvalidCharacter); return; }
// look for a token in the table of keywords
for (int i = 0; g_TokenKeywords[i].s; ++i) { const WCHAR *pwchToken = g_TokenKeywords[i].s; const WCHAR *pwchSource = m_p; while (*pwchToken && *pwchSource && towasciilower(*pwchToken) == towasciilower(*pwchSource)) { ++pwchToken; ++pwchSource; }
if (!*pwchToken && !iswasciialnum(*pwchSource)) { // made it to the end of Token and source word
m_t = g_TokenKeywords[i].t; m_pNext = pwchSource; return; } }
// must be an identifier
for (m_pNext = m_p + 1; iswasciialnum(*m_pNext) || *m_pNext == L'_'; ++m_pNext) {}
if (m_pNext - m_p > g_iMaxBuffer - 1) { this->err(LEXERR_IdentifierTooLong); return; }
char *psz = m_szStr; for (const WCHAR *pwsz = m_p; pwsz < m_pNext; ++psz, ++pwsz) { *psz = *pwsz; }
*psz = '\0';
if (*m_pNext == L'.') { ++m_pNext; m_t = TOKEN_identifierdot; } else { m_t = TOKEN_identifier; } return; } } }
void Lexer::err(LexErr iErr) { static const char *s_rgpszErrorText[] = { "Unexpected error!", // shouldn't ever get this error
"Invalid character", "Identifier too long", "String too long", "Unterminated string constant", "Number too large" };
assert(ARRAY_SIZE(s_rgpszErrorText) == LEXERR_Max); if (iErr <= 0 || iErr >= LEXERR_Max) { assert(false); iErr = LEXERR_NoError; }
m_t = TOKEN_eof; m_iNum = iErr;
// copy error into the buffer
const char *psz = s_rgpszErrorText[iErr]; const char *pszMax = m_szStr + g_iMaxBuffer - 1; for (char *pszDest = m_szStr; pszDest < pszMax && *psz; *pszDest++ = *psz++) {}
assert(!*psz); // since this function is used with hard-coded strings we shouldn't ever get one too long
*pszDest = '\0'; }
|