windows-xp/Source/XPSP1/NT/multimedia/directx/dmusic/dmscript/englex.cpp


								// Copyright (c) 1999 Microsoft Corporation. All rights reserved.

								//

								// Declaration of Lexer.

								//


								//#define LIMITEDVBSCRIPT_LOGLEXER // §§


								#include "stdinc.h"

								#include "enginc.h"

								#include "englex.h"

								#include "limits"


								#ifdef LIMITEDVBSCRIPT_LOGLEXER

								#include "englog.h"

								#endif


								//////////////////////////////////////////////////////////////////////

								// Unicode/ASCII character classification


								inline bool iswasciialpha(WCHAR c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); }

								inline bool iswasciidigit(WCHAR c) { return c >= L'0' && c <= L'9'; }

								inline bool iswasciialnum(WCHAR c) { return iswasciialpha(c) || iswasciidigit(c); }

								inline WCHAR towasciilower(WCHAR c) { return (c >= L'A' && c <= L'Z') ? c + (L'a' - L'A') : c; }


								//////////////////////////////////////////////////////////////////////

								// token tables


								const TokenKeysym g_TokenKeysyms[] =

									{

										{ L'(', TOKEN_lparen },

										{ L')', TOKEN_rparen },

										{ L',', TOKEN_comma },

										{ L'-', TOKEN_op_minus },

										{ L'^', TOKEN_op_pow },

										{ L'*', TOKEN_op_mult },

										{ L'\\', TOKEN_op_div },

										{ L'+', TOKEN_op_plus },

										{ L'<', TOKEN_op_lt },

										{ L'>', TOKEN_op_gt },

										{ L'=', TOKEN_op_eq },

										{ L'\0', TOKEN_eof }

									};


								const TokenKeyword g_TokenKeywords[] =

									{

										{ L"sub", TOKEN_sub },

										{ L"dim", TOKEN_dim },

										{ L"if", TOKEN_if },

										{ L"then", TOKEN_then },

										{ L"end", TOKEN_end },

										{ L"elseif", TOKEN_elseif },

										{ L"else", TOKEN_else },

										{ L"set", TOKEN_set },

										{ L"call", TOKEN_call },

										{ L"not", TOKEN_op_not },

										{ L"mod", TOKEN_op_mod },

										{ L"is", TOKEN_is },

										{ L"and", TOKEN_and },

										{ L"or", TOKEN_or },

										{ NULL, TOKEN_eof }

									};


								//////////////////////////////////////////////////////////////////////

								// helper functions


								bool

								CheckOperatorType(Token t, bool fAcceptParens, bool fAcceptUnary, bool fAcceptBinary, bool fAcceptOverloadedAssignmentTokens)

								{

									switch (t)

									{

									case TOKEN_set:

									case TOKEN_sub:

										return fAcceptOverloadedAssignmentTokens;


									case TOKEN_lparen:

									case TOKEN_rparen:

										return fAcceptParens;


									case TOKEN_op_minus:

										return fAcceptUnary || fAcceptBinary;


									case TOKEN_op_not:

										return fAcceptUnary;


									case TOKEN_op_pow:

									case TOKEN_op_mult:

									case TOKEN_op_div:

									case TOKEN_op_mod:

									case TOKEN_op_plus:

									case TOKEN_op_lt:

									case TOKEN_op_leq:

									case TOKEN_op_gt:

									case TOKEN_op_geq:

									case TOKEN_op_eq:

									case TOKEN_op_neq:

									case TOKEN_is:

									case TOKEN_and:

									case TOKEN_or:

										return fAcceptBinary;

									}


									return false;

								}


								//////////////////////////////////////////////////////////////////////

								// Lexer


								Lexer::Lexer(const WCHAR *pwszSource)

								  : m_p(pwszSource),

									m_pNext(NULL),

									m_iLine(1),

									m_iColumn(1),

									m_t(TOKEN_sub)

								{

									this->Scan();

								}


								void

								Lexer::Next()

								{

									assert(m_t != TOKEN_eof);

									if (m_pNext)

									{

										m_iColumn += (int)(m_pNext - m_p);

										m_p = m_pNext;

										m_pNext = NULL;

									}

									else

									{

										++m_p;

										++m_iColumn;

									}

								}


								void

								Lexer::Scan()

								{

									m_szStr[0] = L'\0';

									m_iNum = 0;

									bool fLineBreak = m_t == TOKEN_linebreak;

									for (;;)

									{

										if (fLineBreak)

										{

											// line breaks tokens are reported on the line/column that they occur so this isn't isn't adjusted until the next pass

											++m_iLine;

											m_iColumn = 1;

										}


										ScanMain();

										if (!fLineBreak || m_t != TOKEN_linebreak)

											break;


										Next();

									}


								#ifdef LIMITEDVBSCRIPT_LOGLEXER

									LogToken(*this);

								#endif

								}


								void

								Lexer::ScanMain()

								{

									for (;; this->Next())

									{

										switch (*m_p)

										{

										case L'\0':

											// end of script

											m_t = TOKEN_eof;

											return;


										case L'\'':

											// comment till end of line

											for (; *m_p && *m_p != L'\n'; ++m_p)

											{}


											--m_p; // put one char back so the next loop can process it

											break;


										case L'\t': case L' ':

											// ignore horizontal white space

											break;


										case L'\r':

											// ignore carriage returns

											--m_iColumn; // in fact, they don't even count as characters

											break;


										case L'\n':

											// line break

											m_t = TOKEN_linebreak;

											return;


										default:

											if (*m_p == L'\"')

											{

												// string literal

												m_pNext = m_p + 1;

												char *pszDest = m_szStr;

												const char *pszMax = m_szStr + g_iMaxBuffer - 1;

												do

												{

													if (!iswascii(*m_pNext))

													{

														this->Next(); // this will update the current position to the offending character -- indicating the correct column of the error

														this->err(LEXERR_NonAsciiCharacterInStringLiteral);

														return;

													}


													if (*m_pNext == L'\n' || *m_pNext == L'\r')

													{

														this->err(LEXERR_StringLiteralUnterminated);

														return;

													}


													if (*m_pNext == L'\"')

													{

														if (*++m_pNext != L'\"')

															break; // found terminating quote


														// There were two quotes, the escape sequence for a single quote.  The first was skipped and we're all ready to append the second.

													}


													*pszDest++ = *m_pNext++; // we know this works because the character is ascii and those codes correspond to the same numbers in Unicode

												} while (pszDest <= pszMax);


												if (pszDest > pszMax)

												{

													this->err(LEXERR_StringLiteralTooLong);

												}

												else

												{

													*pszDest = L'\0';

													m_t = TOKEN_stringliteral;

												}

												return;

											}


											if (iswasciidigit(*m_p))

											{

												// numeric literal

												// Cant find a _wtoi like function that handles overflow so do the conversion myself.


												// §§ Look at runtime version to be sure these aren't constantly recomputed

												const int iMaxChop = std::numeric_limits<int>::max() / 10; // if number gets bigger than this and there's another digit then we're going to overflow

												const WCHAR wchMaxLast = std::numeric_limits<int>::max() % 10 + L'0'; // if number equals iMaxChop and the next digit is bigger than this then we're going to overflow


												m_pNext = m_p;

												m_iNum = 0;

												do

												{

													m_iNum *= 10;

													m_iNum += *m_pNext++ - L'0';

												} while (iswasciidigit(*m_pNext) && (m_iNum < iMaxChop || (m_iNum == iMaxChop && *m_pNext <= wchMaxLast)));


												if (iswasciidigit(*m_pNext))

													this->err(LEXERR_NumericLiteralTooLarge);

												else

													m_t = TOKEN_numericliteral;

												return;

											}


											if (!iswasciialpha(*m_p) && !(*m_p == L'_'))

											{

												// look for a token in the table of symbols

												for (int i = 0; g_TokenKeysyms[i].c; ++i)

												{

													if (*m_p == g_TokenKeysyms[i].c)

													{

														// we have a match

														m_t = g_TokenKeysyms[i].t;


														// check for the two-character symbols (>=, <=, <>)

														if (m_t == TOKEN_op_lt)

														{

															WCHAR wchNext = *(m_p + 1);

															if (wchNext == L'=')

															{

																m_t = TOKEN_op_leq;

																m_pNext = m_p + 2;

															}

															else if (wchNext == L'>')

															{

																m_t = TOKEN_op_neq;

																m_pNext = m_p + 2;

															}

														}

														else if (m_t == TOKEN_op_gt)

														{

															if (*(m_p + 1) == L'=')

															{

																m_t = TOKEN_op_geq;

																m_pNext = m_p + 2;

															}

														}


														return;

													}

												}


												// the symbol was not recognized

												this->err(LEXERR_InvalidCharacter);

												return;

											}


											// look for a token in the table of keywords

											for (int i = 0; g_TokenKeywords[i].s; ++i)

											{

												const WCHAR *pwchToken = g_TokenKeywords[i].s;

												const WCHAR *pwchSource = m_p;

												while (*pwchToken && *pwchSource && towasciilower(*pwchToken) == towasciilower(*pwchSource))

												{

													++pwchToken;

													++pwchSource;

												}


												if (!*pwchToken && !iswasciialnum(*pwchSource))

												{

													// made it to the end of Token and source word

													m_t = g_TokenKeywords[i].t;

													m_pNext = pwchSource;

													return;

												}

											}


											// must be an identifier

											for (m_pNext = m_p + 1; iswasciialnum(*m_pNext) || *m_pNext == L'_'; ++m_pNext)

											{}


											if (m_pNext - m_p > g_iMaxBuffer - 1)

											{

												this->err(LEXERR_IdentifierTooLong);

												return;

											}


											char *psz = m_szStr;

											for (const WCHAR *pwsz = m_p; pwsz < m_pNext; ++psz, ++pwsz)

											{

												*psz = *pwsz;

											}


											*psz = '\0';


											if (*m_pNext == L'.')

											{

												++m_pNext;

												m_t = TOKEN_identifierdot;

											}

											else

											{

												m_t = TOKEN_identifier;

											}

											return;

										}

									}

								}


								void Lexer::err(LexErr iErr)

								{

									static const char *s_rgpszErrorText[] =

										{

										"Unexpected error!", // shouldn't ever get this error

										"Invalid character",

										"Identifier too long",

										"String too long",

										"Unterminated string constant",

										"Number too large"

								 		};


									assert(ARRAY_SIZE(s_rgpszErrorText) == LEXERR_Max);

									if (iErr <= 0 || iErr >= LEXERR_Max)

									{

										assert(false);

										iErr = LEXERR_NoError;

									}


									m_t = TOKEN_eof;

									m_iNum = iErr;


									// copy error into the buffer

									const char *psz = s_rgpszErrorText[iErr];

									const char *pszMax = m_szStr + g_iMaxBuffer - 1;

									for (char *pszDest = m_szStr; pszDest < pszMax && *psz; *pszDest++ = *psz++)

									{}


									assert(!*psz); // since this function is used with hard-coded strings we shouldn't ever get one too long

									*pszDest = '\0';

								}