|
|
/*++
Copyright (c) 2000 Microsoft Corporation
Module Name:
adllexer.cpp
Abstract:
Implementation of the lexer for the ADL language
Author:
t-eugenz - August 2000
Environment:
User mode only.
Revision History:
Created - August 2000
--*/
#include "adl.h"
//
// Constant values outside WCHAR range, for special characters
//
#define CHAR_COMMA 65538
#define CHAR_QUOTE 65539
#define CHAR_SEMICOLON 65540
#define CHAR_OPENPAREN 65541
#define CHAR_CLOSEPAREN 65542
#define CHAR_NULL 65543
#define CHAR_NEWLINE 65544
#define CHAR_RETURN 65545
#define CHAR_TAB 65546
#define CHAR_SPACE 65547
#define CHAR_AT 65548
#define CHAR_SLASH 65549
#define CHAR_PERIOD 65550
//
// States of the lexer DFA
//
#define STATE_WHITESPACE 0
#define STATE_BEGIN 1
#define STATE_IDENT 2
#define STATE_QUOTE 3
#define STATE_DONE 4
//
// If the character is found in the special character map, use the special
// symbol (>65535), otherwise use the regular character value
//
#define RESOLVE_CHAR(CHAR, MAP, ITER, ITEREND) \
((((ITER) = (MAP).find((CHAR)) ) == (ITEREND) ) ? (CHAR) : (*(ITER)).second)
AdlLexer::AdlLexer(IN const WCHAR *input, IN OUT AdlStatement *adlStat, IN const PADL_LANGUAGE_SPEC pLang) /*++
Routine Description:
Constructor for the AdlLexer. Initializes the mapping for finding special characters, and other initial state information
Arguments:
input - The input string adlStat - The AdlStatement instance, for token garbage collection pLang - The ADL language description
Return Value: none --*/
{
_input = input; _pLang = pLang; _adlStat = adlStat;
_position = 0; _tokCount = 0;
//
// Special character mapping
//
_mapCharCode[_pLang->CH_NULL] = CHAR_NULL; _mapCharCode[_pLang->CH_SPACE] = CHAR_SPACE; _mapCharCode[_pLang->CH_TAB] = CHAR_TAB; _mapCharCode[_pLang->CH_NEWLINE] = CHAR_NEWLINE; _mapCharCode[_pLang->CH_RETURN] = CHAR_RETURN; _mapCharCode[_pLang->CH_QUOTE] = CHAR_QUOTE; _mapCharCode[_pLang->CH_COMMA] = CHAR_COMMA; _mapCharCode[_pLang->CH_SEMICOLON] = CHAR_SEMICOLON; _mapCharCode[_pLang->CH_OPENPAREN] = CHAR_OPENPAREN; _mapCharCode[_pLang->CH_CLOSEPAREN] = CHAR_CLOSEPAREN; _mapCharCode[_pLang->CH_AT] = CHAR_AT; _mapCharCode[_pLang->CH_SLASH] = CHAR_SLASH; _mapCharCode[_pLang->CH_PERIOD] = CHAR_PERIOD;
//
// Only find end of map once
//
_iterEnd = _mapCharCode.end();
//
// Place all special tokens into a map, for O(log n) string searches
//
_mapStringToken[_pLang->SZ_TK_AND] = TK_AND; _mapStringToken[_pLang->SZ_TK_EXCEPT] = TK_EXCEPT; _mapStringToken[_pLang->SZ_TK_ON] = TK_ON; _mapStringToken[_pLang->SZ_TK_ALLOWED] = TK_ALLOWED; _mapStringToken[_pLang->SZ_TK_AS] = TK_AS; _mapStringToken[_pLang->SZ_TK_THIS_OBJECT] = TK_THIS_OBJECT; _mapStringToken[_pLang->SZ_TK_CONTAINERS] = TK_CONTAINERS; _mapStringToken[_pLang->SZ_TK_OBJECTS] = TK_OBJECTS; _mapStringToken[_pLang->SZ_TK_CONTAINERS_OBJECTS] = TK_CONTAINERS_OBJECTS; _mapStringToken[_pLang->SZ_TK_NO_PROPAGATE] = TK_NO_PROPAGATE;
}
DWORD AdlLexer::NextToken(OUT AdlToken **value) /*++
Routine Description:
This retrieves the next token from the input string. This is basically a DFA which begins in the WHITESPACE state, and runs until it reaches the DONE state, at which point it returns a token. Arguments:
value - Pointer to a new token containing the string value is stored in *value Return Value: DWORD - The token type, as #define'd by YACC in tokens.h --*/ {
//
// Initial DFA state
//
DWORD state = STATE_WHITESPACE; DWORD tokType = TK_ERROR; wstring curToken; DWORD dwInput;
DWORD dwTokStart = 0;
//
// First token should be the grammar type
//
if( _tokCount == 0 ) { _tokCount++; return _pLang->dwLanguageType; }
dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd);
while( state != STATE_DONE ) { switch( state ) {
case STATE_WHITESPACE:
switch( dwInput ) { case CHAR_NULL: tokType = 0; state = STATE_DONE; break;
case CHAR_NEWLINE: _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd);
break;
case CHAR_RETURN: _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); break;
case CHAR_SPACE: _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); break;
case CHAR_TAB: _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); break;
default: state = STATE_BEGIN; break; } break; case STATE_BEGIN:
dwTokStart = _position;
tokType = TK_ERROR;
switch( dwInput ) { case CHAR_NULL: state = STATE_DONE; break;
case CHAR_COMMA: if( tokType == TK_ERROR ) { tokType = TK_COMMA; }
case CHAR_OPENPAREN: if( tokType == TK_ERROR ) { tokType = TK_OPENPAREN; }
case CHAR_CLOSEPAREN: if( tokType == TK_ERROR ) { tokType = TK_CLOSEPAREN; }
case CHAR_SEMICOLON: if( tokType == TK_ERROR ) { tokType = TK_SEMICOLON; }
case CHAR_AT: if( tokType == TK_ERROR ) { tokType = TK_AT; }
case CHAR_SLASH: if( tokType == TK_ERROR ) { tokType = TK_SLASH; }
case CHAR_PERIOD: if( tokType == TK_ERROR ) { tokType = TK_PERIOD; }
//
// Same action for all special single-char tokens
//
curToken.append( &(_input[_position]), 1 ); _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd);
state = STATE_DONE; break; case CHAR_QUOTE: _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd);
state = STATE_QUOTE; tokType = TK_IDENT; break;
default: state = STATE_IDENT; tokType = TK_IDENT; break; }
break;
case STATE_IDENT:
switch( dwInput ) { case CHAR_NULL: case CHAR_COMMA: case CHAR_OPENPAREN: case CHAR_CLOSEPAREN: case CHAR_SEMICOLON: case CHAR_NEWLINE: case CHAR_RETURN: case CHAR_TAB: case CHAR_SPACE: case CHAR_AT: case CHAR_SLASH: case CHAR_PERIOD: case CHAR_QUOTE:
state = STATE_DONE; break;
default: curToken.append( &(_input[_position]), 1 ); _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd);
break; }
break;
case STATE_QUOTE:
switch( dwInput ) { case CHAR_NULL: case CHAR_TAB: case CHAR_NEWLINE: case CHAR_RETURN: throw AdlStatement::ERROR_UNTERMINATED_STRING; break;
case CHAR_QUOTE:
_position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); state = STATE_DONE; break;
default: curToken.append( &(_input[_position]), 1 ); _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); break; }
break;
default:
//
// Should never get here, well-defined states
//
assert(FALSE); break; } } //
// Done state was reached
// Export the string and column/row info in YACC-form here
//
AdlToken *outVal; outVal = new AdlToken(curToken.c_str(), dwTokStart, _position - 1); _adlStat->AddToken(outVal);
//
// Check if the string is a special token, case-insensitive
//
if( _mapStringToken.find(outVal->GetValue()) != _mapStringToken.end() ) { tokType = _mapStringToken[outVal->GetValue()]; }
*value = outVal;
//
// Set this token to be the error token. This way, if the string is
// not accepted by the parser, we know at which token the parser failed
// If another error occurs later, this value will be overwritten
//
_adlStat->SetErrorToken(outVal);
_tokCount++;
return tokType; }
|