/*++ Copyright (c) 2000 Microsoft Corporation Module Name: adllexer.cpp Abstract: Implementation of the lexer for the ADL language Author: t-eugenz - August 2000 Environment: User mode only. Revision History: Created - August 2000 --*/ #include "adl.h" // // Constant values outside WCHAR range, for special characters // #define CHAR_COMMA 65538 #define CHAR_QUOTE 65539 #define CHAR_SEMICOLON 65540 #define CHAR_OPENPAREN 65541 #define CHAR_CLOSEPAREN 65542 #define CHAR_NULL 65543 #define CHAR_NEWLINE 65544 #define CHAR_RETURN 65545 #define CHAR_TAB 65546 #define CHAR_SPACE 65547 #define CHAR_AT 65548 #define CHAR_SLASH 65549 #define CHAR_PERIOD 65550 // // States of the lexer DFA // #define STATE_WHITESPACE 0 #define STATE_BEGIN 1 #define STATE_IDENT 2 #define STATE_QUOTE 3 #define STATE_DONE 4 // // If the character is found in the special character map, use the special // symbol (>65535), otherwise use the regular character value // #define RESOLVE_CHAR(CHAR, MAP, ITER, ITEREND) \ ((((ITER) = (MAP).find((CHAR)) ) == (ITEREND) ) ? (CHAR) : (*(ITER)).second) AdlLexer::AdlLexer(IN const WCHAR *input, IN OUT AdlStatement *adlStat, IN const PADL_LANGUAGE_SPEC pLang) /*++ Routine Description: Constructor for the AdlLexer. Initializes the mapping for finding special characters, and other initial state information Arguments: input - The input string adlStat - The AdlStatement instance, for token garbage collection pLang - The ADL language description Return Value: none --*/ { _input = input; _pLang = pLang; _adlStat = adlStat; _position = 0; _tokCount = 0; // // Special character mapping // _mapCharCode[_pLang->CH_NULL] = CHAR_NULL; _mapCharCode[_pLang->CH_SPACE] = CHAR_SPACE; _mapCharCode[_pLang->CH_TAB] = CHAR_TAB; _mapCharCode[_pLang->CH_NEWLINE] = CHAR_NEWLINE; _mapCharCode[_pLang->CH_RETURN] = CHAR_RETURN; _mapCharCode[_pLang->CH_QUOTE] = CHAR_QUOTE; _mapCharCode[_pLang->CH_COMMA] = CHAR_COMMA; _mapCharCode[_pLang->CH_SEMICOLON] = CHAR_SEMICOLON; _mapCharCode[_pLang->CH_OPENPAREN] = CHAR_OPENPAREN; _mapCharCode[_pLang->CH_CLOSEPAREN] = CHAR_CLOSEPAREN; _mapCharCode[_pLang->CH_AT] = CHAR_AT; _mapCharCode[_pLang->CH_SLASH] = CHAR_SLASH; _mapCharCode[_pLang->CH_PERIOD] = CHAR_PERIOD; // // Only find end of map once // _iterEnd = _mapCharCode.end(); // // Place all special tokens into a map, for O(log n) string searches // _mapStringToken[_pLang->SZ_TK_AND] = TK_AND; _mapStringToken[_pLang->SZ_TK_EXCEPT] = TK_EXCEPT; _mapStringToken[_pLang->SZ_TK_ON] = TK_ON; _mapStringToken[_pLang->SZ_TK_ALLOWED] = TK_ALLOWED; _mapStringToken[_pLang->SZ_TK_AS] = TK_AS; _mapStringToken[_pLang->SZ_TK_THIS_OBJECT] = TK_THIS_OBJECT; _mapStringToken[_pLang->SZ_TK_CONTAINERS] = TK_CONTAINERS; _mapStringToken[_pLang->SZ_TK_OBJECTS] = TK_OBJECTS; _mapStringToken[_pLang->SZ_TK_CONTAINERS_OBJECTS] = TK_CONTAINERS_OBJECTS; _mapStringToken[_pLang->SZ_TK_NO_PROPAGATE] = TK_NO_PROPAGATE; } DWORD AdlLexer::NextToken(OUT AdlToken **value) /*++ Routine Description: This retrieves the next token from the input string. This is basically a DFA which begins in the WHITESPACE state, and runs until it reaches the DONE state, at which point it returns a token. Arguments: value - Pointer to a new token containing the string value is stored in *value Return Value: DWORD - The token type, as #define'd by YACC in tokens.h --*/ { // // Initial DFA state // DWORD state = STATE_WHITESPACE; DWORD tokType = TK_ERROR; wstring curToken; DWORD dwInput; DWORD dwTokStart = 0; // // First token should be the grammar type // if( _tokCount == 0 ) { _tokCount++; return _pLang->dwLanguageType; } dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); while( state != STATE_DONE ) { switch( state ) { case STATE_WHITESPACE: switch( dwInput ) { case CHAR_NULL: tokType = 0; state = STATE_DONE; break; case CHAR_NEWLINE: _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); break; case CHAR_RETURN: _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); break; case CHAR_SPACE: _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); break; case CHAR_TAB: _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); break; default: state = STATE_BEGIN; break; } break; case STATE_BEGIN: dwTokStart = _position; tokType = TK_ERROR; switch( dwInput ) { case CHAR_NULL: state = STATE_DONE; break; case CHAR_COMMA: if( tokType == TK_ERROR ) { tokType = TK_COMMA; } case CHAR_OPENPAREN: if( tokType == TK_ERROR ) { tokType = TK_OPENPAREN; } case CHAR_CLOSEPAREN: if( tokType == TK_ERROR ) { tokType = TK_CLOSEPAREN; } case CHAR_SEMICOLON: if( tokType == TK_ERROR ) { tokType = TK_SEMICOLON; } case CHAR_AT: if( tokType == TK_ERROR ) { tokType = TK_AT; } case CHAR_SLASH: if( tokType == TK_ERROR ) { tokType = TK_SLASH; } case CHAR_PERIOD: if( tokType == TK_ERROR ) { tokType = TK_PERIOD; } // // Same action for all special single-char tokens // curToken.append( &(_input[_position]), 1 ); _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); state = STATE_DONE; break; case CHAR_QUOTE: _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); state = STATE_QUOTE; tokType = TK_IDENT; break; default: state = STATE_IDENT; tokType = TK_IDENT; break; } break; case STATE_IDENT: switch( dwInput ) { case CHAR_NULL: case CHAR_COMMA: case CHAR_OPENPAREN: case CHAR_CLOSEPAREN: case CHAR_SEMICOLON: case CHAR_NEWLINE: case CHAR_RETURN: case CHAR_TAB: case CHAR_SPACE: case CHAR_AT: case CHAR_SLASH: case CHAR_PERIOD: case CHAR_QUOTE: state = STATE_DONE; break; default: curToken.append( &(_input[_position]), 1 ); _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); break; } break; case STATE_QUOTE: switch( dwInput ) { case CHAR_NULL: case CHAR_TAB: case CHAR_NEWLINE: case CHAR_RETURN: throw AdlStatement::ERROR_UNTERMINATED_STRING; break; case CHAR_QUOTE: _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); state = STATE_DONE; break; default: curToken.append( &(_input[_position]), 1 ); _position++; dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd); break; } break; default: // // Should never get here, well-defined states // assert(FALSE); break; } } // // Done state was reached // Export the string and column/row info in YACC-form here // AdlToken *outVal; outVal = new AdlToken(curToken.c_str(), dwTokStart, _position - 1); _adlStat->AddToken(outVal); // // Check if the string is a special token, case-insensitive // if( _mapStringToken.find(outVal->GetValue()) != _mapStringToken.end() ) { tokType = _mapStringToken[outVal->GetValue()]; } *value = outVal; // // Set this token to be the error token. This way, if the string is // not accepted by the parser, we know at which token the parser failed // If another error occurs later, this value will be overwritten // _adlStat->SetErrorToken(outVal); _tokCount++; return tokType; }