/************************************************************************/
/*									*/
/* RCPP - Resource Compiler Pre-Processor for NT system			*/
/*									*/
/* P0GETTOK.C - Tokenization routines					*/
/*									*/
/* 29-Nov-90 w-BrianM  Update for NT from PM SDK RCPP			*/
/*									*/
/************************************************************************/

#include <stdio.h>
#include "rcpptype.h"
#include "rcppdecl.h"
#include "rcppext.h"
#include "grammar.h"
#include "p0defs.h"
#include "charmap.h"

/************************************************************************
**	MAP_TOKEN : a token has two representations and additional information.
**	(ex : const, has basic token of L_CONST,
**				mapped token of [L_TYPE | L_MODIFIER]
**				and info based on what the map token is)
**	MAP_AND_FILL : has two representations, but none of the extra info.
**	(ex : '<', has basic of L_LT, and map of L_RELOP)
**  NOMAP_TOKEN : has 1 representation and additional info.
**	(ex: a string, basic and 'map' type L_STRING and ptrs to the actual str)
**	NOMAP_AND_FILL : has 1 representation and no additional info.
**	(ex : 'while', has basic and 'map' of L_WHILE)
**  the FILL versions fill the token with the basic token type.
************************************************************************/
#define	MAP_TOKEN(otok)\
	(Basic_token = (otok), TS_VALUE(Basic_token))
#define	MAP_AND_FILL(otok)\
	(yylval.yy_token = Basic_token = (otok), TS_VALUE(Basic_token))
#define	NOMAP_TOKEN(otok)\
	(Basic_token = (otok))
#define	NOMAP_AND_FILL(otok)\
	(yylval.yy_token = Basic_token = (otok))



/************************************************************************/
/* yylex - main tokenization routine					*/
/************************************************************************/

token_t yylex(void)
{
    REG	UCHAR		last_mapped;
    UCHAR		mapped_c;
    REG	token_t		lex_token;

    for(;;) {
	last_mapped = mapped_c = CHARMAP(GETCH());
first_switch:
	switch(mapped_c) {
	case LX_EACH:
	case LX_ASCII:
	    Msg_Temp = GET_MSG(2018);
	    SET_MSG (Msg_Text, Msg_Temp, PREVCH());
	    error(2018);
	    continue;
	    break;
	case LX_OBRACE:
	    return(NOMAP_AND_FILL(L_LCURLY));
	    break;
	case LX_CBRACE:
	    return(NOMAP_AND_FILL(L_RCURLY));
	    break;
	case LX_OBRACK:
	    return(NOMAP_AND_FILL(L_LBRACK));
	    break;
	case LX_CBRACK:
	    return(NOMAP_AND_FILL(L_RBRACK));
	    break;
	case LX_OPAREN:
	    return(NOMAP_AND_FILL(L_LPAREN));
	    break;
	case LX_CPAREN:
	    return(NOMAP_AND_FILL(L_RPAREN));
	    break;
	case LX_COMMA:
	    return(NOMAP_AND_FILL(L_COMMA));
	    break;
	case LX_QUEST:
	    return(NOMAP_AND_FILL(L_QUEST));
	    break;
	case LX_SEMI:
	    return(NOMAP_AND_FILL(L_SEMI));
	    break;
	case LX_TILDE:
	    return(NOMAP_AND_FILL(L_TILDE));
	    break;
	case LX_NUMBER:
	    return(MAP_TOKEN(getnum(PREVCH())));
	    break;


	case LX_MINUS:
	    switch(last_mapped = CHARMAP(GETCH())) {
	    case LX_EQ:
		return(MAP_AND_FILL(L_MINUSEQ));
		break;
	    case LX_GT:
		return(MAP_AND_FILL(L_POINTSTO));
		break;
	    case LX_MINUS:
		return(MAP_AND_FILL(L_DECR));
		break;
	    default:
		lex_token = L_MINUS;
		break;
	    }
	    break;
	case LX_PLUS:
	    switch(last_mapped = CHARMAP(GETCH())) {
	    case LX_EQ:
		return(MAP_AND_FILL(L_PLUSEQ));
		break;
	    case LX_PLUS:
		return(MAP_AND_FILL(L_INCR));
		break;
	    default:
		lex_token = L_PLUS;
		break;
	    }
	    break;
	case LX_AND:
	    switch(last_mapped = CHARMAP(GETCH())) {
	    case LX_EQ:
		return(MAP_AND_FILL(L_ANDEQ));
		break;
	    case LX_AND:
		return(MAP_AND_FILL(L_ANDAND));
		break;
	    default:
		lex_token = L_AND;
		break;
	    }
	    break;
	case LX_OR:
	    switch(last_mapped = CHARMAP(GETCH())) {
	    case LX_EQ:
		return(MAP_AND_FILL(L_OREQ));
		break;
	    case LX_OR:
		return(MAP_AND_FILL(L_OROR));
		break;
	    default:
		lex_token = L_OR;
		break;
	    }
	    break;
	case LX_COLON:
	    return(NOMAP_AND_FILL(L_COLON));
	    break;
	case LX_HAT:
	    if((last_mapped = CHARMAP(GETCH())) == LX_EQ) {
		return(MAP_AND_FILL(L_XOREQ));
	    }
	    lex_token = L_XOR;
	    break;
	case LX_PERCENT:
	    if((last_mapped = CHARMAP(GETCH())) == LX_EQ) {
		return(MAP_AND_FILL(L_MODEQ));
	    }
	    lex_token = L_MOD;
	    break;
	case LX_EQ:
	    if((last_mapped = CHARMAP(GETCH())) == LX_EQ) {
		return(MAP_AND_FILL(L_EQUALS));
	    }
	    lex_token = L_ASSIGN;
	    break;
	case LX_BANG:
	    if((last_mapped = CHARMAP(GETCH())) == LX_EQ) {
		return(MAP_AND_FILL(L_NOTEQ));
	    }
	    lex_token = L_EXCLAIM;
	    break;
	case LX_SLASH:
	    switch(last_mapped = CHARMAP(GETCH())) {
	    case LX_STAR:
		dump_comment();
		continue;
		break;
	    case LX_SLASH:
		DumpSlashComment();
		continue;
		break;
	    case LX_EQ:
		return(MAP_AND_FILL(L_DIVEQ));
		break;
	    default:
		lex_token = L_DIV;
		break;
	    }
	    break;
	case LX_STAR:
	    switch(last_mapped = CHARMAP(GETCH())) {
	    case LX_SLASH:
		if( ! Prep ) {
		    Msg_Temp = GET_MSG(2138);
		    SET_MSG (Msg_Text, Msg_Temp);
		    error(2138); /* (nested comments) */
		}
		else {
		    fwrite("*/", 2, 1, OUTPUTFILE);
		}
		continue;
	    case LX_EQ:
		return(MAP_AND_FILL(L_MULTEQ));
		break;
	    default:
		lex_token = L_MULT;
		break;
	    }
	    break;
	case LX_LT:
	    switch(last_mapped = CHARMAP(GETCH())) {
	    case LX_LT:
		if((last_mapped = CHARMAP(GETCH())) == LX_EQ) {
		    return(MAP_AND_FILL(L_LSHFTEQ));
		}
		mapped_c = LX_LSHIFT;
		lex_token = L_LSHIFT;
		break;
	    case LX_EQ:
		return(MAP_AND_FILL(L_LTEQ));
		break;
	    default:
		lex_token = L_LT;
		break;
	    }
	    break;
	case LX_LSHIFT:
	    /*
			**  if the next char is not an =, then we unget and return,
			**  since the only way in here is if we broke on the char
			**  following '<<'. since we'll have already worked the handle_eos()
			**  code prior to getting here, we'll not see another eos,
			**  UNLESS i/o buffering is char by char. ???
			**  see also, LX_RSHIFT
			*/
	    if((last_mapped = CHARMAP(GETCH())) == LX_EQ) {
		return(MAP_AND_FILL(L_LSHFTEQ));
	    }
	    UNGETCH();
	    return(MAP_AND_FILL(L_LSHIFT));
	    break;
	case LX_GT:
	    switch(last_mapped = CHARMAP(GETCH())) {
	    case LX_EQ:
		return(MAP_AND_FILL(L_GTEQ));
	    case LX_GT:
		if((last_mapped = CHARMAP(GETCH())) == LX_EQ) {
		    return(MAP_AND_FILL(L_RSHFTEQ));
		}
		mapped_c = LX_RSHIFT;
		lex_token = L_RSHIFT;
		break;
	    default:
		lex_token = L_GT;
		break;
	    }
	    break;
	case LX_RSHIFT:
	    if((last_mapped = CHARMAP(GETCH())) == LX_EQ) {
		return(MAP_AND_FILL(L_RSHFTEQ));
	    }
	    UNGETCH();
	    return(MAP_AND_FILL(L_RSHIFT));
	    break;
	case LX_POUND:
	    if( ! Prep ) {
		Msg_Temp = GET_MSG(2014);
		SET_MSG (Msg_Text, Msg_Temp);
		error(2014);/* # sign must be first non-whitespace */
		UNGETCH();		/* replace it */
		Linenumber--;	/* do_newline counts a newline */
		do_newline();	/* may be a 'real' prepro line */
	    }
	    else {
		fwrite("#", 1, 1, OUTPUTFILE);
	    }
	    continue;
	    break;
	case LX_EOS:
	    if(PREVCH() == '\\') {
		if( ! Prep ) {
		    if( ! checknl()) {	/* ignore the new line */
			Msg_Temp = GET_MSG(2017);
			SET_MSG (Msg_Text, Msg_Temp);
			error(2017);/* illegal escape sequence */
		    }
		}
		else {
		    fputc('\\', OUTPUTFILE);
		    fputc(get_non_eof(), OUTPUTFILE);
		}
		continue;
	    }
	    if(Macro_depth == 0) {
		if( ! io_eob()) {	/* not the end of the buffer */
		    continue;
		}
		if(fpop()) {		/* have more files to read */
		    continue;
		}
		return(MAP_AND_FILL(L_EOF));	/* all gone . . . */
	    }
	    handle_eos();			/* found end of macro */
	    continue;
	    break;
	case LX_DQUOTE:
	    if( ! Prep ) {
		str_const();
		return(NOMAP_TOKEN(L_STRING));
	    }
	    prep_string('\"');
	    continue;
	    break;
	case LX_SQUOTE:
	    if( ! Prep ) {
		return(MAP_TOKEN(char_const()));
	    }
	    prep_string('\'');
	    continue;
	    break;
	case LX_CR:		/*  ??? check for nl next  */
	    continue;
	    break;
	case LX_NL:
	    if(On_pound_line) {
		UNGETCH();
		return(NOMAP_TOKEN(L_NOTOKEN));
	    }
	    if(Prep) {
		fputc('\n', OUTPUTFILE);
	    }
	    do_newline();
	    continue;
	    break;
	case LX_WHITE:		/* skip all white space */
	    if( ! Prep ) {	/* check only once */
		do {
		    ;
		} while(LXC_IS_WHITE(GETCH()));
	    }
	    else {
		UCHAR	c;

		c = PREVCH();
		do {
		    fputc(c, OUTPUTFILE);
		} while(LXC_IS_WHITE(c = GETCH()));
	    }
	    UNGETCH();
	    continue;
	    break;
	    /* Note:
                 * RCPP.EXE does not support DBCS code.
                 * Therefore, we should be displaied error message.
                 * IBM-J PTR 12JP-0092
                 * MSHQ  PTR xxxxx
	                     */
	case LX_LEADBYTE:
	    if( ! Prep ) {       /* check only once */
		Msg_Temp = GET_MSG(2018);
		SET_MSG (Msg_Text, Msg_Temp, PREVCH());
		error(2018);

		Msg_Temp = GET_MSG(2018);
		SET_MSG (Msg_Text, Msg_Temp, GETCH());
		error(2018);
	    }
	    else {
		fputc(PREVCH(), OUTPUTFILE);
#ifdef DBCS // token_t yylex(void)
		fputc(get_non_eof(), OUTPUTFILE);
#else
		fputc(GETCH(), OUTPUTFILE);
#endif // DBCS
	    }
	    continue;
	    break;
	case LX_ILL:
	    if( ! Prep ) {
		Msg_Temp = GET_MSG(2018);
		SET_MSG (Msg_Text, Msg_Temp, PREVCH());
		error(2018);/* unknown character */
	    } else {
		fputc(PREVCH(), OUTPUTFILE);
	    }
	    continue;
	    break;
	case LX_BACKSLASH:
	    if( ! Prep ) {
		if( ! checknl()) {	/* ignore the new line */
		    Msg_Temp = GET_MSG(2017);
		    SET_MSG (Msg_Text, Msg_Temp);
		    error(2017);/* illegal escape sequence */
		}
	    }
	    else {
		fputc('\\', OUTPUTFILE);
		fputc(get_non_eof(), OUTPUTFILE);
	    }
	    continue;
	    break;
	case LX_DOT:
dot_switch:
	    switch(last_mapped = CHARMAP(GETCH())) {
	    case LX_BACKSLASH:
		if(checknl()) {
		    goto dot_switch;
		}
		UNGETCH();
		break;
	    case LX_EOS:
		if(handle_eos() == BACKSLASH_EOS) {
		    break;
		}
		goto dot_switch;
		break;
	    case LX_DOT:
		if( ! checkop('.') ) {
		    Msg_Temp = GET_MSG(2142);
		    SET_MSG (Msg_Text, Msg_Temp);
		    error(2142);/* ellipsis requires three '.'s */
		}
		return(NOMAP_AND_FILL(L_ELLIPSIS));
		break;
	    case LX_NUMBER:
		/*
		**	don't worry about getting correct hash value.
		**	The text equivalent of a real number is never
		**	hashed
		*/
		Reuse_1[0] = '.';
		Reuse_1[1] = PREVCH();
		return(MAP_TOKEN(get_real(&Reuse_1[2])));
		break;
	    }
	    UNGETCH();
	    return(MAP_AND_FILL(L_PERIOD));
	    break;
	case LX_NOEXPAND:
	    SKIPCH();			/* just skip length */
	    continue;
	case LX_ID:
	    {
		pdefn_t	pdef;

		if(Macro_depth > 0) {
		    if( ! lex_getid(PREVCH())) {
			goto avoid_expand;
		    }
		}
		else {
		    getid(PREVCH());
		}

		if( ((pdef = get_defined()) != 0)
		    &&
		    ( ! DEFN_EXPANDING(pdef))
		    &&
		    ( can_expand(pdef))
		    ) {
		    continue;
		}

avoid_expand:
		if( ! Prep ) {
		    /* M00BUG get near copy of identifier???? */
		    HLN_NAME(yylval.yy_ident) = Reuse_1;
		    HLN_HASH(yylval.yy_ident) = Reuse_1_hash;
		    HLN_LENGTH(yylval.yy_ident) = (UCHAR)Reuse_1_length;
		    return(L_IDENT);
		}
		else {
		    fwrite(Reuse_1, Reuse_1_length - 1, 1, OUTPUTFILE);
		    return(NOMAP_TOKEN(L_NOTOKEN));
		}
	    }
	    continue;
	    break;
	}
	/*
	**  all the multichar ( -> -- -= etc ) operands
	**  must come through here. we've gotten the next char,
	**  and not matched one of the possiblities, but we have to check
	**  for the end of the buffer character and act accordingly
	**  if it is the eob, then we handle it and go back for another try.
	**  otherwise, we unget the char we got, and return the base token.
	*/
	if(last_mapped == LX_EOS) {
	    if(handle_eos() != BACKSLASH_EOS) {
		goto first_switch;
	    }
	}
	UNGETCH();	/* cause we got an extra one to check */
	return(MAP_AND_FILL(lex_token));
    }
}


/************************************************************************
**
**	lex_getid: reads an identifier for the main lexer.  The
**		identifier is read into Reuse_1. This function should not handle
**		an end of string if it is rescanning a macro expansion, because
**		this could switch the context with regards to whether the macro
**	      is expandable or not.  Similarly, the noexpand marker must only be
**	     allowed if a macro is being rescanned, otherwise let this character
**		be caught as an illegal character in text
************************************************************************/
int lex_getid(UCHAR c)
{
    REG	UCHAR	*p;
    int		length = 0;

    p = Reuse_1;
    *p++ = c;
    c &= HASH_MASK;
    for(;;) {
	while(LXC_IS_IDENT(*p = GETCH())) { /* collect character */
	    c += (*p & HASH_MASK);			/* hash it */
	    p++;
	}
	if(CHARMAP(*p) == LX_NOEXPAND ) {
	    length = (int)GETCH();
	    continue;
	}
	UNGETCH();
	break;				/* out of for loop  -  only way out */
    }
    if(p >= LIMIT(Reuse_1)) {	/* is this error # correct? */
	Msg_Temp = GET_MSG(1067);
	SET_MSG (Msg_Text, Msg_Temp);
	fatal(1067);
    }
    if(((p - Reuse_1) > LIMIT_ID_LENGTH) && ( ! Prep )) {
	p = Reuse_1 + LIMIT_ID_LENGTH;
	*p = '\0';
	c = local_c_hash(Reuse_1);
	Msg_Temp = GET_MSG(4011);
	SET_MSG (Msg_Text, Msg_Temp, Reuse_1);
	warning(4011);	/* id truncated */
    }
    else {
	*p = '\0';		/* terminates identifier for expandable check */
    }
    Reuse_1_hash = c;
    Reuse_1_length = (UCHAR)((p - Reuse_1) + 1);
    return(length != (p - Reuse_1));
}