/************************************************************************/ /* */ /* RCPP - Resource Compiler Pre-Processor for NT system */ /* */ /* P0GETTOK.C - Tokenization routines */ /* */ /* 29-Nov-90 w-BrianM Update for NT from PM SDK RCPP */ /* */ /************************************************************************/ #include #include "rcpptype.h" #include "rcppdecl.h" #include "rcppext.h" #include "grammar.h" #include "p0defs.h" #include "charmap.h" /************************************************************************ ** MAP_TOKEN : a token has two representations and additional information. ** (ex : const, has basic token of L_CONST, ** mapped token of [L_TYPE | L_MODIFIER] ** and info based on what the map token is) ** MAP_AND_FILL : has two representations, but none of the extra info. ** (ex : '<', has basic of L_LT, and map of L_RELOP) ** NOMAP_TOKEN : has 1 representation and additional info. ** (ex: a string, basic and 'map' type L_STRING and ptrs to the actual str) ** NOMAP_AND_FILL : has 1 representation and no additional info. ** (ex : 'while', has basic and 'map' of L_WHILE) ** the FILL versions fill the token with the basic token type. ************************************************************************/ #define MAP_TOKEN(otok)\ (Basic_token = (otok), TS_VALUE(Basic_token)) #define MAP_AND_FILL(otok)\ (yylval.yy_token = Basic_token = (otok), TS_VALUE(Basic_token)) #define NOMAP_TOKEN(otok)\ (Basic_token = (otok)) #define NOMAP_AND_FILL(otok)\ (yylval.yy_token = Basic_token = (otok)) /************************************************************************/ /* yylex - main tokenization routine */ /************************************************************************/ token_t yylex(void) { REG UCHAR last_mapped; UCHAR mapped_c; REG token_t lex_token; for(;;) { last_mapped = mapped_c = CHARMAP(GETCH()); first_switch: switch(mapped_c) { case LX_EACH: case LX_ASCII: Msg_Temp = GET_MSG(2018); SET_MSG (Msg_Text, Msg_Temp, PREVCH()); error(2018); continue; break; case LX_OBRACE: return(NOMAP_AND_FILL(L_LCURLY)); break; case LX_CBRACE: return(NOMAP_AND_FILL(L_RCURLY)); break; case LX_OBRACK: return(NOMAP_AND_FILL(L_LBRACK)); break; case LX_CBRACK: return(NOMAP_AND_FILL(L_RBRACK)); break; case LX_OPAREN: return(NOMAP_AND_FILL(L_LPAREN)); break; case LX_CPAREN: return(NOMAP_AND_FILL(L_RPAREN)); break; case LX_COMMA: return(NOMAP_AND_FILL(L_COMMA)); break; case LX_QUEST: return(NOMAP_AND_FILL(L_QUEST)); break; case LX_SEMI: return(NOMAP_AND_FILL(L_SEMI)); break; case LX_TILDE: return(NOMAP_AND_FILL(L_TILDE)); break; case LX_NUMBER: return(MAP_TOKEN(getnum(PREVCH()))); break; case LX_MINUS: switch(last_mapped = CHARMAP(GETCH())) { case LX_EQ: return(MAP_AND_FILL(L_MINUSEQ)); break; case LX_GT: return(MAP_AND_FILL(L_POINTSTO)); break; case LX_MINUS: return(MAP_AND_FILL(L_DECR)); break; default: lex_token = L_MINUS; break; } break; case LX_PLUS: switch(last_mapped = CHARMAP(GETCH())) { case LX_EQ: return(MAP_AND_FILL(L_PLUSEQ)); break; case LX_PLUS: return(MAP_AND_FILL(L_INCR)); break; default: lex_token = L_PLUS; break; } break; case LX_AND: switch(last_mapped = CHARMAP(GETCH())) { case LX_EQ: return(MAP_AND_FILL(L_ANDEQ)); break; case LX_AND: return(MAP_AND_FILL(L_ANDAND)); break; default: lex_token = L_AND; break; } break; case LX_OR: switch(last_mapped = CHARMAP(GETCH())) { case LX_EQ: return(MAP_AND_FILL(L_OREQ)); break; case LX_OR: return(MAP_AND_FILL(L_OROR)); break; default: lex_token = L_OR; break; } break; case LX_COLON: return(NOMAP_AND_FILL(L_COLON)); break; case LX_HAT: if((last_mapped = CHARMAP(GETCH())) == LX_EQ) { return(MAP_AND_FILL(L_XOREQ)); } lex_token = L_XOR; break; case LX_PERCENT: if((last_mapped = CHARMAP(GETCH())) == LX_EQ) { return(MAP_AND_FILL(L_MODEQ)); } lex_token = L_MOD; break; case LX_EQ: if((last_mapped = CHARMAP(GETCH())) == LX_EQ) { return(MAP_AND_FILL(L_EQUALS)); } lex_token = L_ASSIGN; break; case LX_BANG: if((last_mapped = CHARMAP(GETCH())) == LX_EQ) { return(MAP_AND_FILL(L_NOTEQ)); } lex_token = L_EXCLAIM; break; case LX_SLASH: switch(last_mapped = CHARMAP(GETCH())) { case LX_STAR: dump_comment(); continue; break; case LX_SLASH: DumpSlashComment(); continue; break; case LX_EQ: return(MAP_AND_FILL(L_DIVEQ)); break; default: lex_token = L_DIV; break; } break; case LX_STAR: switch(last_mapped = CHARMAP(GETCH())) { case LX_SLASH: if( ! Prep ) { Msg_Temp = GET_MSG(2138); SET_MSG (Msg_Text, Msg_Temp); error(2138); /* (nested comments) */ } else { fwrite("*/", 2, 1, OUTPUTFILE); } continue; case LX_EQ: return(MAP_AND_FILL(L_MULTEQ)); break; default: lex_token = L_MULT; break; } break; case LX_LT: switch(last_mapped = CHARMAP(GETCH())) { case LX_LT: if((last_mapped = CHARMAP(GETCH())) == LX_EQ) { return(MAP_AND_FILL(L_LSHFTEQ)); } mapped_c = LX_LSHIFT; lex_token = L_LSHIFT; break; case LX_EQ: return(MAP_AND_FILL(L_LTEQ)); break; default: lex_token = L_LT; break; } break; case LX_LSHIFT: /* ** if the next char is not an =, then we unget and return, ** since the only way in here is if we broke on the char ** following '<<'. since we'll have already worked the handle_eos() ** code prior to getting here, we'll not see another eos, ** UNLESS i/o buffering is char by char. ??? ** see also, LX_RSHIFT */ if((last_mapped = CHARMAP(GETCH())) == LX_EQ) { return(MAP_AND_FILL(L_LSHFTEQ)); } UNGETCH(); return(MAP_AND_FILL(L_LSHIFT)); break; case LX_GT: switch(last_mapped = CHARMAP(GETCH())) { case LX_EQ: return(MAP_AND_FILL(L_GTEQ)); case LX_GT: if((last_mapped = CHARMAP(GETCH())) == LX_EQ) { return(MAP_AND_FILL(L_RSHFTEQ)); } mapped_c = LX_RSHIFT; lex_token = L_RSHIFT; break; default: lex_token = L_GT; break; } break; case LX_RSHIFT: if((last_mapped = CHARMAP(GETCH())) == LX_EQ) { return(MAP_AND_FILL(L_RSHFTEQ)); } UNGETCH(); return(MAP_AND_FILL(L_RSHIFT)); break; case LX_POUND: if( ! Prep ) { Msg_Temp = GET_MSG(2014); SET_MSG (Msg_Text, Msg_Temp); error(2014);/* # sign must be first non-whitespace */ UNGETCH(); /* replace it */ Linenumber--; /* do_newline counts a newline */ do_newline(); /* may be a 'real' prepro line */ } else { fwrite("#", 1, 1, OUTPUTFILE); } continue; break; case LX_EOS: if(PREVCH() == '\\') { if( ! Prep ) { if( ! checknl()) { /* ignore the new line */ Msg_Temp = GET_MSG(2017); SET_MSG (Msg_Text, Msg_Temp); error(2017);/* illegal escape sequence */ } } else { fputc('\\', OUTPUTFILE); fputc(get_non_eof(), OUTPUTFILE); } continue; } if(Macro_depth == 0) { if( ! io_eob()) { /* not the end of the buffer */ continue; } if(fpop()) { /* have more files to read */ continue; } return(MAP_AND_FILL(L_EOF)); /* all gone . . . */ } handle_eos(); /* found end of macro */ continue; break; case LX_DQUOTE: if( ! Prep ) { str_const(); return(NOMAP_TOKEN(L_STRING)); } prep_string('\"'); continue; break; case LX_SQUOTE: if( ! Prep ) { return(MAP_TOKEN(char_const())); } prep_string('\''); continue; break; case LX_CR: /* ??? check for nl next */ continue; break; case LX_NL: if(On_pound_line) { UNGETCH(); return(NOMAP_TOKEN(L_NOTOKEN)); } if(Prep) { fputc('\n', OUTPUTFILE); } do_newline(); continue; break; case LX_WHITE: /* skip all white space */ if( ! Prep ) { /* check only once */ do { ; } while(LXC_IS_WHITE(GETCH())); } else { UCHAR c; c = PREVCH(); do { fputc(c, OUTPUTFILE); } while(LXC_IS_WHITE(c = GETCH())); } UNGETCH(); continue; break; /* Note: * RCPP.EXE does not support DBCS code. * Therefore, we should be displaied error message. * IBM-J PTR 12JP-0092 * MSHQ PTR xxxxx */ case LX_LEADBYTE: if( ! Prep ) { /* check only once */ Msg_Temp = GET_MSG(2018); SET_MSG (Msg_Text, Msg_Temp, PREVCH()); error(2018); Msg_Temp = GET_MSG(2018); SET_MSG (Msg_Text, Msg_Temp, GETCH()); error(2018); } else { fputc(PREVCH(), OUTPUTFILE); #ifdef DBCS // token_t yylex(void) fputc(get_non_eof(), OUTPUTFILE); #else fputc(GETCH(), OUTPUTFILE); #endif // DBCS } continue; break; case LX_ILL: if( ! Prep ) { Msg_Temp = GET_MSG(2018); SET_MSG (Msg_Text, Msg_Temp, PREVCH()); error(2018);/* unknown character */ } else { fputc(PREVCH(), OUTPUTFILE); } continue; break; case LX_BACKSLASH: if( ! Prep ) { if( ! checknl()) { /* ignore the new line */ Msg_Temp = GET_MSG(2017); SET_MSG (Msg_Text, Msg_Temp); error(2017);/* illegal escape sequence */ } } else { fputc('\\', OUTPUTFILE); fputc(get_non_eof(), OUTPUTFILE); } continue; break; case LX_DOT: dot_switch: switch(last_mapped = CHARMAP(GETCH())) { case LX_BACKSLASH: if(checknl()) { goto dot_switch; } UNGETCH(); break; case LX_EOS: if(handle_eos() == BACKSLASH_EOS) { break; } goto dot_switch; break; case LX_DOT: if( ! checkop('.') ) { Msg_Temp = GET_MSG(2142); SET_MSG (Msg_Text, Msg_Temp); error(2142);/* ellipsis requires three '.'s */ } return(NOMAP_AND_FILL(L_ELLIPSIS)); break; case LX_NUMBER: /* ** don't worry about getting correct hash value. ** The text equivalent of a real number is never ** hashed */ Reuse_1[0] = '.'; Reuse_1[1] = PREVCH(); return(MAP_TOKEN(get_real(&Reuse_1[2]))); break; } UNGETCH(); return(MAP_AND_FILL(L_PERIOD)); break; case LX_NOEXPAND: SKIPCH(); /* just skip length */ continue; case LX_ID: { pdefn_t pdef; if(Macro_depth > 0) { if( ! lex_getid(PREVCH())) { goto avoid_expand; } } else { getid(PREVCH()); } if( ((pdef = get_defined()) != 0) && ( ! DEFN_EXPANDING(pdef)) && ( can_expand(pdef)) ) { continue; } avoid_expand: if( ! Prep ) { /* M00BUG get near copy of identifier???? */ HLN_NAME(yylval.yy_ident) = Reuse_1; HLN_HASH(yylval.yy_ident) = Reuse_1_hash; HLN_LENGTH(yylval.yy_ident) = (UCHAR)Reuse_1_length; return(L_IDENT); } else { fwrite(Reuse_1, Reuse_1_length - 1, 1, OUTPUTFILE); return(NOMAP_TOKEN(L_NOTOKEN)); } } continue; break; } /* ** all the multichar ( -> -- -= etc ) operands ** must come through here. we've gotten the next char, ** and not matched one of the possiblities, but we have to check ** for the end of the buffer character and act accordingly ** if it is the eob, then we handle it and go back for another try. ** otherwise, we unget the char we got, and return the base token. */ if(last_mapped == LX_EOS) { if(handle_eos() != BACKSLASH_EOS) { goto first_switch; } } UNGETCH(); /* cause we got an extra one to check */ return(MAP_AND_FILL(lex_token)); } } /************************************************************************ ** ** lex_getid: reads an identifier for the main lexer. The ** identifier is read into Reuse_1. This function should not handle ** an end of string if it is rescanning a macro expansion, because ** this could switch the context with regards to whether the macro ** is expandable or not. Similarly, the noexpand marker must only be ** allowed if a macro is being rescanned, otherwise let this character ** be caught as an illegal character in text ************************************************************************/ int lex_getid(UCHAR c) { REG UCHAR *p; int length = 0; p = Reuse_1; *p++ = c; c &= HASH_MASK; for(;;) { while(LXC_IS_IDENT(*p = GETCH())) { /* collect character */ c += (*p & HASH_MASK); /* hash it */ p++; } if(CHARMAP(*p) == LX_NOEXPAND ) { length = (int)GETCH(); continue; } UNGETCH(); break; /* out of for loop - only way out */ } if(p >= LIMIT(Reuse_1)) { /* is this error # correct? */ Msg_Temp = GET_MSG(1067); SET_MSG (Msg_Text, Msg_Temp); fatal(1067); } if(((p - Reuse_1) > LIMIT_ID_LENGTH) && ( ! Prep )) { p = Reuse_1 + LIMIT_ID_LENGTH; *p = '\0'; c = local_c_hash(Reuse_1); Msg_Temp = GET_MSG(4011); SET_MSG (Msg_Text, Msg_Temp, Reuse_1); warning(4011); /* id truncated */ } else { *p = '\0'; /* terminates identifier for expandable check */ } Reuse_1_hash = c; Reuse_1_length = (UCHAR)((p - Reuse_1) + 1); return(length != (p - Reuse_1)); }