//+--------------------------------------------------------------------------- // // File: syntax.h // // Contents: syntax modules for regexpr // // Classes: perl_syntax, posix_syntax // // History: 3-29-00 ericne Created // //---------------------------------------------------------------------------- #pragma once #include #include #include #ifndef ARRAYSIZE #define ARRAYSIZE(x) (sizeof(x)/sizeof(*(x))) #endif #ifndef UCHAR_MAX #define UCHAR_MAX 0xff #endif #ifndef WCHAR_MAX #define WCHAR_MAX ((wchar_t)-1) #endif template<> struct std::iterator_traits< const char * > { // get traits from iterator _Iter typedef random_access_iterator_tag iterator_category; typedef char value_type; typedef ptrdiff_t difference_type; typedef difference_type distance_type; // retained typedef char * pointer; typedef char & reference; }; template<> struct std::iterator_traits< const wchar_t * > { // get traits from iterator _Iter typedef random_access_iterator_tag iterator_category; typedef wchar_t value_type; typedef ptrdiff_t difference_type; typedef difference_type distance_type; // retained typedef wchar_t * pointer; typedef wchar_t & reference; }; namespace regex { // // The following are the tokens that can be emitted by the syntax module. // Don't reorder this list!!! // enum TOKEN { NO_TOKEN = 0, // REGULAR TOKENS BEGIN_GROUP, END_GROUP, ALTERNATION, BEGIN_LINE, END_LINE, BEGIN_CHARSET, MATCH_ANY, ESCAPE, // QUANTIFICATION TOKENS ONE_OR_MORE, ZERO_OR_MORE, ZERO_OR_ONE, ONE_OR_MORE_MIN, ZERO_OR_MORE_MIN, ZERO_OR_ONE_MIN, BEGIN_RANGE, RANGE_SEPARATOR, END_RANGE, END_RANGE_MIN, // ESCAPE SEQUENCES ESC_DIGIT, ESC_NOT_DIGIT, ESC_SPACE, ESC_NOT_SPACE, ESC_WORD, ESC_NOT_WORD, ESC_BEGIN_STRING, ESC_END_STRING, ESC_END_STRING_z, ESC_WORD_BOUNDARY, ESC_NOT_WORD_BOUNDARY, ESC_WORD_START, ESC_WORD_STOP, ESC_QUOTE_META_ON, ESC_QUOTE_META_OFF, // SUBSTITUTION TOKENS SUBST_BACKREF, SUBST_PREMATCH, SUBST_POSTMATCH, SUBST_MATCH, SUBST_ESCAPE, SUBST_QUOTE_META_ON, SUBST_UPPER_ON, SUBST_UPPER_NEXT, SUBST_LOWER_ON, SUBST_LOWER_NEXT, SUBST_ALL_OFF, // CHARSET TOKENS CHARSET_NEGATE, CHARSET_ESCAPE, CHARSET_RANGE, CHARSET_BACKSPACE, CHARSET_END, CHARSET_ALNUM, CHARSET_ALPHA, CHARSET_BLANK, CHARSET_CNTRL, CHARSET_DIGIT, CHARSET_GRAPH, CHARSET_LOWER, CHARSET_PRINT, CHARSET_PUNCT, CHARSET_SPACE, CHARSET_UPPER, CHARSET_XDIGIT, // EXTENSION TOKENS EXT_NOBACKREF, EXT_POS_LOOKAHEAD, EXT_NEG_LOOKAHEAD, EXT_POS_LOOKBEHIND, EXT_NEG_LOOKBEHIND, EXT_INDEPENDENT, EXT_UNKNOWN }; struct posix_charset_type { const char * const szcharset; const size_t cchars; posix_charset_type( const char * const sz, const size_t c ) : szcharset(sz), cchars(c) {} }; extern const posix_charset_type g_rgposix_charsets[]; extern const size_t g_cposix_charsets; template< typename const_iterator > bool is_posix_charset( const_iterator icur, const_iterator iend, const char * szcharset ) { for( ; icur != iend && '\0' != *szcharset; ++icur, ++szcharset ) { if( *icur != *szcharset ) return false; } return '\0' == *szcharset; } // // The perl_syntax class encapsulates the Perl 5 regular expression syntax. It is // used as a template parameter to basic_rpattern. To customize regex syntax, create // your own syntax class and use it as a template parameter instead. // class perl_syntax_base { protected: static TOKEN s_rgreg[ UCHAR_MAX + 1 ]; static TOKEN s_rgescape[ UCHAR_MAX + 1 ]; struct init_perl_syntax; friend struct init_perl_syntax; static struct init_perl_syntax { init_perl_syntax(); } s_init_perl_syntax; static inline TOKEN look_up( char ch, TOKEN rg[] ) { return rg[ (unsigned char)ch ]; } static inline TOKEN look_up( wchar_t ch, TOKEN rg[] ) { return UCHAR_MAX < ch ? NO_TOKEN : rg[ (unsigned char)ch ]; } }; template< typename CH > class perl_syntax : protected perl_syntax_base { public: typedef typename std::basic_string::iterator iterator; typedef typename std::basic_string::const_iterator const_iterator; typedef CH char_type; private: static bool min_quant( iterator & icur, const_iterator iend ) { return ( (const_iterator)++icur != iend && CH('?') == *icur ? (++icur,true) : false ); } public: static TOKEN reg_token( iterator & icur, const_iterator iend ) { assert( (const_iterator)icur != iend ); TOKEN tok = look_up( *icur, s_rgreg ); if( tok ) ++icur; if( ESCAPE == tok && (const_iterator)icur != iend ) { tok = look_up( *icur, s_rgescape ); if( tok ) ++icur; else tok = ESCAPE; } return tok; } static TOKEN quant_token( iterator & icur, const_iterator iend ) { assert( (const_iterator)icur != iend ); TOKEN tok = NO_TOKEN; switch( *icur ) { case CH('*'): tok = min_quant( icur, iend ) ? ZERO_OR_MORE_MIN : ZERO_OR_MORE; break; case CH('+'): tok = min_quant( icur, iend ) ? ONE_OR_MORE_MIN : ONE_OR_MORE; break; case CH('?'): tok = min_quant( icur, iend ) ? ZERO_OR_ONE_MIN : ZERO_OR_ONE; break; case CH('}'): tok = min_quant( icur, iend ) ? END_RANGE_MIN : END_RANGE; break; case CH('{'): tok = BEGIN_RANGE; ++icur; break; case CH(','): tok = RANGE_SEPARATOR; ++icur; break; } return tok; } static TOKEN charset_token( iterator & icur, const_iterator iend ) { assert( (const_iterator)icur != iend ); TOKEN tok = NO_TOKEN; switch( *icur ) { case CH('-'): tok = CHARSET_RANGE; ++icur; break; case CH('^'): tok = CHARSET_NEGATE; ++icur; break; case CH(']'): tok = CHARSET_END; ++icur; break; case CH('\\'): tok = CHARSET_ESCAPE; if( (const_iterator)++icur == iend ) break; switch( *icur ) { case CH('b'): tok = CHARSET_BACKSPACE; ++icur; break; case CH('d'): tok = ESC_DIGIT; ++icur; break; case CH('D'): tok = ESC_NOT_DIGIT; ++icur; break; case CH('s'): tok = ESC_SPACE; ++icur; break; case CH('S'): tok = ESC_NOT_SPACE; ++icur; break; case CH('w'): tok = ESC_WORD; ++icur; break; case CH('W'): tok = ESC_NOT_WORD; ++icur; break; } break; case CH('['): for( size_t i=0; !tok && i < g_cposix_charsets; ++i ) { if( is_posix_charset( icur, iend, g_rgposix_charsets[i].szcharset ) ) { tok = TOKEN(CHARSET_ALNUM + i); std::advance( icur, g_rgposix_charsets[i].cchars ); } } break; } return tok; } static TOKEN subst_token( iterator & icur, const_iterator iend ) { assert( (const_iterator)icur != iend ); TOKEN tok = NO_TOKEN; switch( *icur ) { case CH('\\'): tok = SUBST_ESCAPE; if( (const_iterator)++icur != iend ) switch( *icur ) { case CH('Q'): tok = SUBST_QUOTE_META_ON; ++icur; break; case CH('U'): tok = SUBST_UPPER_ON; ++icur; break; case CH('u'): tok = SUBST_UPPER_NEXT; ++icur; break; case CH('L'): tok = SUBST_LOWER_ON; ++icur; break; case CH('l'): tok = SUBST_LOWER_NEXT; ++icur; break; case CH('E'): tok = SUBST_ALL_OFF; ++icur; break; } break; case CH('$'): tok = SUBST_BACKREF; if( (const_iterator)++icur != iend ) switch( *icur ) { case CH('&'): tok = SUBST_MATCH; ++icur; break; case CH('`'): tok = SUBST_PREMATCH; ++icur; break; case CH('\''): tok = SUBST_POSTMATCH; ++icur; break; } break; } return tok; } static TOKEN ext_token( iterator & icur, const_iterator iend, unsigned & flags ) { assert( (const_iterator)icur != iend ); bool finclude; TOKEN tok = NO_TOKEN; if( CH('?') == *icur ) { tok = EXT_UNKNOWN; if( (const_iterator)++icur != iend ) { switch( *icur ) { case CH(':'): tok = EXT_NOBACKREF; ++icur; break; case CH('='): tok = EXT_POS_LOOKAHEAD; ++icur; break; case CH('!'): tok = EXT_NEG_LOOKAHEAD; ++icur; break; case CH('<'): if( (const_iterator)++icur == iend ) break; switch( *icur ) { case CH('='): tok = EXT_POS_LOOKBEHIND; ++icur; break; case CH('!'): tok = EXT_NEG_LOOKBEHIND; ++icur; break; } break; case CH('>'): tok = EXT_INDEPENDENT; ++icur; break; default: finclude = true; do { if( CH(':') == *icur ) { tok = EXT_NOBACKREF; ++icur; break; } if( CH(')') == *icur ) { tok = EXT_NOBACKREF; break; } if( CH('-') == *icur && finclude ) finclude = false; else if( CH('i') == *icur ) flags = finclude ? ( flags | NOCASE ) : ( flags & ~NOCASE ); else if( CH('m') == *icur ) flags = finclude ? ( flags | MULTILINE ) : ( flags & ~MULTILINE ); else if( CH('s') == *icur ) flags = finclude ? ( flags | SINGLELINE ) : ( flags & ~SINGLELINE ); else break; } while( (const_iterator)++icur != iend ); break; } } } return tok; } }; // // Implements the basic POSIX regular expression syntax // template< typename CH > class posix_syntax { public: typedef typename std::basic_string::iterator iterator; typedef typename std::basic_string::const_iterator const_iterator; typedef CH char_type; static TOKEN reg_token( iterator & icur, const_iterator iend ) { TOKEN tok = NO_TOKEN; switch( *icur ) { case '.': tok = MATCH_ANY; ++icur; break; case '^': tok = BEGIN_LINE; ++icur; break; case '$': tok = END_LINE; ++icur; break; case '[': tok = BEGIN_CHARSET; ++icur; break; case '\\': tok = ESCAPE; ++icur; if( (const_iterator)icur != iend ) { switch( *icur ) { case '(': tok = BEGIN_GROUP; ++icur; break; case ')': tok = END_GROUP; ++icur; break; case '|': tok = ALTERNATION; ++icur; break; } } break; } return tok; } static TOKEN quant_token( iterator & icur, const_iterator iend ) { TOKEN tok = NO_TOKEN; switch( *icur ) { case '*': tok = ZERO_OR_MORE; ++icur; break; case ',': tok = RANGE_SEPARATOR; ++icur; break; case '\\': ++icur; if( (const_iterator)icur != iend ) { switch( *icur ) { case '?': tok = ZERO_OR_ONE; ++icur; break; case '+': tok = ONE_OR_MORE; ++icur; break; case '{': tok = BEGIN_RANGE; ++icur; break; case '}': tok = END_RANGE; ++icur; break; default: --icur; break; } } else { --icur; } } return tok; } static TOKEN charset_token( iterator & icur, const_iterator iend ) { TOKEN tok = NO_TOKEN; switch( *icur ) { case '^': tok = CHARSET_NEGATE; ++icur; break; case '-': tok = CHARSET_RANGE; ++icur; break; case ']': tok = CHARSET_END; ++icur; break; case '[': for( size_t i=0; !tok && i < g_cposix_charsets; ++i ) { if( is_posix_charset( icur, iend, g_rgposix_charsets[i].szcharset ) ) { tok = TOKEN(CHARSET_ALNUM + i); std::advance( icur, g_rgposix_charsets[i].cchars ); } } break; } return tok; } static TOKEN subst_token( iterator & icur, const_iterator iend ) { TOKEN tok = NO_TOKEN; if( '\\' == *icur ) { tok = SUBST_ESCAPE; ++icur; if( (const_iterator)icur != iend && '0' <= *icur && '9' >= *icur ) { tok = SUBST_BACKREF; } } return tok; } static TOKEN ext_token( iterator & icur, const_iterator iend, unsigned & flags ) { return NO_TOKEN; } }; } // namespace regex