//+--------------------------------------------------------------------------- // // File: basic_regexpr.cxx // // Contents: // // Classes: // // Functions: // // Coupling: // // Notes: // // History: 1-11-1999 ericne Created // //---------------------------------------------------------------------------- #include "stdafx.h" #pragma hdrstop // unlimited inline expansion (compile with /Ob1 or /Ob2) #pragma inline_depth(255) // C4355 'this' : used in base member initializer list // C4660 template-class specialization 'foo' is already instantiated // C4786 identifier was truncated to '255' characters in the debug information // C4800 'int' : forcing value to bool 'true' or 'false' (performance warning) #pragma warning( disable : 4355 4660 4786 4800 ) #include #include // for _alloca #include #include #include "regexpr.h" using namespace std; namespace regex { #ifdef _MT // Global critical section used to synchronize the creation of static const patterns CRegExCritSect g_objRegExCritSect; #endif // For use while doing uppercase/lowercase conversions: // For use while doing uppercase/lowercase conversions: inline char to_upper( char ch ) { return ( char )toupper(ch); } inline char to_lower( char ch ) { return ( char )tolower(ch); } inline wint_t to_upper( wint_t ch ) { return (wint_t)towupper(ch); } inline wint_t to_lower( wint_t ch ) { return (wint_t)towlower(ch); } template< typename II, typename CI > void to_upper( II ibegin, CI iend ) { for( ; (CI)ibegin != iend; ++ibegin ) *ibegin = to_upper( *ibegin ); } template< typename II, typename CI > void to_lower( II ibegin, CI iend ) { for( ; (CI)ibegin != iend; ++ibegin ) *ibegin = to_lower( *ibegin ); } template< typename II, typename CI > unsigned parse_int( II & istr, CI iend, const unsigned m_max = -1 ) { unsigned retval = 0; while( (CI)istr != iend && '0' <= *istr && '9' >= *istr && m_max > retval ) { retval = retval * 10 + ( (unsigned)*istr++ - (unsigned)'0' ); } if( m_max < retval ) { retval /= 10; --istr; } return retval; } // This class is used to speed up character set matching by providing // a bitset that spans the ASCII range. std::bitset is not used because // the range-checking slows it down. // Note: The division and modulus operations are optimized by the compiler // into bit-shift operations. class ascii_bitvector { typedef unsigned __int32 elem_type; // use 32-bit ints on 32-bit platforms //typedef unsigned __int64 elem_type; // use 64-bit ints on 64-bit platforms enum { CBELEM = 8 * sizeof elem_type, // count of bytes per element CELEMS = (UCHAR_MAX+1) / CBELEM }; // number of element in array elem_type m_rg[ CELEMS ]; // Used to inline operations like: bv1 |= ~bv2; without creating temp bit vectors. struct not_ascii_bitvector { const ascii_bitvector & m_ref; not_ascii_bitvector( const ascii_bitvector & ref ) throw() : m_ref(ref) {} }; public: ascii_bitvector() throw() { memset( m_rg, 0, CELEMS * sizeof elem_type ); } inline void set( unsigned char ch ) throw() { m_rg[ ( ch / CBELEM ) ] |= ( (elem_type)1U << ( ch % CBELEM ) ); } inline bool operator[]( unsigned char ch ) const throw() { return 0 != ( m_rg[ ( ch / CBELEM ) ] & ( (elem_type)1U << ( ch % CBELEM ) ) ); } inline not_ascii_bitvector operator~() const throw() { return not_ascii_bitvector(*this); } inline ascii_bitvector & operator|=( const ascii_bitvector & that ) throw() { for( int i=0; i struct op_t : public binary_function,CI,bool> { typedef CI const_iterator; typedef typename iterator_traits::value_type char_type; }; // Evaluates the beginning-of-string condition template< typename CI > struct bos_t : public op_t { inline bool operator()( const match_param & param, CI iter ) const { return param.ibegin == iter; } }; // Find the beginning of a line, either beginning of a string, or the character // immediately following a newline template< typename CI > struct bol_t : public bos_t { inline bool operator()( const match_param & param, CI iter ) const { return bos_t::operator()(param,iter) || char_type('\n') == *--iter; } }; // Evaluates end-of-string condition for string's template< typename CI > struct eos_t : public op_t { inline bool operator()( const match_param & param, CI iter ) const { return param.istop == iter; } }; // Evaluates end-of-string condidition for C-style string's when the length is unknown by // looking for the null-terminator. template< typename CI > struct eocs_t : public op_t { inline bool operator()( const match_param & param, CI iter ) const { return char_type('\0') == *iter; } }; // Evaluates end-of-line conditions, either the end of the string, or a // return or newline character. template< typename EOS > struct eol_t_t : public EOS { typedef typename EOS::const_iterator CI; inline bool operator()( const match_param & param, CI iter ) const { return EOS::operator()(param,iter) || char_type('\n') == *iter || char_type('\r') == *iter; } }; template< typename CI > struct eol_t : public eol_t_t > {}; template< typename CI > struct eocl_t : public eol_t_t > {}; // Evaluates perl's end-of-string conditions, either the end of the string, or a // newline character followed by end of string. (Only used by $ and /Z assertions) template< typename EOS > struct peos_t_t : public EOS { typedef typename EOS::const_iterator CI; inline bool operator()( const match_param & param, CI iter ) const { return EOS::operator()(param,iter) || ( ( char_type('\n') == *iter ) && EOS::operator()(param,++iter) ); } }; template< typename CI > struct peos_t : public peos_t_t > {}; template< typename CI > struct peocs_t : public peos_t_t > {}; // compare two characters, case-sensitive template< typename CH > struct ch_neq_t : public binary_function { typedef CH char_type; inline bool operator()( register CH ch1, register CH ch2 ) const { return ch1 != ch2; } }; // Compare two characters, disregarding case template< typename CH > struct ch_neq_nocase_t : public binary_function { typedef CH char_type; inline bool operator()( register CH ch1, register CH ch2 ) const { return to_upper(ch1) != to_upper(ch2); } }; // // Helper functions for match and substitute // template< typename CI > size_t string_length( CI iter ) { size_t n = 0; while( 0 != *iter++ ) ++n; return n; } template< typename CI > backref_tag _do_match( const basic_rpattern_base & pat, match_param & param ) throw() { typedef typename iterator_traits::value_type char_type; bool floop = pat.loops(); unsigned flags = pat.flags(); width_type nwidth = pat.get_width(); const sub_expr * pfirst = pat.get_first_subexpression(); try { vector > rgbackrefs; // dummy backref vector if( NULL == param.prgbackrefs ) param.prgbackrefs = & rgbackrefs; param.prgbackrefs->resize( pat._cgroups_total() ); fill( param.prgbackrefs->begin(), param.prgbackrefs->end(), backref_tag() ); // If a pattern is optimized for CSTRINGS, it can save a call // to calculate the length of the string. if( CI(0) == param.istop && ( ( RIGHTMOST & flags ) || ( 0 == ( CSTRINGS & flags ) ) ) ) param.istop = param.istart + string_length( param.istart ); if( CI(0) != param.istop ) { // If the minimum width of the pattern exceeds the width of the // string, a succesful match is impossible if( nwidth.m_min <= (size_t)distance( param.istart, param.istop ) ) { CI local_istop = param.istop; advance( local_istop, -int( nwidth.m_min ) ); if( RIGHTMOST & flags ) { // begin trying to match after the last character. // Continue to the beginning for( CI icur = local_istop; icur >= param.istart; --icur ) if( pfirst->domatch( param, icur ) ) break; // m_floop not used for rightmost matches } else { // begin trying to match before the first character. // Continue to the end for( CI icur = param.istart; icur <= local_istop; ++icur ) if( pfirst->domatch( param, icur ) || ! floop ) break; } } } else { // begin trying to match before the first character. // Continue to the end for( CI icur = param.istart; ; ++icur ) { if( pfirst->domatch( param, icur ) || ! floop ) break; if( char_type('\0') == *icur ) break; } } } catch(...) // bad alloc, stack overflow? { fill( param.prgbackrefs->begin(), param.prgbackrefs->end(), backref_tag() ); } // Shrink the backref vector to chop off information about the "invisible" groups param.prgbackrefs->resize( pat.cgroups() ); return (*param.prgbackrefs)[0]; } template< typename CI, typename CH, typename TR, typename AL > size_t _do_subst( basic_regexpr & str, const basic_rpattern_base & pat, size_t strpos, size_t strlen ) throw(bad_alloc) { typedef iterator_traits::value_type char_type; typedef list::const_iterator LCI; enum { UPPER = -1, NIL, LOWER } next = NIL, rest = NIL; bool first = true; size_t old_strpos = strpos; const list & subst_list = pat.get_subst_list(); basic_string::iterator itstrlen = str.begin(); advance( itstrlen, strpos + strlen ); const basic_string & subst = pat.get_subst(); push_new_handler pnh( &my_new_handler ); for( LCI isubst = subst_list.begin(); isubst != subst_list.end(); ++isubst ) { size_t sublen; basic_string::const_iterator itsubpos1; // iter into str basic_string::const_iterator itsublen1; basic_string::const_iterator itsubpos2; // iter into subst string basic_string::const_iterator itsublen2; basic_string::iterator itstrpos = str.begin(); advance( itstrpos, strpos ); switch( isubst->stype ) { case subst_node::SUBST_STRING: itsubpos2 = subst.begin(); advance( itsubpos2, isubst->subst_string.rstart ); itsublen2 = itsubpos2; advance( itsublen2, isubst->subst_string.rlength ); first ? str.replace( itstrpos, itstrlen, itsubpos2, itsublen2 ) : str.insert( itstrpos, itsubpos2, itsublen2 ); sublen = distance( itsubpos2, itsublen2 ); break; case subst_node::SUBST_BACKREF: switch( isubst->subst_backref ) { case subst_node::PREMATCH: itsubpos1 = str.backref_str().begin(); itsublen1 = itsubpos1; advance( itsublen1, sublen = str.rstart() ); break; case subst_node::POSTMATCH: itsubpos1 = str.backref_str().begin(); advance( itsubpos1, str.rstart() + str.rlength() ); itsublen1 = str.backref_str().end(); break; default: itsubpos1 = str.backref_str().begin(); advance( itsubpos1, str.rstart( isubst->subst_backref ) ); itsublen1 = itsubpos1; advance( itsublen1, str.rlength( isubst->subst_backref ) ); break; } first ? str.replace( itstrpos, itstrlen, itsubpos1, itsublen1 ) : str.insert( itstrpos, itsubpos1, itsublen1 ); sublen = distance( itsubpos1, itsublen1 ); break; case subst_node::SUBST_OP: switch( isubst->op ) { case subst_node::UPPER_ON: rest = UPPER; break; case subst_node::UPPER_NEXT: next = UPPER; break; case subst_node::LOWER_ON: rest = LOWER; break; case subst_node::LOWER_NEXT: next = LOWER; break; case subst_node::ALL_OFF: rest = NIL; break; default: __assume(0); } continue; // jump to the next item in the list default: __assume(0); } first = false; // Are we upper- or lower-casing this string? if( rest ) { basic_string::iterator istart = str.begin(); advance( istart, strpos ); basic_string::const_iterator istop = istart; advance( istop, sublen ); switch( rest ) { case UPPER: to_upper( istart, istop ); break; case LOWER: to_lower( istart, istop ); break; default: __assume(0); } } // Are we upper- or lower-casing the next character? if( next ) { switch( next ) { case UPPER: str[strpos] = to_upper(str[strpos]); break; case LOWER: str[strpos] = to_lower(str[strpos]); break; default: __assume(0); } next = NIL; } strpos += sublen; } // If *first* is still true, then we never called str.replace, and the substitution // string is empty. Erase the part of the string that the pattern matched. if( first ) str.erase( strpos, strlen ); // return length of the substitution return strpos - old_strpos; } // // Implementation of basic_regexpr // template< typename CH, typename TR, typename AL > size_t basic_regexpr::substitute( const basic_rpattern_base::const_iterator> & pat, size_type pos, size_type len ) throw(bad_alloc) { if( pat.flags() & CSTRINGS ) { assert( ! "You can't use a pattern optimized for CSTRINGS with regexpr::substitute" ); return 0; } backref_vector rgbackrefs; // dummy backref vector backref_vector * prgbackrefs = & rgbackrefs; const bool fsave_backrefs = ( pat.uses_backrefs() || !( pat.flags() & NOBACKREFS ) ); if( fsave_backrefs ) { prgbackrefs = & m_rgbackrefs; m_pbackref_str = & ( m_backref_str = *this ); } else { m_backref_str.erase(); m_pbackref_str = this; m_rgbackrefs.resize( 0 ); } backref_type br; size_t csubst = 0; long stop_offset = ( len == npos ? m_pbackref_str->size() : min( pos + len, m_pbackref_str->size() ) ); match_param param( m_pbackref_str->begin(), m_pbackref_str->begin(), prgbackrefs ); advance( param.istart, pos ); advance( param.istop, stop_offset ); param.ibegin = param.istart; if( GLOBAL & pat.flags() ) { const bool fAll = ( ALLBACKREFS == ( ALLBACKREFS & pat.flags() ) ); const bool fFirst = ( FIRSTBACKREFS == ( FIRSTBACKREFS & pat.flags() ) ); backref_vector rgtempbackrefs; // temporary vector used if fsave_backrefs long pos_offset = 0; // keep track of how much the backref_str and // the current string are out of sync while( br = _do_match( pat, param ) ) { ++csubst; size_type match_length = distance( br.first, br.second ); pos = distance( m_pbackref_str->begin(), br.first ); size_type subst_length = _do_subst( *this, pat, pos + pos_offset, match_length ); if( fsave_backrefs ) { pos += match_length; pos_offset += ( subst_length - match_length ); // Handle specially the backref flags if( fFirst ) rgtempbackrefs.push_back( br ); else if( fAll ) rgtempbackrefs.insert( rgtempbackrefs.end(), param.prgbackrefs->begin(), param.prgbackrefs->end() ); else rgtempbackrefs.swap( *param.prgbackrefs ); } else { pos += subst_length; stop_offset += ( subst_length - match_length ); // we're not saving backref information, so we don't // need to do any special backref maintenance here } // prevent a pattern that matches 0 characters from matching // again at the same point in the string if( 0 == match_length ) { if( br.first == param.istop ) // We're at the end, so we're done break; ++pos; } param.istart = m_pbackref_str->begin(); advance( param.istart, pos ); // ineffecient for bidirectional iterators. param.istop = m_pbackref_str->begin(); advance( param.istop, stop_offset ); // ineffecient for bidirectional iterators. } // If we did special backref handling, swap the backref vectors if( fsave_backrefs && ( !br || fFirst || fAll ) ) param.prgbackrefs->swap( rgtempbackrefs ); else if( ! (*param.prgbackrefs)[0] ) param.prgbackrefs->clear(); } else if( br = _do_match( pat, param ) ) { ++csubst; _do_subst( *this, pat, distance( m_pbackref_str->begin(), br.first ), distance( br.first, br.second ) ); } if( NOBACKREFS == ( pat.flags() & NOBACKREFS ) ) param.prgbackrefs->clear(); return csubst; } // // Helper functions called from both basic_regexpr match methods // template< typename EOS > backref_tag< typename EOS::const_iterator > _match_helper( const basic_rpattern_base & pat, match_param & param, EOS eos ) { typedef typename EOS::const_iterator CI; if( GLOBAL & pat.flags() ) // do a global find { // The NOBACKREFS flag is ignored in the match method. const bool fAll = ( ALLBACKREFS == ( ALLBACKREFS & pat.flags() ) ); const bool fFirst = ( FIRSTBACKREFS == ( FIRSTBACKREFS & pat.flags() ) ); backref_tag br; vector > rgtempbackrefs; while( br = _do_match( pat, param ) ) { // Handle specially the backref flags if( fFirst ) rgtempbackrefs.push_back( br ); else if( fAll ) rgtempbackrefs.insert( rgtempbackrefs.end(), param.prgbackrefs->begin(), param.prgbackrefs->end() ); else rgtempbackrefs.swap( *param.prgbackrefs ); if( br.first == ( param.istart = br.second ) ) { if( eos( param, param.istart ) ) break; ++param.istart; } } // restore the backref vectors if( !br || fFirst || fAll ) param.prgbackrefs->swap( rgtempbackrefs ); else if( ! (*param.prgbackrefs)[0] ) param.prgbackrefs->clear(); return param.prgbackrefs->empty() ? backref_tag() : (*param.prgbackrefs)[0]; } else return _do_match( pat, param ); } template< typename CH, typename TR, typename AL > basic_regexpr::backref_type basic_regexpr::match( const basic_rpattern_base & pat, size_type pos, size_type len ) const throw() { if( pat.flags() & CSTRINGS ) { assert( ! "A pattern optimized for CSTRINGS can only be used with the static regexpr::match method" ); return backref_type(); } m_pbackref_str = this; m_backref_str.erase(); // free up unused memory const_iterator istart = begin(); advance( istart, pos ); const_iterator istop; if( len == npos || pos + len >= size() ) istop = end(); else advance( istop = begin(), pos + len ); match_param param( istart, istop, & m_rgbackrefs ); return _match_helper >( pat, param, eos_t() ); } template< typename CH > backref_tag _static_match_helper( const CH * szstr, const basic_rpattern_base & pat, vector< backref_tag< const CH * > > * prgbackrefs ) throw() { vector< backref_tag< const CH * > > rgdummyvector; if( NULL == prgbackrefs ) prgbackrefs = &rgdummyvector; match_param param( szstr, NULL, prgbackrefs ); return _match_helper >( pat, param, eocs_t() ); } // // Helper function called from both basic_regexpr::count methods // template< typename EOS > size_t _count_helper( const basic_rpattern_base & pat, match_param & param, EOS eos ) { typedef typename EOS::const_iterator CI; size_t cmatches = 0; vector > rgbackrefs; // dummy backref vector backref_tag br; param.prgbackrefs = &rgbackrefs; while( br = _do_match( pat, param ) ) { ++cmatches; if( br.first == ( param.istart = br.second ) ) { if( eos( param, param.istart ) ) break; ++param.istart; } } return cmatches; } template< typename CH, typename TR, typename AL > size_t basic_regexpr::count( const basic_rpattern_base::const_iterator> & pat, size_type pos, size_type len ) const throw() { if( pat.flags() & CSTRINGS ) { assert( ! "A pattern optimized for CSTRINGS can only be used with the static regexpr::count method" ); return backref_type(); } m_pbackref_str = this; const_iterator istart = begin(); advance( istart, pos ); const_iterator istop; if( len == npos || pos + len >= size() ) istop = end(); else advance( istop = begin(), pos + len ); match_param param( istart, istop, NULL ); return _count_helper >( pat, param, eos_t() ); } template< typename CH > size_t _static_count_helper( const CH * szstr, const basic_rpattern_base & pat ) throw() { match_param param( szstr, NULL, NULL ); return _count_helper >( pat, param, eocs_t() ); } // Base class for sub-expressions which are zero-width // (i.e., assertions eat no characters during matching) // Assertions cannot be quantified. template< typename CI > class assertion : public sub_expr { public: virtual ~assertion() {} virtual bool is_assertion() const throw() { return true; } protected: virtual width_type _width_this() throw() { return width_type(0,0); } }; template< typename OP > class assert_op : public assertion { public: typedef OP op_type; typedef typename OP::const_iterator CI; virtual ~assert_op() {} protected: virtual bool _match_this( match_param & param, CI & icur ) const throw() { return m_op( param, icur ); } op_type m_op; }; template< typename CI > assertion * create_bos( unsigned /*flags*/ ) { return new assert_op >(); } template< typename CI > assertion * create_eos( unsigned flags ) { switch( CSTRINGS & flags ) { case 0: return new assert_op >(); case CSTRINGS: return new assert_op >(); default: __assume(0); // tells the compiler that this is unreachable } } template< typename CI > assertion * create_eoz( unsigned flags ) { switch( CSTRINGS & flags ) { case 0: return new assert_op >(); case CSTRINGS: return new assert_op >(); default: __assume(0); // tells the compiler that this is unreachable } } template< typename CI > assertion * create_bol( unsigned flags ) { switch( MULTILINE & flags ) { case 0: return new assert_op >(); case MULTILINE: return new assert_op >(); default: __assume(0); // tells the compiler that this is unreachable } } template< typename CI > assertion * create_eol( unsigned flags ) { switch( ( MULTILINE | CSTRINGS ) & flags ) { case 0: return new assert_op >(); case MULTILINE: return new assert_op >(); case CSTRINGS: return new assert_op >(); case MULTILINE | CSTRINGS: return new assert_op >(); default: __assume(0); // tells the compiler that this is unreachable } } template< typename CI > class match_atom : public sub_expr { public: match_atom( const basic_string::char_type>::iterator istart, basic_string::char_type>::const_iterator istop ) : m_istart( istart ), m_istop( istop ) {} virtual ~match_atom() {} const basic_string::char_type>::iterator m_istart; basic_string::char_type>::const_iterator m_istop; protected: virtual width_type _width_this() throw() { size_t width = distance( (basic_string::char_type>::const_iterator)m_istart, m_istop ); return width_type( width, width ); } }; template< typename EOS > class match_atom_t : public match_atom { public: typedef EOS eos_type; typedef typename EOS::const_iterator CI; match_atom_t( const basic_string::char_type>::iterator istart, basic_string::char_type>::const_iterator istop ) : match_atom( istart, istop ) {} virtual ~match_atom_t() {} protected: virtual bool _match_this( match_param & param, CI & icur ) const throw() { CI icur_tmp = icur; basic_string::char_type>::const_iterator ithis = m_istart; for( ; ithis != m_istop; ++icur_tmp, ++ithis ) { if( m_eos( param, icur_tmp ) || *ithis != *icur_tmp ) return false; } icur = icur_tmp; return true; } eos_type m_eos; }; template< typename EOS > class match_atom_nocase_t : public match_atom { public: typedef EOS eos_type; typedef typename EOS::const_iterator CI; match_atom_nocase_t( const basic_string::char_type>::iterator istart, basic_string::char_type>::const_iterator istop ) : match_atom( istart, istop ), m_strlower( (basic_string::char_type>::const_iterator)istart, istop ) { // Store the uppercase version of the atom in [m_istart,m_istop). to_upper( m_istart, m_istop ); // Store the lowercase version of the atom in m_strlower. to_lower( m_strlower.begin(), m_strlower.end() ); } virtual ~match_atom_nocase_t() {} protected: virtual bool _match_this( match_param & param, CI & icur ) const throw() { CI icur_tmp = icur; basic_string::char_type>::const_iterator ithisu = m_istart; // uppercase basic_string::char_type>::const_iterator ithisl = m_strlower.begin(); // lowercase for( ; ithisu != m_istop; ++icur_tmp, ++ithisu, ++ithisl ) { if( m_eos( param, icur_tmp ) || ( *ithisu != *icur_tmp && *ithisl != *icur_tmp ) ) return false; } icur = icur_tmp; return true; } eos_type m_eos; basic_string::char_type> m_strlower; }; template< typename CI > match_atom * create_atom( const basic_string::value_type>::iterator istart, basic_string::value_type>::const_iterator istop, unsigned flags ) { switch( ( NOCASE | CSTRINGS ) & flags ) { case 0: return new match_atom_t >( istart, istop ); case NOCASE: return new match_atom_nocase_t >( istart, istop ); case CSTRINGS: return new match_atom_t >( istart, istop ); case NOCASE | CSTRINGS: return new match_atom_nocase_t >( istart, istop ); default: __assume(0); // tells the compiler that this is unreachable } } template< typename CI > match_atom * create_atom( const basic_string::value_type>::iterator istart, unsigned flags ) { basic_string::value_type>::const_iterator istop = istart; return create_atom( istart, ++istop, flags ); } template< typename CI > class match_any : public sub_expr { public: virtual ~match_any() {} protected: virtual width_type _width_this() throw() { return width_type(1,1); } }; template< typename EOS > class match_any_t : public match_any { public: typedef EOS eos_type; typedef typename EOS::const_iterator CI; virtual ~match_any_t() {} protected: virtual bool _match_this( match_param & param, CI & icur ) const throw() { if( m_eos( param, icur ) ) return false; ++icur; return true; } eos_type m_eos; }; template< typename CI > match_any * create_any( unsigned flags ) { switch( ( SINGLELINE | CSTRINGS ) & flags ) { case 0: return new match_any_t >(); case SINGLELINE: return new match_any_t >(); case CSTRINGS: return new match_any_t >(); case SINGLELINE | CSTRINGS: return new match_any_t >(); default: __assume(0); // tells the compiler that this is unreachable } } typedef pair range_type; const vector g_rgranges; // empty template< typename CI > class match_charset : public sub_expr { public: match_charset( bool fcomplement, const ascii_bitvector & bvect ) : m_fcomplement( fcomplement ), m_rgascii( bvect ), m_rgranges( g_rgranges ), m_ncharflags(0) {} // Note that only the references are copied here -- they are not ref counted. // Beware of variable lifetime issues. match_charset( const match_charset & that ) : m_fcomplement( that.m_fcomplement ), m_rgascii( that.m_rgascii ), m_rgranges( that.m_rgranges ), m_ncharflags( that.m_ncharflags ) {} virtual ~match_charset() {} const bool m_fcomplement; const ascii_bitvector & m_rgascii; // bitmap for chars in range 0-255 const vector & m_rgranges; // vector of included character ranges 256-65535 int m_ncharflags; // Parameter to isctype() // The case-sensitivity of a character set is "compiled" into the ascii_bitvector // but not into the range vector because it is too computationally expensive. Instead, // when doing a unicode case-insensitive match on the ranges vector, two lookups // must be performed -- one lowercase and one uppercase. By contrast, only one lookup // is needed for the ascii_bitvector. protected: match_charset( bool fcomplement, const ascii_bitvector & bvect, const vector & rgranges ) : m_fcomplement( fcomplement ), m_rgascii( bvect ), m_rgranges( rgranges ), m_ncharflags(0) {} // this method should never be called. match_charset is only a base class // for match_charset_t virtual bool _match_this( match_param & param, CI & icur ) const throw() { assert(false); return true; } template< typename SY > match_charset * get_altern_charset( char_type ch, unsigned flags, SY /*sy*/ ) const throw() { return basic_rpattern::s_charset_map.get( ch, flags ); } virtual width_type _width_this() throw() { return width_type(1,1); } }; // Used as a template parameter to find a unicode character in an array of ranges. class match_range : public unary_function { protected: const vector & m_rgranges; // determines if one range is less then another. // used in binary search of range vector inline static bool _range_less( const range_type & rg1, const range_type & rg2 ) throw() { return rg1.second < rg2.first; } match_range( const vector & rgranges ) : m_rgranges( rgranges ) {} }; class match_range_with_case : public match_range { public: match_range_with_case( const vector & rgranges ) : match_range( rgranges ) {} inline bool operator()( wint_t ch ) const throw() { return binary_search( m_rgranges.begin(), m_rgranges.end(), range_type(ch,ch), _range_less ); } }; class match_range_no_case : public match_range { public: match_range_no_case( const vector & rgranges ) : match_range( rgranges ) {} inline bool operator()( wint_t ch ) const throw() { const wint_t chup = towupper( ch ); if( binary_search( m_rgranges.begin(), m_rgranges.end(), range_type(chup,chup), _range_less ) ) return true; const wint_t chlo = towlower( ch ); if( chup != chlo && binary_search( m_rgranges.begin(), m_rgranges.end(), range_type(chlo,chlo), _range_less ) ) return true; return false; } }; template< typename EOS, typename RGM > class match_charset_t : public match_charset { public: typedef EOS eos_type; typedef RGM range_match_type; typedef typename EOS::const_iterator CI; match_charset_t( const match_charset & that ) : match_charset( that ), m_rgm( m_rgranges ) {} virtual ~match_charset_t() {} inline bool is_in_charset( char_type ch ) const throw() { return m_fcomplement != _is_in_charset( ch ); } protected: match_charset_t( bool fcomplement, const ascii_bitvector & bvect, const vector & rgranges ) : match_charset( fcomplement, bvect, rgranges ), m_rgm( m_rgranges ) {} // Note overloading based on parameter inline bool _is_in_charset( char ch ) const throw() { return ( m_rgascii[ unsigned char(ch) ] ) || ( m_ncharflags && ( _pctype[unsigned char(ch)] & m_ncharflags ) ); } // Note overloading based on parameter inline bool _is_in_charset( wint_t ch ) const throw() { if( UCHAR_MAX >= ch ) return _is_in_charset( char(ch) ); // use range_match_type to see if this character is within one of the // ranges stored in m_rgranges. return ( ! m_rgranges.empty() && m_rgm( ch ) ) || ( m_ncharflags && iswctype( ch, (int)m_ncharflags ) ); } virtual bool _match_this( match_param & param, CI & icur ) const throw() { if( m_eos( param, icur ) || ! is_in_charset( *icur ) ) return false; ++icur; return true; } // range_match_type encapsulates the case-sensitivity // issues with doing a unicode lookup on the ranges vector. range_match_type m_rgm; eos_type m_eos; }; template< typename EOS, typename RGM > class match_custom_charset_t : public match_charset_t { public: template< typename SY > match_custom_charset_t( bool fcomplement, basic_string::iterator & icur, basic_string::const_iterator istop, unsigned flags, SY /*sy*/ ) throw(bad_regexpr,bad_alloc) : match_charset_t( fcomplement, m_rgasciicustom, m_rgrangescustom ) { _parse_charset( icur, istop, flags, SY() ); _optimize(); } virtual ~match_custom_charset_t() {} // for including one character set in another match_custom_charset_t & operator|=( const match_charset & that ) { assert( 0 == that.m_ncharflags ); if( that.m_fcomplement ) { m_rgasciicustom |= ~ that.m_rgascii; // append the inverse of that.m_rgranges to this->m_rgrangescustom wint_t chlow = UCHAR_MAX; typedef vector::const_iterator VCI; for( VCI prg = that.m_rgranges.begin(); prg != that.m_rgranges.end(); ++prg ) { if( UCHAR_MAX + 1 != prg->first ) m_rgrangescustom.push_back( range_type( chlow + 1, prg->first - 1 ) ); chlow = prg->second; } if( WCHAR_MAX != chlow ) m_rgrangescustom.push_back( range_type( chlow + 1, WCHAR_MAX ) ); } else { m_rgasciicustom |= that.m_rgascii; m_rgrangescustom.insert( m_rgrangescustom.end(), that.m_rgranges.begin(), that.m_rgranges.end() ); } return *this; } protected: template< typename SY > void _parse_charset( basic_string::iterator & icur, basic_string::const_iterator istop, unsigned flags, SY /*sy*/ ) throw(bad_regexpr,bad_alloc) { TOKEN tok; char_type ch_prev = 0; match_charset * pcharset; basic_string::iterator iprev = icur; const bool fnocase = ( NOCASE == ( NOCASE & flags ) ); if( (basic_string::const_iterator)icur == istop ) throw bad_regexpr("expecting end of character set"); // remember the current position and grab the next token tok = SY::charset_token( icur, istop ); do { // If we reached the end of the string before finding the end of the // character set, then this is an ill-formed regex if( (basic_string::const_iterator)icur == istop ) throw bad_regexpr("expecting end of character set"); if( CHARSET_RANGE == tok && ch_prev ) { // remember the current position basic_string::iterator iprev2 = icur; char_type old_ch = ch_prev; ch_prev = 0; // old_ch is lower bound of a range switch( SY::charset_token( icur, istop ) ) { case CHARSET_RANGE: case CHARSET_NEGATE: icur = iprev2; // un-get these tokens and fall through case NO_TOKEN: case CHARSET_ESCAPE: // BUGBUG user-defined charset? _set_bit_range( old_ch, *icur++, fnocase ); continue; case CHARSET_BACKSPACE: _set_bit_range( old_ch, char_type(8), fnocase ); // backspace continue; case CHARSET_END: // fall through default: // not a range. icur = iprev; // backup to range token _set_bit( old_ch, fnocase ); _set_bit( *icur++, fnocase ); continue; } } if( ch_prev ) _set_bit( ch_prev, fnocase ); ch_prev = 0; switch( tok ) { // None of the intrinsic charsets are case-sensitive, // so no special handling must be done when the NOCASE // flag is set. case CHARSET_RANGE: case CHARSET_NEGATE: case CHARSET_END: icur = iprev; // un-get these tokens ch_prev = *icur++; continue; case CHARSET_BACKSPACE: ch_prev = char_type(8); // backspace continue; case ESC_DIGIT: *this |= match_charset( false, get_digit_vector() ); continue; case ESC_NOT_DIGIT: *this |= match_charset( true, get_digit_vector() ); continue; case ESC_SPACE: *this |= match_charset( false, get_space_vector() ); continue; case ESC_NOT_SPACE: *this |= match_charset( true, get_space_vector() ); continue; case ESC_WORD: *this |= match_charset( false, get_word_vector() ); continue; case ESC_NOT_WORD: *this |= match_charset( true, get_word_vector() ); continue; case CHARSET_ALNUM: m_ncharflags |= (_ALPHA|_DIGIT); continue; case CHARSET_ALPHA: m_ncharflags |= (_ALPHA); continue; case CHARSET_BLANK: m_ncharflags |= (_BLANK); continue; case CHARSET_CNTRL: m_ncharflags |= (_CONTROL); continue; case CHARSET_DIGIT: m_ncharflags |= (_DIGIT); continue; case CHARSET_GRAPH: m_ncharflags |= (_PUNCT|_ALPHA|_DIGIT); continue; case CHARSET_LOWER: m_ncharflags |= (_LOWER); if( NOCASE == ( NOCASE & flags ) ) m_ncharflags |= (_UPPER); continue; case CHARSET_PRINT: m_ncharflags |= (_BLANK|_PUNCT|_ALPHA|_DIGIT); continue; case CHARSET_PUNCT: m_ncharflags |= (_PUNCT); continue; case CHARSET_SPACE: m_ncharflags |= (_SPACE); continue; case CHARSET_UPPER: m_ncharflags |= (_UPPER); if( NOCASE == ( NOCASE & flags ) ) m_ncharflags |= (_LOWER); continue; case CHARSET_XDIGIT: m_ncharflags |= (_HEX); continue; case CHARSET_ESCAPE: // Maybe this is a user-defined intrinsic charset pcharset = get_altern_charset( *icur, flags, SY() ); if( NULL != pcharset ) { *this |= *pcharset; ++icur; continue; } // else fall through default: ch_prev = *icur++; continue; } } while( iprev = icur, CHARSET_END != ( tok = SY::charset_token( icur, istop ) ) ); if( ch_prev ) _set_bit( ch_prev, fnocase ); } void _optimize() { // this sorts on range_type.first (uses operator<() for pair templates) sort( m_rgrangescustom.begin(), m_rgrangescustom.end() ); // This merges ranges that overlap for( size_t index = 1; index < m_rgrangescustom.size(); ) { if( m_rgrangescustom[index].first <= m_rgrangescustom[index-1].second + 1 ) { m_rgrangescustom[index-1].second = max( m_rgrangescustom[index-1].second, m_rgrangescustom[index].second ); m_rgrangescustom.erase( m_rgrangescustom.begin() + index ); } else ++index; } } // Note overloading based on second parameter void _set_bit( char ch, const bool fnocase ) throw() { if( fnocase ) { m_rgasciicustom.set( unsigned char(tolower(ch)) ); m_rgasciicustom.set( unsigned char(toupper(ch)) ); } else { m_rgasciicustom.set( unsigned char(ch) ); } } // Note overloading based on second parameter void _set_bit( wint_t ch, const bool fnocase ) throw(bad_alloc) { if( UCHAR_MAX >= ch ) _set_bit( char(ch), fnocase ); else m_rgrangescustom.push_back( range_type( ch, ch ) ); } // Note overloading based on second parameter void _set_bit_range( char ch1, char ch2, const bool fnocase ) throw(bad_regexpr) { if( unsigned char(ch1) > unsigned char(ch2) ) throw bad_regexpr("invalid range specified in character set"); if( fnocase ) { // i is unsigned int to prevent overflow if ch2 is UCHAR_MAX for( unsigned int i = unsigned char(ch1); i <= unsigned char(ch2); ++i ) { m_rgasciicustom.set( unsigned char( toupper(i) ) ); m_rgasciicustom.set( unsigned char( tolower(i) ) ); } } else { // i is unsigned int to prevent overflow if ch2 is UCHAR_MAX for( unsigned int i = unsigned char(ch1); i <= unsigned char(ch2); ++i ) m_rgasciicustom.set( unsigned char(i) ); } } // Note overloading based on second parameter void _set_bit_range( wint_t ch1, wint_t ch2, const bool fnocase ) throw(bad_regexpr,bad_alloc) { if( ch1 > ch2 ) throw bad_regexpr("invalid range specified in character set"); if( UCHAR_MAX >= ch1 ) _set_bit_range( char(ch1), char( min(wint_t(UCHAR_MAX),ch2) ), fnocase ); if( UCHAR_MAX < ch2 ) m_rgrangescustom.push_back( range_type( max(wint_t(UCHAR_MAX+1),ch1), ch2 ) ); } ascii_bitvector m_rgasciicustom; vector m_rgrangescustom; }; template< typename CI > match_charset * create_charset( const match_charset & that, unsigned flags ) { switch( ( NOCASE | CSTRINGS ) & flags ) { case 0: return new match_charset_t,match_range_with_case>( that ); case NOCASE: return new match_charset_t,match_range_no_case>( that ); case CSTRINGS: return new match_charset_t,match_range_with_case>( that ); case NOCASE | CSTRINGS: return new match_charset_t,match_range_no_case>( that ); default: __assume(0); // tells the compiler that this is unreachable } } template< typename EOS > class word_assertion_t : public assertion { public: typedef EOS eos_type; typedef typename EOS::const_iterator CI; word_assertion_t() : m_isword( match_charset( false, get_word_vector() ) ) {} virtual ~word_assertion_t() {} protected: bos_t m_bos; eos_type m_eos; match_charset_t m_isword; }; template< typename EOS > class word_boundary_t : public word_assertion_t { public: word_boundary_t( const bool fisboundary ) : m_fisboundary( fisboundary ) {} virtual ~word_boundary_t() {} protected: virtual bool _match_this( match_param & param, CI & icur ) const throw() { CI iprev = icur; --iprev; const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev ); const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur ); return ( m_fisboundary == ( fprevword != fthisword ) ); } const bool m_fisboundary; }; template< typename EOS > class word_start_t : public word_assertion_t { public: word_start_t() {} virtual ~word_start_t() {} protected: virtual bool _match_this( match_param & param, CI & icur ) const throw() { CI iprev = icur; --iprev; const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev ); const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur ); return ! fprevword && fthisword; } }; template< typename EOS > class word_stop_t : public word_assertion_t { public: word_stop_t() {} virtual ~word_stop_t() {} protected: virtual bool _match_this( match_param & param, CI & icur ) const throw() { CI iprev = icur; --iprev; const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev ); const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur ); return fprevword && ! fthisword; } }; template< typename CI > assertion * create_word_boundary( const bool fisboundary, unsigned flags ) { switch( CSTRINGS & flags ) { case 0: return new word_boundary_t >( fisboundary ); case CSTRINGS: return new word_boundary_t >( fisboundary ); default: __assume(0); // tells the compiler that this is unreachable } } template< typename CI > assertion * create_word_start( unsigned flags ) { switch( CSTRINGS & flags ) { case 0: return new word_start_t >(); case CSTRINGS: return new word_start_t >(); default: __assume(0); // tells the compiler that this is unreachable } } template< typename CI > assertion * create_word_stop( unsigned flags ) { switch( CSTRINGS & flags ) { case 0: return new word_stop_t >(); case CSTRINGS: return new word_stop_t >(); default: __assume(0); // tells the compiler that this is unreachable } } template< typename CI > class group_quantifier; template< typename CI > class match_group : public sub_expr { public: friend class group_quantifier; match_group( size_t cgroup ) : m_rgalternates(), m_cgroup( cgroup ), m_pptail(NULL), m_end_group( this ), m_nwidth(uninit_width) {} virtual ~match_group() {} virtual bool domatch( match_param & param, CI icur ) const throw() { CI old_istart; if( -1 != m_cgroup ) // could be -1 if this is a lookahead_assertion { old_istart = (*param.prgbackrefs)[ m_cgroup ].first; (*param.prgbackrefs)[ m_cgroup ].first = icur; } typedef vector*>::const_iterator VCI; for( VCI ialt = m_rgalternates.begin(); ialt != m_rgalternates.end(); ++ialt ) { if( (*ialt)->domatch( param, icur ) ) return true; } if( -1 != m_cgroup ) (*param.prgbackrefs)[ m_cgroup ].first = old_istart; return false; } virtual void _delete() { typedef vector*>::iterator VI; for( VI ialt = m_rgalternates.begin(); ialt != m_rgalternates.end(); ++ialt ) delete_sub_expr( *ialt ); sub_expr::_delete(); } size_t group_number() const { return m_cgroup; } void group_number( size_t cgroup ) { m_cgroup = cgroup; } void add_item( sub_expr * pitem ) { *m_pptail = pitem; m_pptail = & pitem->next(); } void add_alternate() { m_rgalternates.push_back( NULL ); m_pptail = & m_rgalternates.back(); } void end_alternate() { *m_pptail = & m_end_group; } size_t calternates() const { return m_rgalternates.size(); } width_type group_width() { (void) match_group::_width_this(); return m_nwidth; } protected: virtual bool _call_back( match_param & param, CI icur ) const throw() { CI old_iend; if( -1 != m_cgroup ) { old_iend = (*param.prgbackrefs)[ m_cgroup ].second; (*param.prgbackrefs)[ m_cgroup ].second = icur; } if( match_next( param, icur ) ) return true; if( -1 != m_cgroup ) (*param.prgbackrefs)[ m_cgroup ].second = old_iend; return false; } virtual width_type _width_this() throw() { typedef vector*>::const_iterator VCI; if( uninit_width == m_nwidth ) { m_nwidth = width_type(-1,0); for( VCI ialt = m_rgalternates.begin(); worst_width != m_nwidth && ialt != m_rgalternates.end(); ++ialt ) { width_type temp_width = (*ialt)->get_width(); m_nwidth.m_min = min( m_nwidth.m_min, temp_width.m_min ); m_nwidth.m_max = max( m_nwidth.m_max, temp_width.m_max ); } } return m_nwidth; } class end_group; friend class end_group; class end_group : public sub_expr { void * operator new( size_t ); public: end_group( match_group * pgroup ) : m_pgroup( pgroup ) {} virtual ~end_group() {} virtual void _delete() {} // don't delete this, because it was never alloc'ed virtual bool domatch( match_param & param, CI icur ) const throw() { return m_pgroup->_call_back( param, icur ); } protected: // since m_pnext is always NULL for end_groups, get_width() stops recursing here virtual width_type _width_this() throw() { return width_type(0,0); } match_group * m_pgroup; }; vector*> m_rgalternates; sub_expr ** m_pptail; // only used when adding elements size_t m_cgroup; end_group m_end_group; width_type m_nwidth; }; // Behaves like a lookahead assertion if m_cgroup is -1, or like // an independent group otherwise. template< typename CI > class independent_group : public match_group { public: independent_group() : match_group( -1 ), m_fexpected(true) {} virtual ~independent_group() {} virtual bool domatch( match_param & param, CI icur ) const throw() { // Copy the entire backref vector onto the stack backref_tag * prgbr = (backref_tag*)_alloca( param.prgbackrefs->size() * sizeof backref_tag ); copy( param.prgbackrefs->begin(), param.prgbackrefs->end(), raw_storage_iterator*,backref_tag >(prgbr) ); // Match until the end of this group and then return const bool fdomatch = match_group::domatch( param, icur ); if( m_fexpected == fdomatch ) { // If m_cgroup != 1, then this is not a zero-width assertion. if( -1 != m_cgroup ) icur = (*param.prgbackrefs)[ m_cgroup ].second; if( match_next( param, icur ) ) return true; } // if match_group::domatch returned true, the backrefs must be restored if( fdomatch ) copy( prgbr, prgbr + param.prgbackrefs->size(), param.prgbackrefs->begin() ); return false; } protected: independent_group( const bool fexpected ) : match_group( -1 ), m_fexpected(fexpected) {} virtual bool _call_back( match_param & param, CI icur ) const throw() { if( -1 != m_cgroup ) (*param.prgbackrefs)[ m_cgroup ].second = icur; return true; } const bool m_fexpected; }; template< typename CI > class lookahead_assertion : public independent_group { public: lookahead_assertion( const bool fexpected ) : independent_group( fexpected ) {} virtual ~lookahead_assertion() {} virtual bool is_assertion() const throw() { return true; } protected: virtual width_type _width_this() throw() { return width_type(0,0); } }; template< typename CI > class lookbehind_assertion : public independent_group { public: lookbehind_assertion( const bool fexpected ) : independent_group( fexpected ) {} virtual ~lookbehind_assertion() {} virtual bool domatch( match_param & param, CI icur ) const throw() { // This is the room in the string from the start to the current position size_t room = distance( param.ibegin, icur ); // If we don't have enough room to match the lookbehind, the match fails. // If we wanted the match to fail, try to match the rest of the pattern. if( m_nwidth.m_min > room ) return m_fexpected ? false : match_next( param, icur ); // Copy the entire backref vector onto the stack backref_tag * prgbr = (backref_tag*)_alloca( param.prgbackrefs->size() * sizeof backref_tag ); copy( param.prgbackrefs->begin(), param.prgbackrefs->end(), raw_storage_iterator*,backref_tag >(prgbr) ); CI local_istart = icur; advance( local_istart, -int( min( m_nwidth.m_max, room ) ) ); CI local_istop = icur; advance( local_istop, -int( m_nwidth.m_min ) ); // Create a local param struct that has icur as param.iend match_param local_param(param.ibegin,param.istart,icur,param.prgbackrefs); // Find the rightmost match that ends at icur. for( CI local_icur = local_istart; local_icur <= local_istop; ++local_icur ) { // Match until the end of this group and then return const bool fmatched = match_group::domatch( local_param, local_icur ); // If the match results were what we were expecting, try to match the // rest of the pattern. If that succeeds, return true. if( m_fexpected == fmatched && match_next( param, icur ) ) return true; // if match_group::domatch returned true, the backrefs must be restored if( fmatched ) { copy( prgbr, prgbr + param.prgbackrefs->size(), param.prgbackrefs->begin() ); // Match succeeded. If this is a negative lookbehind, we didn't want it // to succeed, so return false. if( ! m_fexpected ) return false; } } // No variation of the lookbehind was satisfied in a way that permited // the rest of the pattern to match successfully, so return false. return false; } virtual bool is_assertion() const throw() { return true; } protected: virtual bool _call_back( match_param & param, CI icur ) const throw() { return param.istop == icur; } virtual width_type _width_this() throw() { return width_type(0,0); } }; // Corresponds to the (?:foo) extension, which has grouping semantics, but // does not store any backref information. template< typename CI > class group_nobackref : public match_group { public: group_nobackref( ) : match_group( -1 ) {} // will be assigned a group number in basic_rpattern::basic_rpattern() virtual ~group_nobackref() {} }; template< typename CI > class match_wrapper : public sub_expr { public: match_wrapper( sub_expr * psub ) : m_psub(psub) {} virtual ~match_wrapper() {} virtual void _delete() { delete_sub_expr( m_psub ); sub_expr::_delete(); } protected: bool _wrapped_match_this( match_param & param, CI & icur ) const throw() { return m_psub->_match_this( param, icur ); } virtual width_type _width_this() throw() { return m_psub->_width_this(); } sub_expr * m_psub; }; template< typename CI > class match_quantifier : public match_wrapper { public: match_quantifier( sub_expr * psub, size_t lbound, size_t ubound ) : match_wrapper( psub ), m_lbound(lbound), m_ubound(ubound) {} virtual ~match_quantifier() {} protected: virtual width_type _width_this() throw() { width_type this_width = match_wrapper::_width_this(); return this_width * width_type( m_lbound, m_ubound ); } const size_t m_lbound; const size_t m_ubound; }; template< typename CI > class max_atom_quantifier : public match_quantifier { public: max_atom_quantifier( sub_expr * psub, size_t lbound, size_t ubound ) : match_quantifier( psub, lbound, ubound ) {} virtual ~max_atom_quantifier() {} virtual bool domatch( match_param & param, CI icur ) const throw() { size_t cmatches = 0; int cdiff = 0; // must be a signed quantity for advance() below if( cmatches < m_ubound ) { CI istart = icur; if( _wrapped_match_this( param, icur ) ) { ++cmatches; cdiff = distance( istart, icur ); if( 0 == cdiff ) return ( match_next( param, icur ) ); while( cmatches < m_ubound && _wrapped_match_this( param, icur ) ) { ++cmatches; } } } if( cmatches >= m_lbound ) { if( ! next() ) return true; for(;;) { if( next()->domatch( param, icur ) ) return true; if( cmatches-- <= m_lbound ) break; advance( icur, -cdiff ); } } return false; } }; template< typename CI > class min_atom_quantifier : public match_quantifier { public: min_atom_quantifier( sub_expr * psub, size_t lbound, size_t ubound ) : match_quantifier( psub, lbound, ubound ) {} virtual ~min_atom_quantifier() {} virtual bool domatch( match_param & param, CI icur ) const throw() { size_t cmatches = 0; bool fsuccess = true; CI icur_tmp = icur; if( _wrapped_match_this( param, icur_tmp ) ) { if( icur_tmp == icur ) return ( match_next( param, icur ) ); if( m_lbound ) { icur = icur_tmp; ++cmatches; } while( ( cmatches < m_lbound ) && ( fsuccess = _wrapped_match_this( param, icur ) ) ) { ++cmatches; } } else { fsuccess = ! m_lbound; } if( fsuccess && next() ) { do { if( next()->domatch( param, icur ) ) break; } while( fsuccess = ( cmatches++ < m_ubound && _wrapped_match_this( param, icur ) ) ); } return fsuccess; } }; template< typename CI > class group_quantifier : public match_quantifier { public: group_quantifier( match_group * psub, size_t lbound, size_t ubound ) : match_quantifier( psub, lbound, ubound ), m_group( *psub ), m_end_quantifier( this ) { psub->next() = & m_end_quantifier; } virtual ~group_quantifier() {} virtual bool domatch( match_param & param, CI icur ) const throw() { // group_number is only -1 for assertions, which can't be quantified assert( -1 != group_number() ); backref_tag & br = (*param.prgbackrefs)[ group_number() ]; backref_tag old_backref = br; br = backref_tag( icur, icur ); // sets cmatches (reserved) to 0 if( _recurse( param, icur ) ) return true; br = old_backref; return false; } protected: class end_quantifier; friend class end_quantifier; class end_quantifier : public sub_expr { void * operator new( size_t ); public: end_quantifier( group_quantifier * pquant ) : m_pquant( pquant ) {} virtual ~end_quantifier() {} virtual void _delete() {} // don't delete this, since it wasn't alloc'ed virtual bool domatch( match_param & param, CI icur ) const throw() { // group_number is only -1 for assertions, which can't be quantified assert( -1 != m_pquant->group_number() ); // handle special the case where a group matches 0 characters backref_tag & br = (*param.prgbackrefs)[ m_pquant->group_number() ]; if( icur == br.first ) { size_t old_cmatches = br.reserved; br.reserved = m_pquant->m_ubound; if( m_pquant->_recurse( param, icur ) ) return true; br.reserved = old_cmatches; return false; } return m_pquant->_recurse( param, icur ); } protected: virtual width_type _width_this() throw() { return width_type(0,0); } group_quantifier * m_pquant; }; size_t group_number() const { return m_group.group_number(); } size_t & cmatches( match_param & param ) const { return (*param.prgbackrefs)[ group_number() ].reserved; } virtual bool _recurse( match_param & param, CI icur ) const throw() = 0; match_group & m_group; end_quantifier m_end_quantifier; }; template< typename CI > class max_group_quantifier : public group_quantifier { public: max_group_quantifier( match_group * psub, size_t lbound, size_t ubound ) : group_quantifier( psub, lbound, ubound ) {} virtual ~max_group_quantifier() {} protected: virtual bool _recurse( match_param & param, CI icur ) const throw() { if( m_ubound == cmatches( param ) ) return match_next( param, icur ); ++cmatches( param ); if( m_psub->domatch( param, icur ) ) return true; if( --cmatches( param ) < m_lbound ) return false; return match_next( param, icur ); } }; template< typename CI > class min_group_quantifier : public group_quantifier { public: min_group_quantifier( match_group * psub, size_t lbound, size_t ubound ) : group_quantifier( psub, lbound, ubound ) {} virtual ~min_group_quantifier() {} protected: virtual bool _recurse( match_param & param, CI icur ) const throw() { if( m_lbound > cmatches( param ) ) { ++cmatches( param ); return m_psub->domatch( param, icur ); } if( match_next( param, icur ) ) return true; if( cmatches( param )++ == m_ubound ) return false; return m_psub->domatch( param, icur ); } }; template< typename CI > class match_backref : public sub_expr { public: match_backref( size_t cbackref, const width_type & group_width ) : m_cbackref( cbackref ), m_nwidth(group_width) {} virtual ~match_backref() {} protected: // Return the width specifications of the group to which this backref refers virtual width_type _width_this() throw() { return m_nwidth; } const size_t m_cbackref; const width_type m_nwidth; }; template< typename CMP, typename EOS > class match_backref_t : public match_backref { public: typedef CMP cmp_type; typedef EOS eos_type; typedef typename EOS::const_iterator CI; match_backref_t( size_t cbackref, const width_type & group_width ) : match_backref( cbackref, group_width ) {} virtual ~match_backref_t() {} protected: virtual bool _match_this( match_param & param, CI & icur ) const throw() { CI ithis = (*param.prgbackrefs)[ m_cbackref ].first; CI istop = (*param.prgbackrefs)[ m_cbackref ].second; CI icur_tmp = icur; // Don't match a backref that hasn't match anything if( ! (*param.prgbackrefs)[ m_cbackref ] ) return false; for( ; ithis != istop; ++icur_tmp, ++ithis ) { if( m_eos( param, icur_tmp ) || m_cmp( *icur_tmp, *ithis ) ) return false; } icur = icur_tmp; return true; } cmp_type m_cmp; eos_type m_eos; }; template< typename CI > match_backref * create_backref( size_t cbackref, const width_type & group_width, unsigned flags ) { typedef typename iterator_traits::value_type char_type; switch( ( NOCASE | CSTRINGS ) & flags ) { case 0: return new match_backref_t,eos_t >( cbackref, group_width ); case NOCASE: return new match_backref_t,eos_t >( cbackref, group_width ); case CSTRINGS: return new match_backref_t,eocs_t >( cbackref, group_width ); case NOCASE | CSTRINGS: return new match_backref_t,eocs_t >( cbackref, group_width ); default: __assume(0); // tells the compiler that this is unreachable } } // Replace some escape sequences with the actual characters // they represent template< typename CI > void basic_rpattern_base::_normalize_string( basic_string::char_type> & str ) { size_t i = 0; // Don't do pattern normalization if the user didn't ask for it. if( NORMALIZE != ( NORMALIZE & m_flags ) ) return; while( basic_string::npos != ( i = str.find( char_type('\\'), i ) ) ) { if( str.size() - 1 == i ) return; switch( str[i+1] ) { case char_type('f'): str.replace( i, 2, 1, char_type('\f') ); break; case char_type('n'): str.replace( i, 2, 1, char_type('\n') ); break; case char_type('r'): str.replace( i, 2, 1, char_type('\r') ); break; case char_type('t'): str.replace( i, 2, 1, char_type('\t') ); break; case char_type('v'): str.replace( i, 2, 1, char_type('\v') ); break; case char_type('\\'): str.replace( i, 2, 1, char_type('\\') ); break; default: ++i; break; } ++i; if( str.size() <= i ) return; } } // // Implementation of basic_rpattern: // template< typename CI, typename SY > basic_rpattern::basic_rpattern() throw() : basic_rpattern_base( 0 ) { } template< typename CI, typename SY > basic_rpattern::basic_rpattern( const basic_string::char_type> & pat, unsigned flags ) throw(bad_regexpr,bad_alloc) : basic_rpattern_base( flags, pat ) { push_new_handler pnh( &my_new_handler ); _normalize_string( m_pat ); _common_init( flags ); } template< typename CI, typename SY > basic_rpattern::basic_rpattern( const basic_string::char_type> & pat, const basic_string::char_type> & subst, unsigned flags ) throw(bad_regexpr,bad_alloc) : basic_rpattern_base( flags, pat, subst ) { push_new_handler pnh( &my_new_handler ); _normalize_string( m_pat ); _common_init( flags ); _normalize_string( m_subst ); _parse_subst(); // must come after _common_init } template< typename CI, typename SY > void basic_rpattern::init( const basic_string::char_type> & pat, unsigned flags ) throw(bad_regexpr,bad_alloc) { push_new_handler pnh( &my_new_handler ); _reset(); m_flags = flags; m_pat = pat; _normalize_string( m_pat ); _common_init( m_flags ); } template< typename CI, typename SY > void basic_rpattern::init( const basic_string::char_type> & pat, const basic_string::char_type> & subst, unsigned flags ) throw(bad_regexpr,bad_alloc) { push_new_handler pnh( &my_new_handler ); _reset(); m_flags = flags; m_pat = pat; m_subst = subst; _normalize_string( m_pat ); _common_init( m_flags ); _normalize_string( m_subst ); _parse_subst(); // must come after _common_init } template< typename CI, typename SY > void basic_rpattern::_common_init( unsigned flags ) { m_cgroups = 0; vector*> rggroups; basic_string::iterator ipat = m_pat.begin(); match_group * pgroup = _find_next_group( ipat, flags, rggroups ); m_pfirst = pgroup; m_nwidth = pgroup->group_width(); // Number the invisible groups m_cgroups_visible = m_cgroups; while( ! m_invisible_groups.empty() ) { m_invisible_groups.front()->group_number( _get_next_group_nbr() ); m_invisible_groups.pop_front(); } // // determine if we can get away with only calling m_pfirst->domatch only once // m_floop = true; // Optimization: if first character of pattern string is '^' // and we are not doing a multiline match, then we only // need to try domatch once basic_string::iterator icur = m_pat.begin(); if( MULTILINE != ( MULTILINE & m_flags ) && 1 == pgroup->calternates() && icur != m_pat.end() && BEGIN_LINE == SY::reg_token( icur, m_pat.end() ) ) { m_flags &= ~RIGHTMOST; m_floop = false; } // Optimization: if first 2 characters of pattern string are ".*" or ".+", // then we only need to try domatch once icur = m_pat.begin(); if( RIGHTMOST != ( RIGHTMOST & m_flags ) && SINGLELINE == ( SINGLELINE & m_flags ) && 1 == pgroup->calternates() && icur != m_pat.end() && MATCH_ANY == SY::reg_token( icur, m_pat.end() ) && icur != m_pat.end() ) { switch( SY::quant_token( icur, m_pat.end() ) ) { case ONE_OR_MORE: case ZERO_OR_MORE: case ONE_OR_MORE_MIN: case ZERO_OR_MORE_MIN: m_floop = false; } } } template< typename CI, typename SY > void basic_rpattern::_reset() throw() { basic_rpattern_base::_reset(); m_cgroups = m_cgroups_visible = 0; m_floop = true; m_subst.erase(); m_pat.erase(); m_pfirst.free_ptr(); m_nwidth = uninit_width; m_subst_list.clear(); m_invisible_groups.clear(); } template< typename CI, typename SY > void basic_rpattern::set_flags( unsigned flags ) throw(bad_regexpr,bad_alloc) { push_new_handler pnh( &my_new_handler ); m_pfirst.free_ptr(); m_flags = flags; _common_init( m_flags ); } template< typename CI, typename SY > void basic_rpattern::set_substitution( const basic_string::char_type> & subst ) { push_new_handler pnh( &my_new_handler ); m_subst_list.clear(); m_subst = subst; _normalize_string( m_subst ); _parse_subst(); } template< typename CI, typename SY > match_group * basic_rpattern::_find_next_group( basic_string::char_type>::iterator & ipat, unsigned & flags, vector*> & rggroups ) { auto_sub_ptr > pgroup; basic_string::iterator itemp = ipat; unsigned old_flags = flags; TOKEN tok; // Look for group extensions. (This could change the value of the flags variable.) if( ipat != m_pat.end() && NO_TOKEN != ( tok = SY::ext_token( ipat, m_pat.end(), flags ) ) ) { if( itemp == m_pat.begin() || ipat == m_pat.end() ) throw bad_regexpr("ill-formed regular expression"); // Don't process empty groups if( END_GROUP != SY::reg_token( itemp = ipat, m_pat.end() ) ) { switch( tok ) { case EXT_NOBACKREF: // invisible groups are numbered only after all // visible groups have been numbererd pgroup = new match_group( -1 ); m_invisible_groups.push_back( pgroup.get() ); break; case EXT_INDEPENDENT: pgroup = new independent_group(); m_invisible_groups.push_back( pgroup.get() ); break; case EXT_POS_LOOKAHEAD: pgroup = new lookahead_assertion( true ); break; case EXT_NEG_LOOKAHEAD: pgroup = new lookahead_assertion( false ); break; case EXT_POS_LOOKBEHIND: // For look-behind assertions, turn off the CSTRINGs optimization flags &= ~CSTRINGS; pgroup = new lookbehind_assertion( true ); break; case EXT_NEG_LOOKBEHIND: // For look-behind assertions, turn off the CSTRINGs optimization flags &= ~CSTRINGS; pgroup = new lookbehind_assertion( false ); break; default: throw bad_regexpr("bad extension sequence"); } } else { // Skip over the END_GROUP token ipat = itemp; } } else { pgroup = new match_group( _get_next_group_nbr() ); } if( NULL != pgroup.get() ) { pgroup->add_alternate(); while( _find_next( ipat, pgroup.get(), flags, rggroups ) ); pgroup->end_alternate(); // Add this group to the rggroups array if( -1 != pgroup->group_number() ) { if( pgroup->group_number() >= rggroups.size() ) rggroups.resize( pgroup->group_number() + 1, NULL ); rggroups[ pgroup->group_number() ] = pgroup.get(); } // The group should calculate its own width now and // save the result for later. pgroup->group_width(); // If this is not a pattern modifier, restore the // flags to their previous settings. This causes // pattern modifiers to have the scope of their // enclosing group. flags = old_flags; } return pgroup.release(); } // // Read ahead through the pattern and treat sequential atoms // as a single atom, making sure to handle quantification // correctly. Warning: dense code ahead. // template< typename CI, typename SY > void basic_rpattern::_find_atom( basic_string::char_type>::iterator & ipat, match_group * pgroup, unsigned flags ) { basic_string::iterator itemp = ipat, istart = ipat; do { switch( SY::quant_token( itemp, m_pat.end() ) ) { // if {,} can't be interpreted as quantifiers, treat them as regular chars case BEGIN_RANGE: if( istart != ipat ) // treat as a quantifier goto quantify; case NO_TOKEN: case END_RANGE: case END_RANGE_MIN: case RANGE_SEPARATOR: break; default: if( istart == ipat ) // must be able to quantify something. throw bad_regexpr("quantifier not expected"); quantify: if( istart != --ipat ) pgroup->add_item( create_atom( istart, ipat, flags ) ); auto_sub_ptr > pnew( create_atom( ipat++, flags ) ); _quantify( pnew, NULL, ipat ); pgroup->add_item( pnew.release() ); return; } } while( m_pat.end() != ++ipat && ! SY::reg_token( itemp = ipat, m_pat.end() ) ); assert( ipat != istart ); pgroup->add_item( create_atom( istart, ipat, flags ) ); } template< typename CI, typename SY > bool basic_rpattern::_find_next( basic_string::char_type>::iterator & ipat, match_group * pgroup, unsigned & flags, vector*> & rggroups ) { match_group * pnew_group = NULL; auto_sub_ptr > pnew; basic_string::iterator istart, itemp; bool fdone; if( ipat == m_pat.end() ) { if( 0 != pgroup->group_number() ) throw bad_regexpr( "mismatched parenthesis" ); return false; } switch( SY::reg_token( ipat, m_pat.end() ) ) { case NO_TOKEN: // not a token. Must be an atom _find_atom( ipat, pgroup, flags ); return true; case END_GROUP: if( 0 == pgroup->group_number() ) throw bad_regexpr( "mismatched parenthesis" ); return false; case ALTERNATION: pgroup->end_alternate(); pgroup->add_alternate(); return true; case BEGIN_GROUP: // Find next group could return NULL if the group is really // a pattern modifier, like: (?s-i) pnew = pnew_group = _find_next_group( ipat, flags, rggroups ); break; case BEGIN_LINE: pnew = create_bol( flags ); break; case END_LINE: pnew = create_eol( flags ); break; case BEGIN_CHARSET: pnew = create_charset_helper::create_charset_aux( m_pat, ipat, flags ); break; case MATCH_ANY: pnew = create_any( flags ); break; case ESC_WORD_BOUNDARY: pnew = create_word_boundary( true, flags ); break; case ESC_NOT_WORD_BOUNDARY: pnew = create_word_boundary( false, flags ); break; case ESC_WORD_START: pnew = create_word_start( flags ); break; case ESC_WORD_STOP: pnew = create_word_stop( flags ); break; case ESC_DIGIT: pnew = create_charset( match_charset( false, get_digit_vector() ), flags ); break; case ESC_NOT_DIGIT: pnew = create_charset( match_charset( true, get_digit_vector() ), flags ); break; case ESC_WORD: pnew = create_charset( match_charset( false, get_word_vector() ), flags ); break; case ESC_NOT_WORD: pnew = create_charset( match_charset( true, get_word_vector() ), flags ); break; case ESC_SPACE: pnew = create_charset( match_charset( false, get_space_vector() ), flags ); break; case ESC_NOT_SPACE: pnew = create_charset( match_charset( true, get_space_vector() ), flags ); break; case ESC_BEGIN_STRING: pnew = create_bos( flags ); break; case ESC_END_STRING: pnew = create_eos( flags ); break; case ESC_END_STRING_z: pnew = create_eoz( flags ); break; case ESCAPE: if( char_type('0') <= *ipat && char_type('9') >= *ipat ) { // use _cgroups_total here since the invisible groups have not been numbered yet. unsigned nbackref = parse_int( ipat, m_pat.end(), _cgroups_total() - 1 );// always at least 1 group if( 0 == nbackref || rggroups.size() <= nbackref || NULL == rggroups[ nbackref ] ) throw bad_regexpr( "invalid backreference" ); pnew = create_backref( nbackref, rggroups[nbackref]->group_width(), flags ); } else { // Is this a user-defined intrinsic character set? match_charset * pcharset = s_charset_map.get( *ipat, flags ); if( NULL != pcharset ) pnew = create_charset( *pcharset, flags ); else pnew = create_atom( ipat, flags ); ++ipat; } break; // If quotemeta, loop until we find quotemeta off or end of string case ESC_QUOTE_META_ON: for( istart = itemp = ipat, fdone = false; !fdone && ipat != m_pat.end(); ) { switch( SY::reg_token( ipat, m_pat.end() ) ) { case ESC_QUOTE_META_OFF: fdone = true; break; case NO_TOKEN: ++ipat; // fallthrough default: itemp = ipat; break; } } if( itemp != istart ) pgroup->add_item( create_atom( istart, itemp, flags ) ); // skip the quantification code below return true; // Should never get here for valid patterns case ESC_QUOTE_META_OFF: throw bad_regexpr("quotemeta turned off, but was never turned on"); default: assert( ! "Unhandled token type" ); break; } // If pnew is null, then the current subexpression is a no-op. if( pnew.get() ) { // Look for quantifiers _quantify( pnew, pnew_group, ipat ); // Add the item to the group pgroup->add_item( pnew.release() ); } return true; } template< typename CI, typename SY > void basic_rpattern::_quantify( auto_sub_ptr > & pnew, match_group * pnew_group, basic_string::char_type>::iterator & ipat ) { if( ipat != m_pat.end() && ! pnew->is_assertion() ) { basic_string::iterator itemp = ipat; bool fmin = false; // Since size_t is unsigned, -1 is really the largest size_t size_t lbound = (size_t)-1; size_t ubound = (size_t)-1; size_t ubound_tmp; switch( SY::quant_token( itemp, m_pat.end() ) ) { case ZERO_OR_MORE_MIN: fmin = true; case ZERO_OR_MORE: lbound = 0; break; case ONE_OR_MORE_MIN: fmin = true; case ONE_OR_MORE: lbound = 1; break; case ZERO_OR_ONE_MIN: fmin = true; case ZERO_OR_ONE: lbound = 0; ubound = 1; break; case BEGIN_RANGE: lbound = parse_int( itemp, m_pat.end() ); if( itemp == m_pat.end() ) throw bad_regexpr( "expecting end of range" ); switch( SY::quant_token( itemp, m_pat.end() ) ) { case END_RANGE_MIN: fmin = true; case END_RANGE: ubound = lbound; break; case RANGE_SEPARATOR: ipat = itemp; ubound_tmp = parse_int( itemp, m_pat.end() ); if( itemp != ipat ) ubound = ubound_tmp; if( itemp == m_pat.end() ) throw bad_regexpr( "expecting end of range" ); switch( SY::quant_token( itemp, m_pat.end() ) ) { case END_RANGE_MIN: fmin = true; case END_RANGE: break; default: throw bad_regexpr( "expecting end of range" ); } break; default: throw bad_regexpr( "ill-formed quantifier" ); } if( ubound < lbound ) throw bad_regexpr( "ill-formed quantifier" ); break; } if( (size_t)-1 != lbound ) { auto_sub_ptr > pquant; // a group quantifier is less efficient than an atom quantifier if( fmin ) { if( pnew_group ) pquant = new min_group_quantifier( pnew_group, lbound, ubound ); else pquant = new min_atom_quantifier( pnew.get(), lbound, ubound ); } else { if( pnew_group ) pquant = new max_group_quantifier( pnew_group, lbound, ubound ); else pquant = new max_atom_quantifier( pnew.get(), lbound, ubound ); } pnew.release(); pnew = pquant.release(); ipat = itemp; } } } template< typename CI, typename SY > void basic_rpattern::_add_subst_backref( subst_node & snode, size_t nbackref, size_t rstart ) { m_fuses_backrefs = true; assert( subst_node::SUBST_STRING == snode.stype ); if( snode.subst_string.rlength ) m_subst_list.push_back( snode ); snode.stype = subst_node::SUBST_BACKREF; snode.subst_backref = nbackref; m_subst_list.push_back( snode ); // re-initialize the subst_node snode.stype = subst_node::SUBST_STRING; snode.subst_string.rstart = rstart; snode.subst_string.rlength = 0; } template< typename CI, typename SY > void basic_rpattern::_parse_subst() { TOKEN tok; subst_node snode; basic_string::iterator icur = m_subst.begin(); size_t nbackref; basic_string::iterator itemp; bool fdone; m_fuses_backrefs = false; // Initialize the subst_node snode.stype = subst_node::SUBST_STRING; snode.subst_string.rstart = 0; snode.subst_string.rlength = 0; while( icur != m_subst.end() ) { switch( tok = SY::subst_token( icur, m_subst.end() ) ) { case SUBST_MATCH: _add_subst_backref( snode, 0, distance( m_subst.begin(), icur ) ); break; case SUBST_PREMATCH: _add_subst_backref( snode, subst_node::PREMATCH, distance( m_subst.begin(), icur ) ); break; case SUBST_POSTMATCH: _add_subst_backref( snode, subst_node::POSTMATCH, distance( m_subst.begin(), icur ) ); break; case SUBST_BACKREF: nbackref = parse_int( icur, m_subst.end(), cgroups() - 1 ); // always at least 1 group if( 0 == nbackref ) throw bad_regexpr( "invalid backreference in substitution" ); _add_subst_backref( snode, nbackref, distance( m_subst.begin(), icur ) ); break; case SUBST_QUOTE_META_ON: assert( subst_node::SUBST_STRING == snode.stype ); if( snode.subst_string.rlength ) m_subst_list.push_back( snode ); snode.subst_string.rstart = distance( m_subst.begin(), icur ); for( itemp = icur, fdone = false; !fdone && icur != m_subst.end(); ) { switch( tok = SY::subst_token( icur, m_subst.end() ) ) { case SUBST_ALL_OFF: fdone = true; break; case NO_TOKEN: ++icur; // fall-through default: itemp = icur; break; } } snode.subst_string.rlength = distance( m_subst.begin(), itemp ) - snode.subst_string.rstart; if( snode.subst_string.rlength ) m_subst_list.push_back( snode ); if( tok == SUBST_ALL_OFF ) { snode.stype = subst_node::SUBST_OP; snode.op = subst_node::ALL_OFF; m_subst_list.push_back( snode ); } // re-initialize the subst_node snode.stype = subst_node::SUBST_STRING; snode.subst_string.rstart = distance( m_subst.begin(), icur ); snode.subst_string.rlength = 0; break; case SUBST_UPPER_ON: case SUBST_UPPER_NEXT: case SUBST_LOWER_ON: case SUBST_LOWER_NEXT: case SUBST_ALL_OFF: assert( subst_node::SUBST_STRING == snode.stype ); if( snode.subst_string.rlength ) m_subst_list.push_back( snode ); snode.stype = subst_node::SUBST_OP; snode.op = (subst_node::op_type) tok; m_subst_list.push_back( snode ); // re-initialize the subst_node snode.stype = subst_node::SUBST_STRING; snode.subst_string.rstart = distance( m_subst.begin(), icur ); snode.subst_string.rlength = 0; break; case SUBST_ESCAPE: if( icur == m_subst.end() ) throw bad_regexpr("expecting escape sequence in substitution string"); assert( subst_node::SUBST_STRING == snode.stype ); if( snode.subst_string.rlength ) m_subst_list.push_back( snode ); snode.subst_string.rstart = distance( m_subst.begin(), icur++ ); snode.subst_string.rlength = 1; break; case NO_TOKEN: default: ++snode.subst_string.rlength; ++icur; break; } } assert( subst_node::SUBST_STRING == snode.stype ); if( snode.subst_string.rlength ) m_subst_list.push_back( snode ); } template< typename CI, typename SY > basic_rpattern::charset_map basic_rpattern::s_charset_map; // Pass in an interator to one after the opening bracket of the character set. // On return, icur points to one character after the closing bracket template< typename CI, typename SY > sub_expr * create_charset_helper::create_charset_aux( basic_string::value_type> & str, basic_string::value_type>::iterator & icur, unsigned flags ) { bool fcomplement = false; match_charset * pnew = NULL; basic_string::value_type>::iterator itemp = icur; if( itemp != str.end() && CHARSET_NEGATE == SY::charset_token( itemp, str.end() ) ) { fcomplement = true; icur = itemp; } switch( ( NOCASE | CSTRINGS ) & flags ) { case 0: pnew = new match_custom_charset_t,match_range_with_case>( fcomplement, icur, str.end(), flags, SY() ); break; case NOCASE: pnew = new match_custom_charset_t,match_range_no_case>( fcomplement, icur, str.end(), flags, SY() ); break; case CSTRINGS: pnew = new match_custom_charset_t,match_range_with_case>( fcomplement, icur, str.end(), flags, SY() ); break; case NOCASE | CSTRINGS: pnew = new match_custom_charset_t,match_range_no_case>( fcomplement, icur, str.end(), flags, SY() ); break; default: __assume(0); // tells the compiler that this is unreachable } return pnew; } #pragma warning( disable : 4660 ) // Explicit instantiation #ifdef REGEX_FORCE_INSTANTIATION template class basic_regexpr; template class basic_regexpr; #else template class basic_regexpr; #endif #ifndef NO_PERL_RE #ifdef REGEX_FORCE_INSTANTIATION template class basic_rpattern; template class basic_rpattern; template class basic_rpattern; template class basic_rpattern; #else template class basic_rpattern; template class basic_rpattern; #endif #endif #ifdef POSIX_RE #ifdef REGEX_FORCE_INSTANTIATION template class basic_rpattern >; template class basic_rpattern >; template class basic_rpattern >; template class basic_rpattern >; #else template class basic_rpattern >; template class basic_rpattern >; #endif #endif } // namespace regex