|
|
//+---------------------------------------------------------------------------
//
// File: basic_regexpr.cxx
//
// Contents:
//
// Classes:
//
// Functions:
//
// Coupling:
//
// Notes:
//
// History: 1-11-1999 ericne Created
//
//----------------------------------------------------------------------------
#include "stdafx.h"
#pragma hdrstop
// unlimited inline expansion (compile with /Ob1 or /Ob2)
#pragma inline_depth(255)
// C4355 'this' : used in base member initializer list
// C4660 template-class specialization 'foo<bar>' is already instantiated
// C4786 identifier was truncated to '255' characters in the debug information
// C4800 'int' : forcing value to bool 'true' or 'false' (performance warning)
#pragma warning( disable : 4355 4660 4786 4800 )
#include <assert.h>
#include <malloc.h> // for _alloca
#include <algorithm>
#include <minmax.h>
#include "regexpr.h"
using namespace std;
namespace regex {
#ifdef _MT
// Global critical section used to synchronize the creation of static const patterns
CRegExCritSect g_objRegExCritSect; #endif
// For use while doing uppercase/lowercase conversions:
// For use while doing uppercase/lowercase conversions:
inline char to_upper( char ch ) { return ( char )toupper(ch); } inline char to_lower( char ch ) { return ( char )tolower(ch); } inline wint_t to_upper( wint_t ch ) { return (wint_t)towupper(ch); } inline wint_t to_lower( wint_t ch ) { return (wint_t)towlower(ch); } template< typename II, typename CI > void to_upper( II ibegin, CI iend ) { for( ; (CI)ibegin != iend; ++ibegin ) *ibegin = to_upper( *ibegin ); } template< typename II, typename CI > void to_lower( II ibegin, CI iend ) { for( ; (CI)ibegin != iend; ++ibegin ) *ibegin = to_lower( *ibegin ); }
template< typename II, typename CI > unsigned parse_int( II & istr, CI iend, const unsigned m_max = -1 ) { unsigned retval = 0; while( (CI)istr != iend && '0' <= *istr && '9' >= *istr && m_max > retval ) { retval = retval * 10 + ( (unsigned)*istr++ - (unsigned)'0' ); } if( m_max < retval ) { retval /= 10; --istr; } return retval; }
// This class is used to speed up character set matching by providing
// a bitset that spans the ASCII range. std::bitset is not used because
// the range-checking slows it down.
// Note: The division and modulus operations are optimized by the compiler
// into bit-shift operations.
class ascii_bitvector { typedef unsigned __int32 elem_type; // use 32-bit ints on 32-bit platforms
//typedef unsigned __int64 elem_type; // use 64-bit ints on 64-bit platforms
enum { CBELEM = 8 * sizeof elem_type, // count of bytes per element
CELEMS = (UCHAR_MAX+1) / CBELEM }; // number of element in array
elem_type m_rg[ CELEMS ];
// Used to inline operations like: bv1 |= ~bv2; without creating temp bit vectors.
struct not_ascii_bitvector { const ascii_bitvector & m_ref; not_ascii_bitvector( const ascii_bitvector & ref ) throw() : m_ref(ref) {} }; public: ascii_bitvector() throw() { memset( m_rg, 0, CELEMS * sizeof elem_type ); } inline void set( unsigned char ch ) throw() { m_rg[ ( ch / CBELEM ) ] |= ( (elem_type)1U << ( ch % CBELEM ) ); } inline bool operator[]( unsigned char ch ) const throw() { return 0 != ( m_rg[ ( ch / CBELEM ) ] & ( (elem_type)1U << ( ch % CBELEM ) ) ); } inline not_ascii_bitvector operator~() const throw() { return not_ascii_bitvector(*this); } inline ascii_bitvector & operator|=( const ascii_bitvector & that ) throw() { for( int i=0; i<CELEMS; ++i ) m_rg[i] |= that.m_rg[i]; return *this; } inline ascii_bitvector & operator|=( const not_ascii_bitvector & that ) throw() { for( int i=0; i<CELEMS; ++i ) m_rg[i] |= ~that.m_ref.m_rg[i]; return *this; } };
const ascii_bitvector & get_digit_vector(void) { // 0-9
class digit_vector : public ascii_bitvector { public: digit_vector() { unsigned char ich; for( ich ='0'; ich <= '9'; ++ich ) set(ich); } };
static const digit_vector s_digit_vector; return s_digit_vector; }
const ascii_bitvector & get_word_vector(void) { // a-zA-Z_0-9
class word_vector : public ascii_bitvector { public: word_vector() { unsigned char ich; for( ich = 'a'; ich <= 'z'; ++ich ) set(ich); for( ich = 'A'; ich <= 'Z'; ++ich ) set(ich); for( ich = '0'; ich <= '9'; ++ich ) set(ich); set('_'); } };
static const word_vector s_word_vector; return s_word_vector; }
const ascii_bitvector & get_space_vector(void) { // " \t\r\n\f"
class space_vector : public ascii_bitvector { public: space_vector() { set(' '); set('\t'); set('\v'); set('\r'); set('\n'); set('\f'); } };
static const space_vector s_space_vector; return s_space_vector; }
//
// Operator implementations
//
// Base type used so that all derived operators share typedefs.
template< typename CI > struct op_t : public binary_function<match_param<CI>,CI,bool> { typedef CI const_iterator; typedef typename iterator_traits<CI>::value_type char_type; };
// Evaluates the beginning-of-string condition
template< typename CI > struct bos_t : public op_t<CI> { inline bool operator()( const match_param<CI> & param, CI iter ) const { return param.ibegin == iter; } };
// Find the beginning of a line, either beginning of a string, or the character
// immediately following a newline
template< typename CI > struct bol_t : public bos_t<CI> { inline bool operator()( const match_param<CI> & param, CI iter ) const { return bos_t<CI>::operator()(param,iter) || char_type('\n') == *--iter; } };
// Evaluates end-of-string condition for string's
template< typename CI > struct eos_t : public op_t<CI> { inline bool operator()( const match_param<CI> & param, CI iter ) const { return param.istop == iter; } };
// Evaluates end-of-string condidition for C-style string's when the length is unknown by
// looking for the null-terminator.
template< typename CI > struct eocs_t : public op_t<CI> { inline bool operator()( const match_param<CI> & param, CI iter ) const { return char_type('\0') == *iter; } };
// Evaluates end-of-line conditions, either the end of the string, or a
// return or newline character.
template< typename EOS > struct eol_t_t : public EOS { typedef typename EOS::const_iterator CI; inline bool operator()( const match_param<CI> & param, CI iter ) const { return EOS::operator()(param,iter) || char_type('\n') == *iter || char_type('\r') == *iter; } };
template< typename CI > struct eol_t : public eol_t_t<eos_t<CI> > {}; template< typename CI > struct eocl_t : public eol_t_t<eocs_t<CI> > {};
// Evaluates perl's end-of-string conditions, either the end of the string, or a
// newline character followed by end of string. (Only used by $ and /Z assertions)
template< typename EOS > struct peos_t_t : public EOS { typedef typename EOS::const_iterator CI; inline bool operator()( const match_param<CI> & param, CI iter ) const { return EOS::operator()(param,iter) || ( ( char_type('\n') == *iter ) && EOS::operator()(param,++iter) ); } };
template< typename CI > struct peos_t : public peos_t_t<eos_t<CI> > {}; template< typename CI > struct peocs_t : public peos_t_t<eocs_t<CI> > {};
// compare two characters, case-sensitive
template< typename CH > struct ch_neq_t : public binary_function<CH, CH, bool> { typedef CH char_type; inline bool operator()( register CH ch1, register CH ch2 ) const { return ch1 != ch2; } };
// Compare two characters, disregarding case
template< typename CH > struct ch_neq_nocase_t : public binary_function<CH, CH, bool> { typedef CH char_type; inline bool operator()( register CH ch1, register CH ch2 ) const { return to_upper(ch1) != to_upper(ch2); } };
//
// Helper functions for match and substitute
//
template< typename CI > size_t string_length( CI iter ) { size_t n = 0; while( 0 != *iter++ ) ++n; return n; }
template< typename CI > backref_tag<CI> _do_match( const basic_rpattern_base<CI> & pat, match_param<CI> & param ) throw() { typedef typename iterator_traits<CI>::value_type char_type; bool floop = pat.loops(); unsigned flags = pat.flags(); width_type nwidth = pat.get_width();
const sub_expr<CI> * pfirst = pat.get_first_subexpression();
try { vector<backref_tag<CI> > rgbackrefs; // dummy backref vector
if( NULL == param.prgbackrefs ) param.prgbackrefs = & rgbackrefs;
param.prgbackrefs->resize( pat._cgroups_total() ); fill( param.prgbackrefs->begin(), param.prgbackrefs->end(), backref_tag<CI>() );
// If a pattern is optimized for CSTRINGS, it can save a call
// to calculate the length of the string.
if( CI(0) == param.istop && ( ( RIGHTMOST & flags ) || ( 0 == ( CSTRINGS & flags ) ) ) ) param.istop = param.istart + string_length( param.istart );
if( CI(0) != param.istop ) { // If the minimum width of the pattern exceeds the width of the
// string, a succesful match is impossible
if( nwidth.m_min <= (size_t)distance( param.istart, param.istop ) ) { CI local_istop = param.istop; advance( local_istop, -int( nwidth.m_min ) );
if( RIGHTMOST & flags ) { // begin trying to match after the last character.
// Continue to the beginning
for( CI icur = local_istop; icur >= param.istart; --icur ) if( pfirst->domatch( param, icur ) ) break; // m_floop not used for rightmost matches
} else { // begin trying to match before the first character.
// Continue to the end
for( CI icur = param.istart; icur <= local_istop; ++icur ) if( pfirst->domatch( param, icur ) || ! floop ) break; } } } else { // begin trying to match before the first character.
// Continue to the end
for( CI icur = param.istart; ; ++icur ) { if( pfirst->domatch( param, icur ) || ! floop ) break; if( char_type('\0') == *icur ) break; } } } catch(...) // bad alloc, stack overflow?
{ fill( param.prgbackrefs->begin(), param.prgbackrefs->end(), backref_tag<CI>() ); }
// Shrink the backref vector to chop off information about the "invisible" groups
param.prgbackrefs->resize( pat.cgroups() );
return (*param.prgbackrefs)[0]; }
template< typename CI, typename CH, typename TR, typename AL > size_t _do_subst( basic_regexpr<CH,TR,AL> & str, const basic_rpattern_base<CI> & pat, size_t strpos, size_t strlen ) throw(bad_alloc) { typedef iterator_traits<CI>::value_type char_type; typedef list<subst_node>::const_iterator LCI; enum { UPPER = -1, NIL, LOWER } next = NIL, rest = NIL; bool first = true; size_t old_strpos = strpos; const list<subst_node> & subst_list = pat.get_subst_list(); basic_string<CH,TR,AL>::iterator itstrlen = str.begin(); advance( itstrlen, strpos + strlen ); const basic_string<char_type> & subst = pat.get_subst(); push_new_handler pnh( &my_new_handler ); for( LCI isubst = subst_list.begin(); isubst != subst_list.end(); ++isubst ) { size_t sublen; basic_string<CH,TR,AL>::const_iterator itsubpos1; // iter into str
basic_string<CH,TR,AL>::const_iterator itsublen1; basic_string<char_type>::const_iterator itsubpos2; // iter into subst string
basic_string<char_type>::const_iterator itsublen2; basic_string<CH,TR,AL>::iterator itstrpos = str.begin(); advance( itstrpos, strpos );
switch( isubst->stype ) { case subst_node::SUBST_STRING: itsubpos2 = subst.begin(); advance( itsubpos2, isubst->subst_string.rstart ); itsublen2 = itsubpos2; advance( itsublen2, isubst->subst_string.rlength );
first ? str.replace( itstrpos, itstrlen, itsubpos2, itsublen2 ) : str.insert( itstrpos, itsubpos2, itsublen2 ); sublen = distance( itsubpos2, itsublen2 ); break;
case subst_node::SUBST_BACKREF: switch( isubst->subst_backref ) { case subst_node::PREMATCH: itsubpos1 = str.backref_str().begin(); itsublen1 = itsubpos1; advance( itsublen1, sublen = str.rstart() ); break; case subst_node::POSTMATCH: itsubpos1 = str.backref_str().begin(); advance( itsubpos1, str.rstart() + str.rlength() ); itsublen1 = str.backref_str().end(); break; default: itsubpos1 = str.backref_str().begin(); advance( itsubpos1, str.rstart( isubst->subst_backref ) ); itsublen1 = itsubpos1; advance( itsublen1, str.rlength( isubst->subst_backref ) ); break; }
first ? str.replace( itstrpos, itstrlen, itsubpos1, itsublen1 ) : str.insert( itstrpos, itsubpos1, itsublen1 ); sublen = distance( itsubpos1, itsublen1 ); break;
case subst_node::SUBST_OP: switch( isubst->op ) { case subst_node::UPPER_ON: rest = UPPER; break; case subst_node::UPPER_NEXT: next = UPPER; break; case subst_node::LOWER_ON: rest = LOWER; break; case subst_node::LOWER_NEXT: next = LOWER; break; case subst_node::ALL_OFF: rest = NIL; break; default: __assume(0); } continue; // jump to the next item in the list
default: __assume(0); }
first = false;
// Are we upper- or lower-casing this string?
if( rest ) { basic_string<CH,TR,AL>::iterator istart = str.begin(); advance( istart, strpos ); basic_string<CH,TR,AL>::const_iterator istop = istart; advance( istop, sublen ); switch( rest ) { case UPPER: to_upper( istart, istop ); break; case LOWER: to_lower( istart, istop ); break; default: __assume(0); } }
// Are we upper- or lower-casing the next character?
if( next ) { switch( next ) { case UPPER: str[strpos] = to_upper(str[strpos]); break; case LOWER: str[strpos] = to_lower(str[strpos]); break; default: __assume(0); } next = NIL; }
strpos += sublen; }
// If *first* is still true, then we never called str.replace, and the substitution
// string is empty. Erase the part of the string that the pattern matched.
if( first ) str.erase( strpos, strlen );
// return length of the substitution
return strpos - old_strpos; }
//
// Implementation of basic_regexpr
//
template< typename CH, typename TR, typename AL > size_t basic_regexpr<CH,TR,AL>::substitute( const basic_rpattern_base<basic_regexpr<CH,TR,AL>::const_iterator> & pat, size_type pos, size_type len ) throw(bad_alloc) { if( pat.flags() & CSTRINGS ) { assert( ! "You can't use a pattern optimized for CSTRINGS with regexpr::substitute" ); return 0; }
backref_vector rgbackrefs; // dummy backref vector
backref_vector * prgbackrefs = & rgbackrefs; const bool fsave_backrefs = ( pat.uses_backrefs() || !( pat.flags() & NOBACKREFS ) );
if( fsave_backrefs ) { prgbackrefs = & m_rgbackrefs; m_pbackref_str = & ( m_backref_str = *this ); } else { m_backref_str.erase(); m_pbackref_str = this; m_rgbackrefs.resize( 0 ); }
backref_type br; size_t csubst = 0; long stop_offset = ( len == npos ? m_pbackref_str->size() : min( pos + len, m_pbackref_str->size() ) );
match_param<const_iterator> param( m_pbackref_str->begin(), m_pbackref_str->begin(), prgbackrefs ); advance( param.istart, pos ); advance( param.istop, stop_offset ); param.ibegin = param.istart;
if( GLOBAL & pat.flags() ) { const bool fAll = ( ALLBACKREFS == ( ALLBACKREFS & pat.flags() ) ); const bool fFirst = ( FIRSTBACKREFS == ( FIRSTBACKREFS & pat.flags() ) ); backref_vector rgtempbackrefs; // temporary vector used if fsave_backrefs
long pos_offset = 0; // keep track of how much the backref_str and
// the current string are out of sync
while( br = _do_match( pat, param ) ) { ++csubst; size_type match_length = distance( br.first, br.second ); pos = distance( m_pbackref_str->begin(), br.first ); size_type subst_length = _do_subst( *this, pat, pos + pos_offset, match_length );
if( fsave_backrefs ) { pos += match_length; pos_offset += ( subst_length - match_length );
// Handle specially the backref flags
if( fFirst ) rgtempbackrefs.push_back( br ); else if( fAll ) rgtempbackrefs.insert( rgtempbackrefs.end(), param.prgbackrefs->begin(), param.prgbackrefs->end() ); else rgtempbackrefs.swap( *param.prgbackrefs ); } else { pos += subst_length; stop_offset += ( subst_length - match_length );
// we're not saving backref information, so we don't
// need to do any special backref maintenance here
} // prevent a pattern that matches 0 characters from matching
// again at the same point in the string
if( 0 == match_length ) { if( br.first == param.istop ) // We're at the end, so we're done
break; ++pos; }
param.istart = m_pbackref_str->begin(); advance( param.istart, pos ); // ineffecient for bidirectional iterators.
param.istop = m_pbackref_str->begin(); advance( param.istop, stop_offset ); // ineffecient for bidirectional iterators.
}
// If we did special backref handling, swap the backref vectors
if( fsave_backrefs && ( !br || fFirst || fAll ) ) param.prgbackrefs->swap( rgtempbackrefs ); else if( ! (*param.prgbackrefs)[0] ) param.prgbackrefs->clear(); } else if( br = _do_match( pat, param ) ) { ++csubst; _do_subst( *this, pat, distance( m_pbackref_str->begin(), br.first ), distance( br.first, br.second ) ); }
if( NOBACKREFS == ( pat.flags() & NOBACKREFS ) ) param.prgbackrefs->clear();
return csubst; }
//
// Helper functions called from both basic_regexpr match methods
//
template< typename EOS > backref_tag< typename EOS::const_iterator > _match_helper( const basic_rpattern_base<typename EOS::const_iterator> & pat, match_param<typename EOS::const_iterator> & param, EOS eos ) { typedef typename EOS::const_iterator CI;
if( GLOBAL & pat.flags() ) // do a global find
{ // The NOBACKREFS flag is ignored in the match method.
const bool fAll = ( ALLBACKREFS == ( ALLBACKREFS & pat.flags() ) ); const bool fFirst = ( FIRSTBACKREFS == ( FIRSTBACKREFS & pat.flags() ) );
backref_tag<CI> br; vector<backref_tag<CI> > rgtempbackrefs; while( br = _do_match( pat, param ) ) { // Handle specially the backref flags
if( fFirst ) rgtempbackrefs.push_back( br ); else if( fAll ) rgtempbackrefs.insert( rgtempbackrefs.end(), param.prgbackrefs->begin(), param.prgbackrefs->end() ); else rgtempbackrefs.swap( *param.prgbackrefs );
if( br.first == ( param.istart = br.second ) ) { if( eos( param, param.istart ) ) break; ++param.istart; } }
// restore the backref vectors
if( !br || fFirst || fAll ) param.prgbackrefs->swap( rgtempbackrefs ); else if( ! (*param.prgbackrefs)[0] ) param.prgbackrefs->clear();
return param.prgbackrefs->empty() ? backref_tag<CI>() : (*param.prgbackrefs)[0]; } else return _do_match( pat, param ); }
template< typename CH, typename TR, typename AL > basic_regexpr<CH,TR,AL>::backref_type basic_regexpr<CH,TR,AL>::match( const basic_rpattern_base<const_iterator> & pat, size_type pos, size_type len ) const throw() { if( pat.flags() & CSTRINGS ) { assert( ! "A pattern optimized for CSTRINGS can only be used with the static regexpr::match method" ); return backref_type(); }
m_pbackref_str = this; m_backref_str.erase(); // free up unused memory
const_iterator istart = begin(); advance( istart, pos );
const_iterator istop; if( len == npos || pos + len >= size() ) istop = end(); else advance( istop = begin(), pos + len );
match_param<const_iterator> param( istart, istop, & m_rgbackrefs ); return _match_helper<eos_t<const_iterator> >( pat, param, eos_t<const_iterator>() ); }
template< typename CH > backref_tag<const CH *> _static_match_helper( const CH * szstr, const basic_rpattern_base<const CH *> & pat, vector< backref_tag< const CH * > > * prgbackrefs ) throw() { vector< backref_tag< const CH * > > rgdummyvector; if( NULL == prgbackrefs ) prgbackrefs = &rgdummyvector;
match_param<const CH *> param( szstr, NULL, prgbackrefs ); return _match_helper<eocs_t<const CH *> >( pat, param, eocs_t<const CH *>() ); }
//
// Helper function called from both basic_regexpr::count methods
//
template< typename EOS > size_t _count_helper( const basic_rpattern_base<typename EOS::const_iterator> & pat, match_param<typename EOS::const_iterator> & param, EOS eos ) { typedef typename EOS::const_iterator CI;
size_t cmatches = 0; vector<backref_tag<CI> > rgbackrefs; // dummy backref vector
backref_tag<CI> br; param.prgbackrefs = &rgbackrefs;
while( br = _do_match( pat, param ) ) { ++cmatches;
if( br.first == ( param.istart = br.second ) ) { if( eos( param, param.istart ) ) break; ++param.istart; } } return cmatches; }
template< typename CH, typename TR, typename AL > size_t basic_regexpr<CH,TR,AL>::count( const basic_rpattern_base<basic_regexpr<CH,TR,AL>::const_iterator> & pat, size_type pos, size_type len ) const throw() { if( pat.flags() & CSTRINGS ) { assert( ! "A pattern optimized for CSTRINGS can only be used with the static regexpr::count method" ); return backref_type(); }
m_pbackref_str = this;
const_iterator istart = begin(); advance( istart, pos );
const_iterator istop; if( len == npos || pos + len >= size() ) istop = end(); else advance( istop = begin(), pos + len );
match_param<const_iterator> param( istart, istop, NULL ); return _count_helper<eos_t<const_iterator> >( pat, param, eos_t<const_iterator>() ); }
template< typename CH > size_t _static_count_helper( const CH * szstr, const basic_rpattern_base<const CH *> & pat ) throw() { match_param<const CH *> param( szstr, NULL, NULL ); return _count_helper<eocs_t<const CH *> >( pat, param, eocs_t<const CH *>() ); }
// Base class for sub-expressions which are zero-width
// (i.e., assertions eat no characters during matching)
// Assertions cannot be quantified.
template< typename CI > class assertion : public sub_expr<CI> { public: virtual ~assertion() {} virtual bool is_assertion() const throw() { return true; } protected: virtual width_type _width_this() throw() { return width_type(0,0); } };
template< typename OP > class assert_op : public assertion<typename OP::const_iterator> { public: typedef OP op_type; typedef typename OP::const_iterator CI; virtual ~assert_op() {} protected: virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw() { return m_op( param, icur ); } op_type m_op; };
template< typename CI > assertion<CI> * create_bos( unsigned /*flags*/ ) { return new assert_op<bos_t<CI> >(); }
template< typename CI > assertion<CI> * create_eos( unsigned flags ) { switch( CSTRINGS & flags ) { case 0: return new assert_op<peos_t<CI> >(); case CSTRINGS: return new assert_op<peocs_t<CI> >(); default: __assume(0); // tells the compiler that this is unreachable
} }
template< typename CI > assertion<CI> * create_eoz( unsigned flags ) { switch( CSTRINGS & flags ) { case 0: return new assert_op<eos_t<CI> >(); case CSTRINGS: return new assert_op<eocs_t<CI> >(); default: __assume(0); // tells the compiler that this is unreachable
} }
template< typename CI > assertion<CI> * create_bol( unsigned flags ) { switch( MULTILINE & flags ) { case 0: return new assert_op<bos_t<CI> >(); case MULTILINE: return new assert_op<bol_t<CI> >(); default: __assume(0); // tells the compiler that this is unreachable
} }
template< typename CI > assertion<CI> * create_eol( unsigned flags ) { switch( ( MULTILINE | CSTRINGS ) & flags ) { case 0: return new assert_op<peos_t<CI> >(); case MULTILINE: return new assert_op<eol_t<CI> >(); case CSTRINGS: return new assert_op<peocs_t<CI> >(); case MULTILINE | CSTRINGS: return new assert_op<eocl_t<CI> >(); default: __assume(0); // tells the compiler that this is unreachable
} }
template< typename CI > class match_atom : public sub_expr<CI> { public: match_atom( const basic_string<sub_expr<CI>::char_type>::iterator istart, basic_string<sub_expr<CI>::char_type>::const_iterator istop ) : m_istart( istart ), m_istop( istop ) {} virtual ~match_atom() {}
const basic_string<sub_expr<CI>::char_type>::iterator m_istart; basic_string<sub_expr<CI>::char_type>::const_iterator m_istop; protected: virtual width_type _width_this() throw() { size_t width = distance( (basic_string<sub_expr<CI>::char_type>::const_iterator)m_istart, m_istop ); return width_type( width, width ); } };
template< typename EOS > class match_atom_t : public match_atom<typename EOS::const_iterator> { public: typedef EOS eos_type; typedef typename EOS::const_iterator CI; match_atom_t( const basic_string<sub_expr<CI>::char_type>::iterator istart, basic_string<sub_expr<CI>::char_type>::const_iterator istop ) : match_atom<CI>( istart, istop ) {} virtual ~match_atom_t() {} protected: virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw() { CI icur_tmp = icur; basic_string<sub_expr<CI>::char_type>::const_iterator ithis = m_istart; for( ; ithis != m_istop; ++icur_tmp, ++ithis ) { if( m_eos( param, icur_tmp ) || *ithis != *icur_tmp ) return false; } icur = icur_tmp; return true; } eos_type m_eos; };
template< typename EOS > class match_atom_nocase_t : public match_atom<typename EOS::const_iterator> { public: typedef EOS eos_type; typedef typename EOS::const_iterator CI; match_atom_nocase_t( const basic_string<sub_expr<CI>::char_type>::iterator istart, basic_string<sub_expr<CI>::char_type>::const_iterator istop ) : match_atom<CI>( istart, istop ), m_strlower( (basic_string<sub_expr<CI>::char_type>::const_iterator)istart, istop ) { // Store the uppercase version of the atom in [m_istart,m_istop).
to_upper( m_istart, m_istop ); // Store the lowercase version of the atom in m_strlower.
to_lower( m_strlower.begin(), m_strlower.end() ); } virtual ~match_atom_nocase_t() {}
protected: virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw() { CI icur_tmp = icur; basic_string<sub_expr<CI>::char_type>::const_iterator ithisu = m_istart; // uppercase
basic_string<sub_expr<CI>::char_type>::const_iterator ithisl = m_strlower.begin(); // lowercase
for( ; ithisu != m_istop; ++icur_tmp, ++ithisu, ++ithisl ) { if( m_eos( param, icur_tmp ) || ( *ithisu != *icur_tmp && *ithisl != *icur_tmp ) ) return false; } icur = icur_tmp; return true; } eos_type m_eos; basic_string<sub_expr<CI>::char_type> m_strlower; };
template< typename CI > match_atom<CI> * create_atom( const basic_string<iterator_traits<CI>::value_type>::iterator istart, basic_string<iterator_traits<CI>::value_type>::const_iterator istop, unsigned flags ) { switch( ( NOCASE | CSTRINGS ) & flags ) { case 0: return new match_atom_t<eos_t<CI> >( istart, istop ); case NOCASE: return new match_atom_nocase_t<eos_t<CI> >( istart, istop ); case CSTRINGS: return new match_atom_t<eocs_t<CI> >( istart, istop ); case NOCASE | CSTRINGS: return new match_atom_nocase_t<eocs_t<CI> >( istart, istop ); default: __assume(0); // tells the compiler that this is unreachable
} }
template< typename CI > match_atom<CI> * create_atom( const basic_string<iterator_traits<CI>::value_type>::iterator istart, unsigned flags ) { basic_string<iterator_traits<CI>::value_type>::const_iterator istop = istart; return create_atom<CI>( istart, ++istop, flags ); }
template< typename CI > class match_any : public sub_expr<CI> { public: virtual ~match_any() {} protected: virtual width_type _width_this() throw() { return width_type(1,1); } };
template< typename EOS > class match_any_t : public match_any<typename EOS::const_iterator> { public: typedef EOS eos_type; typedef typename EOS::const_iterator CI; virtual ~match_any_t() {} protected: virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw() { if( m_eos( param, icur ) ) return false; ++icur; return true; } eos_type m_eos; };
template< typename CI > match_any<CI> * create_any( unsigned flags ) { switch( ( SINGLELINE | CSTRINGS ) & flags ) { case 0: return new match_any_t<eol_t<CI> >(); case SINGLELINE: return new match_any_t<eos_t<CI> >(); case CSTRINGS: return new match_any_t<eocl_t<CI> >(); case SINGLELINE | CSTRINGS: return new match_any_t<eocs_t<CI> >(); default: __assume(0); // tells the compiler that this is unreachable
} }
typedef pair<wint_t,wint_t> range_type; const vector<range_type> g_rgranges; // empty
template< typename CI > class match_charset : public sub_expr<CI> { public: match_charset( bool fcomplement, const ascii_bitvector & bvect ) : m_fcomplement( fcomplement ), m_rgascii( bvect ), m_rgranges( g_rgranges ), m_ncharflags(0) {}
// Note that only the references are copied here -- they are not ref counted.
// Beware of variable lifetime issues.
match_charset( const match_charset<CI> & that ) : m_fcomplement( that.m_fcomplement ), m_rgascii( that.m_rgascii ), m_rgranges( that.m_rgranges ), m_ncharflags( that.m_ncharflags ) {}
virtual ~match_charset() {}
const bool m_fcomplement; const ascii_bitvector & m_rgascii; // bitmap for chars in range 0-255
const vector<range_type> & m_rgranges; // vector of included character ranges 256-65535
int m_ncharflags; // Parameter to isctype()
// The case-sensitivity of a character set is "compiled" into the ascii_bitvector
// but not into the range vector because it is too computationally expensive. Instead,
// when doing a unicode case-insensitive match on the ranges vector, two lookups
// must be performed -- one lowercase and one uppercase. By contrast, only one lookup
// is needed for the ascii_bitvector.
protected:
match_charset( bool fcomplement, const ascii_bitvector & bvect, const vector<range_type> & rgranges ) : m_fcomplement( fcomplement ), m_rgascii( bvect ), m_rgranges( rgranges ), m_ncharflags(0) {}
// this method should never be called. match_charset is only a base class
// for match_charset_t
virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw() { assert(false); return true; }
template< typename SY > match_charset<CI> * get_altern_charset( char_type ch, unsigned flags, SY /*sy*/ ) const throw() { return basic_rpattern<CI,SY>::s_charset_map.get( ch, flags ); } virtual width_type _width_this() throw() { return width_type(1,1); } };
// Used as a template parameter to find a unicode character in an array of ranges.
class match_range : public unary_function<wint_t,bool> { protected: const vector<range_type> & m_rgranges;
// determines if one range is less then another.
// used in binary search of range vector
inline static bool _range_less( const range_type & rg1, const range_type & rg2 ) throw() { return rg1.second < rg2.first; }
match_range( const vector<range_type> & rgranges ) : m_rgranges( rgranges ) {} };
class match_range_with_case : public match_range { public: match_range_with_case( const vector<range_type> & rgranges ) : match_range( rgranges ) {} inline bool operator()( wint_t ch ) const throw() { return binary_search( m_rgranges.begin(), m_rgranges.end(), range_type(ch,ch), _range_less ); } };
class match_range_no_case : public match_range { public: match_range_no_case( const vector<range_type> & rgranges ) : match_range( rgranges ) {} inline bool operator()( wint_t ch ) const throw() { const wint_t chup = towupper( ch ); if( binary_search( m_rgranges.begin(), m_rgranges.end(), range_type(chup,chup), _range_less ) ) return true;
const wint_t chlo = towlower( ch ); if( chup != chlo && binary_search( m_rgranges.begin(), m_rgranges.end(), range_type(chlo,chlo), _range_less ) ) return true;
return false; } };
template< typename EOS, typename RGM > class match_charset_t : public match_charset<typename EOS::const_iterator> { public: typedef EOS eos_type; typedef RGM range_match_type; typedef typename EOS::const_iterator CI;
match_charset_t( const match_charset<CI> & that ) : match_charset<CI>( that ), m_rgm( m_rgranges ) {} virtual ~match_charset_t() {}
inline bool is_in_charset( char_type ch ) const throw() { return m_fcomplement != _is_in_charset( ch ); }
protected: match_charset_t( bool fcomplement, const ascii_bitvector & bvect, const vector<range_type> & rgranges ) : match_charset<CI>( fcomplement, bvect, rgranges ), m_rgm( m_rgranges ) {}
// Note overloading based on parameter
inline bool _is_in_charset( char ch ) const throw() { return ( m_rgascii[ unsigned char(ch) ] ) || ( m_ncharflags && ( _pctype[unsigned char(ch)] & m_ncharflags ) ); }
// Note overloading based on parameter
inline bool _is_in_charset( wint_t ch ) const throw() { if( UCHAR_MAX >= ch ) return _is_in_charset( char(ch) );
// use range_match_type to see if this character is within one of the
// ranges stored in m_rgranges.
return ( ! m_rgranges.empty() && m_rgm( ch ) ) || ( m_ncharflags && iswctype( ch, (int)m_ncharflags ) ); }
virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw() { if( m_eos( param, icur ) || ! is_in_charset( *icur ) ) return false; ++icur; return true; }
// range_match_type encapsulates the case-sensitivity
// issues with doing a unicode lookup on the ranges vector.
range_match_type m_rgm; eos_type m_eos; };
template< typename EOS, typename RGM > class match_custom_charset_t : public match_charset_t<EOS,RGM> { public: template< typename SY > match_custom_charset_t( bool fcomplement, basic_string<char_type>::iterator & icur, basic_string<char_type>::const_iterator istop, unsigned flags, SY /*sy*/ ) throw(bad_regexpr,bad_alloc) : match_charset_t<EOS,RGM>( fcomplement, m_rgasciicustom, m_rgrangescustom ) { _parse_charset( icur, istop, flags, SY() ); _optimize(); }
virtual ~match_custom_charset_t() {} // for including one character set in another
match_custom_charset_t<EOS,RGM> & operator|=( const match_charset<CI> & that ) { assert( 0 == that.m_ncharflags ); if( that.m_fcomplement ) { m_rgasciicustom |= ~ that.m_rgascii; // append the inverse of that.m_rgranges to this->m_rgrangescustom
wint_t chlow = UCHAR_MAX; typedef vector<range_type>::const_iterator VCI; for( VCI prg = that.m_rgranges.begin(); prg != that.m_rgranges.end(); ++prg ) { if( UCHAR_MAX + 1 != prg->first ) m_rgrangescustom.push_back( range_type( chlow + 1, prg->first - 1 ) ); chlow = prg->second; } if( WCHAR_MAX != chlow ) m_rgrangescustom.push_back( range_type( chlow + 1, WCHAR_MAX ) ); } else { m_rgasciicustom |= that.m_rgascii; m_rgrangescustom.insert( m_rgrangescustom.end(), that.m_rgranges.begin(), that.m_rgranges.end() ); } return *this; }
protected:
template< typename SY > void _parse_charset( basic_string<char_type>::iterator & icur, basic_string<char_type>::const_iterator istop, unsigned flags, SY /*sy*/ ) throw(bad_regexpr,bad_alloc) { TOKEN tok; char_type ch_prev = 0; match_charset<CI> * pcharset; basic_string<char_type>::iterator iprev = icur; const bool fnocase = ( NOCASE == ( NOCASE & flags ) );
if( (basic_string<char_type>::const_iterator)icur == istop ) throw bad_regexpr("expecting end of character set");
// remember the current position and grab the next token
tok = SY::charset_token( icur, istop ); do { // If we reached the end of the string before finding the end of the
// character set, then this is an ill-formed regex
if( (basic_string<char_type>::const_iterator)icur == istop ) throw bad_regexpr("expecting end of character set");
if( CHARSET_RANGE == tok && ch_prev ) { // remember the current position
basic_string<char_type>::iterator iprev2 = icur; char_type old_ch = ch_prev; ch_prev = 0;
// old_ch is lower bound of a range
switch( SY::charset_token( icur, istop ) ) { case CHARSET_RANGE: case CHARSET_NEGATE: icur = iprev2; // un-get these tokens and fall through
case NO_TOKEN: case CHARSET_ESCAPE: // BUGBUG user-defined charset?
_set_bit_range( old_ch, *icur++, fnocase ); continue; case CHARSET_BACKSPACE: _set_bit_range( old_ch, char_type(8), fnocase ); // backspace
continue; case CHARSET_END: // fall through
default: // not a range.
icur = iprev; // backup to range token
_set_bit( old_ch, fnocase ); _set_bit( *icur++, fnocase ); continue; } }
if( ch_prev ) _set_bit( ch_prev, fnocase ); ch_prev = 0;
switch( tok ) { // None of the intrinsic charsets are case-sensitive,
// so no special handling must be done when the NOCASE
// flag is set.
case CHARSET_RANGE: case CHARSET_NEGATE: case CHARSET_END: icur = iprev; // un-get these tokens
ch_prev = *icur++; continue; case CHARSET_BACKSPACE: ch_prev = char_type(8); // backspace
continue; case ESC_DIGIT: *this |= match_charset<CI>( false, get_digit_vector() ); continue; case ESC_NOT_DIGIT: *this |= match_charset<CI>( true, get_digit_vector() ); continue; case ESC_SPACE: *this |= match_charset<CI>( false, get_space_vector() ); continue; case ESC_NOT_SPACE: *this |= match_charset<CI>( true, get_space_vector() ); continue; case ESC_WORD: *this |= match_charset<CI>( false, get_word_vector() ); continue; case ESC_NOT_WORD: *this |= match_charset<CI>( true, get_word_vector() ); continue; case CHARSET_ALNUM: m_ncharflags |= (_ALPHA|_DIGIT); continue; case CHARSET_ALPHA: m_ncharflags |= (_ALPHA); continue; case CHARSET_BLANK: m_ncharflags |= (_BLANK); continue; case CHARSET_CNTRL: m_ncharflags |= (_CONTROL); continue; case CHARSET_DIGIT: m_ncharflags |= (_DIGIT); continue; case CHARSET_GRAPH: m_ncharflags |= (_PUNCT|_ALPHA|_DIGIT); continue; case CHARSET_LOWER: m_ncharflags |= (_LOWER); if( NOCASE == ( NOCASE & flags ) ) m_ncharflags |= (_UPPER); continue; case CHARSET_PRINT: m_ncharflags |= (_BLANK|_PUNCT|_ALPHA|_DIGIT); continue; case CHARSET_PUNCT: m_ncharflags |= (_PUNCT); continue; case CHARSET_SPACE: m_ncharflags |= (_SPACE); continue; case CHARSET_UPPER: m_ncharflags |= (_UPPER); if( NOCASE == ( NOCASE & flags ) ) m_ncharflags |= (_LOWER); continue; case CHARSET_XDIGIT: m_ncharflags |= (_HEX); continue; case CHARSET_ESCAPE: // Maybe this is a user-defined intrinsic charset
pcharset = get_altern_charset( *icur, flags, SY() ); if( NULL != pcharset ) { *this |= *pcharset; ++icur; continue; } // else fall through
default: ch_prev = *icur++; continue; } } while( iprev = icur, CHARSET_END != ( tok = SY::charset_token( icur, istop ) ) );
if( ch_prev ) _set_bit( ch_prev, fnocase ); }
void _optimize() { // this sorts on range_type.first (uses operator<() for pair templates)
sort( m_rgrangescustom.begin(), m_rgrangescustom.end() ); // This merges ranges that overlap
for( size_t index = 1; index < m_rgrangescustom.size(); ) { if( m_rgrangescustom[index].first <= m_rgrangescustom[index-1].second + 1 ) { m_rgrangescustom[index-1].second = max( m_rgrangescustom[index-1].second, m_rgrangescustom[index].second ); m_rgrangescustom.erase( m_rgrangescustom.begin() + index ); } else ++index; } }
// Note overloading based on second parameter
void _set_bit( char ch, const bool fnocase ) throw() { if( fnocase ) { m_rgasciicustom.set( unsigned char(tolower(ch)) ); m_rgasciicustom.set( unsigned char(toupper(ch)) ); } else { m_rgasciicustom.set( unsigned char(ch) ); } }
// Note overloading based on second parameter
void _set_bit( wint_t ch, const bool fnocase ) throw(bad_alloc) { if( UCHAR_MAX >= ch ) _set_bit( char(ch), fnocase ); else m_rgrangescustom.push_back( range_type( ch, ch ) ); }
// Note overloading based on second parameter
void _set_bit_range( char ch1, char ch2, const bool fnocase ) throw(bad_regexpr) { if( unsigned char(ch1) > unsigned char(ch2) ) throw bad_regexpr("invalid range specified in character set");
if( fnocase ) { // i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
for( unsigned int i = unsigned char(ch1); i <= unsigned char(ch2); ++i ) { m_rgasciicustom.set( unsigned char( toupper(i) ) ); m_rgasciicustom.set( unsigned char( tolower(i) ) ); } } else { // i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
for( unsigned int i = unsigned char(ch1); i <= unsigned char(ch2); ++i ) m_rgasciicustom.set( unsigned char(i) ); } }
// Note overloading based on second parameter
void _set_bit_range( wint_t ch1, wint_t ch2, const bool fnocase ) throw(bad_regexpr,bad_alloc) { if( ch1 > ch2 ) throw bad_regexpr("invalid range specified in character set");
if( UCHAR_MAX >= ch1 ) _set_bit_range( char(ch1), char( min(wint_t(UCHAR_MAX),ch2) ), fnocase );
if( UCHAR_MAX < ch2 ) m_rgrangescustom.push_back( range_type( max(wint_t(UCHAR_MAX+1),ch1), ch2 ) ); }
ascii_bitvector m_rgasciicustom; vector<range_type> m_rgrangescustom; };
template< typename CI > match_charset<CI> * create_charset( const match_charset<CI> & that, unsigned flags ) { switch( ( NOCASE | CSTRINGS ) & flags ) { case 0: return new match_charset_t<eos_t<CI>,match_range_with_case>( that ); case NOCASE: return new match_charset_t<eos_t<CI>,match_range_no_case>( that ); case CSTRINGS: return new match_charset_t<eocs_t<CI>,match_range_with_case>( that ); case NOCASE | CSTRINGS: return new match_charset_t<eocs_t<CI>,match_range_no_case>( that ); default: __assume(0); // tells the compiler that this is unreachable
} }
template< typename EOS > class word_assertion_t : public assertion<typename EOS::const_iterator> { public: typedef EOS eos_type; typedef typename EOS::const_iterator CI; word_assertion_t() : m_isword( match_charset<CI>( false, get_word_vector() ) ) {} virtual ~word_assertion_t() {} protected: bos_t<CI> m_bos; eos_type m_eos; match_charset_t<eos_type,match_range_with_case> m_isword; };
template< typename EOS > class word_boundary_t : public word_assertion_t<EOS> { public: word_boundary_t( const bool fisboundary ) : m_fisboundary( fisboundary ) {} virtual ~word_boundary_t() {} protected: virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw() { CI iprev = icur; --iprev;
const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev ); const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur );
return ( m_fisboundary == ( fprevword != fthisword ) ); } const bool m_fisboundary; };
template< typename EOS > class word_start_t : public word_assertion_t<EOS> { public: word_start_t() {} virtual ~word_start_t() {} protected: virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw() { CI iprev = icur; --iprev;
const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev ); const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur );
return ! fprevword && fthisword; } };
template< typename EOS > class word_stop_t : public word_assertion_t<EOS> { public: word_stop_t() {} virtual ~word_stop_t() {} protected: virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw() { CI iprev = icur; --iprev;
const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev ); const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur );
return fprevword && ! fthisword; } };
template< typename CI > assertion<CI> * create_word_boundary( const bool fisboundary, unsigned flags ) { switch( CSTRINGS & flags ) { case 0: return new word_boundary_t<eos_t<CI> >( fisboundary ); case CSTRINGS: return new word_boundary_t<eocs_t<CI> >( fisboundary ); default: __assume(0); // tells the compiler that this is unreachable
} }
template< typename CI > assertion<CI> * create_word_start( unsigned flags ) { switch( CSTRINGS & flags ) { case 0: return new word_start_t<eos_t<CI> >(); case CSTRINGS: return new word_start_t<eocs_t<CI> >(); default: __assume(0); // tells the compiler that this is unreachable
} }
template< typename CI > assertion<CI> * create_word_stop( unsigned flags ) { switch( CSTRINGS & flags ) { case 0: return new word_stop_t<eos_t<CI> >(); case CSTRINGS: return new word_stop_t<eocs_t<CI> >(); default: __assume(0); // tells the compiler that this is unreachable
} }
template< typename CI > class group_quantifier;
template< typename CI > class match_group : public sub_expr<CI> { public: friend class group_quantifier<CI>;
match_group( size_t cgroup ) : m_rgalternates(), m_cgroup( cgroup ), m_pptail(NULL), m_end_group( this ), m_nwidth(uninit_width) {}
virtual ~match_group() {}
virtual bool domatch( match_param<CI> & param, CI icur ) const throw() { CI old_istart; if( -1 != m_cgroup ) // could be -1 if this is a lookahead_assertion
{ old_istart = (*param.prgbackrefs)[ m_cgroup ].first; (*param.prgbackrefs)[ m_cgroup ].first = icur; }
typedef vector<sub_expr<CI>*>::const_iterator VCI; for( VCI ialt = m_rgalternates.begin(); ialt != m_rgalternates.end(); ++ialt ) { if( (*ialt)->domatch( param, icur ) ) return true; }
if( -1 != m_cgroup ) (*param.prgbackrefs)[ m_cgroup ].first = old_istart;
return false; } virtual void _delete() { typedef vector<sub_expr<CI>*>::iterator VI; for( VI ialt = m_rgalternates.begin(); ialt != m_rgalternates.end(); ++ialt ) delete_sub_expr( *ialt ); sub_expr<CI>::_delete(); }
size_t group_number() const { return m_cgroup; }
void group_number( size_t cgroup ) { m_cgroup = cgroup; }
void add_item( sub_expr<CI> * pitem ) { *m_pptail = pitem; m_pptail = & pitem->next(); }
void add_alternate() { m_rgalternates.push_back( NULL ); m_pptail = & m_rgalternates.back(); }
void end_alternate() { *m_pptail = & m_end_group; }
size_t calternates() const { return m_rgalternates.size(); }
width_type group_width() { (void) match_group<CI>::_width_this(); return m_nwidth; }
protected:
virtual bool _call_back( match_param<CI> & param, CI icur ) const throw() { CI old_iend;
if( -1 != m_cgroup ) { old_iend = (*param.prgbackrefs)[ m_cgroup ].second; (*param.prgbackrefs)[ m_cgroup ].second = icur; }
if( match_next( param, icur ) ) return true;
if( -1 != m_cgroup ) (*param.prgbackrefs)[ m_cgroup ].second = old_iend;
return false; }
virtual width_type _width_this() throw() { typedef vector<sub_expr<CI>*>::const_iterator VCI; if( uninit_width == m_nwidth ) { m_nwidth = width_type(-1,0); for( VCI ialt = m_rgalternates.begin(); worst_width != m_nwidth && ialt != m_rgalternates.end(); ++ialt ) { width_type temp_width = (*ialt)->get_width(); m_nwidth.m_min = min( m_nwidth.m_min, temp_width.m_min ); m_nwidth.m_max = max( m_nwidth.m_max, temp_width.m_max ); } } return m_nwidth; }
class end_group; friend class end_group; class end_group : public sub_expr<CI> { void * operator new( size_t ); public: end_group( match_group * pgroup ) : m_pgroup( pgroup ) {} virtual ~end_group() {} virtual void _delete() {} // don't delete this, because it was never alloc'ed
virtual bool domatch( match_param<CI> & param, CI icur ) const throw() { return m_pgroup->_call_back( param, icur ); } protected: // since m_pnext is always NULL for end_groups, get_width() stops recursing here
virtual width_type _width_this() throw() { return width_type(0,0); } match_group<CI> * m_pgroup; };
vector<sub_expr<CI>*> m_rgalternates; sub_expr<CI> ** m_pptail; // only used when adding elements
size_t m_cgroup; end_group m_end_group; width_type m_nwidth; };
// Behaves like a lookahead assertion if m_cgroup is -1, or like
// an independent group otherwise.
template< typename CI > class independent_group : public match_group<CI> { public: independent_group() : match_group<CI>( -1 ), m_fexpected(true) {} virtual ~independent_group() {}
virtual bool domatch( match_param<CI> & param, CI icur ) const throw() { // Copy the entire backref vector onto the stack
backref_tag<CI> * prgbr = (backref_tag<CI>*)_alloca( param.prgbackrefs->size() * sizeof backref_tag<CI> ); copy( param.prgbackrefs->begin(), param.prgbackrefs->end(), raw_storage_iterator<backref_tag<CI>*,backref_tag<CI> >(prgbr) );
// Match until the end of this group and then return
const bool fdomatch = match_group<CI>::domatch( param, icur );
if( m_fexpected == fdomatch ) { // If m_cgroup != 1, then this is not a zero-width assertion.
if( -1 != m_cgroup ) icur = (*param.prgbackrefs)[ m_cgroup ].second;
if( match_next( param, icur ) ) return true; }
// if match_group::domatch returned true, the backrefs must be restored
if( fdomatch ) copy( prgbr, prgbr + param.prgbackrefs->size(), param.prgbackrefs->begin() );
return false; }
protected:
independent_group( const bool fexpected ) : match_group<CI>( -1 ), m_fexpected(fexpected) {}
virtual bool _call_back( match_param<CI> & param, CI icur ) const throw() { if( -1 != m_cgroup ) (*param.prgbackrefs)[ m_cgroup ].second = icur; return true; }
const bool m_fexpected; };
template< typename CI > class lookahead_assertion : public independent_group<CI> { public: lookahead_assertion( const bool fexpected ) : independent_group<CI>( fexpected ) {} virtual ~lookahead_assertion() {} virtual bool is_assertion() const throw() { return true; } protected: virtual width_type _width_this() throw() { return width_type(0,0); } };
template< typename CI > class lookbehind_assertion : public independent_group<CI> { public: lookbehind_assertion( const bool fexpected ) : independent_group<CI>( fexpected ) {} virtual ~lookbehind_assertion() {}
virtual bool domatch( match_param<CI> & param, CI icur ) const throw() { // This is the room in the string from the start to the current position
size_t room = distance( param.ibegin, icur );
// If we don't have enough room to match the lookbehind, the match fails.
// If we wanted the match to fail, try to match the rest of the pattern.
if( m_nwidth.m_min > room ) return m_fexpected ? false : match_next( param, icur );
// Copy the entire backref vector onto the stack
backref_tag<CI> * prgbr = (backref_tag<CI>*)_alloca( param.prgbackrefs->size() * sizeof backref_tag<CI> ); copy( param.prgbackrefs->begin(), param.prgbackrefs->end(), raw_storage_iterator<backref_tag<CI>*,backref_tag<CI> >(prgbr) );
CI local_istart = icur; advance( local_istart, -int( min( m_nwidth.m_max, room ) ) );
CI local_istop = icur; advance( local_istop, -int( m_nwidth.m_min ) );
// Create a local param struct that has icur as param.iend
match_param<CI> local_param(param.ibegin,param.istart,icur,param.prgbackrefs);
// Find the rightmost match that ends at icur.
for( CI local_icur = local_istart; local_icur <= local_istop; ++local_icur ) { // Match until the end of this group and then return
const bool fmatched = match_group<CI>::domatch( local_param, local_icur );
// If the match results were what we were expecting, try to match the
// rest of the pattern. If that succeeds, return true.
if( m_fexpected == fmatched && match_next( param, icur ) ) return true;
// if match_group::domatch returned true, the backrefs must be restored
if( fmatched ) { copy( prgbr, prgbr + param.prgbackrefs->size(), param.prgbackrefs->begin() );
// Match succeeded. If this is a negative lookbehind, we didn't want it
// to succeed, so return false.
if( ! m_fexpected ) return false; } }
// No variation of the lookbehind was satisfied in a way that permited
// the rest of the pattern to match successfully, so return false.
return false; } virtual bool is_assertion() const throw() { return true; } protected: virtual bool _call_back( match_param<CI> & param, CI icur ) const throw() { return param.istop == icur; } virtual width_type _width_this() throw() { return width_type(0,0); } };
// Corresponds to the (?:foo) extension, which has grouping semantics, but
// does not store any backref information.
template< typename CI > class group_nobackref : public match_group<CI> { public: group_nobackref( ) : match_group( -1 ) {} // will be assigned a group number in basic_rpattern::basic_rpattern()
virtual ~group_nobackref() {} };
template< typename CI > class match_wrapper : public sub_expr<CI> { public: match_wrapper( sub_expr<CI> * psub ) : m_psub(psub) {} virtual ~match_wrapper() {}
virtual void _delete() { delete_sub_expr( m_psub ); sub_expr<CI>::_delete(); }
protected:
bool _wrapped_match_this( match_param<CI> & param, CI & icur ) const throw() { return m_psub->_match_this( param, icur ); } virtual width_type _width_this() throw() { return m_psub->_width_this(); }
sub_expr<CI> * m_psub; };
template< typename CI > class match_quantifier : public match_wrapper<CI> { public: match_quantifier( sub_expr<CI> * psub, size_t lbound, size_t ubound ) : match_wrapper<CI>( psub ), m_lbound(lbound), m_ubound(ubound) {}
virtual ~match_quantifier() {}
protected:
virtual width_type _width_this() throw() { width_type this_width = match_wrapper<CI>::_width_this();
return this_width * width_type( m_lbound, m_ubound ); }
const size_t m_lbound; const size_t m_ubound; };
template< typename CI > class max_atom_quantifier : public match_quantifier<CI> { public: max_atom_quantifier( sub_expr<CI> * psub, size_t lbound, size_t ubound ) : match_quantifier<CI>( psub, lbound, ubound ) {}
virtual ~max_atom_quantifier() {}
virtual bool domatch( match_param<CI> & param, CI icur ) const throw() { size_t cmatches = 0; int cdiff = 0; // must be a signed quantity for advance() below
if( cmatches < m_ubound ) { CI istart = icur; if( _wrapped_match_this( param, icur ) ) { ++cmatches; cdiff = distance( istart, icur ); if( 0 == cdiff ) return ( match_next( param, icur ) );
while( cmatches < m_ubound && _wrapped_match_this( param, icur ) ) { ++cmatches; } } }
if( cmatches >= m_lbound ) { if( ! next() ) return true; for(;;) { if( next()->domatch( param, icur ) ) return true; if( cmatches-- <= m_lbound ) break; advance( icur, -cdiff ); } }
return false; } };
template< typename CI > class min_atom_quantifier : public match_quantifier<CI> { public: min_atom_quantifier( sub_expr<CI> * psub, size_t lbound, size_t ubound ) : match_quantifier<CI>( psub, lbound, ubound ) {}
virtual ~min_atom_quantifier() {}
virtual bool domatch( match_param<CI> & param, CI icur ) const throw() { size_t cmatches = 0; bool fsuccess = true; CI icur_tmp = icur;
if( _wrapped_match_this( param, icur_tmp ) ) { if( icur_tmp == icur ) return ( match_next( param, icur ) );
if( m_lbound ) { icur = icur_tmp; ++cmatches; } while( ( cmatches < m_lbound ) && ( fsuccess = _wrapped_match_this( param, icur ) ) ) { ++cmatches; } } else { fsuccess = ! m_lbound; }
if( fsuccess && next() ) { do { if( next()->domatch( param, icur ) ) break; } while( fsuccess = ( cmatches++ < m_ubound && _wrapped_match_this( param, icur ) ) ); }
return fsuccess; } };
template< typename CI > class group_quantifier : public match_quantifier<CI> { public: group_quantifier( match_group<CI> * psub, size_t lbound, size_t ubound ) : match_quantifier<CI>( psub, lbound, ubound ), m_group( *psub ), m_end_quantifier( this ) { psub->next() = & m_end_quantifier; }
virtual ~group_quantifier() {}
virtual bool domatch( match_param<CI> & param, CI icur ) const throw() { // group_number is only -1 for assertions, which can't be quantified
assert( -1 != group_number() );
backref_tag<CI> & br = (*param.prgbackrefs)[ group_number() ]; backref_tag<CI> old_backref = br; br = backref_tag<CI>( icur, icur ); // sets cmatches (reserved) to 0
if( _recurse( param, icur ) ) return true;
br = old_backref; return false; }
protected:
class end_quantifier; friend class end_quantifier; class end_quantifier : public sub_expr<CI> { void * operator new( size_t ); public: end_quantifier( group_quantifier<CI> * pquant ) : m_pquant( pquant ) {}
virtual ~end_quantifier() {} virtual void _delete() {} // don't delete this, since it wasn't alloc'ed
virtual bool domatch( match_param<CI> & param, CI icur ) const throw() { // group_number is only -1 for assertions, which can't be quantified
assert( -1 != m_pquant->group_number() );
// handle special the case where a group matches 0 characters
backref_tag<CI> & br = (*param.prgbackrefs)[ m_pquant->group_number() ]; if( icur == br.first ) { size_t old_cmatches = br.reserved; br.reserved = m_pquant->m_ubound; if( m_pquant->_recurse( param, icur ) ) return true; br.reserved = old_cmatches; return false; } return m_pquant->_recurse( param, icur ); }
protected: virtual width_type _width_this() throw() { return width_type(0,0); } group_quantifier<CI> * m_pquant; };
size_t group_number() const { return m_group.group_number(); }
size_t & cmatches( match_param<CI> & param ) const { return (*param.prgbackrefs)[ group_number() ].reserved; }
virtual bool _recurse( match_param<CI> & param, CI icur ) const throw() = 0;
match_group<CI> & m_group; end_quantifier m_end_quantifier; };
template< typename CI > class max_group_quantifier : public group_quantifier<CI> { public: max_group_quantifier( match_group<CI> * psub, size_t lbound, size_t ubound ) : group_quantifier<CI>( psub, lbound, ubound ) {}
virtual ~max_group_quantifier() {}
protected:
virtual bool _recurse( match_param<CI> & param, CI icur ) const throw() { if( m_ubound == cmatches( param ) ) return match_next( param, icur );
++cmatches( param ); if( m_psub->domatch( param, icur ) ) return true;
if( --cmatches( param ) < m_lbound ) return false;
return match_next( param, icur ); } };
template< typename CI > class min_group_quantifier : public group_quantifier<CI> { public: min_group_quantifier( match_group<CI> * psub, size_t lbound, size_t ubound ) : group_quantifier<CI>( psub, lbound, ubound ) {}
virtual ~min_group_quantifier() {}
protected:
virtual bool _recurse( match_param<CI> & param, CI icur ) const throw() { if( m_lbound > cmatches( param ) ) { ++cmatches( param ); return m_psub->domatch( param, icur ); }
if( match_next( param, icur ) ) return true;
if( cmatches( param )++ == m_ubound ) return false; return m_psub->domatch( param, icur ); } };
template< typename CI > class match_backref : public sub_expr<CI> { public: match_backref( size_t cbackref, const width_type & group_width ) : m_cbackref( cbackref ), m_nwidth(group_width) {} virtual ~match_backref() {} protected: // Return the width specifications of the group to which this backref refers
virtual width_type _width_this() throw() { return m_nwidth; } const size_t m_cbackref; const width_type m_nwidth; };
template< typename CMP, typename EOS > class match_backref_t : public match_backref<typename EOS::const_iterator> { public: typedef CMP cmp_type; typedef EOS eos_type; typedef typename EOS::const_iterator CI; match_backref_t( size_t cbackref, const width_type & group_width ) : match_backref<CI>( cbackref, group_width ) {} virtual ~match_backref_t() {} protected: virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw() { CI ithis = (*param.prgbackrefs)[ m_cbackref ].first; CI istop = (*param.prgbackrefs)[ m_cbackref ].second; CI icur_tmp = icur;
// Don't match a backref that hasn't match anything
if( ! (*param.prgbackrefs)[ m_cbackref ] ) return false;
for( ; ithis != istop; ++icur_tmp, ++ithis ) { if( m_eos( param, icur_tmp ) || m_cmp( *icur_tmp, *ithis ) ) return false; } icur = icur_tmp; return true; } cmp_type m_cmp; eos_type m_eos; };
template< typename CI > match_backref<CI> * create_backref( size_t cbackref, const width_type & group_width, unsigned flags ) { typedef typename iterator_traits<CI>::value_type char_type;
switch( ( NOCASE | CSTRINGS ) & flags ) { case 0: return new match_backref_t<ch_neq_t<char_type>,eos_t<CI> >( cbackref, group_width ); case NOCASE: return new match_backref_t<ch_neq_nocase_t<char_type>,eos_t<CI> >( cbackref, group_width ); case CSTRINGS: return new match_backref_t<ch_neq_t<char_type>,eocs_t<CI> >( cbackref, group_width ); case NOCASE | CSTRINGS: return new match_backref_t<ch_neq_nocase_t<char_type>,eocs_t<CI> >( cbackref, group_width ); default: __assume(0); // tells the compiler that this is unreachable
} }
// Replace some escape sequences with the actual characters
// they represent
template< typename CI > void basic_rpattern_base<CI>::_normalize_string( basic_string<basic_rpattern_base<CI>::char_type> & str ) { size_t i = 0;
// Don't do pattern normalization if the user didn't ask for it.
if( NORMALIZE != ( NORMALIZE & m_flags ) ) return;
while( basic_string<char_type>::npos != ( i = str.find( char_type('\\'), i ) ) ) { if( str.size() - 1 == i ) return;
switch( str[i+1] ) { case char_type('f'): str.replace( i, 2, 1, char_type('\f') ); break; case char_type('n'): str.replace( i, 2, 1, char_type('\n') ); break; case char_type('r'): str.replace( i, 2, 1, char_type('\r') ); break; case char_type('t'): str.replace( i, 2, 1, char_type('\t') ); break; case char_type('v'): str.replace( i, 2, 1, char_type('\v') ); break; case char_type('\\'): str.replace( i, 2, 1, char_type('\\') ); break; default: ++i; break; } ++i; if( str.size() <= i ) return; } }
//
// Implementation of basic_rpattern:
//
template< typename CI, typename SY > basic_rpattern<CI,SY>::basic_rpattern() throw() : basic_rpattern_base<CI>( 0 ) { }
template< typename CI, typename SY > basic_rpattern<CI,SY>::basic_rpattern( const basic_string<basic_rpattern<CI,SY>::char_type> & pat, unsigned flags ) throw(bad_regexpr,bad_alloc) : basic_rpattern_base<CI>( flags, pat ) { push_new_handler pnh( &my_new_handler ); _normalize_string( m_pat ); _common_init( flags ); }
template< typename CI, typename SY > basic_rpattern<CI,SY>::basic_rpattern( const basic_string<basic_rpattern<CI,SY>::char_type> & pat, const basic_string<basic_rpattern<CI,SY>::char_type> & subst, unsigned flags ) throw(bad_regexpr,bad_alloc) : basic_rpattern_base<CI>( flags, pat, subst ) { push_new_handler pnh( &my_new_handler ); _normalize_string( m_pat ); _common_init( flags ); _normalize_string( m_subst ); _parse_subst(); // must come after _common_init
}
template< typename CI, typename SY > void basic_rpattern<CI,SY>::init( const basic_string<basic_rpattern<CI,SY>::char_type> & pat, unsigned flags ) throw(bad_regexpr,bad_alloc) { push_new_handler pnh( &my_new_handler ); _reset(); m_flags = flags; m_pat = pat; _normalize_string( m_pat ); _common_init( m_flags ); }
template< typename CI, typename SY > void basic_rpattern<CI,SY>::init( const basic_string<basic_rpattern<CI,SY>::char_type> & pat, const basic_string<basic_rpattern<CI,SY>::char_type> & subst, unsigned flags ) throw(bad_regexpr,bad_alloc) { push_new_handler pnh( &my_new_handler ); _reset(); m_flags = flags; m_pat = pat; m_subst = subst; _normalize_string( m_pat ); _common_init( m_flags ); _normalize_string( m_subst ); _parse_subst(); // must come after _common_init
}
template< typename CI, typename SY > void basic_rpattern<CI,SY>::_common_init( unsigned flags ) { m_cgroups = 0; vector<match_group<CI>*> rggroups; basic_string<char_type>::iterator ipat = m_pat.begin(); match_group<CI> * pgroup = _find_next_group( ipat, flags, rggroups );
m_pfirst = pgroup; m_nwidth = pgroup->group_width();
// Number the invisible groups
m_cgroups_visible = m_cgroups; while( ! m_invisible_groups.empty() ) { m_invisible_groups.front()->group_number( _get_next_group_nbr() ); m_invisible_groups.pop_front(); }
//
// determine if we can get away with only calling m_pfirst->domatch only once
//
m_floop = true;
// Optimization: if first character of pattern string is '^'
// and we are not doing a multiline match, then we only
// need to try domatch once
basic_string<char_type>::iterator icur = m_pat.begin(); if( MULTILINE != ( MULTILINE & m_flags ) && 1 == pgroup->calternates() && icur != m_pat.end() && BEGIN_LINE == SY::reg_token( icur, m_pat.end() ) ) { m_flags &= ~RIGHTMOST; m_floop = false; }
// Optimization: if first 2 characters of pattern string are ".*" or ".+",
// then we only need to try domatch once
icur = m_pat.begin(); if( RIGHTMOST != ( RIGHTMOST & m_flags ) && SINGLELINE == ( SINGLELINE & m_flags ) && 1 == pgroup->calternates() && icur != m_pat.end() && MATCH_ANY == SY::reg_token( icur, m_pat.end() ) && icur != m_pat.end() ) { switch( SY::quant_token( icur, m_pat.end() ) ) { case ONE_OR_MORE: case ZERO_OR_MORE: case ONE_OR_MORE_MIN: case ZERO_OR_MORE_MIN: m_floop = false; } } }
template< typename CI, typename SY > void basic_rpattern<CI,SY>::_reset() throw() { basic_rpattern_base<CI>::_reset();
m_cgroups = m_cgroups_visible = 0; m_floop = true;
m_subst.erase(); m_pat.erase();
m_pfirst.free_ptr(); m_nwidth = uninit_width; m_subst_list.clear(); m_invisible_groups.clear(); }
template< typename CI, typename SY > void basic_rpattern<CI,SY>::set_flags( unsigned flags ) throw(bad_regexpr,bad_alloc) { push_new_handler pnh( &my_new_handler ); m_pfirst.free_ptr(); m_flags = flags; _common_init( m_flags ); }
template< typename CI, typename SY > void basic_rpattern<CI,SY>::set_substitution( const basic_string<basic_rpattern<CI,SY>::char_type> & subst ) { push_new_handler pnh( &my_new_handler ); m_subst_list.clear(); m_subst = subst; _normalize_string( m_subst ); _parse_subst(); }
template< typename CI, typename SY > match_group<CI> * basic_rpattern<CI,SY>::_find_next_group( basic_string<basic_rpattern<CI,SY>::char_type>::iterator & ipat, unsigned & flags, vector<match_group<CI>*> & rggroups ) { auto_sub_ptr<match_group<CI> > pgroup; basic_string<char_type>::iterator itemp = ipat; unsigned old_flags = flags; TOKEN tok;
// Look for group extensions. (This could change the value of the flags variable.)
if( ipat != m_pat.end() && NO_TOKEN != ( tok = SY::ext_token( ipat, m_pat.end(), flags ) ) ) { if( itemp == m_pat.begin() || ipat == m_pat.end() ) throw bad_regexpr("ill-formed regular expression");
// Don't process empty groups
if( END_GROUP != SY::reg_token( itemp = ipat, m_pat.end() ) ) { switch( tok ) { case EXT_NOBACKREF: // invisible groups are numbered only after all
// visible groups have been numbererd
pgroup = new match_group<CI>( -1 ); m_invisible_groups.push_back( pgroup.get() ); break;
case EXT_INDEPENDENT: pgroup = new independent_group<CI>(); m_invisible_groups.push_back( pgroup.get() ); break;
case EXT_POS_LOOKAHEAD: pgroup = new lookahead_assertion<CI>( true ); break; case EXT_NEG_LOOKAHEAD: pgroup = new lookahead_assertion<CI>( false ); break; case EXT_POS_LOOKBEHIND: // For look-behind assertions, turn off the CSTRINGs optimization
flags &= ~CSTRINGS; pgroup = new lookbehind_assertion<CI>( true ); break; case EXT_NEG_LOOKBEHIND: // For look-behind assertions, turn off the CSTRINGs optimization
flags &= ~CSTRINGS; pgroup = new lookbehind_assertion<CI>( false ); break; default: throw bad_regexpr("bad extension sequence"); } } else { // Skip over the END_GROUP token
ipat = itemp; } } else { pgroup = new match_group<CI>( _get_next_group_nbr() ); }
if( NULL != pgroup.get() ) { pgroup->add_alternate(); while( _find_next( ipat, pgroup.get(), flags, rggroups ) ); pgroup->end_alternate();
// Add this group to the rggroups array
if( -1 != pgroup->group_number() ) { if( pgroup->group_number() >= rggroups.size() ) rggroups.resize( pgroup->group_number() + 1, NULL ); rggroups[ pgroup->group_number() ] = pgroup.get(); }
// The group should calculate its own width now and
// save the result for later.
pgroup->group_width();
// If this is not a pattern modifier, restore the
// flags to their previous settings. This causes
// pattern modifiers to have the scope of their
// enclosing group.
flags = old_flags; }
return pgroup.release(); }
//
// Read ahead through the pattern and treat sequential atoms
// as a single atom, making sure to handle quantification
// correctly. Warning: dense code ahead.
//
template< typename CI, typename SY > void basic_rpattern<CI,SY>::_find_atom( basic_string<basic_rpattern<CI,SY>::char_type>::iterator & ipat, match_group<CI> * pgroup, unsigned flags ) { basic_string<char_type>::iterator itemp = ipat, istart = ipat;
do { switch( SY::quant_token( itemp, m_pat.end() ) ) { // if {,} can't be interpreted as quantifiers, treat them as regular chars
case BEGIN_RANGE: if( istart != ipat ) // treat as a quantifier
goto quantify; case NO_TOKEN: case END_RANGE: case END_RANGE_MIN: case RANGE_SEPARATOR: break;
default: if( istart == ipat ) // must be able to quantify something.
throw bad_regexpr("quantifier not expected"); quantify: if( istart != --ipat ) pgroup->add_item( create_atom<CI>( istart, ipat, flags ) ); auto_sub_ptr<sub_expr<CI> > pnew( create_atom<CI>( ipat++, flags ) ); _quantify( pnew, NULL, ipat ); pgroup->add_item( pnew.release() ); return; } } while( m_pat.end() != ++ipat && ! SY::reg_token( itemp = ipat, m_pat.end() ) );
assert( ipat != istart ); pgroup->add_item( create_atom<CI>( istart, ipat, flags ) ); }
template< typename CI, typename SY > bool basic_rpattern<CI,SY>::_find_next( basic_string<basic_rpattern<CI,SY>::char_type>::iterator & ipat, match_group<CI> * pgroup, unsigned & flags, vector<match_group<CI>*> & rggroups ) { match_group<CI> * pnew_group = NULL; auto_sub_ptr<sub_expr<CI> > pnew; basic_string<char_type>::iterator istart, itemp; bool fdone;
if( ipat == m_pat.end() ) { if( 0 != pgroup->group_number() ) throw bad_regexpr( "mismatched parenthesis" ); return false; }
switch( SY::reg_token( ipat, m_pat.end() ) ) { case NO_TOKEN: // not a token. Must be an atom
_find_atom( ipat, pgroup, flags ); return true; case END_GROUP: if( 0 == pgroup->group_number() ) throw bad_regexpr( "mismatched parenthesis" ); return false;
case ALTERNATION: pgroup->end_alternate(); pgroup->add_alternate(); return true; case BEGIN_GROUP: // Find next group could return NULL if the group is really
// a pattern modifier, like: (?s-i)
pnew = pnew_group = _find_next_group( ipat, flags, rggroups ); break;
case BEGIN_LINE: pnew = create_bol<CI>( flags ); break;
case END_LINE: pnew = create_eol<CI>( flags ); break;
case BEGIN_CHARSET: pnew = create_charset_helper<CI,SY>::create_charset_aux( m_pat, ipat, flags ); break;
case MATCH_ANY: pnew = create_any<CI>( flags ); break;
case ESC_WORD_BOUNDARY: pnew = create_word_boundary<CI>( true, flags ); break; case ESC_NOT_WORD_BOUNDARY: pnew = create_word_boundary<CI>( false, flags ); break; case ESC_WORD_START: pnew = create_word_start<CI>( flags ); break; case ESC_WORD_STOP: pnew = create_word_stop<CI>( flags ); break; case ESC_DIGIT: pnew = create_charset<CI>( match_charset<CI>( false, get_digit_vector() ), flags ); break;
case ESC_NOT_DIGIT: pnew = create_charset<CI>( match_charset<CI>( true, get_digit_vector() ), flags ); break;
case ESC_WORD: pnew = create_charset<CI>( match_charset<CI>( false, get_word_vector() ), flags ); break;
case ESC_NOT_WORD: pnew = create_charset<CI>( match_charset<CI>( true, get_word_vector() ), flags ); break;
case ESC_SPACE: pnew = create_charset<CI>( match_charset<CI>( false, get_space_vector() ), flags ); break;
case ESC_NOT_SPACE: pnew = create_charset<CI>( match_charset<CI>( true, get_space_vector() ), flags ); break;
case ESC_BEGIN_STRING: pnew = create_bos<CI>( flags ); break;
case ESC_END_STRING: pnew = create_eos<CI>( flags ); break;
case ESC_END_STRING_z: pnew = create_eoz<CI>( flags ); break;
case ESCAPE:
if( char_type('0') <= *ipat && char_type('9') >= *ipat ) { // use _cgroups_total here since the invisible groups have not been numbered yet.
unsigned nbackref = parse_int( ipat, m_pat.end(), _cgroups_total() - 1 );// always at least 1 group
if( 0 == nbackref || rggroups.size() <= nbackref || NULL == rggroups[ nbackref ] ) throw bad_regexpr( "invalid backreference" ); pnew = create_backref<CI>( nbackref, rggroups[nbackref]->group_width(), flags ); } else { // Is this a user-defined intrinsic character set?
match_charset<CI> * pcharset = s_charset_map.get( *ipat, flags ); if( NULL != pcharset ) pnew = create_charset<CI>( *pcharset, flags ); else pnew = create_atom<CI>( ipat, flags ); ++ipat; } break;
// If quotemeta, loop until we find quotemeta off or end of string
case ESC_QUOTE_META_ON: for( istart = itemp = ipat, fdone = false; !fdone && ipat != m_pat.end(); ) { switch( SY::reg_token( ipat, m_pat.end() ) ) { case ESC_QUOTE_META_OFF: fdone = true; break; case NO_TOKEN: ++ipat; // fallthrough
default: itemp = ipat; break; } } if( itemp != istart ) pgroup->add_item( create_atom<CI>( istart, itemp, flags ) );
// skip the quantification code below
return true;
// Should never get here for valid patterns
case ESC_QUOTE_META_OFF: throw bad_regexpr("quotemeta turned off, but was never turned on");
default: assert( ! "Unhandled token type" ); break; } // If pnew is null, then the current subexpression is a no-op.
if( pnew.get() ) { // Look for quantifiers
_quantify( pnew, pnew_group, ipat );
// Add the item to the group
pgroup->add_item( pnew.release() ); } return true; }
template< typename CI, typename SY > void basic_rpattern<CI,SY>::_quantify( auto_sub_ptr<sub_expr<CI> > & pnew, match_group<CI> * pnew_group, basic_string<basic_rpattern<CI,SY>::char_type>::iterator & ipat ) { if( ipat != m_pat.end() && ! pnew->is_assertion() ) { basic_string<char_type>::iterator itemp = ipat; bool fmin = false; // Since size_t is unsigned, -1 is really the largest size_t
size_t lbound = (size_t)-1; size_t ubound = (size_t)-1; size_t ubound_tmp;
switch( SY::quant_token( itemp, m_pat.end() ) ) { case ZERO_OR_MORE_MIN: fmin = true; case ZERO_OR_MORE: lbound = 0; break;
case ONE_OR_MORE_MIN: fmin = true; case ONE_OR_MORE: lbound = 1; break;
case ZERO_OR_ONE_MIN: fmin = true; case ZERO_OR_ONE: lbound = 0; ubound = 1; break;
case BEGIN_RANGE: lbound = parse_int( itemp, m_pat.end() ); if( itemp == m_pat.end() ) throw bad_regexpr( "expecting end of range" );
switch( SY::quant_token( itemp, m_pat.end() ) ) { case END_RANGE_MIN: fmin = true; case END_RANGE: ubound = lbound; break;
case RANGE_SEPARATOR: ipat = itemp; ubound_tmp = parse_int( itemp, m_pat.end() ); if( itemp != ipat ) ubound = ubound_tmp; if( itemp == m_pat.end() ) throw bad_regexpr( "expecting end of range" ); switch( SY::quant_token( itemp, m_pat.end() ) ) { case END_RANGE_MIN: fmin = true; case END_RANGE: break; default: throw bad_regexpr( "expecting end of range" ); } break;
default: throw bad_regexpr( "ill-formed quantifier" ); }
if( ubound < lbound ) throw bad_regexpr( "ill-formed quantifier" );
break; }
if( (size_t)-1 != lbound ) { auto_sub_ptr<match_quantifier<CI> > pquant;
// a group quantifier is less efficient than an atom quantifier
if( fmin ) { if( pnew_group ) pquant = new min_group_quantifier<CI>( pnew_group, lbound, ubound ); else pquant = new min_atom_quantifier<CI>( pnew.get(), lbound, ubound ); } else { if( pnew_group ) pquant = new max_group_quantifier<CI>( pnew_group, lbound, ubound ); else pquant = new max_atom_quantifier<CI>( pnew.get(), lbound, ubound ); }
pnew.release(); pnew = pquant.release(); ipat = itemp; } } }
template< typename CI, typename SY > void basic_rpattern<CI,SY>::_add_subst_backref( subst_node & snode, size_t nbackref, size_t rstart ) { m_fuses_backrefs = true; assert( subst_node::SUBST_STRING == snode.stype ); if( snode.subst_string.rlength ) m_subst_list.push_back( snode );
snode.stype = subst_node::SUBST_BACKREF; snode.subst_backref = nbackref; m_subst_list.push_back( snode );
// re-initialize the subst_node
snode.stype = subst_node::SUBST_STRING; snode.subst_string.rstart = rstart; snode.subst_string.rlength = 0; }
template< typename CI, typename SY > void basic_rpattern<CI,SY>::_parse_subst() { TOKEN tok; subst_node snode; basic_string<char_type>::iterator icur = m_subst.begin(); size_t nbackref; basic_string<char_type>::iterator itemp; bool fdone;
m_fuses_backrefs = false;
// Initialize the subst_node
snode.stype = subst_node::SUBST_STRING; snode.subst_string.rstart = 0; snode.subst_string.rlength = 0;
while( icur != m_subst.end() ) { switch( tok = SY::subst_token( icur, m_subst.end() ) ) { case SUBST_MATCH: _add_subst_backref( snode, 0, distance( m_subst.begin(), icur ) ); break;
case SUBST_PREMATCH: _add_subst_backref( snode, subst_node::PREMATCH, distance( m_subst.begin(), icur ) ); break;
case SUBST_POSTMATCH: _add_subst_backref( snode, subst_node::POSTMATCH, distance( m_subst.begin(), icur ) ); break;
case SUBST_BACKREF: nbackref = parse_int( icur, m_subst.end(), cgroups() - 1 ); // always at least 1 group
if( 0 == nbackref ) throw bad_regexpr( "invalid backreference in substitution" );
_add_subst_backref( snode, nbackref, distance( m_subst.begin(), icur ) ); break;
case SUBST_QUOTE_META_ON: assert( subst_node::SUBST_STRING == snode.stype ); if( snode.subst_string.rlength ) m_subst_list.push_back( snode );
snode.subst_string.rstart = distance( m_subst.begin(), icur ); for( itemp = icur, fdone = false; !fdone && icur != m_subst.end(); ) { switch( tok = SY::subst_token( icur, m_subst.end() ) ) { case SUBST_ALL_OFF: fdone = true; break; case NO_TOKEN: ++icur; // fall-through
default: itemp = icur; break; } } snode.subst_string.rlength = distance( m_subst.begin(), itemp ) - snode.subst_string.rstart; if( snode.subst_string.rlength ) m_subst_list.push_back( snode );
if( tok == SUBST_ALL_OFF ) { snode.stype = subst_node::SUBST_OP; snode.op = subst_node::ALL_OFF; m_subst_list.push_back( snode ); }
// re-initialize the subst_node
snode.stype = subst_node::SUBST_STRING; snode.subst_string.rstart = distance( m_subst.begin(), icur ); snode.subst_string.rlength = 0; break;
case SUBST_UPPER_ON: case SUBST_UPPER_NEXT: case SUBST_LOWER_ON: case SUBST_LOWER_NEXT: case SUBST_ALL_OFF: assert( subst_node::SUBST_STRING == snode.stype ); if( snode.subst_string.rlength ) m_subst_list.push_back( snode );
snode.stype = subst_node::SUBST_OP; snode.op = (subst_node::op_type) tok; m_subst_list.push_back( snode );
// re-initialize the subst_node
snode.stype = subst_node::SUBST_STRING; snode.subst_string.rstart = distance( m_subst.begin(), icur ); snode.subst_string.rlength = 0; break; case SUBST_ESCAPE: if( icur == m_subst.end() ) throw bad_regexpr("expecting escape sequence in substitution string"); assert( subst_node::SUBST_STRING == snode.stype ); if( snode.subst_string.rlength ) m_subst_list.push_back( snode ); snode.subst_string.rstart = distance( m_subst.begin(), icur++ ); snode.subst_string.rlength = 1; break;
case NO_TOKEN: default: ++snode.subst_string.rlength; ++icur; break; } } assert( subst_node::SUBST_STRING == snode.stype ); if( snode.subst_string.rlength ) m_subst_list.push_back( snode ); }
template< typename CI, typename SY > basic_rpattern<CI,SY>::charset_map basic_rpattern<CI,SY>::s_charset_map;
// Pass in an interator to one after the opening bracket of the character set.
// On return, icur points to one character after the closing bracket
template< typename CI, typename SY > sub_expr<CI> * create_charset_helper<CI,SY>::create_charset_aux( basic_string<iterator_traits<CI>::value_type> & str, basic_string<iterator_traits<CI>::value_type>::iterator & icur, unsigned flags ) { bool fcomplement = false; match_charset<CI> * pnew = NULL; basic_string<iterator_traits<CI>::value_type>::iterator itemp = icur;
if( itemp != str.end() && CHARSET_NEGATE == SY::charset_token( itemp, str.end() ) ) { fcomplement = true; icur = itemp; }
switch( ( NOCASE | CSTRINGS ) & flags ) { case 0: pnew = new match_custom_charset_t<eos_t<CI>,match_range_with_case>( fcomplement, icur, str.end(), flags, SY() ); break; case NOCASE: pnew = new match_custom_charset_t<eos_t<CI>,match_range_no_case>( fcomplement, icur, str.end(), flags, SY() ); break; case CSTRINGS: pnew = new match_custom_charset_t<eocs_t<CI>,match_range_with_case>( fcomplement, icur, str.end(), flags, SY() ); break; case NOCASE | CSTRINGS: pnew = new match_custom_charset_t<eocs_t<CI>,match_range_no_case>( fcomplement, icur, str.end(), flags, SY() ); break; default: __assume(0); // tells the compiler that this is unreachable
}
return pnew; }
#pragma warning( disable : 4660 )
// Explicit instantiation
#ifdef REGEX_FORCE_INSTANTIATION
template class basic_regexpr<char>; template class basic_regexpr<wint_t>; #else
template class basic_regexpr<TCHAR>; #endif
#ifndef NO_PERL_RE
#ifdef REGEX_FORCE_INSTANTIATION
template class basic_rpattern<const char *>; template class basic_rpattern<const wint_t *>; template class basic_rpattern<string::const_iterator>; template class basic_rpattern<wstring::const_iterator>; #else
template class basic_rpattern<const TCHAR *>; template class basic_rpattern<tstring::const_iterator>; #endif
#endif
#ifdef POSIX_RE
#ifdef REGEX_FORCE_INSTANTIATION
template class basic_rpattern<const char *,posix_syntax<char> >; template class basic_rpattern<const wint_t *,posix_syntax<wint_t> >; template class basic_rpattern<string::const_iterator,posix_syntax<char> >; template class basic_rpattern<wstring::const_iterator,posix_syntax<wint_t> >; #else
template class basic_rpattern<const TCHAR *,posix_syntax<TCHAR> >; template class basic_rpattern<tstring::const_iterator,posix_syntax<TCHAR> >; #endif
#endif
} // namespace regex
|