You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
3154 lines
100 KiB
3154 lines
100 KiB
//+---------------------------------------------------------------------------
|
|
//
|
|
// File: basic_regexpr.cxx
|
|
//
|
|
// Contents:
|
|
//
|
|
// Classes:
|
|
//
|
|
// Functions:
|
|
//
|
|
// Coupling:
|
|
//
|
|
// Notes:
|
|
//
|
|
// History: 1-11-1999 ericne Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
#include "stdafx.h"
|
|
#pragma hdrstop
|
|
// unlimited inline expansion (compile with /Ob1 or /Ob2)
|
|
#pragma inline_depth(255)
|
|
|
|
// C4355 'this' : used in base member initializer list
|
|
// C4660 template-class specialization 'foo<bar>' is already instantiated
|
|
// C4786 identifier was truncated to '255' characters in the debug information
|
|
// C4800 'int' : forcing value to bool 'true' or 'false' (performance warning)
|
|
#pragma warning( disable : 4355 4660 4786 4800 )
|
|
|
|
#include <assert.h>
|
|
#include <malloc.h> // for _alloca
|
|
#include <algorithm>
|
|
#include <minmax.h>
|
|
#include "regexpr.h"
|
|
|
|
using namespace std;
|
|
|
|
namespace regex
|
|
{
|
|
|
|
#ifdef _MT
|
|
// Global critical section used to synchronize the creation of static const patterns
|
|
CRegExCritSect g_objRegExCritSect;
|
|
#endif
|
|
|
|
// For use while doing uppercase/lowercase conversions:
|
|
// For use while doing uppercase/lowercase conversions:
|
|
inline char to_upper( char ch ) { return ( char )toupper(ch); }
|
|
inline char to_lower( char ch ) { return ( char )tolower(ch); }
|
|
inline wint_t to_upper( wint_t ch ) { return (wint_t)towupper(ch); }
|
|
inline wint_t to_lower( wint_t ch ) { return (wint_t)towlower(ch); }
|
|
template< typename II, typename CI >
|
|
void to_upper( II ibegin, CI iend )
|
|
{
|
|
for( ; (CI)ibegin != iend; ++ibegin )
|
|
*ibegin = to_upper( *ibegin );
|
|
}
|
|
template< typename II, typename CI >
|
|
void to_lower( II ibegin, CI iend )
|
|
{
|
|
for( ; (CI)ibegin != iend; ++ibegin )
|
|
*ibegin = to_lower( *ibegin );
|
|
}
|
|
|
|
template< typename II, typename CI >
|
|
unsigned parse_int( II & istr, CI iend, const unsigned m_max = -1 )
|
|
{
|
|
unsigned retval = 0;
|
|
while( (CI)istr != iend && '0' <= *istr && '9' >= *istr && m_max > retval )
|
|
{
|
|
retval = retval * 10 + ( (unsigned)*istr++ - (unsigned)'0' );
|
|
}
|
|
if( m_max < retval )
|
|
{
|
|
retval /= 10;
|
|
--istr;
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
// This class is used to speed up character set matching by providing
|
|
// a bitset that spans the ASCII range. std::bitset is not used because
|
|
// the range-checking slows it down.
|
|
// Note: The division and modulus operations are optimized by the compiler
|
|
// into bit-shift operations.
|
|
class ascii_bitvector
|
|
{
|
|
typedef unsigned __int32 elem_type; // use 32-bit ints on 32-bit platforms
|
|
//typedef unsigned __int64 elem_type; // use 64-bit ints on 64-bit platforms
|
|
|
|
enum { CBELEM = 8 * sizeof elem_type, // count of bytes per element
|
|
CELEMS = (UCHAR_MAX+1) / CBELEM }; // number of element in array
|
|
elem_type m_rg[ CELEMS ];
|
|
|
|
// Used to inline operations like: bv1 |= ~bv2; without creating temp bit vectors.
|
|
struct not_ascii_bitvector
|
|
{
|
|
const ascii_bitvector & m_ref;
|
|
not_ascii_bitvector( const ascii_bitvector & ref ) throw()
|
|
: m_ref(ref) {}
|
|
};
|
|
public:
|
|
ascii_bitvector() throw()
|
|
{ memset( m_rg, 0, CELEMS * sizeof elem_type ); }
|
|
|
|
inline void set( unsigned char ch ) throw()
|
|
{ m_rg[ ( ch / CBELEM ) ] |= ( (elem_type)1U << ( ch % CBELEM ) ); }
|
|
|
|
inline bool operator[]( unsigned char ch ) const throw()
|
|
{ return 0 != ( m_rg[ ( ch / CBELEM ) ] & ( (elem_type)1U << ( ch % CBELEM ) ) ); }
|
|
|
|
inline not_ascii_bitvector operator~() const throw()
|
|
{ return not_ascii_bitvector(*this); }
|
|
|
|
inline ascii_bitvector & operator|=( const ascii_bitvector & that ) throw()
|
|
{ for( int i=0; i<CELEMS; ++i )
|
|
m_rg[i] |= that.m_rg[i];
|
|
return *this; }
|
|
|
|
inline ascii_bitvector & operator|=( const not_ascii_bitvector & that ) throw()
|
|
{ for( int i=0; i<CELEMS; ++i )
|
|
m_rg[i] |= ~that.m_ref.m_rg[i];
|
|
return *this; }
|
|
};
|
|
|
|
const ascii_bitvector & get_digit_vector(void)
|
|
{
|
|
// 0-9
|
|
class digit_vector : public ascii_bitvector
|
|
{
|
|
public:
|
|
digit_vector()
|
|
{
|
|
unsigned char ich;
|
|
for( ich ='0'; ich <= '9'; ++ich )
|
|
set(ich);
|
|
}
|
|
};
|
|
|
|
static const digit_vector s_digit_vector;
|
|
return s_digit_vector;
|
|
}
|
|
|
|
const ascii_bitvector & get_word_vector(void)
|
|
{
|
|
// a-zA-Z_0-9
|
|
class word_vector : public ascii_bitvector
|
|
{
|
|
public:
|
|
word_vector()
|
|
{
|
|
unsigned char ich;
|
|
for( ich = 'a'; ich <= 'z'; ++ich )
|
|
set(ich);
|
|
for( ich = 'A'; ich <= 'Z'; ++ich )
|
|
set(ich);
|
|
for( ich = '0'; ich <= '9'; ++ich )
|
|
set(ich);
|
|
set('_');
|
|
}
|
|
};
|
|
|
|
static const word_vector s_word_vector;
|
|
return s_word_vector;
|
|
}
|
|
|
|
const ascii_bitvector & get_space_vector(void)
|
|
{
|
|
// " \t\r\n\f"
|
|
class space_vector : public ascii_bitvector
|
|
{
|
|
public:
|
|
space_vector()
|
|
{
|
|
set(' ');
|
|
set('\t');
|
|
set('\v');
|
|
set('\r');
|
|
set('\n');
|
|
set('\f');
|
|
}
|
|
};
|
|
|
|
static const space_vector s_space_vector;
|
|
return s_space_vector;
|
|
}
|
|
|
|
//
|
|
// Operator implementations
|
|
//
|
|
|
|
// Base type used so that all derived operators share typedefs.
|
|
template< typename CI >
|
|
struct op_t : public binary_function<match_param<CI>,CI,bool>
|
|
{
|
|
typedef CI const_iterator;
|
|
typedef typename iterator_traits<CI>::value_type char_type;
|
|
};
|
|
|
|
// Evaluates the beginning-of-string condition
|
|
template< typename CI >
|
|
struct bos_t : public op_t<CI>
|
|
{
|
|
inline bool operator()( const match_param<CI> & param, CI iter ) const
|
|
{
|
|
return param.ibegin == iter;
|
|
}
|
|
};
|
|
|
|
// Find the beginning of a line, either beginning of a string, or the character
|
|
// immediately following a newline
|
|
template< typename CI >
|
|
struct bol_t : public bos_t<CI>
|
|
{
|
|
inline bool operator()( const match_param<CI> & param, CI iter ) const
|
|
{
|
|
return bos_t<CI>::operator()(param,iter) || char_type('\n') == *--iter;
|
|
}
|
|
};
|
|
|
|
// Evaluates end-of-string condition for string's
|
|
template< typename CI >
|
|
struct eos_t : public op_t<CI>
|
|
{
|
|
inline bool operator()( const match_param<CI> & param, CI iter ) const
|
|
{
|
|
return param.istop == iter;
|
|
}
|
|
};
|
|
|
|
// Evaluates end-of-string condidition for C-style string's when the length is unknown by
|
|
// looking for the null-terminator.
|
|
template< typename CI >
|
|
struct eocs_t : public op_t<CI>
|
|
{
|
|
inline bool operator()( const match_param<CI> & param, CI iter ) const
|
|
{
|
|
return char_type('\0') == *iter;
|
|
}
|
|
};
|
|
|
|
// Evaluates end-of-line conditions, either the end of the string, or a
|
|
// return or newline character.
|
|
template< typename EOS >
|
|
struct eol_t_t : public EOS
|
|
{
|
|
typedef typename EOS::const_iterator CI;
|
|
inline bool operator()( const match_param<CI> & param, CI iter ) const
|
|
{
|
|
return EOS::operator()(param,iter) || char_type('\n') == *iter || char_type('\r') == *iter;
|
|
}
|
|
};
|
|
|
|
template< typename CI > struct eol_t : public eol_t_t<eos_t<CI> > {};
|
|
template< typename CI > struct eocl_t : public eol_t_t<eocs_t<CI> > {};
|
|
|
|
// Evaluates perl's end-of-string conditions, either the end of the string, or a
|
|
// newline character followed by end of string. (Only used by $ and /Z assertions)
|
|
template< typename EOS >
|
|
struct peos_t_t : public EOS
|
|
{
|
|
typedef typename EOS::const_iterator CI;
|
|
inline bool operator()( const match_param<CI> & param, CI iter ) const
|
|
{
|
|
return EOS::operator()(param,iter) || ( ( char_type('\n') == *iter ) && EOS::operator()(param,++iter) );
|
|
}
|
|
};
|
|
|
|
template< typename CI > struct peos_t : public peos_t_t<eos_t<CI> > {};
|
|
template< typename CI > struct peocs_t : public peos_t_t<eocs_t<CI> > {};
|
|
|
|
// compare two characters, case-sensitive
|
|
template< typename CH >
|
|
struct ch_neq_t : public binary_function<CH, CH, bool>
|
|
{
|
|
typedef CH char_type;
|
|
inline bool operator()( register CH ch1, register CH ch2 ) const
|
|
{
|
|
return ch1 != ch2;
|
|
}
|
|
};
|
|
|
|
// Compare two characters, disregarding case
|
|
template< typename CH >
|
|
struct ch_neq_nocase_t : public binary_function<CH, CH, bool>
|
|
{
|
|
typedef CH char_type;
|
|
inline bool operator()( register CH ch1, register CH ch2 ) const
|
|
{
|
|
return to_upper(ch1) != to_upper(ch2);
|
|
}
|
|
};
|
|
|
|
//
|
|
// Helper functions for match and substitute
|
|
//
|
|
|
|
template< typename CI >
|
|
size_t string_length( CI iter )
|
|
{
|
|
size_t n = 0;
|
|
while( 0 != *iter++ )
|
|
++n;
|
|
return n;
|
|
}
|
|
|
|
template< typename CI >
|
|
backref_tag<CI> _do_match( const basic_rpattern_base<CI> & pat, match_param<CI> & param ) throw()
|
|
{
|
|
typedef typename iterator_traits<CI>::value_type char_type;
|
|
|
|
bool floop = pat.loops();
|
|
unsigned flags = pat.flags();
|
|
width_type nwidth = pat.get_width();
|
|
|
|
const sub_expr<CI> * pfirst = pat.get_first_subexpression();
|
|
|
|
try
|
|
{
|
|
vector<backref_tag<CI> > rgbackrefs; // dummy backref vector
|
|
|
|
if( NULL == param.prgbackrefs )
|
|
param.prgbackrefs = & rgbackrefs;
|
|
|
|
param.prgbackrefs->resize( pat._cgroups_total() );
|
|
fill( param.prgbackrefs->begin(), param.prgbackrefs->end(), backref_tag<CI>() );
|
|
|
|
// If a pattern is optimized for CSTRINGS, it can save a call
|
|
// to calculate the length of the string.
|
|
if( CI(0) == param.istop && ( ( RIGHTMOST & flags ) || ( 0 == ( CSTRINGS & flags ) ) ) )
|
|
param.istop = param.istart + string_length( param.istart );
|
|
|
|
if( CI(0) != param.istop )
|
|
{
|
|
// If the minimum width of the pattern exceeds the width of the
|
|
// string, a succesful match is impossible
|
|
if( nwidth.m_min <= (size_t)distance( param.istart, param.istop ) )
|
|
{
|
|
CI local_istop = param.istop;
|
|
advance( local_istop, -int( nwidth.m_min ) );
|
|
|
|
if( RIGHTMOST & flags )
|
|
{
|
|
// begin trying to match after the last character.
|
|
// Continue to the beginning
|
|
for( CI icur = local_istop; icur >= param.istart; --icur )
|
|
if( pfirst->domatch( param, icur ) )
|
|
break; // m_floop not used for rightmost matches
|
|
}
|
|
else
|
|
{
|
|
// begin trying to match before the first character.
|
|
// Continue to the end
|
|
for( CI icur = param.istart; icur <= local_istop; ++icur )
|
|
if( pfirst->domatch( param, icur ) || ! floop )
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// begin trying to match before the first character.
|
|
// Continue to the end
|
|
for( CI icur = param.istart; ; ++icur )
|
|
{
|
|
if( pfirst->domatch( param, icur ) || ! floop )
|
|
break;
|
|
if( char_type('\0') == *icur )
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
catch(std::bad_alloc) // bad alloc?
|
|
{
|
|
fill( param.prgbackrefs->begin(), param.prgbackrefs->end(), backref_tag<CI>() );
|
|
}
|
|
|
|
// Shrink the backref vector to chop off information about the "invisible" groups
|
|
param.prgbackrefs->resize( pat.cgroups() );
|
|
|
|
return (*param.prgbackrefs)[0];
|
|
}
|
|
|
|
template< typename CI, typename CH, typename TR, typename AL >
|
|
size_t _do_subst( basic_regexpr<CH,TR,AL> & str, const basic_rpattern_base<CI> & pat, size_t strpos, size_t strlen ) throw(bad_alloc)
|
|
{
|
|
typedef iterator_traits<CI>::value_type char_type;
|
|
typedef list<subst_node>::const_iterator LCI;
|
|
enum { UPPER = -1, NIL, LOWER } next = NIL, rest = NIL;
|
|
bool first = true;
|
|
size_t old_strpos = strpos;
|
|
const list<subst_node> & subst_list = pat.get_subst_list();
|
|
basic_string<CH,TR,AL>::iterator itstrlen = str.begin();
|
|
advance( itstrlen, strpos + strlen );
|
|
const basic_string<char_type> & subst = pat.get_subst();
|
|
push_new_handler pnh( &my_new_handler );
|
|
|
|
for( LCI isubst = subst_list.begin(); isubst != subst_list.end(); ++isubst )
|
|
{
|
|
size_t sublen;
|
|
basic_string<CH,TR,AL>::const_iterator itsubpos1; // iter into str
|
|
basic_string<CH,TR,AL>::const_iterator itsublen1;
|
|
basic_string<char_type>::const_iterator itsubpos2; // iter into subst string
|
|
basic_string<char_type>::const_iterator itsublen2;
|
|
basic_string<CH,TR,AL>::iterator itstrpos = str.begin();
|
|
advance( itstrpos, strpos );
|
|
|
|
switch( isubst->stype )
|
|
{
|
|
case subst_node::SUBST_STRING:
|
|
itsubpos2 = subst.begin();
|
|
advance( itsubpos2, isubst->subst_string.rstart );
|
|
itsublen2 = itsubpos2;
|
|
advance( itsublen2, isubst->subst_string.rlength );
|
|
|
|
first ? str.replace( itstrpos, itstrlen, itsubpos2, itsublen2 ) :
|
|
str.insert( itstrpos, itsubpos2, itsublen2 );
|
|
sublen = distance( itsubpos2, itsublen2 );
|
|
break;
|
|
|
|
case subst_node::SUBST_BACKREF:
|
|
switch( isubst->subst_backref )
|
|
{
|
|
case subst_node::PREMATCH:
|
|
itsubpos1 = str.backref_str().begin();
|
|
itsublen1 = itsubpos1;
|
|
advance( itsublen1, sublen = str.rstart() );
|
|
break;
|
|
case subst_node::POSTMATCH:
|
|
itsubpos1 = str.backref_str().begin();
|
|
advance( itsubpos1, str.rstart() + str.rlength() );
|
|
itsublen1 = str.backref_str().end();
|
|
break;
|
|
default:
|
|
itsubpos1 = str.backref_str().begin();
|
|
advance( itsubpos1, str.rstart( isubst->subst_backref ) );
|
|
itsublen1 = itsubpos1;
|
|
advance( itsublen1, str.rlength( isubst->subst_backref ) );
|
|
break;
|
|
}
|
|
|
|
first ? str.replace( itstrpos, itstrlen, itsubpos1, itsublen1 ) :
|
|
str.insert( itstrpos, itsubpos1, itsublen1 );
|
|
sublen = distance( itsubpos1, itsublen1 );
|
|
break;
|
|
|
|
case subst_node::SUBST_OP:
|
|
switch( isubst->op )
|
|
{
|
|
case subst_node::UPPER_ON:
|
|
rest = UPPER;
|
|
break;
|
|
case subst_node::UPPER_NEXT:
|
|
next = UPPER;
|
|
break;
|
|
case subst_node::LOWER_ON:
|
|
rest = LOWER;
|
|
break;
|
|
case subst_node::LOWER_NEXT:
|
|
next = LOWER;
|
|
break;
|
|
case subst_node::ALL_OFF:
|
|
rest = NIL;
|
|
break;
|
|
default:
|
|
__assume(0);
|
|
}
|
|
continue; // jump to the next item in the list
|
|
|
|
default:
|
|
__assume(0);
|
|
}
|
|
|
|
first = false;
|
|
|
|
// Are we upper- or lower-casing this string?
|
|
if( rest )
|
|
{
|
|
basic_string<CH,TR,AL>::iterator istart = str.begin();
|
|
advance( istart, strpos );
|
|
basic_string<CH,TR,AL>::const_iterator istop = istart;
|
|
advance( istop, sublen );
|
|
switch( rest )
|
|
{
|
|
case UPPER:
|
|
to_upper( istart, istop );
|
|
break;
|
|
case LOWER:
|
|
to_lower( istart, istop );
|
|
break;
|
|
default:
|
|
__assume(0);
|
|
}
|
|
}
|
|
|
|
// Are we upper- or lower-casing the next character?
|
|
if( next )
|
|
{
|
|
switch( next )
|
|
{
|
|
case UPPER:
|
|
str[strpos] = to_upper(str[strpos]);
|
|
break;
|
|
case LOWER:
|
|
str[strpos] = to_lower(str[strpos]);
|
|
break;
|
|
default:
|
|
__assume(0);
|
|
}
|
|
next = NIL;
|
|
}
|
|
|
|
strpos += sublen;
|
|
}
|
|
|
|
// If *first* is still true, then we never called str.replace, and the substitution
|
|
// string is empty. Erase the part of the string that the pattern matched.
|
|
if( first )
|
|
str.erase( strpos, strlen );
|
|
|
|
// return length of the substitution
|
|
return strpos - old_strpos;
|
|
}
|
|
|
|
//
|
|
// Implementation of basic_regexpr
|
|
//
|
|
|
|
template< typename CH, typename TR, typename AL >
|
|
size_t basic_regexpr<CH,TR,AL>::substitute(
|
|
const basic_rpattern_base<typename basic_regexpr<CH,TR,AL>::const_iterator> & pat,
|
|
size_type pos,
|
|
size_type len ) throw(bad_alloc)
|
|
{
|
|
if( pat.flags() & CSTRINGS )
|
|
{
|
|
assert( ! "You can't use a pattern optimized for CSTRINGS with regexpr::substitute" );
|
|
return 0;
|
|
}
|
|
|
|
backref_vector rgbackrefs; // dummy backref vector
|
|
backref_vector * prgbackrefs = & rgbackrefs;
|
|
const bool fsave_backrefs = ( pat.uses_backrefs() || !( pat.flags() & NOBACKREFS ) );
|
|
|
|
if( fsave_backrefs )
|
|
{
|
|
prgbackrefs = & m_rgbackrefs;
|
|
m_pbackref_str = & ( m_backref_str = *this );
|
|
}
|
|
else
|
|
{
|
|
m_backref_str.erase();
|
|
m_pbackref_str = this;
|
|
m_rgbackrefs.resize( 0 );
|
|
}
|
|
|
|
backref_type br;
|
|
size_t csubst = 0;
|
|
long stop_offset = ( len == npos ?
|
|
m_pbackref_str->size() :
|
|
min( pos + len, m_pbackref_str->size() ) );
|
|
|
|
match_param<const_iterator> param( m_pbackref_str->begin(),
|
|
m_pbackref_str->begin(),
|
|
prgbackrefs );
|
|
advance( param.istart, pos );
|
|
advance( param.istop, stop_offset );
|
|
param.ibegin = param.istart;
|
|
|
|
if( GLOBAL & pat.flags() )
|
|
{
|
|
const bool fAll = ( ALLBACKREFS == ( ALLBACKREFS & pat.flags() ) );
|
|
const bool fFirst = ( FIRSTBACKREFS == ( FIRSTBACKREFS & pat.flags() ) );
|
|
backref_vector rgtempbackrefs; // temporary vector used if fsave_backrefs
|
|
|
|
long pos_offset = 0; // keep track of how much the backref_str and
|
|
// the current string are out of sync
|
|
|
|
while( br = _do_match( pat, param ) )
|
|
{
|
|
++csubst;
|
|
size_type match_length = distance( br.first, br.second );
|
|
pos = distance( m_pbackref_str->begin(), br.first );
|
|
size_type subst_length = _do_subst( *this, pat, pos + pos_offset, match_length );
|
|
|
|
if( fsave_backrefs )
|
|
{
|
|
pos += match_length;
|
|
pos_offset += ( subst_length - match_length );
|
|
|
|
// Handle specially the backref flags
|
|
if( fFirst )
|
|
rgtempbackrefs.push_back( br );
|
|
else if( fAll )
|
|
rgtempbackrefs.insert( rgtempbackrefs.end(),
|
|
param.prgbackrefs->begin(),
|
|
param.prgbackrefs->end() );
|
|
else
|
|
rgtempbackrefs.swap( *param.prgbackrefs );
|
|
}
|
|
else
|
|
{
|
|
pos += subst_length;
|
|
stop_offset += ( subst_length - match_length );
|
|
|
|
// we're not saving backref information, so we don't
|
|
// need to do any special backref maintenance here
|
|
}
|
|
|
|
// prevent a pattern that matches 0 characters from matching
|
|
// again at the same point in the string
|
|
if( 0 == match_length )
|
|
{
|
|
if( br.first == param.istop ) // We're at the end, so we're done
|
|
break;
|
|
++pos;
|
|
}
|
|
|
|
param.istart = m_pbackref_str->begin();
|
|
advance( param.istart, pos ); // ineffecient for bidirectional iterators.
|
|
|
|
param.istop = m_pbackref_str->begin();
|
|
advance( param.istop, stop_offset ); // ineffecient for bidirectional iterators.
|
|
}
|
|
|
|
// If we did special backref handling, swap the backref vectors
|
|
if( fsave_backrefs && ( !br || fFirst || fAll ) )
|
|
param.prgbackrefs->swap( rgtempbackrefs );
|
|
else if( ! (*param.prgbackrefs)[0] )
|
|
param.prgbackrefs->clear();
|
|
}
|
|
else if( br = _do_match( pat, param ) )
|
|
{
|
|
++csubst;
|
|
_do_subst( *this, pat,
|
|
distance( m_pbackref_str->begin(), br.first ),
|
|
distance( br.first, br.second ) );
|
|
}
|
|
|
|
if( NOBACKREFS == ( pat.flags() & NOBACKREFS ) )
|
|
param.prgbackrefs->clear();
|
|
|
|
return csubst;
|
|
}
|
|
|
|
//
|
|
// Helper functions called from both basic_regexpr match methods
|
|
//
|
|
|
|
template< typename EOS >
|
|
backref_tag< typename EOS::const_iterator > _match_helper(
|
|
const basic_rpattern_base<typename EOS::const_iterator> & pat,
|
|
match_param<typename EOS::const_iterator> & param,
|
|
EOS eos )
|
|
{
|
|
typedef typename EOS::const_iterator CI;
|
|
|
|
if( GLOBAL & pat.flags() ) // do a global find
|
|
{
|
|
// The NOBACKREFS flag is ignored in the match method.
|
|
const bool fAll = ( ALLBACKREFS == ( ALLBACKREFS & pat.flags() ) );
|
|
const bool fFirst = ( FIRSTBACKREFS == ( FIRSTBACKREFS & pat.flags() ) );
|
|
|
|
backref_tag<CI> br;
|
|
vector<backref_tag<CI> > rgtempbackrefs;
|
|
|
|
while( br = _do_match( pat, param ) )
|
|
{
|
|
// Handle specially the backref flags
|
|
if( fFirst )
|
|
rgtempbackrefs.push_back( br );
|
|
else if( fAll )
|
|
rgtempbackrefs.insert( rgtempbackrefs.end(),
|
|
param.prgbackrefs->begin(),
|
|
param.prgbackrefs->end() );
|
|
else
|
|
rgtempbackrefs.swap( *param.prgbackrefs );
|
|
|
|
if( br.first == ( param.istart = br.second ) )
|
|
{
|
|
if( eos( param, param.istart ) )
|
|
break;
|
|
++param.istart;
|
|
}
|
|
}
|
|
|
|
// restore the backref vectors
|
|
if( !br || fFirst || fAll )
|
|
param.prgbackrefs->swap( rgtempbackrefs );
|
|
else if( ! (*param.prgbackrefs)[0] )
|
|
param.prgbackrefs->clear();
|
|
|
|
return param.prgbackrefs->empty() ? backref_tag<CI>() : (*param.prgbackrefs)[0];
|
|
}
|
|
else
|
|
return _do_match( pat, param );
|
|
}
|
|
|
|
template< typename CH, typename TR, typename AL >
|
|
typename basic_regexpr<CH,TR,AL>::backref_type basic_regexpr<CH,TR,AL>::match(
|
|
const basic_rpattern_base<const_iterator> & pat,
|
|
size_type pos, size_type len ) const throw()
|
|
{
|
|
if( pat.flags() & CSTRINGS )
|
|
{
|
|
assert( ! "A pattern optimized for CSTRINGS can only be used with the static regexpr::match method" );
|
|
return backref_type();
|
|
}
|
|
|
|
m_pbackref_str = this;
|
|
m_backref_str.erase(); // free up unused memory
|
|
|
|
const_iterator istart = begin();
|
|
advance( istart, pos );
|
|
|
|
const_iterator istop;
|
|
if( len == npos || pos + len >= size() )
|
|
istop = end();
|
|
else
|
|
advance( istop = begin(), pos + len );
|
|
|
|
match_param<const_iterator> param( istart, istop, & m_rgbackrefs );
|
|
return _match_helper<eos_t<const_iterator> >( pat, param, eos_t<const_iterator>() );
|
|
}
|
|
|
|
template< typename CH >
|
|
backref_tag<const CH *> _static_match_helper(
|
|
const CH * szstr,
|
|
const basic_rpattern_base<const CH *> & pat,
|
|
vector< backref_tag< const CH * > > * prgbackrefs ) throw()
|
|
{
|
|
vector< backref_tag< const CH * > > rgdummyvector;
|
|
if( NULL == prgbackrefs )
|
|
prgbackrefs = &rgdummyvector;
|
|
|
|
match_param<const CH *> param( szstr, NULL, prgbackrefs );
|
|
return _match_helper<eocs_t<const CH *> >( pat, param, eocs_t<const CH *>() );
|
|
}
|
|
|
|
//
|
|
// Helper function called from both basic_regexpr::count methods
|
|
//
|
|
template< typename EOS >
|
|
size_t _count_helper(
|
|
const basic_rpattern_base<typename EOS::const_iterator> & pat,
|
|
match_param<typename EOS::const_iterator> & param,
|
|
EOS eos )
|
|
{
|
|
typedef typename EOS::const_iterator CI;
|
|
|
|
size_t cmatches = 0;
|
|
vector<backref_tag<CI> > rgbackrefs; // dummy backref vector
|
|
backref_tag<CI> br;
|
|
param.prgbackrefs = &rgbackrefs;
|
|
|
|
while( br = _do_match( pat, param ) )
|
|
{
|
|
++cmatches;
|
|
|
|
if( br.first == ( param.istart = br.second ) )
|
|
{
|
|
if( eos( param, param.istart ) )
|
|
break;
|
|
++param.istart;
|
|
}
|
|
}
|
|
return cmatches;
|
|
}
|
|
|
|
template< typename CH, typename TR, typename AL >
|
|
size_t basic_regexpr<CH,TR,AL>::count(
|
|
const basic_rpattern_base<typename basic_regexpr<CH,TR,AL>::const_iterator> & pat,
|
|
size_type pos,
|
|
size_type len ) const throw()
|
|
{
|
|
if( pat.flags() & CSTRINGS )
|
|
{
|
|
assert( ! "A pattern optimized for CSTRINGS can only be used with the static regexpr::count method" );
|
|
return backref_type();
|
|
}
|
|
|
|
m_pbackref_str = this;
|
|
|
|
const_iterator istart = begin();
|
|
advance( istart, pos );
|
|
|
|
const_iterator istop;
|
|
if( len == npos || pos + len >= size() )
|
|
istop = end();
|
|
else
|
|
advance( istop = begin(), pos + len );
|
|
|
|
match_param<const_iterator> param( istart, istop, NULL );
|
|
return _count_helper<eos_t<const_iterator> >( pat, param, eos_t<const_iterator>() );
|
|
}
|
|
|
|
template< typename CH >
|
|
size_t _static_count_helper(
|
|
const CH * szstr,
|
|
const basic_rpattern_base<const CH *> & pat ) throw()
|
|
{
|
|
match_param<const CH *> param( szstr, NULL, NULL );
|
|
return _count_helper<eocs_t<const CH *> >( pat, param, eocs_t<const CH *>() );
|
|
}
|
|
|
|
// Base class for sub-expressions which are zero-width
|
|
// (i.e., assertions eat no characters during matching)
|
|
// Assertions cannot be quantified.
|
|
template< typename CI >
|
|
class assertion : public sub_expr<CI>
|
|
{
|
|
public:
|
|
virtual ~assertion() {}
|
|
virtual bool is_assertion() const throw() { return true; }
|
|
protected:
|
|
virtual width_type _width_this() throw() { return width_type(0,0); }
|
|
};
|
|
|
|
template< typename OP >
|
|
class assert_op : public assertion<typename OP::const_iterator>
|
|
{
|
|
public:
|
|
typedef OP op_type;
|
|
typedef typename OP::const_iterator CI;
|
|
virtual ~assert_op() {}
|
|
protected:
|
|
virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
|
|
{
|
|
return m_op( param, icur );
|
|
}
|
|
op_type m_op;
|
|
};
|
|
|
|
template< typename CI >
|
|
assertion<CI> * create_bos( unsigned /*flags*/ )
|
|
{
|
|
return new assert_op<bos_t<CI> >();
|
|
}
|
|
|
|
template< typename CI >
|
|
assertion<CI> * create_eos( unsigned flags )
|
|
{
|
|
switch( CSTRINGS & flags )
|
|
{
|
|
case 0:
|
|
return new assert_op<peos_t<CI> >();
|
|
case CSTRINGS:
|
|
return new assert_op<peocs_t<CI> >();
|
|
default:
|
|
__assume(0); // tells the compiler that this is unreachable
|
|
}
|
|
}
|
|
|
|
template< typename CI >
|
|
assertion<CI> * create_eoz( unsigned flags )
|
|
{
|
|
switch( CSTRINGS & flags )
|
|
{
|
|
case 0:
|
|
return new assert_op<eos_t<CI> >();
|
|
case CSTRINGS:
|
|
return new assert_op<eocs_t<CI> >();
|
|
default:
|
|
__assume(0); // tells the compiler that this is unreachable
|
|
}
|
|
}
|
|
|
|
template< typename CI >
|
|
assertion<CI> * create_bol( unsigned flags )
|
|
{
|
|
switch( MULTILINE & flags )
|
|
{
|
|
case 0:
|
|
return new assert_op<bos_t<CI> >();
|
|
case MULTILINE:
|
|
return new assert_op<bol_t<CI> >();
|
|
default:
|
|
__assume(0); // tells the compiler that this is unreachable
|
|
}
|
|
}
|
|
|
|
template< typename CI >
|
|
assertion<CI> * create_eol( unsigned flags )
|
|
{
|
|
switch( ( MULTILINE | CSTRINGS ) & flags )
|
|
{
|
|
case 0:
|
|
return new assert_op<peos_t<CI> >();
|
|
case MULTILINE:
|
|
return new assert_op<eol_t<CI> >();
|
|
case CSTRINGS:
|
|
return new assert_op<peocs_t<CI> >();
|
|
case MULTILINE | CSTRINGS:
|
|
return new assert_op<eocl_t<CI> >();
|
|
default:
|
|
__assume(0); // tells the compiler that this is unreachable
|
|
}
|
|
}
|
|
|
|
template< typename CI >
|
|
class match_atom : public sub_expr<CI>
|
|
{
|
|
public:
|
|
match_atom( const typename basic_string<typename sub_expr<CI>::char_type>::iterator istart,
|
|
typename basic_string<typename sub_expr<CI>::char_type>::const_iterator istop )
|
|
: m_istart( istart ), m_istop( istop ) {}
|
|
virtual ~match_atom() {}
|
|
|
|
const typename basic_string<typename sub_expr<CI>::char_type>::iterator m_istart;
|
|
typename typename basic_string<typename sub_expr<CI>::char_type>::const_iterator m_istop;
|
|
protected:
|
|
virtual width_type _width_this() throw()
|
|
{
|
|
size_t width = distance( (basic_string<sub_expr<CI>::char_type>::const_iterator)m_istart, m_istop );
|
|
return width_type( width, width );
|
|
}
|
|
};
|
|
|
|
template< typename EOS >
|
|
class match_atom_t : public match_atom<typename EOS::const_iterator>
|
|
{
|
|
public:
|
|
typedef EOS eos_type;
|
|
typedef typename EOS::const_iterator CI;
|
|
match_atom_t( const typename basic_string<typename sub_expr<CI>::char_type>::iterator istart,
|
|
typename basic_string<typename sub_expr<CI>::char_type>::const_iterator istop )
|
|
: match_atom<CI>( istart, istop ) {}
|
|
virtual ~match_atom_t() {}
|
|
protected:
|
|
virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
|
|
{
|
|
CI icur_tmp = icur;
|
|
basic_string<sub_expr<CI>::char_type>::const_iterator ithis = m_istart;
|
|
for( ; ithis != m_istop; ++icur_tmp, ++ithis )
|
|
{
|
|
if( m_eos( param, icur_tmp ) || *ithis != *icur_tmp )
|
|
return false;
|
|
}
|
|
icur = icur_tmp;
|
|
return true;
|
|
}
|
|
eos_type m_eos;
|
|
};
|
|
|
|
template< typename EOS >
|
|
class match_atom_nocase_t : public match_atom<typename EOS::const_iterator>
|
|
{
|
|
public:
|
|
typedef EOS eos_type;
|
|
typedef typename EOS::const_iterator CI;
|
|
match_atom_nocase_t( typename const basic_string<typename sub_expr<CI>::char_type>::iterator istart,
|
|
typename basic_string<typename sub_expr<CI>::char_type>::const_iterator istop )
|
|
: match_atom<CI>( istart, istop ), m_strlower( (basic_string<sub_expr<CI>::char_type>::const_iterator)istart, istop )
|
|
{
|
|
// Store the uppercase version of the atom in [m_istart,m_istop).
|
|
to_upper( m_istart, m_istop );
|
|
// Store the lowercase version of the atom in m_strlower.
|
|
to_lower( m_strlower.begin(), m_strlower.end() );
|
|
}
|
|
virtual ~match_atom_nocase_t() {}
|
|
|
|
protected:
|
|
virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
|
|
{
|
|
CI icur_tmp = icur;
|
|
basic_string<sub_expr<CI>::char_type>::const_iterator ithisu = m_istart; // uppercase
|
|
basic_string<sub_expr<CI>::char_type>::const_iterator ithisl = m_strlower.begin(); // lowercase
|
|
for( ; ithisu != m_istop; ++icur_tmp, ++ithisu, ++ithisl )
|
|
{
|
|
if( m_eos( param, icur_tmp ) || ( *ithisu != *icur_tmp && *ithisl != *icur_tmp ) )
|
|
return false;
|
|
}
|
|
icur = icur_tmp;
|
|
return true;
|
|
}
|
|
eos_type m_eos;
|
|
basic_string<typename sub_expr<CI>::char_type> m_strlower;
|
|
};
|
|
|
|
template< typename CI >
|
|
match_atom<CI> * create_atom(
|
|
const typename basic_string<typename iterator_traits<CI>::value_type>::iterator istart,
|
|
typename basic_string<typename iterator_traits<CI>::value_type>::const_iterator istop,
|
|
unsigned flags )
|
|
{
|
|
switch( ( NOCASE | CSTRINGS ) & flags )
|
|
{
|
|
case 0:
|
|
return new match_atom_t<eos_t<CI> >( istart, istop );
|
|
case NOCASE:
|
|
return new match_atom_nocase_t<eos_t<CI> >( istart, istop );
|
|
case CSTRINGS:
|
|
return new match_atom_t<eocs_t<CI> >( istart, istop );
|
|
case NOCASE | CSTRINGS:
|
|
return new match_atom_nocase_t<eocs_t<CI> >( istart, istop );
|
|
default:
|
|
__assume(0); // tells the compiler that this is unreachable
|
|
}
|
|
}
|
|
|
|
template< typename CI >
|
|
match_atom<CI> * create_atom(
|
|
const typename basic_string<typename iterator_traits<CI>::value_type>::iterator istart,
|
|
unsigned flags )
|
|
{
|
|
basic_string<iterator_traits<CI>::value_type>::const_iterator istop = istart;
|
|
return create_atom<CI>( istart, ++istop, flags );
|
|
}
|
|
|
|
template< typename CI >
|
|
class match_any : public sub_expr<CI>
|
|
{
|
|
public:
|
|
virtual ~match_any() {}
|
|
protected:
|
|
virtual width_type _width_this() throw() { return width_type(1,1); }
|
|
};
|
|
|
|
template< typename EOS >
|
|
class match_any_t : public match_any<typename EOS::const_iterator>
|
|
{
|
|
public:
|
|
typedef EOS eos_type;
|
|
typedef typename EOS::const_iterator CI;
|
|
virtual ~match_any_t() {}
|
|
protected:
|
|
virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
|
|
{
|
|
if( m_eos( param, icur ) )
|
|
return false;
|
|
++icur;
|
|
return true;
|
|
}
|
|
eos_type m_eos;
|
|
};
|
|
|
|
template< typename CI >
|
|
match_any<CI> * create_any( unsigned flags )
|
|
{
|
|
switch( ( SINGLELINE | CSTRINGS ) & flags )
|
|
{
|
|
case 0:
|
|
return new match_any_t<eol_t<CI> >();
|
|
case SINGLELINE:
|
|
return new match_any_t<eos_t<CI> >();
|
|
case CSTRINGS:
|
|
return new match_any_t<eocl_t<CI> >();
|
|
case SINGLELINE | CSTRINGS:
|
|
return new match_any_t<eocs_t<CI> >();
|
|
default:
|
|
__assume(0); // tells the compiler that this is unreachable
|
|
}
|
|
}
|
|
|
|
typedef pair<wint_t,wint_t> range_type;
|
|
const vector<range_type> g_rgranges; // empty
|
|
|
|
template< typename CI >
|
|
class match_charset : public sub_expr<CI>
|
|
{
|
|
public:
|
|
match_charset( bool fcomplement,
|
|
const ascii_bitvector & bvect )
|
|
: m_fcomplement( fcomplement ),
|
|
m_rgascii( bvect ),
|
|
m_rgranges( g_rgranges ),
|
|
m_ncharflags(0) {}
|
|
|
|
// Note that only the references are copied here -- they are not ref counted.
|
|
// Beware of variable lifetime issues.
|
|
match_charset( const match_charset<CI> & that )
|
|
: m_fcomplement( that.m_fcomplement ),
|
|
m_rgascii( that.m_rgascii ),
|
|
m_rgranges( that.m_rgranges ),
|
|
m_ncharflags( that.m_ncharflags ) {}
|
|
|
|
virtual ~match_charset() {}
|
|
|
|
const bool m_fcomplement;
|
|
const ascii_bitvector & m_rgascii; // bitmap for chars in range 0-255
|
|
const vector<range_type> & m_rgranges; // vector of included character ranges 256-65535
|
|
int m_ncharflags; // Parameter to isctype()
|
|
|
|
// The case-sensitivity of a character set is "compiled" into the ascii_bitvector
|
|
// but not into the range vector because it is too computationally expensive. Instead,
|
|
// when doing a unicode case-insensitive match on the ranges vector, two lookups
|
|
// must be performed -- one lowercase and one uppercase. By contrast, only one lookup
|
|
// is needed for the ascii_bitvector.
|
|
|
|
protected:
|
|
|
|
match_charset( bool fcomplement,
|
|
const ascii_bitvector & bvect,
|
|
const vector<range_type> & rgranges )
|
|
: m_fcomplement( fcomplement ),
|
|
m_rgascii( bvect ),
|
|
m_rgranges( rgranges ),
|
|
m_ncharflags(0) {}
|
|
|
|
// this method should never be called. match_charset is only a base class
|
|
// for match_charset_t
|
|
virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
|
|
{
|
|
assert(false);
|
|
return true;
|
|
}
|
|
|
|
template< typename SY >
|
|
match_charset<CI> * get_altern_charset( char_type ch, unsigned flags, SY /*sy*/ ) const throw()
|
|
{
|
|
return basic_rpattern<CI,SY>::s_charset_map.get( ch, flags );
|
|
}
|
|
virtual width_type _width_this() throw() { return width_type(1,1); }
|
|
};
|
|
|
|
// Used as a template parameter to find a unicode character in an array of ranges.
|
|
class match_range : public unary_function<wint_t,bool>
|
|
{
|
|
protected:
|
|
const vector<range_type> & m_rgranges;
|
|
|
|
// determines if one range is less then another.
|
|
// used in binary search of range vector
|
|
inline static bool _range_less( const range_type & rg1,
|
|
const range_type & rg2 ) throw()
|
|
{
|
|
return rg1.second < rg2.first;
|
|
}
|
|
|
|
match_range( const vector<range_type> & rgranges )
|
|
: m_rgranges( rgranges ) {}
|
|
};
|
|
|
|
class match_range_with_case : public match_range
|
|
{
|
|
public:
|
|
match_range_with_case( const vector<range_type> & rgranges )
|
|
: match_range( rgranges ) {}
|
|
inline bool operator()( wint_t ch ) const throw()
|
|
{
|
|
return binary_search( m_rgranges.begin(), m_rgranges.end(),
|
|
range_type(ch,ch), _range_less );
|
|
}
|
|
};
|
|
|
|
class match_range_no_case : public match_range
|
|
{
|
|
public:
|
|
match_range_no_case( const vector<range_type> & rgranges )
|
|
: match_range( rgranges ) {}
|
|
inline bool operator()( wint_t ch ) const throw()
|
|
{
|
|
const wint_t chup = towupper( ch );
|
|
if( binary_search( m_rgranges.begin(), m_rgranges.end(),
|
|
range_type(chup,chup), _range_less ) )
|
|
return true;
|
|
|
|
const wint_t chlo = towlower( ch );
|
|
if( chup != chlo &&
|
|
binary_search( m_rgranges.begin(), m_rgranges.end(),
|
|
range_type(chlo,chlo), _range_less ) )
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
};
|
|
|
|
template< typename EOS, typename RGM >
|
|
class match_charset_t : public match_charset<typename EOS::const_iterator>
|
|
{
|
|
public:
|
|
typedef EOS eos_type;
|
|
typedef RGM range_match_type;
|
|
typedef typename EOS::const_iterator CI;
|
|
|
|
match_charset_t( const match_charset<CI> & that )
|
|
: match_charset<CI>( that ), m_rgm( m_rgranges ) {}
|
|
virtual ~match_charset_t() {}
|
|
|
|
inline bool is_in_charset( char_type ch ) const throw()
|
|
{
|
|
return m_fcomplement != _is_in_charset( ch );
|
|
}
|
|
|
|
protected:
|
|
match_charset_t( bool fcomplement,
|
|
const ascii_bitvector & bvect,
|
|
const vector<range_type> & rgranges )
|
|
: match_charset<CI>( fcomplement, bvect, rgranges ), m_rgm( m_rgranges ) {}
|
|
|
|
// Note overloading based on parameter
|
|
inline bool _is_in_charset( char ch ) const throw()
|
|
{
|
|
return ( m_rgascii[ unsigned char(ch) ] ) ||
|
|
( m_ncharflags && ( _pctype[unsigned char(ch)] & m_ncharflags ) );
|
|
}
|
|
|
|
// Note overloading based on parameter
|
|
inline bool _is_in_charset( wint_t ch ) const throw()
|
|
{
|
|
if( UCHAR_MAX >= ch )
|
|
return _is_in_charset( char(ch) );
|
|
|
|
// use range_match_type to see if this character is within one of the
|
|
// ranges stored in m_rgranges.
|
|
return ( ! m_rgranges.empty() && m_rgm( ch ) ) ||
|
|
( m_ncharflags && iswctype( ch, (int)m_ncharflags ) );
|
|
}
|
|
|
|
virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
|
|
{
|
|
if( m_eos( param, icur ) || ! is_in_charset( *icur ) )
|
|
return false;
|
|
++icur;
|
|
return true;
|
|
}
|
|
|
|
// range_match_type encapsulates the case-sensitivity
|
|
// issues with doing a unicode lookup on the ranges vector.
|
|
range_match_type m_rgm;
|
|
eos_type m_eos;
|
|
};
|
|
|
|
template< typename EOS, typename RGM >
|
|
class match_custom_charset_t : public match_charset_t<EOS,RGM>
|
|
{
|
|
public:
|
|
template< typename SY >
|
|
match_custom_charset_t( bool fcomplement,
|
|
typename basic_string<char_type>::iterator & icur,
|
|
typename basic_string<char_type>::const_iterator istop,
|
|
unsigned flags, SY /*sy*/ ) throw(bad_regexpr,bad_alloc)
|
|
: match_charset_t<EOS,RGM>( fcomplement, m_rgasciicustom, m_rgrangescustom )
|
|
{
|
|
_parse_charset( icur, istop, flags, SY() );
|
|
_optimize();
|
|
}
|
|
|
|
virtual ~match_custom_charset_t() {}
|
|
|
|
// for including one character set in another
|
|
match_custom_charset_t<EOS,RGM> & operator|=( const match_charset<CI> & that )
|
|
{
|
|
assert( 0 == that.m_ncharflags );
|
|
if( that.m_fcomplement )
|
|
{
|
|
m_rgasciicustom |= ~ that.m_rgascii;
|
|
|
|
// append the inverse of that.m_rgranges to this->m_rgrangescustom
|
|
wint_t chlow = UCHAR_MAX;
|
|
typedef vector<range_type>::const_iterator VCI;
|
|
for( VCI prg = that.m_rgranges.begin(); prg != that.m_rgranges.end(); ++prg )
|
|
{
|
|
if( UCHAR_MAX + 1 != prg->first )
|
|
m_rgrangescustom.push_back( range_type( chlow + 1, prg->first - 1 ) );
|
|
chlow = prg->second;
|
|
}
|
|
if( WCHAR_MAX != chlow )
|
|
m_rgrangescustom.push_back( range_type( chlow + 1, WCHAR_MAX ) );
|
|
}
|
|
else
|
|
{
|
|
m_rgasciicustom |= that.m_rgascii;
|
|
m_rgrangescustom.insert( m_rgrangescustom.end(),
|
|
that.m_rgranges.begin(),
|
|
that.m_rgranges.end() );
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
protected:
|
|
|
|
template< typename SY >
|
|
void _parse_charset( typename basic_string<char_type>::iterator & icur,
|
|
typename basic_string<char_type>::const_iterator istop,
|
|
unsigned flags, SY /*sy*/ ) throw(bad_regexpr,bad_alloc)
|
|
{
|
|
TOKEN tok;
|
|
char_type ch_prev = 0;
|
|
match_charset<CI> * pcharset;
|
|
basic_string<char_type>::iterator iprev = icur;
|
|
const bool fnocase = ( NOCASE == ( NOCASE & flags ) );
|
|
|
|
if( (basic_string<char_type>::const_iterator)icur == istop )
|
|
throw bad_regexpr("expecting end of character set");
|
|
|
|
// remember the current position and grab the next token
|
|
tok = SY::charset_token( icur, istop );
|
|
do
|
|
{
|
|
// If we reached the end of the string before finding the end of the
|
|
// character set, then this is an ill-formed regex
|
|
if( (basic_string<char_type>::const_iterator)icur == istop )
|
|
throw bad_regexpr("expecting end of character set");
|
|
|
|
if( CHARSET_RANGE == tok && ch_prev )
|
|
{
|
|
// remember the current position
|
|
basic_string<char_type>::iterator iprev2 = icur;
|
|
char_type old_ch = ch_prev;
|
|
ch_prev = 0;
|
|
|
|
// old_ch is lower bound of a range
|
|
switch( SY::charset_token( icur, istop ) )
|
|
{
|
|
case CHARSET_RANGE:
|
|
case CHARSET_NEGATE:
|
|
icur = iprev2; // un-get these tokens and fall through
|
|
case NO_TOKEN:
|
|
case CHARSET_ESCAPE: // BUGBUG user-defined charset?
|
|
_set_bit_range( old_ch, *icur++, fnocase );
|
|
continue;
|
|
case CHARSET_BACKSPACE:
|
|
_set_bit_range( old_ch, char_type(8), fnocase ); // backspace
|
|
continue;
|
|
case CHARSET_END: // fall through
|
|
default: // not a range.
|
|
icur = iprev; // backup to range token
|
|
_set_bit( old_ch, fnocase );
|
|
_set_bit( *icur++, fnocase );
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if( ch_prev )
|
|
_set_bit( ch_prev, fnocase );
|
|
ch_prev = 0;
|
|
|
|
switch( tok )
|
|
{
|
|
// None of the intrinsic charsets are case-sensitive,
|
|
// so no special handling must be done when the NOCASE
|
|
// flag is set.
|
|
case CHARSET_RANGE:
|
|
case CHARSET_NEGATE:
|
|
case CHARSET_END:
|
|
icur = iprev; // un-get these tokens
|
|
ch_prev = *icur++;
|
|
continue;
|
|
case CHARSET_BACKSPACE:
|
|
ch_prev = char_type(8); // backspace
|
|
continue;
|
|
case ESC_DIGIT:
|
|
*this |= match_charset<CI>( false, get_digit_vector() );
|
|
continue;
|
|
case ESC_NOT_DIGIT:
|
|
*this |= match_charset<CI>( true, get_digit_vector() );
|
|
continue;
|
|
case ESC_SPACE:
|
|
*this |= match_charset<CI>( false, get_space_vector() );
|
|
continue;
|
|
case ESC_NOT_SPACE:
|
|
*this |= match_charset<CI>( true, get_space_vector() );
|
|
continue;
|
|
case ESC_WORD:
|
|
*this |= match_charset<CI>( false, get_word_vector() );
|
|
continue;
|
|
case ESC_NOT_WORD:
|
|
*this |= match_charset<CI>( true, get_word_vector() );
|
|
continue;
|
|
case CHARSET_ALNUM:
|
|
m_ncharflags |= (_ALPHA|_DIGIT);
|
|
continue;
|
|
case CHARSET_ALPHA:
|
|
m_ncharflags |= (_ALPHA);
|
|
continue;
|
|
case CHARSET_BLANK:
|
|
m_ncharflags |= (_BLANK);
|
|
continue;
|
|
case CHARSET_CNTRL:
|
|
m_ncharflags |= (_CONTROL);
|
|
continue;
|
|
case CHARSET_DIGIT:
|
|
m_ncharflags |= (_DIGIT);
|
|
continue;
|
|
case CHARSET_GRAPH:
|
|
m_ncharflags |= (_PUNCT|_ALPHA|_DIGIT);
|
|
continue;
|
|
case CHARSET_LOWER:
|
|
m_ncharflags |= (_LOWER);
|
|
if( NOCASE == ( NOCASE & flags ) )
|
|
m_ncharflags |= (_UPPER);
|
|
continue;
|
|
case CHARSET_PRINT:
|
|
m_ncharflags |= (_BLANK|_PUNCT|_ALPHA|_DIGIT);
|
|
continue;
|
|
case CHARSET_PUNCT:
|
|
m_ncharflags |= (_PUNCT);
|
|
continue;
|
|
case CHARSET_SPACE:
|
|
m_ncharflags |= (_SPACE);
|
|
continue;
|
|
case CHARSET_UPPER:
|
|
m_ncharflags |= (_UPPER);
|
|
if( NOCASE == ( NOCASE & flags ) )
|
|
m_ncharflags |= (_LOWER);
|
|
continue;
|
|
case CHARSET_XDIGIT:
|
|
m_ncharflags |= (_HEX);
|
|
continue;
|
|
case CHARSET_ESCAPE:
|
|
// Maybe this is a user-defined intrinsic charset
|
|
pcharset = get_altern_charset( *icur, flags, SY() );
|
|
if( NULL != pcharset )
|
|
{
|
|
*this |= *pcharset;
|
|
++icur;
|
|
continue;
|
|
}
|
|
// else fall through
|
|
default:
|
|
ch_prev = *icur++;
|
|
continue;
|
|
}
|
|
}
|
|
while( iprev = icur, CHARSET_END != ( tok = SY::charset_token( icur, istop ) ) );
|
|
|
|
if( ch_prev )
|
|
_set_bit( ch_prev, fnocase );
|
|
}
|
|
|
|
void _optimize()
|
|
{
|
|
// this sorts on range_type.first (uses operator<() for pair templates)
|
|
sort( m_rgrangescustom.begin(), m_rgrangescustom.end() );
|
|
|
|
// This merges ranges that overlap
|
|
for( size_t index = 1; index < m_rgrangescustom.size(); )
|
|
{
|
|
if( m_rgrangescustom[index].first <= m_rgrangescustom[index-1].second + 1 )
|
|
{
|
|
m_rgrangescustom[index-1].second = max(
|
|
m_rgrangescustom[index-1].second, m_rgrangescustom[index].second );
|
|
m_rgrangescustom.erase( m_rgrangescustom.begin() + index );
|
|
}
|
|
else
|
|
++index;
|
|
}
|
|
}
|
|
|
|
// Note overloading based on second parameter
|
|
void _set_bit( char ch, const bool fnocase ) throw()
|
|
{
|
|
if( fnocase )
|
|
{
|
|
m_rgasciicustom.set( unsigned char(tolower(ch)) );
|
|
m_rgasciicustom.set( unsigned char(toupper(ch)) );
|
|
}
|
|
else
|
|
{
|
|
m_rgasciicustom.set( unsigned char(ch) );
|
|
}
|
|
}
|
|
|
|
// Note overloading based on second parameter
|
|
void _set_bit( wint_t ch, const bool fnocase ) throw(bad_alloc)
|
|
{
|
|
if( UCHAR_MAX >= ch )
|
|
_set_bit( char(ch), fnocase );
|
|
else
|
|
m_rgrangescustom.push_back( range_type( ch, ch ) );
|
|
}
|
|
|
|
// Note overloading based on second parameter
|
|
void _set_bit_range( char ch1, char ch2, const bool fnocase ) throw(bad_regexpr)
|
|
{
|
|
if( unsigned char(ch1) > unsigned char(ch2) )
|
|
throw bad_regexpr("invalid range specified in character set");
|
|
|
|
if( fnocase )
|
|
{
|
|
// i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
|
|
for( unsigned int i = unsigned char(ch1); i <= unsigned char(ch2); ++i )
|
|
{
|
|
m_rgasciicustom.set( unsigned char( toupper(i) ) );
|
|
m_rgasciicustom.set( unsigned char( tolower(i) ) );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
|
|
for( unsigned int i = unsigned char(ch1); i <= unsigned char(ch2); ++i )
|
|
m_rgasciicustom.set( unsigned char(i) );
|
|
}
|
|
}
|
|
|
|
// Note overloading based on second parameter
|
|
void _set_bit_range( wint_t ch1, wint_t ch2, const bool fnocase ) throw(bad_regexpr,bad_alloc)
|
|
{
|
|
if( ch1 > ch2 )
|
|
throw bad_regexpr("invalid range specified in character set");
|
|
|
|
if( UCHAR_MAX >= ch1 )
|
|
_set_bit_range( char(ch1), char( min(wint_t(UCHAR_MAX),ch2) ), fnocase );
|
|
|
|
if( UCHAR_MAX < ch2 )
|
|
m_rgrangescustom.push_back( range_type( max(wint_t(UCHAR_MAX+1),ch1), ch2 ) );
|
|
}
|
|
|
|
ascii_bitvector m_rgasciicustom;
|
|
vector<range_type> m_rgrangescustom;
|
|
};
|
|
|
|
|
|
template< typename CI >
|
|
match_charset<CI> * create_charset(
|
|
const match_charset<CI> & that,
|
|
unsigned flags )
|
|
{
|
|
switch( ( NOCASE | CSTRINGS ) & flags )
|
|
{
|
|
case 0:
|
|
return new match_charset_t<eos_t<CI>,match_range_with_case>( that );
|
|
case NOCASE:
|
|
return new match_charset_t<eos_t<CI>,match_range_no_case>( that );
|
|
case CSTRINGS:
|
|
return new match_charset_t<eocs_t<CI>,match_range_with_case>( that );
|
|
case NOCASE | CSTRINGS:
|
|
return new match_charset_t<eocs_t<CI>,match_range_no_case>( that );
|
|
default:
|
|
__assume(0); // tells the compiler that this is unreachable
|
|
}
|
|
}
|
|
|
|
template< typename EOS >
|
|
class word_assertion_t : public assertion<typename EOS::const_iterator>
|
|
{
|
|
public:
|
|
typedef EOS eos_type;
|
|
typedef typename EOS::const_iterator CI;
|
|
word_assertion_t()
|
|
: m_isword( match_charset<CI>( false, get_word_vector() ) ) {}
|
|
virtual ~word_assertion_t() {}
|
|
protected:
|
|
bos_t<CI> m_bos;
|
|
eos_type m_eos;
|
|
match_charset_t<eos_type,match_range_with_case> m_isword;
|
|
};
|
|
|
|
template< typename EOS >
|
|
class word_boundary_t : public word_assertion_t<EOS>
|
|
{
|
|
public:
|
|
word_boundary_t( const bool fisboundary )
|
|
: m_fisboundary( fisboundary ) {}
|
|
virtual ~word_boundary_t() {}
|
|
protected:
|
|
virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
|
|
{
|
|
CI iprev = icur;
|
|
--iprev;
|
|
|
|
const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev );
|
|
const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur );
|
|
|
|
return ( m_fisboundary == ( fprevword != fthisword ) );
|
|
}
|
|
const bool m_fisboundary;
|
|
};
|
|
|
|
template< typename EOS >
|
|
class word_start_t : public word_assertion_t<EOS>
|
|
{
|
|
public:
|
|
word_start_t() {}
|
|
virtual ~word_start_t() {}
|
|
protected:
|
|
virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
|
|
{
|
|
CI iprev = icur;
|
|
--iprev;
|
|
|
|
const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev );
|
|
const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur );
|
|
|
|
return ! fprevword && fthisword;
|
|
}
|
|
};
|
|
|
|
template< typename EOS >
|
|
class word_stop_t : public word_assertion_t<EOS>
|
|
{
|
|
public:
|
|
word_stop_t() {}
|
|
virtual ~word_stop_t() {}
|
|
protected:
|
|
virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
|
|
{
|
|
CI iprev = icur;
|
|
--iprev;
|
|
|
|
const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev );
|
|
const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur );
|
|
|
|
return fprevword && ! fthisword;
|
|
}
|
|
};
|
|
|
|
template< typename CI >
|
|
assertion<CI> * create_word_boundary( const bool fisboundary, unsigned flags )
|
|
{
|
|
switch( CSTRINGS & flags )
|
|
{
|
|
case 0:
|
|
return new word_boundary_t<eos_t<CI> >( fisboundary );
|
|
case CSTRINGS:
|
|
return new word_boundary_t<eocs_t<CI> >( fisboundary );
|
|
default:
|
|
__assume(0); // tells the compiler that this is unreachable
|
|
}
|
|
}
|
|
|
|
template< typename CI >
|
|
assertion<CI> * create_word_start( unsigned flags )
|
|
{
|
|
switch( CSTRINGS & flags )
|
|
{
|
|
case 0:
|
|
return new word_start_t<eos_t<CI> >();
|
|
case CSTRINGS:
|
|
return new word_start_t<eocs_t<CI> >();
|
|
default:
|
|
__assume(0); // tells the compiler that this is unreachable
|
|
}
|
|
}
|
|
|
|
template< typename CI >
|
|
assertion<CI> * create_word_stop( unsigned flags )
|
|
{
|
|
switch( CSTRINGS & flags )
|
|
{
|
|
case 0:
|
|
return new word_stop_t<eos_t<CI> >();
|
|
case CSTRINGS:
|
|
return new word_stop_t<eocs_t<CI> >();
|
|
default:
|
|
__assume(0); // tells the compiler that this is unreachable
|
|
}
|
|
}
|
|
|
|
template< typename CI > class group_quantifier;
|
|
|
|
template< typename CI >
|
|
class match_group : public sub_expr<CI>
|
|
{
|
|
public:
|
|
friend class group_quantifier<CI>;
|
|
|
|
match_group( size_t cgroup )
|
|
: m_rgalternates(), m_cgroup( cgroup ),
|
|
m_pptail(NULL), m_end_group( this ), m_nwidth(uninit_width) {}
|
|
|
|
virtual ~match_group() {}
|
|
|
|
virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
CI old_istart;
|
|
|
|
if( -1 != m_cgroup ) // could be -1 if this is a lookahead_assertion
|
|
{
|
|
old_istart = (*param.prgbackrefs)[ m_cgroup ].first;
|
|
(*param.prgbackrefs)[ m_cgroup ].first = icur;
|
|
}
|
|
|
|
typedef vector<sub_expr<CI>*>::const_iterator VCI;
|
|
for( VCI ialt = m_rgalternates.begin(); ialt != m_rgalternates.end(); ++ialt )
|
|
{
|
|
if( (*ialt)->domatch( param, icur ) )
|
|
return true;
|
|
}
|
|
|
|
if( -1 != m_cgroup )
|
|
(*param.prgbackrefs)[ m_cgroup ].first = old_istart;
|
|
|
|
return false;
|
|
}
|
|
|
|
virtual void _delete()
|
|
{
|
|
typedef vector<sub_expr<CI>*>::iterator VI;
|
|
for( VI ialt = m_rgalternates.begin(); ialt != m_rgalternates.end(); ++ialt )
|
|
delete_sub_expr( *ialt );
|
|
sub_expr<CI>::_delete();
|
|
}
|
|
|
|
size_t group_number() const
|
|
{
|
|
return m_cgroup;
|
|
}
|
|
|
|
void group_number( size_t cgroup )
|
|
{
|
|
m_cgroup = cgroup;
|
|
}
|
|
|
|
void add_item( sub_expr<CI> * pitem )
|
|
{
|
|
*m_pptail = pitem;
|
|
m_pptail = & pitem->next();
|
|
}
|
|
|
|
void add_alternate()
|
|
{
|
|
m_rgalternates.push_back( NULL );
|
|
m_pptail = & m_rgalternates.back();
|
|
}
|
|
|
|
void end_alternate()
|
|
{
|
|
*m_pptail = & m_end_group;
|
|
}
|
|
|
|
size_t calternates() const
|
|
{
|
|
return m_rgalternates.size();
|
|
}
|
|
|
|
width_type group_width()
|
|
{
|
|
(void) match_group<CI>::_width_this();
|
|
return m_nwidth;
|
|
}
|
|
|
|
protected:
|
|
|
|
virtual bool _call_back( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
CI old_iend;
|
|
|
|
if( -1 != m_cgroup )
|
|
{
|
|
old_iend = (*param.prgbackrefs)[ m_cgroup ].second;
|
|
(*param.prgbackrefs)[ m_cgroup ].second = icur;
|
|
}
|
|
|
|
if( match_next( param, icur ) )
|
|
return true;
|
|
|
|
if( -1 != m_cgroup )
|
|
(*param.prgbackrefs)[ m_cgroup ].second = old_iend;
|
|
|
|
return false;
|
|
}
|
|
|
|
virtual width_type _width_this() throw()
|
|
{
|
|
typedef vector<sub_expr<CI>*>::const_iterator VCI;
|
|
if( uninit_width == m_nwidth )
|
|
{
|
|
m_nwidth = width_type(-1,0);
|
|
for( VCI ialt = m_rgalternates.begin(); worst_width != m_nwidth && ialt != m_rgalternates.end(); ++ialt )
|
|
{
|
|
width_type temp_width = (*ialt)->get_width();
|
|
m_nwidth.m_min = min( m_nwidth.m_min, temp_width.m_min );
|
|
m_nwidth.m_max = max( m_nwidth.m_max, temp_width.m_max );
|
|
}
|
|
}
|
|
return m_nwidth;
|
|
}
|
|
|
|
class end_group;
|
|
friend class end_group;
|
|
class end_group : public sub_expr<CI>
|
|
{
|
|
void * operator new( size_t );
|
|
public:
|
|
end_group( match_group * pgroup )
|
|
: m_pgroup( pgroup ) {}
|
|
virtual ~end_group() {}
|
|
virtual void _delete() {} // don't delete this, because it was never alloc'ed
|
|
|
|
virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
return m_pgroup->_call_back( param, icur );
|
|
}
|
|
protected:
|
|
// since m_pnext is always NULL for end_groups, get_width() stops recursing here
|
|
virtual width_type _width_this() throw()
|
|
{
|
|
return width_type(0,0);
|
|
}
|
|
match_group<CI> * m_pgroup;
|
|
};
|
|
|
|
vector<sub_expr<CI>*> m_rgalternates;
|
|
sub_expr<CI> ** m_pptail; // only used when adding elements
|
|
size_t m_cgroup;
|
|
end_group m_end_group;
|
|
width_type m_nwidth;
|
|
};
|
|
|
|
// Behaves like a lookahead assertion if m_cgroup is -1, or like
|
|
// an independent group otherwise.
|
|
template< typename CI >
|
|
class independent_group : public match_group<CI>
|
|
{
|
|
public:
|
|
independent_group()
|
|
: match_group<CI>( -1 ), m_fexpected(true) {}
|
|
virtual ~independent_group() {}
|
|
|
|
virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
// Copy the entire backref vector onto the stack
|
|
backref_tag<CI> * prgbr = (backref_tag<CI>*)_alloca( param.prgbackrefs->size() * sizeof backref_tag<CI> );
|
|
copy( param.prgbackrefs->begin(), param.prgbackrefs->end(),
|
|
raw_storage_iterator<backref_tag<CI>*,backref_tag<CI> >(prgbr) );
|
|
|
|
// Match until the end of this group and then return
|
|
const bool fdomatch = match_group<CI>::domatch( param, icur );
|
|
|
|
if( m_fexpected == fdomatch )
|
|
{
|
|
// If m_cgroup != 1, then this is not a zero-width assertion.
|
|
if( -1 != m_cgroup )
|
|
icur = (*param.prgbackrefs)[ m_cgroup ].second;
|
|
|
|
if( match_next( param, icur ) )
|
|
return true;
|
|
}
|
|
|
|
// if match_group::domatch returned true, the backrefs must be restored
|
|
if( fdomatch )
|
|
copy( prgbr, prgbr + param.prgbackrefs->size(), param.prgbackrefs->begin() );
|
|
|
|
return false;
|
|
}
|
|
|
|
protected:
|
|
|
|
independent_group( const bool fexpected )
|
|
: match_group<CI>( -1 ), m_fexpected(fexpected) {}
|
|
|
|
virtual bool _call_back( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
if( -1 != m_cgroup )
|
|
(*param.prgbackrefs)[ m_cgroup ].second = icur;
|
|
return true;
|
|
}
|
|
|
|
const bool m_fexpected;
|
|
};
|
|
|
|
template< typename CI >
|
|
class lookahead_assertion : public independent_group<CI>
|
|
{
|
|
public:
|
|
lookahead_assertion( const bool fexpected )
|
|
: independent_group<CI>( fexpected ) {}
|
|
virtual ~lookahead_assertion() {}
|
|
virtual bool is_assertion() const throw() { return true; }
|
|
protected:
|
|
virtual width_type _width_this() throw() { return width_type(0,0); }
|
|
};
|
|
|
|
template< typename CI >
|
|
class lookbehind_assertion : public independent_group<CI>
|
|
{
|
|
public:
|
|
lookbehind_assertion( const bool fexpected )
|
|
: independent_group<CI>( fexpected ) {}
|
|
virtual ~lookbehind_assertion() {}
|
|
|
|
virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
// This is the room in the string from the start to the current position
|
|
size_t room = distance( param.ibegin, icur );
|
|
|
|
// If we don't have enough room to match the lookbehind, the match fails.
|
|
// If we wanted the match to fail, try to match the rest of the pattern.
|
|
if( m_nwidth.m_min > room )
|
|
return m_fexpected ? false : match_next( param, icur );
|
|
|
|
// Copy the entire backref vector onto the stack
|
|
backref_tag<CI> * prgbr = (backref_tag<CI>*)_alloca( param.prgbackrefs->size() * sizeof backref_tag<CI> );
|
|
copy( param.prgbackrefs->begin(), param.prgbackrefs->end(),
|
|
raw_storage_iterator<backref_tag<CI>*,backref_tag<CI> >(prgbr) );
|
|
|
|
CI local_istart = icur;
|
|
advance( local_istart, -int( min( m_nwidth.m_max, room ) ) );
|
|
|
|
CI local_istop = icur;
|
|
advance( local_istop, -int( m_nwidth.m_min ) );
|
|
|
|
// Create a local param struct that has icur as param.iend
|
|
match_param<CI> local_param(param.ibegin,param.istart,icur,param.prgbackrefs);
|
|
|
|
// Find the rightmost match that ends at icur.
|
|
for( CI local_icur = local_istart; local_icur <= local_istop; ++local_icur )
|
|
{
|
|
// Match until the end of this group and then return
|
|
const bool fmatched = match_group<CI>::domatch( local_param, local_icur );
|
|
|
|
// If the match results were what we were expecting, try to match the
|
|
// rest of the pattern. If that succeeds, return true.
|
|
if( m_fexpected == fmatched && match_next( param, icur ) )
|
|
return true;
|
|
|
|
// if match_group::domatch returned true, the backrefs must be restored
|
|
if( fmatched )
|
|
{
|
|
copy( prgbr, prgbr + param.prgbackrefs->size(), param.prgbackrefs->begin() );
|
|
|
|
// Match succeeded. If this is a negative lookbehind, we didn't want it
|
|
// to succeed, so return false.
|
|
if( ! m_fexpected )
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// No variation of the lookbehind was satisfied in a way that permited
|
|
// the rest of the pattern to match successfully, so return false.
|
|
return false;
|
|
}
|
|
virtual bool is_assertion() const throw() { return true; }
|
|
protected:
|
|
virtual bool _call_back( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
return param.istop == icur;
|
|
}
|
|
virtual width_type _width_this() throw() { return width_type(0,0); }
|
|
};
|
|
|
|
// Corresponds to the (?:foo) extension, which has grouping semantics, but
|
|
// does not store any backref information.
|
|
template< typename CI >
|
|
class group_nobackref : public match_group<CI>
|
|
{
|
|
public:
|
|
group_nobackref( )
|
|
: match_group( -1 ) {} // will be assigned a group number in basic_rpattern::basic_rpattern()
|
|
virtual ~group_nobackref() {}
|
|
};
|
|
|
|
template< typename CI >
|
|
class match_wrapper : public sub_expr<CI>
|
|
{
|
|
public:
|
|
match_wrapper( sub_expr<CI> * psub )
|
|
: m_psub(psub) {}
|
|
virtual ~match_wrapper() {}
|
|
|
|
virtual void _delete()
|
|
{
|
|
delete_sub_expr( m_psub );
|
|
sub_expr<CI>::_delete();
|
|
}
|
|
|
|
protected:
|
|
|
|
bool _wrapped_match_this( match_param<CI> & param, CI & icur ) const throw()
|
|
{
|
|
return m_psub->_match_this( param, icur );
|
|
}
|
|
|
|
virtual width_type _width_this() throw()
|
|
{
|
|
return m_psub->_width_this();
|
|
}
|
|
|
|
sub_expr<CI> * m_psub;
|
|
};
|
|
|
|
template< typename CI >
|
|
class match_quantifier : public match_wrapper<CI>
|
|
{
|
|
public:
|
|
match_quantifier( sub_expr<CI> * psub, size_t lbound, size_t ubound )
|
|
: match_wrapper<CI>( psub ), m_lbound(lbound), m_ubound(ubound) {}
|
|
|
|
virtual ~match_quantifier() {}
|
|
|
|
protected:
|
|
|
|
virtual width_type _width_this() throw()
|
|
{
|
|
width_type this_width = match_wrapper<CI>::_width_this();
|
|
|
|
return this_width * width_type( m_lbound, m_ubound );
|
|
}
|
|
|
|
const size_t m_lbound;
|
|
const size_t m_ubound;
|
|
};
|
|
|
|
template< typename CI >
|
|
class max_atom_quantifier : public match_quantifier<CI>
|
|
{
|
|
public:
|
|
max_atom_quantifier( sub_expr<CI> * psub, size_t lbound, size_t ubound )
|
|
: match_quantifier<CI>( psub, lbound, ubound ) {}
|
|
|
|
virtual ~max_atom_quantifier() {}
|
|
|
|
virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
size_t cmatches = 0;
|
|
int cdiff = 0; // must be a signed quantity for advance() below
|
|
|
|
if( cmatches < m_ubound )
|
|
{
|
|
CI istart = icur;
|
|
if( _wrapped_match_this( param, icur ) )
|
|
{
|
|
++cmatches;
|
|
cdiff = distance( istart, icur );
|
|
|
|
if( 0 == cdiff )
|
|
return ( match_next( param, icur ) );
|
|
|
|
while( cmatches < m_ubound &&
|
|
_wrapped_match_this( param, icur ) )
|
|
{
|
|
++cmatches;
|
|
}
|
|
}
|
|
}
|
|
|
|
if( cmatches >= m_lbound )
|
|
{
|
|
if( ! next() )
|
|
return true;
|
|
|
|
for(;;)
|
|
{
|
|
if( next()->domatch( param, icur ) )
|
|
return true;
|
|
|
|
if( cmatches-- <= m_lbound )
|
|
break;
|
|
|
|
advance( icur, -cdiff );
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
};
|
|
|
|
template< typename CI >
|
|
class min_atom_quantifier : public match_quantifier<CI>
|
|
{
|
|
public:
|
|
min_atom_quantifier( sub_expr<CI> * psub, size_t lbound, size_t ubound )
|
|
: match_quantifier<CI>( psub, lbound, ubound ) {}
|
|
|
|
virtual ~min_atom_quantifier() {}
|
|
|
|
virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
size_t cmatches = 0;
|
|
bool fsuccess = true;
|
|
CI icur_tmp = icur;
|
|
|
|
if( _wrapped_match_this( param, icur_tmp ) )
|
|
{
|
|
if( icur_tmp == icur )
|
|
return ( match_next( param, icur ) );
|
|
|
|
if( m_lbound )
|
|
{
|
|
icur = icur_tmp;
|
|
++cmatches;
|
|
}
|
|
|
|
while( ( cmatches < m_lbound ) &&
|
|
( fsuccess = _wrapped_match_this( param, icur ) ) )
|
|
{
|
|
++cmatches;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
fsuccess = ! m_lbound;
|
|
}
|
|
|
|
if( fsuccess && next() )
|
|
{
|
|
do
|
|
{
|
|
if( next()->domatch( param, icur ) )
|
|
break;
|
|
} while( fsuccess = ( cmatches++ < m_ubound &&
|
|
_wrapped_match_this( param, icur ) ) );
|
|
}
|
|
|
|
return fsuccess;
|
|
}
|
|
};
|
|
|
|
template< typename CI >
|
|
class group_quantifier : public match_quantifier<CI>
|
|
{
|
|
public:
|
|
|
|
group_quantifier( match_group<CI> * psub, size_t lbound, size_t ubound )
|
|
: match_quantifier<CI>( psub, lbound, ubound ),
|
|
m_group( *psub ), m_end_quantifier( this )
|
|
{
|
|
psub->next() = & m_end_quantifier;
|
|
}
|
|
|
|
virtual ~group_quantifier() {}
|
|
|
|
virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
// group_number is only -1 for assertions, which can't be quantified
|
|
assert( -1 != group_number() );
|
|
|
|
backref_tag<CI> & br = (*param.prgbackrefs)[ group_number() ];
|
|
backref_tag<CI> old_backref = br;
|
|
br = backref_tag<CI>( icur, icur ); // sets cmatches (reserved) to 0
|
|
|
|
if( _recurse( param, icur ) )
|
|
return true;
|
|
|
|
br = old_backref;
|
|
return false;
|
|
}
|
|
|
|
protected:
|
|
|
|
class end_quantifier;
|
|
friend class end_quantifier;
|
|
class end_quantifier : public sub_expr<CI>
|
|
{
|
|
void * operator new( size_t );
|
|
public:
|
|
end_quantifier( group_quantifier<CI> * pquant )
|
|
: m_pquant( pquant ) {}
|
|
|
|
virtual ~end_quantifier() {}
|
|
virtual void _delete() {} // don't delete this, since it wasn't alloc'ed
|
|
|
|
virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
// group_number is only -1 for assertions, which can't be quantified
|
|
assert( -1 != m_pquant->group_number() );
|
|
|
|
// handle special the case where a group matches 0 characters
|
|
backref_tag<CI> & br = (*param.prgbackrefs)[ m_pquant->group_number() ];
|
|
if( icur == br.first )
|
|
{
|
|
size_t old_cmatches = br.reserved;
|
|
br.reserved = m_pquant->m_ubound;
|
|
if( m_pquant->_recurse( param, icur ) )
|
|
return true;
|
|
br.reserved = old_cmatches;
|
|
return false;
|
|
}
|
|
return m_pquant->_recurse( param, icur );
|
|
}
|
|
|
|
protected:
|
|
virtual width_type _width_this() throw() { return width_type(0,0); }
|
|
group_quantifier<CI> * m_pquant;
|
|
};
|
|
|
|
size_t group_number() const
|
|
{
|
|
return m_group.group_number();
|
|
}
|
|
|
|
size_t & cmatches( match_param<CI> & param ) const
|
|
{
|
|
return (*param.prgbackrefs)[ group_number() ].reserved;
|
|
}
|
|
|
|
virtual bool _recurse( match_param<CI> & param, CI icur ) const throw() = 0;
|
|
|
|
match_group<CI> & m_group;
|
|
end_quantifier m_end_quantifier;
|
|
};
|
|
|
|
template< typename CI >
|
|
class max_group_quantifier : public group_quantifier<CI>
|
|
{
|
|
public:
|
|
max_group_quantifier( match_group<CI> * psub, size_t lbound, size_t ubound )
|
|
: group_quantifier<CI>( psub, lbound, ubound ) {}
|
|
|
|
virtual ~max_group_quantifier() {}
|
|
|
|
protected:
|
|
|
|
virtual bool _recurse( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
if( m_ubound == cmatches( param ) )
|
|
return match_next( param, icur );
|
|
|
|
++cmatches( param );
|
|
if( m_psub->domatch( param, icur ) )
|
|
return true;
|
|
|
|
if( --cmatches( param ) < m_lbound )
|
|
return false;
|
|
|
|
return match_next( param, icur );
|
|
}
|
|
};
|
|
|
|
template< typename CI >
|
|
class min_group_quantifier : public group_quantifier<CI>
|
|
{
|
|
public:
|
|
min_group_quantifier( match_group<CI> * psub, size_t lbound, size_t ubound )
|
|
: group_quantifier<CI>( psub, lbound, ubound ) {}
|
|
|
|
virtual ~min_group_quantifier() {}
|
|
|
|
protected:
|
|
|
|
virtual bool _recurse( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
if( m_lbound > cmatches( param ) )
|
|
{
|
|
++cmatches( param );
|
|
return m_psub->domatch( param, icur );
|
|
}
|
|
|
|
if( match_next( param, icur ) )
|
|
return true;
|
|
|
|
if( cmatches( param )++ == m_ubound )
|
|
return false;
|
|
|
|
return m_psub->domatch( param, icur );
|
|
}
|
|
};
|
|
|
|
template< typename CI >
|
|
class match_backref : public sub_expr<CI>
|
|
{
|
|
public:
|
|
match_backref( size_t cbackref, const width_type & group_width )
|
|
: m_cbackref( cbackref ), m_nwidth(group_width) {}
|
|
virtual ~match_backref() {}
|
|
protected:
|
|
// Return the width specifications of the group to which this backref refers
|
|
virtual width_type _width_this() throw() { return m_nwidth; }
|
|
const size_t m_cbackref;
|
|
const width_type m_nwidth;
|
|
};
|
|
|
|
template< typename CMP, typename EOS >
|
|
class match_backref_t : public match_backref<typename EOS::const_iterator>
|
|
{
|
|
public:
|
|
typedef CMP cmp_type;
|
|
typedef EOS eos_type;
|
|
typedef typename EOS::const_iterator CI;
|
|
match_backref_t( size_t cbackref, const width_type & group_width )
|
|
: match_backref<CI>( cbackref, group_width ) {}
|
|
virtual ~match_backref_t() {}
|
|
protected:
|
|
virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
|
|
{
|
|
CI ithis = (*param.prgbackrefs)[ m_cbackref ].first;
|
|
CI istop = (*param.prgbackrefs)[ m_cbackref ].second;
|
|
CI icur_tmp = icur;
|
|
|
|
// Don't match a backref that hasn't match anything
|
|
if( ! (*param.prgbackrefs)[ m_cbackref ] )
|
|
return false;
|
|
|
|
for( ; ithis != istop; ++icur_tmp, ++ithis )
|
|
{
|
|
if( m_eos( param, icur_tmp ) || m_cmp( *icur_tmp, *ithis ) )
|
|
return false;
|
|
}
|
|
icur = icur_tmp;
|
|
return true;
|
|
}
|
|
cmp_type m_cmp;
|
|
eos_type m_eos;
|
|
};
|
|
|
|
template< typename CI >
|
|
match_backref<CI> * create_backref(
|
|
size_t cbackref,
|
|
const width_type & group_width,
|
|
unsigned flags )
|
|
{
|
|
typedef typename iterator_traits<CI>::value_type char_type;
|
|
|
|
switch( ( NOCASE | CSTRINGS ) & flags )
|
|
{
|
|
case 0:
|
|
return new match_backref_t<ch_neq_t<char_type>,eos_t<CI> >( cbackref, group_width );
|
|
case NOCASE:
|
|
return new match_backref_t<ch_neq_nocase_t<char_type>,eos_t<CI> >( cbackref, group_width );
|
|
case CSTRINGS:
|
|
return new match_backref_t<ch_neq_t<char_type>,eocs_t<CI> >( cbackref, group_width );
|
|
case NOCASE | CSTRINGS:
|
|
return new match_backref_t<ch_neq_nocase_t<char_type>,eocs_t<CI> >( cbackref, group_width );
|
|
default:
|
|
__assume(0); // tells the compiler that this is unreachable
|
|
}
|
|
}
|
|
|
|
// Replace some escape sequences with the actual characters
|
|
// they represent
|
|
template< typename CI >
|
|
void basic_rpattern_base<CI>::_normalize_string( basic_string<char_type> & str )
|
|
{
|
|
size_t i = 0;
|
|
|
|
// Don't do pattern normalization if the user didn't ask for it.
|
|
if( NORMALIZE != ( NORMALIZE & m_flags ) )
|
|
return;
|
|
|
|
while( basic_string<char_type>::npos != ( i = str.find( char_type('\\'), i ) ) )
|
|
{
|
|
if( str.size() - 1 == i )
|
|
return;
|
|
|
|
switch( str[i+1] )
|
|
{
|
|
case char_type('f'):
|
|
str.replace( i, 2, 1, char_type('\f') );
|
|
break;
|
|
case char_type('n'):
|
|
str.replace( i, 2, 1, char_type('\n') );
|
|
break;
|
|
case char_type('r'):
|
|
str.replace( i, 2, 1, char_type('\r') );
|
|
break;
|
|
case char_type('t'):
|
|
str.replace( i, 2, 1, char_type('\t') );
|
|
break;
|
|
case char_type('v'):
|
|
str.replace( i, 2, 1, char_type('\v') );
|
|
break;
|
|
case char_type('\\'):
|
|
str.replace( i, 2, 1, char_type('\\') );
|
|
break;
|
|
default:
|
|
++i;
|
|
break;
|
|
}
|
|
++i;
|
|
if( str.size() <= i )
|
|
return;
|
|
}
|
|
}
|
|
|
|
//
|
|
// Implementation of basic_rpattern:
|
|
//
|
|
template< typename CI, typename SY >
|
|
basic_rpattern<CI,SY>::basic_rpattern() throw()
|
|
: basic_rpattern_base<CI>( 0 )
|
|
{
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
basic_rpattern<CI,SY>::basic_rpattern(
|
|
const basic_string<char_type> & pat,
|
|
unsigned flags ) throw(bad_regexpr,bad_alloc)
|
|
: basic_rpattern_base<CI>( flags, pat )
|
|
{
|
|
push_new_handler pnh( &my_new_handler );
|
|
_normalize_string( m_pat );
|
|
_common_init( flags );
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
basic_rpattern<CI,SY>::basic_rpattern(
|
|
const basic_string<char_type> & pat,
|
|
const basic_string<char_type> & subst,
|
|
unsigned flags ) throw(bad_regexpr,bad_alloc)
|
|
: basic_rpattern_base<CI>( flags, pat, subst )
|
|
{
|
|
push_new_handler pnh( &my_new_handler );
|
|
_normalize_string( m_pat );
|
|
_common_init( flags );
|
|
_normalize_string( m_subst );
|
|
_parse_subst(); // must come after _common_init
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
void basic_rpattern<CI,SY>::init(
|
|
const basic_string<char_type> & pat,
|
|
unsigned flags ) throw(bad_regexpr,bad_alloc)
|
|
{
|
|
push_new_handler pnh( &my_new_handler );
|
|
_reset();
|
|
m_flags = flags;
|
|
m_pat = pat;
|
|
_normalize_string( m_pat );
|
|
_common_init( m_flags );
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
void basic_rpattern<CI,SY>::init(
|
|
const basic_string<char_type> & pat,
|
|
const basic_string<char_type> & subst,
|
|
unsigned flags ) throw(bad_regexpr,bad_alloc)
|
|
{
|
|
push_new_handler pnh( &my_new_handler );
|
|
_reset();
|
|
m_flags = flags;
|
|
m_pat = pat;
|
|
m_subst = subst;
|
|
_normalize_string( m_pat );
|
|
_common_init( m_flags );
|
|
_normalize_string( m_subst );
|
|
_parse_subst(); // must come after _common_init
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
void basic_rpattern<CI,SY>::_common_init( unsigned flags )
|
|
{
|
|
m_cgroups = 0;
|
|
vector<match_group<CI>*> rggroups;
|
|
basic_string<char_type>::iterator ipat = m_pat.begin();
|
|
match_group<CI> * pgroup = _find_next_group( ipat, flags, rggroups );
|
|
|
|
m_pfirst = pgroup;
|
|
m_nwidth = pgroup->group_width();
|
|
|
|
// Number the invisible groups
|
|
m_cgroups_visible = m_cgroups;
|
|
while( ! m_invisible_groups.empty() )
|
|
{
|
|
m_invisible_groups.front()->group_number( _get_next_group_nbr() );
|
|
m_invisible_groups.pop_front();
|
|
}
|
|
|
|
//
|
|
// determine if we can get away with only calling m_pfirst->domatch only once
|
|
//
|
|
|
|
m_floop = true;
|
|
|
|
// Optimization: if first character of pattern string is '^'
|
|
// and we are not doing a multiline match, then we only
|
|
// need to try domatch once
|
|
basic_string<char_type>::iterator icur = m_pat.begin();
|
|
if( MULTILINE != ( MULTILINE & m_flags ) &&
|
|
1 == pgroup->calternates() &&
|
|
icur != m_pat.end() &&
|
|
BEGIN_LINE == SY::reg_token( icur, m_pat.end() ) )
|
|
{
|
|
m_flags &= ~RIGHTMOST;
|
|
m_floop = false;
|
|
}
|
|
|
|
// Optimization: if first 2 characters of pattern string are ".*" or ".+",
|
|
// then we only need to try domatch once
|
|
icur = m_pat.begin();
|
|
if( RIGHTMOST != ( RIGHTMOST & m_flags ) &&
|
|
SINGLELINE == ( SINGLELINE & m_flags ) &&
|
|
1 == pgroup->calternates() &&
|
|
icur != m_pat.end() &&
|
|
MATCH_ANY == SY::reg_token( icur, m_pat.end() ) &&
|
|
icur != m_pat.end() )
|
|
{
|
|
switch( SY::quant_token( icur, m_pat.end() ) )
|
|
{
|
|
case ONE_OR_MORE:
|
|
case ZERO_OR_MORE:
|
|
case ONE_OR_MORE_MIN:
|
|
case ZERO_OR_MORE_MIN:
|
|
m_floop = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
void basic_rpattern<CI,SY>::_reset() throw()
|
|
{
|
|
basic_rpattern_base<CI>::_reset();
|
|
|
|
m_cgroups = m_cgroups_visible = 0;
|
|
m_floop = true;
|
|
|
|
m_subst.erase();
|
|
m_pat.erase();
|
|
|
|
m_pfirst.free_ptr();
|
|
m_nwidth = uninit_width;
|
|
m_subst_list.clear();
|
|
m_invisible_groups.clear();
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
void basic_rpattern<CI,SY>::set_flags( unsigned flags ) throw(bad_regexpr,bad_alloc)
|
|
{
|
|
push_new_handler pnh( &my_new_handler );
|
|
m_pfirst.free_ptr();
|
|
m_flags = flags;
|
|
_common_init( m_flags );
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
void basic_rpattern<CI,SY>::set_substitution( const basic_string<char_type> & subst )
|
|
{
|
|
push_new_handler pnh( &my_new_handler );
|
|
m_subst_list.clear();
|
|
m_subst = subst;
|
|
_normalize_string( m_subst );
|
|
_parse_subst();
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
match_group<CI> * basic_rpattern<CI,SY>::_find_next_group(
|
|
typename basic_string<char_type>::iterator & ipat,
|
|
unsigned & flags,
|
|
vector<match_group<CI>*> & rggroups )
|
|
{
|
|
auto_sub_ptr<match_group<CI> > pgroup;
|
|
basic_string<char_type>::iterator itemp = ipat;
|
|
unsigned old_flags = flags;
|
|
TOKEN tok;
|
|
|
|
// Look for group extensions. (This could change the value of the flags variable.)
|
|
if( ipat != m_pat.end() && NO_TOKEN != ( tok = SY::ext_token( ipat, m_pat.end(), flags ) ) )
|
|
{
|
|
if( itemp == m_pat.begin() || ipat == m_pat.end() )
|
|
throw bad_regexpr("ill-formed regular expression");
|
|
|
|
// Don't process empty groups
|
|
if( END_GROUP != SY::reg_token( itemp = ipat, m_pat.end() ) )
|
|
{
|
|
switch( tok )
|
|
{
|
|
case EXT_NOBACKREF:
|
|
// invisible groups are numbered only after all
|
|
// visible groups have been numbererd
|
|
pgroup = new match_group<CI>( -1 );
|
|
m_invisible_groups.push_back( pgroup.get() );
|
|
break;
|
|
|
|
case EXT_INDEPENDENT:
|
|
pgroup = new independent_group<CI>();
|
|
m_invisible_groups.push_back( pgroup.get() );
|
|
break;
|
|
|
|
case EXT_POS_LOOKAHEAD:
|
|
pgroup = new lookahead_assertion<CI>( true );
|
|
break;
|
|
|
|
case EXT_NEG_LOOKAHEAD:
|
|
pgroup = new lookahead_assertion<CI>( false );
|
|
break;
|
|
|
|
case EXT_POS_LOOKBEHIND:
|
|
// For look-behind assertions, turn off the CSTRINGs optimization
|
|
flags &= ~CSTRINGS;
|
|
pgroup = new lookbehind_assertion<CI>( true );
|
|
break;
|
|
|
|
case EXT_NEG_LOOKBEHIND:
|
|
// For look-behind assertions, turn off the CSTRINGs optimization
|
|
flags &= ~CSTRINGS;
|
|
pgroup = new lookbehind_assertion<CI>( false );
|
|
break;
|
|
|
|
default:
|
|
throw bad_regexpr("bad extension sequence");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Skip over the END_GROUP token
|
|
ipat = itemp;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
pgroup = new match_group<CI>( _get_next_group_nbr() );
|
|
}
|
|
|
|
if( NULL != pgroup.get() )
|
|
{
|
|
pgroup->add_alternate();
|
|
while( _find_next( ipat, pgroup.get(), flags, rggroups ) );
|
|
pgroup->end_alternate();
|
|
|
|
// Add this group to the rggroups array
|
|
if( -1 != pgroup->group_number() )
|
|
{
|
|
if( pgroup->group_number() >= rggroups.size() )
|
|
rggroups.resize( pgroup->group_number() + 1, NULL );
|
|
rggroups[ pgroup->group_number() ] = pgroup.get();
|
|
}
|
|
|
|
// The group should calculate its own width now and
|
|
// save the result for later.
|
|
pgroup->group_width();
|
|
|
|
// If this is not a pattern modifier, restore the
|
|
// flags to their previous settings. This causes
|
|
// pattern modifiers to have the scope of their
|
|
// enclosing group.
|
|
flags = old_flags;
|
|
}
|
|
|
|
return pgroup.release();
|
|
}
|
|
|
|
//
|
|
// Read ahead through the pattern and treat sequential atoms
|
|
// as a single atom, making sure to handle quantification
|
|
// correctly. Warning: dense code ahead.
|
|
//
|
|
template< typename CI, typename SY >
|
|
void basic_rpattern<CI,SY>::_find_atom(
|
|
typename basic_string<char_type>::iterator & ipat,
|
|
match_group<CI> * pgroup,
|
|
unsigned flags )
|
|
{
|
|
basic_string<char_type>::iterator itemp = ipat, istart = ipat;
|
|
|
|
do
|
|
{
|
|
switch( SY::quant_token( itemp, m_pat.end() ) )
|
|
{
|
|
// if {,} can't be interpreted as quantifiers, treat them as regular chars
|
|
case BEGIN_RANGE:
|
|
if( istart != ipat ) // treat as a quantifier
|
|
goto quantify;
|
|
case NO_TOKEN:
|
|
case END_RANGE:
|
|
case END_RANGE_MIN:
|
|
case RANGE_SEPARATOR:
|
|
break;
|
|
|
|
default:
|
|
if( istart == ipat ) // must be able to quantify something.
|
|
throw bad_regexpr("quantifier not expected");
|
|
quantify: if( istart != --ipat )
|
|
pgroup->add_item( create_atom<CI>( istart, ipat, flags ) );
|
|
auto_sub_ptr<sub_expr<CI> > pnew( create_atom<CI>( ipat++, flags ) );
|
|
_quantify( pnew, NULL, ipat );
|
|
pgroup->add_item( pnew.release() );
|
|
return;
|
|
}
|
|
} while( m_pat.end() != ++ipat && ! SY::reg_token( itemp = ipat, m_pat.end() ) );
|
|
|
|
assert( ipat != istart );
|
|
pgroup->add_item( create_atom<CI>( istart, ipat, flags ) );
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
bool basic_rpattern<CI,SY>::_find_next(
|
|
typename basic_string<char_type>::iterator & ipat,
|
|
match_group<CI> * pgroup,
|
|
unsigned & flags,
|
|
vector<match_group<CI>*> & rggroups )
|
|
{
|
|
match_group<CI> * pnew_group = NULL;
|
|
auto_sub_ptr<sub_expr<CI> > pnew;
|
|
basic_string<char_type>::iterator istart, itemp;
|
|
bool fdone;
|
|
|
|
if( ipat == m_pat.end() )
|
|
{
|
|
if( 0 != pgroup->group_number() )
|
|
throw bad_regexpr( "mismatched parenthesis" );
|
|
return false;
|
|
}
|
|
|
|
switch( SY::reg_token( ipat, m_pat.end() ) )
|
|
{
|
|
case NO_TOKEN: // not a token. Must be an atom
|
|
_find_atom( ipat, pgroup, flags );
|
|
return true;
|
|
|
|
case END_GROUP:
|
|
if( 0 == pgroup->group_number() )
|
|
throw bad_regexpr( "mismatched parenthesis" );
|
|
return false;
|
|
|
|
case ALTERNATION:
|
|
pgroup->end_alternate();
|
|
pgroup->add_alternate();
|
|
return true;
|
|
|
|
case BEGIN_GROUP:
|
|
// Find next group could return NULL if the group is really
|
|
// a pattern modifier, like: (?s-i)
|
|
pnew = pnew_group = _find_next_group( ipat, flags, rggroups );
|
|
break;
|
|
|
|
case BEGIN_LINE:
|
|
pnew = create_bol<CI>( flags );
|
|
break;
|
|
|
|
case END_LINE:
|
|
pnew = create_eol<CI>( flags );
|
|
break;
|
|
|
|
case BEGIN_CHARSET:
|
|
pnew = create_charset_helper<CI,SY>::create_charset_aux( m_pat, ipat, flags );
|
|
break;
|
|
|
|
case MATCH_ANY:
|
|
pnew = create_any<CI>( flags );
|
|
break;
|
|
|
|
case ESC_WORD_BOUNDARY:
|
|
pnew = create_word_boundary<CI>( true, flags );
|
|
break;
|
|
|
|
case ESC_NOT_WORD_BOUNDARY:
|
|
pnew = create_word_boundary<CI>( false, flags );
|
|
break;
|
|
|
|
case ESC_WORD_START:
|
|
pnew = create_word_start<CI>( flags );
|
|
break;
|
|
|
|
case ESC_WORD_STOP:
|
|
pnew = create_word_stop<CI>( flags );
|
|
break;
|
|
|
|
case ESC_DIGIT:
|
|
pnew = create_charset<CI>( match_charset<CI>( false,
|
|
get_digit_vector() ),
|
|
flags );
|
|
break;
|
|
|
|
case ESC_NOT_DIGIT:
|
|
pnew = create_charset<CI>( match_charset<CI>( true,
|
|
get_digit_vector() ),
|
|
flags );
|
|
break;
|
|
|
|
case ESC_WORD:
|
|
pnew = create_charset<CI>( match_charset<CI>( false,
|
|
get_word_vector() ),
|
|
flags );
|
|
break;
|
|
|
|
case ESC_NOT_WORD:
|
|
pnew = create_charset<CI>( match_charset<CI>( true,
|
|
get_word_vector() ),
|
|
flags );
|
|
break;
|
|
|
|
case ESC_SPACE:
|
|
pnew = create_charset<CI>( match_charset<CI>( false,
|
|
get_space_vector() ),
|
|
flags );
|
|
break;
|
|
|
|
case ESC_NOT_SPACE:
|
|
pnew = create_charset<CI>( match_charset<CI>( true,
|
|
get_space_vector() ),
|
|
flags );
|
|
break;
|
|
|
|
case ESC_BEGIN_STRING:
|
|
pnew = create_bos<CI>( flags );
|
|
break;
|
|
|
|
case ESC_END_STRING:
|
|
pnew = create_eos<CI>( flags );
|
|
break;
|
|
|
|
case ESC_END_STRING_z:
|
|
pnew = create_eoz<CI>( flags );
|
|
break;
|
|
|
|
case ESCAPE:
|
|
|
|
if( char_type('0') <= *ipat && char_type('9') >= *ipat )
|
|
{
|
|
// use _cgroups_total here since the invisible groups have not been numbered yet.
|
|
unsigned nbackref = parse_int( ipat, m_pat.end(), _cgroups_total() - 1 );// always at least 1 group
|
|
if( 0 == nbackref || rggroups.size() <= nbackref || NULL == rggroups[ nbackref ] )
|
|
throw bad_regexpr( "invalid backreference" );
|
|
pnew = create_backref<CI>( nbackref, rggroups[nbackref]->group_width(), flags );
|
|
}
|
|
else
|
|
{
|
|
// Is this a user-defined intrinsic character set?
|
|
match_charset<CI> * pcharset = s_charset_map.get( *ipat, flags );
|
|
if( NULL != pcharset )
|
|
pnew = create_charset<CI>( *pcharset, flags );
|
|
else
|
|
pnew = create_atom<CI>( ipat, flags );
|
|
++ipat;
|
|
}
|
|
break;
|
|
|
|
// If quotemeta, loop until we find quotemeta off or end of string
|
|
case ESC_QUOTE_META_ON:
|
|
for( istart = itemp = ipat, fdone = false; !fdone && ipat != m_pat.end(); )
|
|
{
|
|
switch( SY::reg_token( ipat, m_pat.end() ) )
|
|
{
|
|
case ESC_QUOTE_META_OFF:
|
|
fdone = true;
|
|
break;
|
|
case NO_TOKEN:
|
|
++ipat; // fallthrough
|
|
default:
|
|
itemp = ipat;
|
|
break;
|
|
}
|
|
}
|
|
if( itemp != istart )
|
|
pgroup->add_item( create_atom<CI>( istart, itemp, flags ) );
|
|
|
|
// skip the quantification code below
|
|
return true;
|
|
|
|
// Should never get here for valid patterns
|
|
case ESC_QUOTE_META_OFF:
|
|
throw bad_regexpr("quotemeta turned off, but was never turned on");
|
|
|
|
default:
|
|
assert( ! "Unhandled token type" );
|
|
break;
|
|
}
|
|
|
|
// If pnew is null, then the current subexpression is a no-op.
|
|
if( pnew.get() )
|
|
{
|
|
// Look for quantifiers
|
|
_quantify( pnew, pnew_group, ipat );
|
|
|
|
// Add the item to the group
|
|
pgroup->add_item( pnew.release() );
|
|
}
|
|
return true;
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
void basic_rpattern<CI,SY>::_quantify(
|
|
auto_sub_ptr<sub_expr<CI> > & pnew,
|
|
match_group<CI> * pnew_group,
|
|
typename basic_string<char_type>::iterator & ipat )
|
|
{
|
|
if( ipat != m_pat.end() && ! pnew->is_assertion() )
|
|
{
|
|
basic_string<char_type>::iterator itemp = ipat;
|
|
bool fmin = false;
|
|
|
|
// Since size_t is unsigned, -1 is really the largest size_t
|
|
size_t lbound = (size_t)-1;
|
|
size_t ubound = (size_t)-1;
|
|
size_t ubound_tmp;
|
|
|
|
switch( SY::quant_token( itemp, m_pat.end() ) )
|
|
{
|
|
case ZERO_OR_MORE_MIN:
|
|
fmin = true;
|
|
case ZERO_OR_MORE:
|
|
lbound = 0;
|
|
break;
|
|
|
|
case ONE_OR_MORE_MIN:
|
|
fmin = true;
|
|
case ONE_OR_MORE:
|
|
lbound = 1;
|
|
break;
|
|
|
|
case ZERO_OR_ONE_MIN:
|
|
fmin = true;
|
|
case ZERO_OR_ONE:
|
|
lbound = 0;
|
|
ubound = 1;
|
|
break;
|
|
|
|
case BEGIN_RANGE:
|
|
lbound = parse_int( itemp, m_pat.end() );
|
|
if( itemp == m_pat.end() )
|
|
throw bad_regexpr( "expecting end of range" );
|
|
|
|
switch( SY::quant_token( itemp, m_pat.end() ) )
|
|
{
|
|
case END_RANGE_MIN:
|
|
fmin = true;
|
|
case END_RANGE:
|
|
ubound = lbound;
|
|
break;
|
|
|
|
case RANGE_SEPARATOR:
|
|
ipat = itemp;
|
|
ubound_tmp = parse_int( itemp, m_pat.end() );
|
|
if( itemp != ipat )
|
|
ubound = ubound_tmp;
|
|
if( itemp == m_pat.end() )
|
|
throw bad_regexpr( "expecting end of range" );
|
|
switch( SY::quant_token( itemp, m_pat.end() ) )
|
|
{
|
|
case END_RANGE_MIN:
|
|
fmin = true;
|
|
case END_RANGE:
|
|
break;
|
|
default:
|
|
throw bad_regexpr( "expecting end of range" );
|
|
}
|
|
break;
|
|
|
|
default:
|
|
throw bad_regexpr( "ill-formed quantifier" );
|
|
}
|
|
|
|
if( ubound < lbound )
|
|
throw bad_regexpr( "ill-formed quantifier" );
|
|
|
|
break;
|
|
}
|
|
|
|
if( (size_t)-1 != lbound )
|
|
{
|
|
auto_sub_ptr<match_quantifier<CI> > pquant;
|
|
|
|
// a group quantifier is less efficient than an atom quantifier
|
|
if( fmin )
|
|
{
|
|
if( pnew_group )
|
|
pquant = new min_group_quantifier<CI>( pnew_group,
|
|
lbound, ubound );
|
|
else
|
|
pquant = new min_atom_quantifier<CI>( pnew.get(),
|
|
lbound, ubound );
|
|
}
|
|
else
|
|
{
|
|
if( pnew_group )
|
|
pquant = new max_group_quantifier<CI>( pnew_group,
|
|
lbound, ubound );
|
|
else
|
|
pquant = new max_atom_quantifier<CI>( pnew.get(),
|
|
lbound, ubound );
|
|
}
|
|
|
|
pnew.release();
|
|
pnew = pquant.release();
|
|
ipat = itemp;
|
|
}
|
|
}
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
void basic_rpattern<CI,SY>::_add_subst_backref( subst_node & snode, size_t nbackref, size_t rstart )
|
|
{
|
|
m_fuses_backrefs = true;
|
|
assert( subst_node::SUBST_STRING == snode.stype );
|
|
if( snode.subst_string.rlength )
|
|
m_subst_list.push_back( snode );
|
|
|
|
snode.stype = subst_node::SUBST_BACKREF;
|
|
snode.subst_backref = nbackref;
|
|
m_subst_list.push_back( snode );
|
|
|
|
// re-initialize the subst_node
|
|
snode.stype = subst_node::SUBST_STRING;
|
|
snode.subst_string.rstart = rstart;
|
|
snode.subst_string.rlength = 0;
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
void basic_rpattern<CI,SY>::_parse_subst()
|
|
{
|
|
TOKEN tok;
|
|
subst_node snode;
|
|
basic_string<char_type>::iterator icur = m_subst.begin();
|
|
size_t nbackref;
|
|
basic_string<char_type>::iterator itemp;
|
|
bool fdone;
|
|
|
|
m_fuses_backrefs = false;
|
|
|
|
// Initialize the subst_node
|
|
snode.stype = subst_node::SUBST_STRING;
|
|
snode.subst_string.rstart = 0;
|
|
snode.subst_string.rlength = 0;
|
|
|
|
while( icur != m_subst.end() )
|
|
{
|
|
switch( tok = SY::subst_token( icur, m_subst.end() ) )
|
|
{
|
|
case SUBST_MATCH:
|
|
_add_subst_backref( snode, 0, distance( m_subst.begin(), icur ) );
|
|
break;
|
|
|
|
case SUBST_PREMATCH:
|
|
_add_subst_backref( snode, subst_node::PREMATCH, distance( m_subst.begin(), icur ) );
|
|
break;
|
|
|
|
case SUBST_POSTMATCH:
|
|
_add_subst_backref( snode, subst_node::POSTMATCH, distance( m_subst.begin(), icur ) );
|
|
break;
|
|
|
|
case SUBST_BACKREF:
|
|
nbackref = parse_int( icur, m_subst.end(), cgroups() - 1 ); // always at least 1 group
|
|
if( 0 == nbackref )
|
|
throw bad_regexpr( "invalid backreference in substitution" );
|
|
|
|
_add_subst_backref( snode, nbackref, distance( m_subst.begin(), icur ) );
|
|
break;
|
|
|
|
case SUBST_QUOTE_META_ON:
|
|
assert( subst_node::SUBST_STRING == snode.stype );
|
|
if( snode.subst_string.rlength )
|
|
m_subst_list.push_back( snode );
|
|
|
|
snode.subst_string.rstart = distance( m_subst.begin(), icur );
|
|
for( itemp = icur, fdone = false; !fdone && icur != m_subst.end(); )
|
|
{
|
|
switch( tok = SY::subst_token( icur, m_subst.end() ) )
|
|
{
|
|
case SUBST_ALL_OFF:
|
|
fdone = true;
|
|
break;
|
|
case NO_TOKEN:
|
|
++icur; // fall-through
|
|
default:
|
|
itemp = icur;
|
|
break;
|
|
}
|
|
}
|
|
snode.subst_string.rlength = distance( m_subst.begin(), itemp ) - snode.subst_string.rstart;
|
|
if( snode.subst_string.rlength )
|
|
m_subst_list.push_back( snode );
|
|
|
|
if( tok == SUBST_ALL_OFF )
|
|
{
|
|
snode.stype = subst_node::SUBST_OP;
|
|
snode.op = subst_node::ALL_OFF;
|
|
m_subst_list.push_back( snode );
|
|
}
|
|
|
|
// re-initialize the subst_node
|
|
snode.stype = subst_node::SUBST_STRING;
|
|
snode.subst_string.rstart = distance( m_subst.begin(), icur );
|
|
snode.subst_string.rlength = 0;
|
|
break;
|
|
|
|
case SUBST_UPPER_ON:
|
|
case SUBST_UPPER_NEXT:
|
|
case SUBST_LOWER_ON:
|
|
case SUBST_LOWER_NEXT:
|
|
case SUBST_ALL_OFF:
|
|
assert( subst_node::SUBST_STRING == snode.stype );
|
|
if( snode.subst_string.rlength )
|
|
m_subst_list.push_back( snode );
|
|
|
|
snode.stype = subst_node::SUBST_OP;
|
|
snode.op = (subst_node::op_type) tok;
|
|
m_subst_list.push_back( snode );
|
|
|
|
// re-initialize the subst_node
|
|
snode.stype = subst_node::SUBST_STRING;
|
|
snode.subst_string.rstart = distance( m_subst.begin(), icur );
|
|
snode.subst_string.rlength = 0;
|
|
break;
|
|
|
|
case SUBST_ESCAPE:
|
|
if( icur == m_subst.end() )
|
|
throw bad_regexpr("expecting escape sequence in substitution string");
|
|
assert( subst_node::SUBST_STRING == snode.stype );
|
|
if( snode.subst_string.rlength )
|
|
m_subst_list.push_back( snode );
|
|
snode.subst_string.rstart = distance( m_subst.begin(), icur++ );
|
|
snode.subst_string.rlength = 1;
|
|
break;
|
|
|
|
case NO_TOKEN:
|
|
default:
|
|
++snode.subst_string.rlength;
|
|
++icur;
|
|
break;
|
|
}
|
|
}
|
|
assert( subst_node::SUBST_STRING == snode.stype );
|
|
if( snode.subst_string.rlength )
|
|
m_subst_list.push_back( snode );
|
|
}
|
|
|
|
template< typename CI, typename SY >
|
|
typename basic_rpattern<CI,SY>::charset_map basic_rpattern<CI,SY>::s_charset_map;
|
|
|
|
// Pass in an interator to one after the opening bracket of the character set.
|
|
// On return, icur points to one character after the closing bracket
|
|
template< typename CI, typename SY >
|
|
sub_expr<CI> * create_charset_helper<CI,SY>::create_charset_aux(
|
|
basic_string<char_type> & str,
|
|
typename basic_string<char_type>::iterator & icur,
|
|
unsigned flags )
|
|
{
|
|
bool fcomplement = false;
|
|
match_charset<CI> * pnew = NULL;
|
|
basic_string<iterator_traits<CI>::value_type>::iterator itemp = icur;
|
|
|
|
if( itemp != str.end() && CHARSET_NEGATE == SY::charset_token( itemp, str.end() ) )
|
|
{
|
|
fcomplement = true;
|
|
icur = itemp;
|
|
}
|
|
|
|
switch( ( NOCASE | CSTRINGS ) & flags )
|
|
{
|
|
case 0:
|
|
pnew = new match_custom_charset_t<eos_t<CI>,match_range_with_case>( fcomplement, icur, str.end(), flags, SY() );
|
|
break;
|
|
case NOCASE:
|
|
pnew = new match_custom_charset_t<eos_t<CI>,match_range_no_case>( fcomplement, icur, str.end(), flags, SY() );
|
|
break;
|
|
case CSTRINGS:
|
|
pnew = new match_custom_charset_t<eocs_t<CI>,match_range_with_case>( fcomplement, icur, str.end(), flags, SY() );
|
|
break;
|
|
case NOCASE | CSTRINGS:
|
|
pnew = new match_custom_charset_t<eocs_t<CI>,match_range_no_case>( fcomplement, icur, str.end(), flags, SY() );
|
|
break;
|
|
default:
|
|
__assume(0); // tells the compiler that this is unreachable
|
|
}
|
|
|
|
return pnew;
|
|
}
|
|
|
|
#pragma warning( disable : 4660 )
|
|
|
|
// Explicit instantiation
|
|
#ifdef REGEX_FORCE_INSTANTIATION
|
|
template class basic_regexpr<char>;
|
|
template class basic_regexpr<wint_t>;
|
|
#else
|
|
template class basic_regexpr<TCHAR>;
|
|
#endif
|
|
|
|
#ifndef NO_PERL_RE
|
|
#ifdef REGEX_FORCE_INSTANTIATION
|
|
template class basic_rpattern<const char *>;
|
|
template class basic_rpattern<const wint_t *>;
|
|
template class basic_rpattern<string::const_iterator>;
|
|
template class basic_rpattern<wstring::const_iterator>;
|
|
#else
|
|
template class basic_rpattern<const TCHAR *>;
|
|
template class basic_rpattern<tstring::const_iterator>;
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef POSIX_RE
|
|
#ifdef REGEX_FORCE_INSTANTIATION
|
|
template class basic_rpattern<const char *,posix_syntax<char> >;
|
|
template class basic_rpattern<const wint_t *,posix_syntax<wint_t> >;
|
|
template class basic_rpattern<string::const_iterator,posix_syntax<char> >;
|
|
template class basic_rpattern<wstring::const_iterator,posix_syntax<wint_t> >;
|
|
#else
|
|
template class basic_rpattern<const TCHAR *,posix_syntax<TCHAR> >;
|
|
template class basic_rpattern<tstring::const_iterator,posix_syntax<TCHAR> >;
|
|
#endif
|
|
#endif
|
|
|
|
} // namespace regex
|
|
|