Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

613 lines
16 KiB

//+---------------------------------------------------------------------------
//
// File: syntax.h
//
// Contents: syntax modules for regexpr
//
// Classes: perl_syntax, posix_syntax
//
// History: 3-29-00 ericne Created
//
//----------------------------------------------------------------------------
#pragma once
#pragma warning(push)
// warning C4511: copy constructor could not be generated
// warning C4512: assignment operator could not be generated
#pragma warning( disable : 4511 4512 )
#include <string>
#include <cwchar>
#include <iterator>
#ifndef ARRAYSIZE
#define ARRAYSIZE(x) (sizeof(x)/sizeof(*(x)))
#endif
#ifndef UCHAR_MAX
#define UCHAR_MAX 0xff
#endif
#ifndef WCHAR_MAX
#define WCHAR_MAX ((wchar_t)-1)
#endif
template<>
struct std::iterator_traits< const char * >
{ // get traits from iterator _Iter
typedef random_access_iterator_tag iterator_category;
typedef char value_type;
typedef ptrdiff_t difference_type;
typedef difference_type distance_type; // retained
typedef char * pointer;
typedef char & reference;
};
template<>
struct std::iterator_traits< const wchar_t * >
{ // get traits from iterator _Iter
typedef random_access_iterator_tag iterator_category;
typedef wchar_t value_type;
typedef ptrdiff_t difference_type;
typedef difference_type distance_type; // retained
typedef wchar_t * pointer;
typedef wchar_t & reference;
};
namespace regex
{
//
// The following are the tokens that can be emitted by the syntax module.
// Don't reorder this list!!!
//
enum TOKEN
{
NO_TOKEN = 0,
// REGULAR TOKENS
BEGIN_GROUP,
END_GROUP,
ALTERNATION,
BEGIN_LINE,
END_LINE,
BEGIN_CHARSET,
MATCH_ANY,
ESCAPE,
// QUANTIFICATION TOKENS
ONE_OR_MORE,
ZERO_OR_MORE,
ZERO_OR_ONE,
ONE_OR_MORE_MIN,
ZERO_OR_MORE_MIN,
ZERO_OR_ONE_MIN,
BEGIN_RANGE,
RANGE_SEPARATOR,
END_RANGE,
END_RANGE_MIN,
// ESCAPE SEQUENCES
ESC_DIGIT,
ESC_NOT_DIGIT,
ESC_SPACE,
ESC_NOT_SPACE,
ESC_WORD,
ESC_NOT_WORD,
ESC_BEGIN_STRING,
ESC_END_STRING,
ESC_END_STRING_z,
ESC_WORD_BOUNDARY,
ESC_NOT_WORD_BOUNDARY,
ESC_WORD_START,
ESC_WORD_STOP,
ESC_QUOTE_META_ON,
ESC_QUOTE_META_OFF,
// SUBSTITUTION TOKENS
SUBST_BACKREF,
SUBST_PREMATCH,
SUBST_POSTMATCH,
SUBST_MATCH,
SUBST_ESCAPE,
SUBST_QUOTE_META_ON,
SUBST_UPPER_ON,
SUBST_UPPER_NEXT,
SUBST_LOWER_ON,
SUBST_LOWER_NEXT,
SUBST_ALL_OFF,
// CHARSET TOKENS
CHARSET_NEGATE,
CHARSET_ESCAPE,
CHARSET_RANGE,
CHARSET_BACKSPACE,
CHARSET_END,
CHARSET_ALNUM,
CHARSET_ALPHA,
CHARSET_BLANK,
CHARSET_CNTRL,
CHARSET_DIGIT,
CHARSET_GRAPH,
CHARSET_LOWER,
CHARSET_PRINT,
CHARSET_PUNCT,
CHARSET_SPACE,
CHARSET_UPPER,
CHARSET_XDIGIT,
// EXTENSION TOKENS
EXT_NOBACKREF,
EXT_POS_LOOKAHEAD,
EXT_NEG_LOOKAHEAD,
EXT_POS_LOOKBEHIND,
EXT_NEG_LOOKBEHIND,
EXT_INDEPENDENT,
EXT_UNKNOWN
};
struct posix_charset_type
{
const char * const szcharset;
const size_t cchars;
posix_charset_type( const char * const sz, const size_t c )
: szcharset(sz), cchars(c) {}
};
extern const posix_charset_type g_rgposix_charsets[];
extern const size_t g_cposix_charsets;
template< typename const_iterator >
bool is_posix_charset( const_iterator icur, const_iterator iend, const char * szcharset )
{
for( ; icur != iend && '\0' != *szcharset; ++icur, ++szcharset )
{
if( *icur != *szcharset )
return false;
}
return '\0' == *szcharset;
}
//
// The perl_syntax class encapsulates the Perl 5 regular expression syntax. It is
// used as a template parameter to basic_rpattern. To customize regex syntax, create
// your own syntax class and use it as a template parameter instead.
//
class perl_syntax_base
{
protected:
static TOKEN s_rgreg[ UCHAR_MAX + 1 ];
static TOKEN s_rgescape[ UCHAR_MAX + 1 ];
struct init_perl_syntax;
friend struct init_perl_syntax;
static struct init_perl_syntax
{
init_perl_syntax();
} s_init_perl_syntax;
static inline TOKEN look_up( char ch, TOKEN rg[] ) { return rg[ (unsigned char)ch ]; }
static inline TOKEN look_up( wchar_t ch, TOKEN rg[] ) { return UCHAR_MAX < ch ? NO_TOKEN : rg[ (unsigned char)ch ]; }
};
template< typename CH >
class perl_syntax : protected perl_syntax_base
{
public:
typedef std::basic_string<CH>::iterator iterator;
typedef std::basic_string<CH>::const_iterator const_iterator;
typedef CH char_type;
private:
static bool min_quant( iterator & icur, const_iterator iend )
{
return ( (const_iterator)++icur != iend && CH('?') == *icur ? (++icur,true) : false );
}
public:
static TOKEN reg_token( iterator & icur, const_iterator iend )
{
assert( (const_iterator)icur != iend );
TOKEN tok = look_up( *icur, s_rgreg );
if( tok )
++icur;
if( ESCAPE == tok && (const_iterator)icur != iend )
{
tok = look_up( *icur, s_rgescape );
if( tok )
++icur;
else
tok = ESCAPE;
}
return tok;
}
static TOKEN quant_token( iterator & icur, const_iterator iend )
{
assert( (const_iterator)icur != iend );
TOKEN tok = NO_TOKEN;
switch( *icur )
{
case CH('*'):
tok = min_quant( icur, iend ) ? ZERO_OR_MORE_MIN : ZERO_OR_MORE;
break;
case CH('+'):
tok = min_quant( icur, iend ) ? ONE_OR_MORE_MIN : ONE_OR_MORE;
break;
case CH('?'):
tok = min_quant( icur, iend ) ? ZERO_OR_ONE_MIN : ZERO_OR_ONE;
break;
case CH('}'):
tok = min_quant( icur, iend ) ? END_RANGE_MIN : END_RANGE;
break;
case CH('{'):
tok = BEGIN_RANGE;
++icur;
break;
case CH(','):
tok = RANGE_SEPARATOR;
++icur;
break;
}
return tok;
}
static TOKEN charset_token( iterator & icur, const_iterator iend )
{
assert( (const_iterator)icur != iend );
TOKEN tok = NO_TOKEN;
switch( *icur )
{
case CH('-'):
tok = CHARSET_RANGE;
++icur;
break;
case CH('^'):
tok = CHARSET_NEGATE;
++icur;
break;
case CH(']'):
tok = CHARSET_END;
++icur;
break;
case CH('\\'):
tok = CHARSET_ESCAPE;
if( (const_iterator)++icur == iend )
break;
switch( *icur )
{
case CH('b'):
tok = CHARSET_BACKSPACE;
++icur;
break;
case CH('d'):
tok = ESC_DIGIT;
++icur;
break;
case CH('D'):
tok = ESC_NOT_DIGIT;
++icur;
break;
case CH('s'):
tok = ESC_SPACE;
++icur;
break;
case CH('S'):
tok = ESC_NOT_SPACE;
++icur;
break;
case CH('w'):
tok = ESC_WORD;
++icur;
break;
case CH('W'):
tok = ESC_NOT_WORD;
++icur;
break;
}
break;
case CH('['):
for( size_t i=0; !tok && i < g_cposix_charsets; ++i )
{
if( is_posix_charset<const_iterator>( icur, iend, g_rgposix_charsets[i].szcharset ) )
{
tok = TOKEN(CHARSET_ALNUM + i);
std::advance( icur, g_rgposix_charsets[i].cchars );
}
}
break;
}
return tok;
}
static TOKEN subst_token( iterator & icur, const_iterator iend )
{
assert( (const_iterator)icur != iend );
TOKEN tok = NO_TOKEN;
switch( *icur )
{
case CH('\\'):
tok = SUBST_ESCAPE;
if( (const_iterator)++icur != iend )
switch( *icur )
{
case CH('Q'):
tok = SUBST_QUOTE_META_ON;
++icur;
break;
case CH('U'):
tok = SUBST_UPPER_ON;
++icur;
break;
case CH('u'):
tok = SUBST_UPPER_NEXT;
++icur;
break;
case CH('L'):
tok = SUBST_LOWER_ON;
++icur;
break;
case CH('l'):
tok = SUBST_LOWER_NEXT;
++icur;
break;
case CH('E'):
tok = SUBST_ALL_OFF;
++icur;
break;
}
break;
case CH('$'):
tok = SUBST_BACKREF;
if( (const_iterator)++icur != iend )
switch( *icur )
{
case CH('&'):
tok = SUBST_MATCH;
++icur;
break;
case CH('`'):
tok = SUBST_PREMATCH;
++icur;
break;
case CH('\''):
tok = SUBST_POSTMATCH;
++icur;
break;
}
break;
}
return tok;
}
static TOKEN ext_token( iterator & icur, const_iterator iend, unsigned & flags )
{
assert( (const_iterator)icur != iend );
bool finclude;
TOKEN tok = NO_TOKEN;
if( CH('?') == *icur )
{
tok = EXT_UNKNOWN;
if( (const_iterator)++icur != iend )
{
switch( *icur )
{
case CH(':'):
tok = EXT_NOBACKREF;
++icur;
break;
case CH('='):
tok = EXT_POS_LOOKAHEAD;
++icur;
break;
case CH('!'):
tok = EXT_NEG_LOOKAHEAD;
++icur;
break;
case CH('<'):
if( (const_iterator)++icur == iend )
break;
switch( *icur )
{
case CH('='):
tok = EXT_POS_LOOKBEHIND;
++icur;
break;
case CH('!'):
tok = EXT_NEG_LOOKBEHIND;
++icur;
break;
}
break;
case CH('>'):
tok = EXT_INDEPENDENT;
++icur;
break;
default:
finclude = true;
do
{
if( CH(':') == *icur )
{
tok = EXT_NOBACKREF;
++icur;
break;
}
if( CH(')') == *icur )
{
tok = EXT_NOBACKREF;
break;
}
if( CH('-') == *icur && finclude )
finclude = false;
else if( CH('i') == *icur )
flags = finclude ? ( flags | NOCASE ) : ( flags & ~NOCASE );
else if( CH('m') == *icur )
flags = finclude ? ( flags | MULTILINE ) : ( flags & ~MULTILINE );
else if( CH('s') == *icur )
flags = finclude ? ( flags | SINGLELINE ) : ( flags & ~SINGLELINE );
else
break;
} while( (const_iterator)++icur != iend );
break;
}
}
}
return tok;
}
};
//
// Implements the basic POSIX regular expression syntax
//
template< typename CH >
class posix_syntax
{
public:
typedef std::basic_string<CH>::iterator iterator;
typedef std::basic_string<CH>::const_iterator const_iterator;
typedef CH char_type;
static TOKEN reg_token( iterator & icur, const_iterator iend )
{
TOKEN tok = NO_TOKEN;
switch( *icur )
{
case '.':
tok = MATCH_ANY;
++icur;
break;
case '^':
tok = BEGIN_LINE;
++icur;
break;
case '$':
tok = END_LINE;
++icur;
break;
case '[':
tok = BEGIN_CHARSET;
++icur;
break;
case '\\':
tok = ESCAPE;
++icur;
if( (const_iterator)icur != iend )
{
switch( *icur )
{
case '(':
tok = BEGIN_GROUP;
++icur;
break;
case ')':
tok = END_GROUP;
++icur;
break;
case '|':
tok = ALTERNATION;
++icur;
break;
}
}
break;
}
return tok;
}
static TOKEN quant_token( iterator & icur, const_iterator iend )
{
TOKEN tok = NO_TOKEN;
switch( *icur )
{
case '*':
tok = ZERO_OR_MORE;
++icur;
break;
case ',':
tok = RANGE_SEPARATOR;
++icur;
break;
case '\\':
++icur;
if( (const_iterator)icur != iend )
{
switch( *icur )
{
case '?':
tok = ZERO_OR_ONE;
++icur;
break;
case '+':
tok = ONE_OR_MORE;
++icur;
break;
case '{':
tok = BEGIN_RANGE;
++icur;
break;
case '}':
tok = END_RANGE;
++icur;
break;
default:
--icur;
break;
}
}
else
{
--icur;
}
}
return tok;
}
static TOKEN charset_token( iterator & icur, const_iterator iend )
{
TOKEN tok = NO_TOKEN;
switch( *icur )
{
case '^':
tok = CHARSET_NEGATE;
++icur;
break;
case '-':
tok = CHARSET_RANGE;
++icur;
break;
case ']':
tok = CHARSET_END;
++icur;
break;
case '[':
for( size_t i=0; !tok && i < g_cposix_charsets; ++i )
{
if( is_posix_charset<const_iterator>( icur, iend, g_rgposix_charsets[i].szcharset ) )
{
tok = TOKEN(CHARSET_ALNUM + i);
std::advance( icur, g_rgposix_charsets[i].cchars );
}
}
break;
}
return tok;
}
static TOKEN subst_token( iterator & icur, const_iterator iend )
{
TOKEN tok = NO_TOKEN;
if( '\\' == *icur )
{
tok = SUBST_ESCAPE;
++icur;
if( (const_iterator)icur != iend && '0' <= *icur && '9' >= *icur )
{
tok = SUBST_BACKREF;
}
}
return tok;
}
static TOKEN ext_token( iterator &, const_iterator, unsigned & )
{
return NO_TOKEN;
}
};
} // namespace regex
#pragma warning(pop)