Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3127 lines
100 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // File: basic_regexpr.cxx
  4. //
  5. // Contents:
  6. //
  7. // Classes:
  8. //
  9. // Functions:
  10. //
  11. // Coupling:
  12. //
  13. // Notes:
  14. //
  15. // History: 1-11-1999 ericne Created
  16. //
  17. //----------------------------------------------------------------------------
  18. // unlimited inline expansion (compile with /Ob1 or /Ob2)
  19. #pragma inline_depth(255)
  20. // warning C4355: 'this' : used in base member initializer list
  21. // warning C4511: copy constructor could not be generated
  22. // warning C4512: assignment operator could not be generated
  23. // warning C4660: template-class specialization 'foo<bar>' is already instantiated
  24. // warning C4706: assignment within conditional expression
  25. // warning C4786: identifier was truncated to '255' characters in the debug information
  26. // warning C4800: 'int' : forcing value to bool 'true' or 'false' (performance warning)
  27. #pragma warning( disable : 4355 4511 4512 4660 4706 4786 4800 )
  28. #include <assert.h>
  29. #include <malloc.h> // for _alloca
  30. #include <algorithm>
  31. #include <minmax.h>
  32. #include "regexpr.h"
  33. using namespace std;
  34. namespace regex
  35. {
  36. #ifdef _MT
  37. // Global critical section used to synchronize the creation of static const patterns
  38. CRegExCritSect g_objRegExCritSect;
  39. #endif
  40. // For use while doing uppercase/lowercase conversions:
  41. // For use while doing uppercase/lowercase conversions:
  42. inline char to_upper( char ch ) { return ( char )toupper(ch); }
  43. inline char to_lower( char ch ) { return ( char )tolower(ch); }
  44. inline wchar_t to_upper( wchar_t ch ) { return (wchar_t)towupper(ch); }
  45. inline wchar_t to_lower( wchar_t ch ) { return (wchar_t)towlower(ch); }
  46. template< typename II, typename CI >
  47. void to_upper( II ibegin, CI iend )
  48. {
  49. for( ; (CI)ibegin != iend; ++ibegin )
  50. *ibegin = to_upper( *ibegin );
  51. }
  52. template< typename II, typename CI >
  53. void to_lower( II ibegin, CI iend )
  54. {
  55. for( ; (CI)ibegin != iend; ++ibegin )
  56. *ibegin = to_lower( *ibegin );
  57. }
  58. template< typename II, typename CI >
  59. unsigned parse_int( II & istr, CI iend, const unsigned m_max = unsigned(-1) )
  60. {
  61. unsigned retval = 0;
  62. while( (CI)istr != iend && '0' <= *istr && '9' >= *istr && m_max > retval )
  63. {
  64. retval = retval * 10 + ( (unsigned)*istr++ - (unsigned)'0' );
  65. }
  66. if( m_max < retval )
  67. {
  68. retval /= 10;
  69. --istr;
  70. }
  71. return retval;
  72. }
  73. // This class is used to speed up character set matching by providing
  74. // a bitset that spans the ASCII range. std::bitset is not used because
  75. // the range-checking slows it down.
  76. // Note: The division and modulus operations are optimized by the compiler
  77. // into bit-shift operations.
  78. class ascii_bitvector
  79. {
  80. typedef unsigned int elem_type;
  81. enum { CBELEM = 8 * sizeof elem_type, // count of bytes per element
  82. CELEMS = (UCHAR_MAX+1) / CBELEM }; // number of element in array
  83. elem_type m_rg[ CELEMS ];
  84. // Used to inline operations like: bv1 |= ~bv2; without creating temp bit vectors.
  85. struct not_ascii_bitvector
  86. {
  87. const ascii_bitvector & m_ref;
  88. not_ascii_bitvector( const ascii_bitvector & ref ) throw()
  89. : m_ref(ref) {}
  90. };
  91. public:
  92. ascii_bitvector() throw()
  93. { memset( m_rg, 0, CELEMS * sizeof elem_type ); }
  94. inline void set( unsigned char ch ) throw()
  95. { m_rg[ ( ch / CBELEM ) ] |= ( (elem_type)1U << ( ch % CBELEM ) ); }
  96. inline bool operator[]( unsigned char ch ) const throw()
  97. { return 0 != ( m_rg[ ( ch / CBELEM ) ] & ( (elem_type)1U << ( ch % CBELEM ) ) ); }
  98. inline not_ascii_bitvector operator~() const throw()
  99. { return not_ascii_bitvector(*this); }
  100. inline ascii_bitvector & operator|=( const ascii_bitvector & that ) throw()
  101. { for( int i=0; i<CELEMS; ++i )
  102. m_rg[i] |= that.m_rg[i];
  103. return *this; }
  104. inline ascii_bitvector & operator|=( const not_ascii_bitvector & that ) throw()
  105. { for( int i=0; i<CELEMS; ++i )
  106. m_rg[i] |= ~that.m_ref.m_rg[i];
  107. return *this; }
  108. };
  109. const ascii_bitvector & get_digit_vector(void)
  110. {
  111. // 0-9
  112. class digit_vector : public ascii_bitvector
  113. {
  114. public:
  115. digit_vector()
  116. {
  117. unsigned char ich;
  118. for( ich ='0'; ich <= '9'; ++ich )
  119. set(ich);
  120. }
  121. };
  122. static const digit_vector s_digit_vector;
  123. return s_digit_vector;
  124. }
  125. const ascii_bitvector & get_word_vector(void)
  126. {
  127. // a-zA-Z_0-9
  128. class word_vector : public ascii_bitvector
  129. {
  130. public:
  131. word_vector()
  132. {
  133. unsigned char ich;
  134. for( ich = 'a'; ich <= 'z'; ++ich )
  135. set(ich);
  136. for( ich = 'A'; ich <= 'Z'; ++ich )
  137. set(ich);
  138. for( ich = '0'; ich <= '9'; ++ich )
  139. set(ich);
  140. set('_');
  141. }
  142. };
  143. static const word_vector s_word_vector;
  144. return s_word_vector;
  145. }
  146. const ascii_bitvector & get_space_vector(void)
  147. {
  148. // " \t\r\n\f"
  149. class space_vector : public ascii_bitvector
  150. {
  151. public:
  152. space_vector()
  153. {
  154. set(' ');
  155. set('\t');
  156. set('\r');
  157. set('\n');
  158. set('\f');
  159. }
  160. };
  161. static const space_vector s_space_vector;
  162. return s_space_vector;
  163. }
  164. //
  165. // Operator implementations
  166. //
  167. // Base type used so that all derived operators share typedefs.
  168. template< typename CI >
  169. struct op_t : public binary_function<match_param<CI>,CI,bool>
  170. {
  171. typedef CI const_iterator;
  172. typedef typename iterator_traits<CI>::value_type char_type;
  173. };
  174. // Evaluates the beginning-of-string condition
  175. template< typename CI >
  176. struct bos_t : public op_t<CI>
  177. {
  178. inline bool operator()( const match_param<CI> & param, CI iter ) const
  179. {
  180. return param.ibegin == iter;
  181. }
  182. };
  183. // Find the beginning of a line, either beginning of a string, or the character
  184. // immediately following a newline
  185. template< typename CI >
  186. struct bol_t : public bos_t<CI>
  187. {
  188. inline bool operator()( const match_param<CI> & param, CI iter ) const
  189. {
  190. return bos_t<CI>::operator()(param,iter) || char_type('\n') == *--iter;
  191. }
  192. };
  193. // Evaluates end-of-string condition for string's
  194. template< typename CI >
  195. struct eos_t : public op_t<CI>
  196. {
  197. inline bool operator()( const match_param<CI> & param, CI iter ) const
  198. {
  199. return param.istop == iter;
  200. }
  201. };
  202. // Evaluates end-of-string condidition for C-style string's when the length is unknown by
  203. // looking for the null-terminator.
  204. template< typename CI >
  205. struct eocs_t : public op_t<CI>
  206. {
  207. inline bool operator()( const match_param<CI> &, CI iter ) const
  208. {
  209. return char_type('\0') == *iter;
  210. }
  211. };
  212. // Evaluates end-of-line conditions, either the end of the string, or a
  213. // return or newline character.
  214. template< typename EOS >
  215. struct eol_t_t : public EOS
  216. {
  217. typedef typename EOS::const_iterator CI;
  218. typedef typename EOS::char_type char_type;
  219. inline bool operator()( const match_param<CI> & param, CI iter ) const
  220. {
  221. return EOS::operator()(param,iter) || char_type('\n') == *iter || char_type('\r') == *iter;
  222. }
  223. };
  224. template< typename CI > struct eol_t : public eol_t_t<eos_t<CI> > {};
  225. template< typename CI > struct eocl_t : public eol_t_t<eocs_t<CI> > {};
  226. // Evaluates perl's end-of-string conditions, either the end of the string, or a
  227. // newline character followed by end of string. (Only used by $ and /Z assertions)
  228. template< typename EOS >
  229. struct peos_t_t : public EOS
  230. {
  231. typedef typename EOS::const_iterator CI;
  232. typedef typename EOS::char_type char_type;
  233. inline bool operator()( const match_param<CI> & param, CI iter ) const
  234. {
  235. return EOS::operator()(param,iter) || ( ( char_type('\n') == *iter ) && EOS::operator()(param,++iter) );
  236. }
  237. };
  238. template< typename CI > struct peos_t : public peos_t_t<eos_t<CI> > {};
  239. template< typename CI > struct peocs_t : public peos_t_t<eocs_t<CI> > {};
  240. // compare two characters, case-sensitive
  241. template< typename CH >
  242. struct ch_neq_t : public binary_function<CH, CH, bool>
  243. {
  244. typedef CH char_type;
  245. inline bool operator()( register CH ch1, register CH ch2 ) const
  246. {
  247. return ch1 != ch2;
  248. }
  249. };
  250. // Compare two characters, disregarding case
  251. template< typename CH >
  252. struct ch_neq_nocase_t : public binary_function<CH, CH, bool>
  253. {
  254. typedef CH char_type;
  255. inline bool operator()( register CH ch1, register CH ch2 ) const
  256. {
  257. return to_upper(ch1) != to_upper(ch2);
  258. }
  259. };
  260. //
  261. // Helper functions for match and substitute
  262. //
  263. template< typename CI >
  264. size_t string_length( CI iter )
  265. {
  266. size_t n = 0;
  267. while( 0 != *iter++ )
  268. ++n;
  269. return n;
  270. }
  271. template< typename CI >
  272. backref_tag<CI> _do_match( const basic_rpattern_base<CI> & pat, match_param<CI> & param ) throw()
  273. {
  274. typedef typename iterator_traits<CI>::value_type char_type;
  275. bool floop = pat.loops();
  276. unsigned flags = pat.flags();
  277. width_type nwidth = pat.get_width();
  278. const sub_expr<CI> * pfirst = pat.get_first_subexpression();
  279. try
  280. {
  281. vector<backref_tag<CI> > rgbackrefs; // dummy backref vector
  282. if( NULL == param.prgbackrefs )
  283. param.prgbackrefs = & rgbackrefs;
  284. param.prgbackrefs->resize( pat._cgroups_total() );
  285. fill( param.prgbackrefs->begin(), param.prgbackrefs->end(), backref_tag<CI>() );
  286. // If a pattern is optimized for CSTRINGS, it can save a call
  287. // to calculate the length of the string.
  288. if( CI(0) == param.istop && ( ( RIGHTMOST & flags ) || ( 0 == ( CSTRINGS & flags ) ) ) )
  289. param.istop = param.istart + string_length( param.istart );
  290. if( CI(0) != param.istop )
  291. {
  292. // If the minimum width of the pattern exceeds the width of the
  293. // string, a succesful match is impossible
  294. if( nwidth.m_min <= (size_t)distance( param.istart, param.istop ) )
  295. {
  296. CI local_istop = param.istop;
  297. advance( local_istop, -int( nwidth.m_min ) );
  298. if( RIGHTMOST & flags )
  299. {
  300. // begin trying to match after the last character.
  301. // Continue to the beginning
  302. for( CI icur = local_istop; icur >= param.istart; --icur )
  303. if( pfirst->domatch( param, icur ) )
  304. break; // m_floop not used for rightmost matches
  305. }
  306. else
  307. {
  308. // begin trying to match before the first character.
  309. // Continue to the end
  310. for( CI icur = param.istart; icur <= local_istop; ++icur )
  311. if( pfirst->domatch( param, icur ) || ! floop )
  312. break;
  313. }
  314. }
  315. }
  316. else
  317. {
  318. // begin trying to match before the first character.
  319. // Continue to the end
  320. for( CI icur = param.istart; ; ++icur )
  321. {
  322. if( pfirst->domatch( param, icur ) || ! floop )
  323. break;
  324. if( char_type('\0') == *icur )
  325. break;
  326. }
  327. }
  328. }
  329. catch(...) // bad alloc, stack overflow?
  330. {
  331. fill( param.prgbackrefs->begin(), param.prgbackrefs->end(), backref_tag<CI>() );
  332. }
  333. // Shrink the backref vector to chop off information about the "invisible" groups
  334. param.prgbackrefs->resize( pat.cgroups() );
  335. return (*param.prgbackrefs)[0];
  336. }
  337. template< typename CI, typename CH, typename TR, typename AL >
  338. size_t _do_subst( basic_regexpr<CH,TR,AL> & str, const basic_rpattern_base<CI> & pat, size_t strpos, size_t strlen ) throw(bad_alloc)
  339. {
  340. typedef iterator_traits<CI>::value_type char_type;
  341. typedef list<subst_node>::const_iterator LCI;
  342. enum { UPPER = -1, NIL, LOWER } next = NIL, rest = NIL;
  343. bool first = true;
  344. size_t old_strpos = strpos;
  345. const list<subst_node> & subst_list = pat.get_subst_list();
  346. basic_string<CH,TR,AL>::iterator itstrlen = str.begin();
  347. advance( itstrlen, strpos + strlen );
  348. const basic_string<char_type> & subst = pat.get_subst();
  349. push_new_handler pnh( &my_new_handler );
  350. for( LCI isubst = subst_list.begin(); isubst != subst_list.end(); ++isubst )
  351. {
  352. size_t sublen;
  353. basic_string<CH,TR,AL>::const_iterator itsubpos1; // iter into str
  354. basic_string<CH,TR,AL>::const_iterator itsublen1;
  355. basic_string<char_type>::const_iterator itsubpos2; // iter into subst string
  356. basic_string<char_type>::const_iterator itsublen2;
  357. basic_string<CH,TR,AL>::iterator itstrpos = str.begin();
  358. advance( itstrpos, strpos );
  359. switch( isubst->stype )
  360. {
  361. case subst_node::SUBST_STRING:
  362. itsubpos2 = subst.begin();
  363. advance( itsubpos2, isubst->subst_string.rstart );
  364. itsublen2 = itsubpos2;
  365. advance( itsublen2, isubst->subst_string.rlength );
  366. first ? str.replace( itstrpos, itstrlen, itsubpos2, itsublen2 ) :
  367. str.insert( itstrpos, itsubpos2, itsublen2 );
  368. sublen = distance( itsubpos2, itsublen2 );
  369. break;
  370. case subst_node::SUBST_BACKREF:
  371. switch( isubst->subst_backref )
  372. {
  373. case subst_node::PREMATCH:
  374. itsubpos1 = str.backref_str().begin();
  375. itsublen1 = itsubpos1;
  376. advance( itsublen1, sublen = str.rstart() );
  377. break;
  378. case subst_node::POSTMATCH:
  379. itsubpos1 = str.backref_str().begin();
  380. advance( itsubpos1, str.rstart() + str.rlength() );
  381. itsublen1 = str.backref_str().end();
  382. break;
  383. default:
  384. itsubpos1 = str.backref_str().begin();
  385. advance( itsubpos1, str.rstart( isubst->subst_backref ) );
  386. itsublen1 = itsubpos1;
  387. advance( itsublen1, str.rlength( isubst->subst_backref ) );
  388. break;
  389. }
  390. first ? str.replace( itstrpos, itstrlen, itsubpos1, itsublen1 ) :
  391. str.insert( itstrpos, itsubpos1, itsublen1 );
  392. sublen = distance( itsubpos1, itsublen1 );
  393. break;
  394. case subst_node::SUBST_OP:
  395. switch( isubst->op )
  396. {
  397. case subst_node::UPPER_ON:
  398. rest = UPPER;
  399. break;
  400. case subst_node::UPPER_NEXT:
  401. next = UPPER;
  402. break;
  403. case subst_node::LOWER_ON:
  404. rest = LOWER;
  405. break;
  406. case subst_node::LOWER_NEXT:
  407. next = LOWER;
  408. break;
  409. case subst_node::ALL_OFF:
  410. rest = NIL;
  411. break;
  412. default:
  413. __assume(0);
  414. }
  415. continue; // jump to the next item in the list
  416. default:
  417. __assume(0);
  418. }
  419. first = false;
  420. // Are we upper- or lower-casing this string?
  421. if( rest )
  422. {
  423. basic_string<CH,TR,AL>::iterator istart = str.begin();
  424. advance( istart, strpos );
  425. basic_string<CH,TR,AL>::const_iterator istop = istart;
  426. advance( istop, sublen );
  427. switch( rest )
  428. {
  429. case UPPER:
  430. to_upper( istart, istop );
  431. break;
  432. case LOWER:
  433. to_lower( istart, istop );
  434. break;
  435. default:
  436. __assume(0);
  437. }
  438. }
  439. // Are we upper- or lower-casing the next character?
  440. if( next )
  441. {
  442. switch( next )
  443. {
  444. case UPPER:
  445. str[strpos] = to_upper(str[strpos]);
  446. break;
  447. case LOWER:
  448. str[strpos] = to_lower(str[strpos]);
  449. break;
  450. default:
  451. __assume(0);
  452. }
  453. next = NIL;
  454. }
  455. strpos += sublen;
  456. }
  457. // If *first* is still true, then we never called str.replace, and the substitution
  458. // string is empty. Erase the part of the string that the pattern matched.
  459. if( first )
  460. str.erase( strpos, strlen );
  461. // return length of the substitution
  462. return strpos - old_strpos;
  463. }
  464. //
  465. // Implementation of basic_regexpr
  466. //
  467. template< typename CH, typename TR, typename AL >
  468. size_t basic_regexpr<CH,TR,AL>::substitute(
  469. const basic_rpattern_base<basic_regexpr<CH,TR,AL>::const_iterator> & pat,
  470. size_type pos,
  471. size_type len ) throw(bad_alloc)
  472. {
  473. if( pat.flags() & CSTRINGS )
  474. {
  475. assert( ! "You can't use a pattern optimized for CSTRINGS with regexpr::substitute" );
  476. return 0;
  477. }
  478. backref_vector rgbackrefs; // dummy backref vector
  479. backref_vector * prgbackrefs = & rgbackrefs;
  480. const bool fsave_backrefs = ( pat.uses_backrefs() || !( pat.flags() & NOBACKREFS ) );
  481. if( fsave_backrefs )
  482. {
  483. prgbackrefs = & m_rgbackrefs;
  484. m_pbackref_str = & ( m_backref_str = *this );
  485. }
  486. else
  487. {
  488. m_backref_str.erase();
  489. m_pbackref_str = this;
  490. m_rgbackrefs.resize( 0 );
  491. }
  492. backref_type br;
  493. size_t csubst = 0;
  494. long stop_offset = ( len == npos ?
  495. m_pbackref_str->size() :
  496. min( pos + len, m_pbackref_str->size() ) );
  497. match_param<const_iterator> param( m_pbackref_str->begin(),
  498. m_pbackref_str->begin(),
  499. prgbackrefs );
  500. advance( param.istart, pos );
  501. advance( param.istop, stop_offset );
  502. param.ibegin = param.istart;
  503. if( GLOBAL & pat.flags() )
  504. {
  505. const bool fAll = ( ALLBACKREFS == ( ALLBACKREFS & pat.flags() ) );
  506. const bool fFirst = ( FIRSTBACKREFS == ( FIRSTBACKREFS & pat.flags() ) );
  507. backref_vector rgtempbackrefs; // temporary vector used if fsave_backrefs
  508. long pos_offset = 0; // keep track of how much the backref_str and
  509. // the current string are out of sync
  510. while( br = _do_match( pat, param ) )
  511. {
  512. ++csubst;
  513. size_type match_length = distance( br.first, br.second );
  514. pos = distance( m_pbackref_str->begin(), br.first );
  515. size_type subst_length = _do_subst( *this, pat, pos + pos_offset, match_length );
  516. if( fsave_backrefs )
  517. {
  518. pos += match_length;
  519. pos_offset += ( subst_length - match_length );
  520. // Handle specially the backref flags
  521. if( fFirst )
  522. rgtempbackrefs.push_back( br );
  523. else if( fAll )
  524. rgtempbackrefs.insert( rgtempbackrefs.end(),
  525. param.prgbackrefs->begin(),
  526. param.prgbackrefs->end() );
  527. else
  528. rgtempbackrefs.swap( *param.prgbackrefs );
  529. }
  530. else
  531. {
  532. pos += subst_length;
  533. stop_offset += ( subst_length - match_length );
  534. // we're not saving backref information, so we don't
  535. // need to do any special backref maintenance here
  536. }
  537. // prevent a pattern that matches 0 characters from matching
  538. // again at the same point in the string
  539. if( 0 == match_length )
  540. {
  541. if( br.first == param.istop ) // We're at the end, so we're done
  542. break;
  543. ++pos;
  544. }
  545. param.istart = m_pbackref_str->begin();
  546. advance( param.istart, pos ); // ineffecient for bidirectional iterators.
  547. param.istop = m_pbackref_str->begin();
  548. advance( param.istop, stop_offset ); // ineffecient for bidirectional iterators.
  549. }
  550. // If we did special backref handling, swap the backref vectors
  551. if( fsave_backrefs && ( !br || fFirst || fAll ) )
  552. param.prgbackrefs->swap( rgtempbackrefs );
  553. else if( ! (*param.prgbackrefs)[0] )
  554. param.prgbackrefs->clear();
  555. }
  556. else if( br = _do_match( pat, param ) )
  557. {
  558. ++csubst;
  559. _do_subst( *this, pat,
  560. distance( m_pbackref_str->begin(), br.first ),
  561. distance( br.first, br.second ) );
  562. }
  563. if( NOBACKREFS == ( pat.flags() & NOBACKREFS ) )
  564. param.prgbackrefs->clear();
  565. return csubst;
  566. }
  567. //
  568. // Helper functions called from both basic_regexpr match methods
  569. //
  570. template< typename EOS >
  571. backref_tag< typename EOS::const_iterator > _match_helper(
  572. const basic_rpattern_base<typename EOS::const_iterator> & pat,
  573. match_param<typename EOS::const_iterator> & param,
  574. EOS eos )
  575. {
  576. typedef typename EOS::const_iterator CI;
  577. if( GLOBAL & pat.flags() ) // do a global find
  578. {
  579. // The NOBACKREFS flag is ignored in the match method.
  580. const bool fAll = ( ALLBACKREFS == ( ALLBACKREFS & pat.flags() ) );
  581. const bool fFirst = ( FIRSTBACKREFS == ( FIRSTBACKREFS & pat.flags() ) );
  582. backref_tag<CI> br;
  583. vector<backref_tag<CI> > rgtempbackrefs;
  584. while( br = _do_match( pat, param ) )
  585. {
  586. // Handle specially the backref flags
  587. if( fFirst )
  588. rgtempbackrefs.push_back( br );
  589. else if( fAll )
  590. rgtempbackrefs.insert( rgtempbackrefs.end(),
  591. param.prgbackrefs->begin(),
  592. param.prgbackrefs->end() );
  593. else
  594. rgtempbackrefs.swap( *param.prgbackrefs );
  595. if( br.first == ( param.istart = br.second ) )
  596. {
  597. if( eos( param, param.istart ) )
  598. break;
  599. ++param.istart;
  600. }
  601. }
  602. // restore the backref vectors
  603. if( !br || fFirst || fAll )
  604. param.prgbackrefs->swap( rgtempbackrefs );
  605. else if( ! (*param.prgbackrefs)[0] )
  606. param.prgbackrefs->clear();
  607. return param.prgbackrefs->empty() ? backref_tag<CI>() : (*param.prgbackrefs)[0];
  608. }
  609. else
  610. return _do_match( pat, param );
  611. }
  612. template< typename CH, typename TR, typename AL >
  613. basic_regexpr<CH,TR,AL>::backref_type basic_regexpr<CH,TR,AL>::match(
  614. const basic_rpattern_base<const_iterator> & pat,
  615. size_type pos, size_type len ) const throw()
  616. {
  617. if( pat.flags() & CSTRINGS )
  618. {
  619. assert( ! "A pattern optimized for CSTRINGS can only be used with the static regexpr::match method" );
  620. return backref_type();
  621. }
  622. m_pbackref_str = this;
  623. m_backref_str.erase(); // free up unused memory
  624. const_iterator istart = begin();
  625. advance( istart, pos );
  626. const_iterator istop;
  627. if( len == npos || pos + len >= size() )
  628. istop = end();
  629. else
  630. advance( istop = begin(), pos + len );
  631. match_param<const_iterator> param( istart, istop, & m_rgbackrefs );
  632. return _match_helper<eos_t<const_iterator> >( pat, param, eos_t<const_iterator>() );
  633. }
  634. template< typename CH >
  635. backref_tag<const CH *> _static_match_helper(
  636. const CH * szstr,
  637. const basic_rpattern_base<const CH *> & pat,
  638. vector< backref_tag< const CH * > > * prgbackrefs ) throw()
  639. {
  640. vector< backref_tag< const CH * > > rgdummyvector;
  641. if( NULL == prgbackrefs )
  642. prgbackrefs = &rgdummyvector;
  643. match_param<const CH *> param( szstr, NULL, prgbackrefs );
  644. return _match_helper<eocs_t<const CH *> >( pat, param, eocs_t<const CH *>() );
  645. }
  646. //
  647. // Helper function called from both basic_regexpr::count methods
  648. //
  649. template< typename EOS >
  650. size_t _count_helper(
  651. const basic_rpattern_base<typename EOS::const_iterator> & pat,
  652. match_param<typename EOS::const_iterator> & param,
  653. EOS eos )
  654. {
  655. typedef typename EOS::const_iterator CI;
  656. size_t cmatches = 0;
  657. vector<backref_tag<CI> > rgbackrefs; // dummy backref vector
  658. backref_tag<CI> br;
  659. param.prgbackrefs = &rgbackrefs;
  660. while( br = _do_match( pat, param ) )
  661. {
  662. ++cmatches;
  663. if( br.first == ( param.istart = br.second ) )
  664. {
  665. if( eos( param, param.istart ) )
  666. break;
  667. ++param.istart;
  668. }
  669. }
  670. return cmatches;
  671. }
  672. template< typename CH, typename TR, typename AL >
  673. size_t basic_regexpr<CH,TR,AL>::count(
  674. const basic_rpattern_base<basic_regexpr<CH,TR,AL>::const_iterator> & pat,
  675. size_type pos,
  676. size_type len ) const throw()
  677. {
  678. if( pat.flags() & CSTRINGS )
  679. {
  680. assert( ! "A pattern optimized for CSTRINGS can only be used with the static regexpr::count method" );
  681. return backref_type();
  682. }
  683. m_pbackref_str = this;
  684. const_iterator istart = begin();
  685. advance( istart, pos );
  686. const_iterator istop;
  687. if( len == npos || pos + len >= size() )
  688. istop = end();
  689. else
  690. advance( istop = begin(), pos + len );
  691. match_param<const_iterator> param( istart, istop, NULL );
  692. return _count_helper<eos_t<const_iterator> >( pat, param, eos_t<const_iterator>() );
  693. }
  694. template< typename CH >
  695. size_t _static_count_helper(
  696. const CH * szstr,
  697. const basic_rpattern_base<const CH *> & pat ) throw()
  698. {
  699. match_param<const CH *> param( szstr, NULL, NULL );
  700. return _count_helper<eocs_t<const CH *> >( pat, param, eocs_t<const CH *>() );
  701. }
  702. // Base class for sub-expressions which are zero-width
  703. // (i.e., assertions eat no characters during matching)
  704. // Assertions cannot be quantified.
  705. template< typename CI >
  706. class assertion : public sub_expr<CI>
  707. {
  708. public:
  709. virtual ~assertion() {}
  710. virtual bool is_assertion() const throw() { return true; }
  711. protected:
  712. virtual width_type _width_this() throw() { return width_type(0,0); }
  713. };
  714. template< typename OP >
  715. class assert_op : public assertion<typename OP::const_iterator>
  716. {
  717. public:
  718. typedef OP op_type;
  719. typedef typename OP::const_iterator CI;
  720. virtual ~assert_op() {}
  721. protected:
  722. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  723. {
  724. return m_op( param, icur );
  725. }
  726. op_type m_op;
  727. };
  728. template< typename CI >
  729. assertion<CI> * create_bos( unsigned /*flags*/ )
  730. {
  731. return new assert_op<bos_t<CI> >();
  732. }
  733. template< typename CI >
  734. assertion<CI> * create_eos( unsigned flags )
  735. {
  736. switch( CSTRINGS & flags )
  737. {
  738. case 0:
  739. return new assert_op<peos_t<CI> >();
  740. case CSTRINGS:
  741. return new assert_op<peocs_t<CI> >();
  742. default:
  743. __assume(0); // tells the compiler that this is unreachable
  744. }
  745. }
  746. template< typename CI >
  747. assertion<CI> * create_eoz( unsigned flags )
  748. {
  749. switch( CSTRINGS & flags )
  750. {
  751. case 0:
  752. return new assert_op<eos_t<CI> >();
  753. case CSTRINGS:
  754. return new assert_op<eocs_t<CI> >();
  755. default:
  756. __assume(0); // tells the compiler that this is unreachable
  757. }
  758. }
  759. template< typename CI >
  760. assertion<CI> * create_bol( unsigned flags )
  761. {
  762. switch( MULTILINE & flags )
  763. {
  764. case 0:
  765. return new assert_op<bos_t<CI> >();
  766. case MULTILINE:
  767. return new assert_op<bol_t<CI> >();
  768. default:
  769. __assume(0); // tells the compiler that this is unreachable
  770. }
  771. }
  772. template< typename CI >
  773. assertion<CI> * create_eol( unsigned flags )
  774. {
  775. switch( ( MULTILINE | CSTRINGS ) & flags )
  776. {
  777. case 0:
  778. return new assert_op<peos_t<CI> >();
  779. case MULTILINE:
  780. return new assert_op<eol_t<CI> >();
  781. case CSTRINGS:
  782. return new assert_op<peocs_t<CI> >();
  783. case MULTILINE | CSTRINGS:
  784. return new assert_op<eocl_t<CI> >();
  785. default:
  786. __assume(0); // tells the compiler that this is unreachable
  787. }
  788. }
  789. template< typename CI >
  790. class match_atom : public sub_expr<CI>
  791. {
  792. public:
  793. match_atom( const basic_string<sub_expr<CI>::char_type>::iterator istart,
  794. basic_string<sub_expr<CI>::char_type>::const_iterator istop )
  795. : m_istart( istart ), m_istop( istop ) {}
  796. virtual ~match_atom() {}
  797. const basic_string<sub_expr<CI>::char_type>::iterator m_istart;
  798. basic_string<sub_expr<CI>::char_type>::const_iterator m_istop;
  799. protected:
  800. virtual width_type _width_this() throw()
  801. {
  802. size_t width = distance( (basic_string<sub_expr<CI>::char_type>::const_iterator)m_istart, m_istop );
  803. return width_type( width, width );
  804. }
  805. };
  806. template< typename EOS >
  807. class match_atom_t : public match_atom<typename EOS::const_iterator>
  808. {
  809. public:
  810. typedef EOS eos_type;
  811. typedef typename EOS::const_iterator CI;
  812. match_atom_t( const basic_string<sub_expr<CI>::char_type>::iterator istart,
  813. basic_string<sub_expr<CI>::char_type>::const_iterator istop )
  814. : match_atom<CI>( istart, istop ) {}
  815. virtual ~match_atom_t() {}
  816. protected:
  817. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  818. {
  819. CI icur_tmp = icur;
  820. basic_string<sub_expr<CI>::char_type>::const_iterator ithis = m_istart;
  821. for( ; ithis != m_istop; ++icur_tmp, ++ithis )
  822. {
  823. if( m_eos( param, icur_tmp ) || *ithis != *icur_tmp )
  824. return false;
  825. }
  826. icur = icur_tmp;
  827. return true;
  828. }
  829. eos_type m_eos;
  830. };
  831. template< typename EOS >
  832. class match_atom_nocase_t : public match_atom<typename EOS::const_iterator>
  833. {
  834. public:
  835. typedef EOS eos_type;
  836. typedef typename EOS::const_iterator CI;
  837. match_atom_nocase_t( const basic_string<sub_expr<CI>::char_type>::iterator istart,
  838. basic_string<sub_expr<CI>::char_type>::const_iterator istop )
  839. : match_atom<CI>( istart, istop ), m_strlower( (basic_string<sub_expr<CI>::char_type>::const_iterator)istart, istop )
  840. {
  841. // Store the uppercase version of the atom in [m_istart,m_istop).
  842. to_upper( m_istart, m_istop );
  843. // Store the lowercase version of the atom in m_strlower.
  844. to_lower( m_strlower.begin(), m_strlower.end() );
  845. }
  846. virtual ~match_atom_nocase_t() {}
  847. protected:
  848. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  849. {
  850. CI icur_tmp = icur;
  851. basic_string<sub_expr<CI>::char_type>::const_iterator ithisu = m_istart; // uppercase
  852. basic_string<sub_expr<CI>::char_type>::const_iterator ithisl = m_strlower.begin(); // lowercase
  853. for( ; ithisu != m_istop; ++icur_tmp, ++ithisu, ++ithisl )
  854. {
  855. if( m_eos( param, icur_tmp ) || ( *ithisu != *icur_tmp && *ithisl != *icur_tmp ) )
  856. return false;
  857. }
  858. icur = icur_tmp;
  859. return true;
  860. }
  861. eos_type m_eos;
  862. basic_string<sub_expr<CI>::char_type> m_strlower;
  863. };
  864. template< typename CI >
  865. match_atom<CI> * create_atom(
  866. const basic_string<iterator_traits<CI>::value_type>::iterator istart,
  867. basic_string<iterator_traits<CI>::value_type>::const_iterator istop,
  868. unsigned flags )
  869. {
  870. switch( ( NOCASE | CSTRINGS ) & flags )
  871. {
  872. case 0:
  873. return new match_atom_t<eos_t<CI> >( istart, istop );
  874. case NOCASE:
  875. return new match_atom_nocase_t<eos_t<CI> >( istart, istop );
  876. case CSTRINGS:
  877. return new match_atom_t<eocs_t<CI> >( istart, istop );
  878. case NOCASE | CSTRINGS:
  879. return new match_atom_nocase_t<eocs_t<CI> >( istart, istop );
  880. default:
  881. __assume(0); // tells the compiler that this is unreachable
  882. }
  883. }
  884. template< typename CI >
  885. match_atom<CI> * create_atom(
  886. const basic_string<iterator_traits<CI>::value_type>::iterator istart,
  887. unsigned flags )
  888. {
  889. basic_string<iterator_traits<CI>::value_type>::const_iterator istop = istart;
  890. return create_atom<CI>( istart, ++istop, flags );
  891. }
  892. template< typename CI >
  893. class match_any : public sub_expr<CI>
  894. {
  895. public:
  896. virtual ~match_any() {}
  897. protected:
  898. virtual width_type _width_this() throw() { return width_type(1,1); }
  899. };
  900. template< typename EOS >
  901. class match_any_t : public match_any<typename EOS::const_iterator>
  902. {
  903. public:
  904. typedef EOS eos_type;
  905. typedef typename EOS::const_iterator CI;
  906. virtual ~match_any_t() {}
  907. protected:
  908. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  909. {
  910. if( m_eos( param, icur ) )
  911. return false;
  912. ++icur;
  913. return true;
  914. }
  915. eos_type m_eos;
  916. };
  917. template< typename CI >
  918. match_any<CI> * create_any( unsigned flags )
  919. {
  920. switch( ( SINGLELINE | CSTRINGS ) & flags )
  921. {
  922. case 0:
  923. return new match_any_t<eol_t<CI> >();
  924. case SINGLELINE:
  925. return new match_any_t<eos_t<CI> >();
  926. case CSTRINGS:
  927. return new match_any_t<eocl_t<CI> >();
  928. case SINGLELINE | CSTRINGS:
  929. return new match_any_t<eocs_t<CI> >();
  930. default:
  931. __assume(0); // tells the compiler that this is unreachable
  932. }
  933. }
  934. typedef pair<wchar_t,wchar_t> range_type;
  935. const vector<range_type> g_rgranges; // empty
  936. template< typename CI >
  937. class match_charset : public sub_expr<CI>
  938. {
  939. public:
  940. match_charset( bool fcomplement,
  941. const ascii_bitvector & bvect )
  942. : m_fcomplement( fcomplement ),
  943. m_rgascii( bvect ),
  944. m_rgranges( g_rgranges ),
  945. m_ncharflags(0) {}
  946. // Note that only the references are copied here -- they are not ref counted.
  947. // Beware of variable lifetime issues.
  948. match_charset( const match_charset<CI> & that )
  949. : m_fcomplement( that.m_fcomplement ),
  950. m_rgascii( that.m_rgascii ),
  951. m_rgranges( that.m_rgranges ),
  952. m_ncharflags( that.m_ncharflags ) {}
  953. virtual ~match_charset() {}
  954. const bool m_fcomplement;
  955. const ascii_bitvector & m_rgascii; // bitmap for chars in range 0-255
  956. const vector<range_type> & m_rgranges; // vector of included character ranges 256-65535
  957. wctype_t m_ncharflags; // Parameter to iswctype()
  958. // The case-sensitivity of a character set is "compiled" into the ascii_bitvector
  959. // but not into the range vector because it is too computationally expensive. Instead,
  960. // when doing a unicode case-insensitive match on the ranges vector, two lookups
  961. // must be performed -- one lowercase and one uppercase. By contrast, only one lookup
  962. // is needed for the ascii_bitvector.
  963. protected:
  964. match_charset( bool fcomplement,
  965. const ascii_bitvector & bvect,
  966. const vector<range_type> & rgranges )
  967. : m_fcomplement( fcomplement ),
  968. m_rgascii( bvect ),
  969. m_rgranges( rgranges ),
  970. m_ncharflags(0) {}
  971. // this method should never be called. match_charset is only a base class
  972. // for match_charset_t
  973. virtual bool _match_this( match_param<CI> &, CI & ) const throw()
  974. {
  975. assert(false);
  976. return true;
  977. }
  978. template< typename SY >
  979. match_charset<CI> * get_altern_charset( char_type ch, unsigned flags, SY /*sy*/ ) const throw()
  980. {
  981. return basic_rpattern<CI,SY>::s_charset_map.get( ch, flags );
  982. }
  983. virtual width_type _width_this() throw() { return width_type(1,1); }
  984. };
  985. // Used as a template parameter to find a unicode character in an array of ranges.
  986. class match_range : public unary_function<wchar_t,bool>
  987. {
  988. protected:
  989. const vector<range_type> & m_rgranges;
  990. // determines if one range is less then another.
  991. // used in binary search of range vector
  992. inline static bool _range_less( const range_type & rg1,
  993. const range_type & rg2 ) throw()
  994. {
  995. return rg1.second < rg2.first;
  996. }
  997. match_range( const vector<range_type> & rgranges )
  998. : m_rgranges( rgranges ) {}
  999. };
  1000. class match_range_with_case : public match_range
  1001. {
  1002. public:
  1003. match_range_with_case( const vector<range_type> & rgranges )
  1004. : match_range( rgranges ) {}
  1005. inline bool operator()( wchar_t ch ) const throw()
  1006. {
  1007. return binary_search( m_rgranges.begin(), m_rgranges.end(),
  1008. range_type(ch,ch), _range_less );
  1009. }
  1010. };
  1011. class match_range_no_case : public match_range
  1012. {
  1013. public:
  1014. match_range_no_case( const vector<range_type> & rgranges )
  1015. : match_range( rgranges ) {}
  1016. inline bool operator()( wchar_t ch ) const throw()
  1017. {
  1018. const wchar_t chup = towupper( ch );
  1019. if( binary_search( m_rgranges.begin(), m_rgranges.end(),
  1020. range_type(chup,chup), _range_less ) )
  1021. return true;
  1022. const wchar_t chlo = towlower( ch );
  1023. if( chup != chlo &&
  1024. binary_search( m_rgranges.begin(), m_rgranges.end(),
  1025. range_type(chlo,chlo), _range_less ) )
  1026. return true;
  1027. return false;
  1028. }
  1029. };
  1030. template< typename EOS, typename RGM >
  1031. class match_charset_t : public match_charset<typename EOS::const_iterator>
  1032. {
  1033. public:
  1034. typedef EOS eos_type;
  1035. typedef RGM range_match_type;
  1036. typedef typename EOS::const_iterator CI;
  1037. match_charset_t( const match_charset<CI> & that )
  1038. : match_charset<CI>( that ), m_rgm( m_rgranges ) {}
  1039. virtual ~match_charset_t() {}
  1040. inline bool is_in_charset( char_type ch ) const throw()
  1041. {
  1042. return m_fcomplement != _is_in_charset( ch );
  1043. }
  1044. protected:
  1045. match_charset_t( bool fcomplement,
  1046. const ascii_bitvector & bvect,
  1047. const vector<range_type> & rgranges )
  1048. : match_charset<CI>( fcomplement, bvect, rgranges ), m_rgm( m_rgranges ) {}
  1049. // Note overloading based on parameter
  1050. inline bool _is_in_charset( char ch ) const throw()
  1051. {
  1052. return ( m_rgascii[ (unsigned char)(ch) ] ) ||
  1053. ( m_ncharflags && ( _pctype[(unsigned char)(ch)] & m_ncharflags ) );
  1054. }
  1055. // Note overloading based on parameter
  1056. inline bool _is_in_charset( wchar_t ch ) const throw()
  1057. {
  1058. if( UCHAR_MAX >= ch )
  1059. return _is_in_charset( char(ch) );
  1060. // use range_match_type to see if this character is within one of the
  1061. // ranges stored in m_rgranges.
  1062. return ( ! m_rgranges.empty() && m_rgm( ch ) ) ||
  1063. ( m_ncharflags && iswctype( ch, m_ncharflags ) );
  1064. }
  1065. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  1066. {
  1067. if( m_eos( param, icur ) || ! is_in_charset( *icur ) )
  1068. return false;
  1069. ++icur;
  1070. return true;
  1071. }
  1072. // range_match_type encapsulates the case-sensitivity
  1073. // issues with doing a unicode lookup on the ranges vector.
  1074. range_match_type m_rgm;
  1075. eos_type m_eos;
  1076. };
  1077. template< typename EOS, typename RGM >
  1078. class match_custom_charset_t : public match_charset_t<EOS,RGM>
  1079. {
  1080. public:
  1081. template< typename SY >
  1082. match_custom_charset_t( bool fcomplement,
  1083. basic_string<char_type>::iterator & icur,
  1084. basic_string<char_type>::const_iterator istop,
  1085. unsigned flags, SY /*sy*/ ) throw(bad_regexpr,bad_alloc)
  1086. : match_charset_t<EOS,RGM>( fcomplement, m_rgasciicustom, m_rgrangescustom )
  1087. {
  1088. _parse_charset( icur, istop, flags, SY() );
  1089. _optimize();
  1090. }
  1091. virtual ~match_custom_charset_t() {}
  1092. // for including one character set in another
  1093. match_custom_charset_t<EOS,RGM> & operator|=( const match_charset<CI> & that )
  1094. {
  1095. assert( 0 == that.m_ncharflags );
  1096. if( that.m_fcomplement )
  1097. {
  1098. m_rgasciicustom |= ~ that.m_rgascii;
  1099. // append the inverse of that.m_rgranges to this->m_rgrangescustom
  1100. wchar_t chlow = UCHAR_MAX;
  1101. typedef vector<range_type>::const_iterator VCI;
  1102. for( VCI prg = that.m_rgranges.begin(); prg != that.m_rgranges.end(); ++prg )
  1103. {
  1104. if( UCHAR_MAX + 1 != prg->first )
  1105. m_rgrangescustom.push_back( range_type( wchar_t(chlow+1), wchar_t(prg->first-1) ) );
  1106. chlow = prg->second;
  1107. }
  1108. if( WCHAR_MAX != chlow )
  1109. m_rgrangescustom.push_back( range_type( wchar_t(chlow+1), WCHAR_MAX ) );
  1110. }
  1111. else
  1112. {
  1113. m_rgasciicustom |= that.m_rgascii;
  1114. m_rgrangescustom.insert( m_rgrangescustom.end(),
  1115. that.m_rgranges.begin(),
  1116. that.m_rgranges.end() );
  1117. }
  1118. return *this;
  1119. }
  1120. protected:
  1121. // If we reached the end of the string before finding the end of the
  1122. // character set, then this is an ill-formed regex
  1123. void _check_iter( basic_string<char_type>::const_iterator icur,
  1124. basic_string<char_type>::const_iterator istop ) throw(bad_regexpr)
  1125. {
  1126. if( icur == istop )
  1127. throw bad_regexpr("expecting end of character set");
  1128. }
  1129. template< typename SY >
  1130. void _parse_charset( basic_string<char_type>::iterator & icur,
  1131. basic_string<char_type>::const_iterator istop,
  1132. unsigned flags, SY /*sy*/ ) throw(bad_regexpr,bad_alloc)
  1133. {
  1134. TOKEN tok;
  1135. char_type ch_prev = 0;
  1136. match_charset<CI> * pcharset;
  1137. basic_string<char_type>::iterator iprev = icur;
  1138. const bool fnocase = ( NOCASE == ( NOCASE & flags ) );
  1139. _check_iter( icur, istop );
  1140. // remember the current position and grab the next token
  1141. tok = SY::charset_token( icur, istop );
  1142. do
  1143. {
  1144. _check_iter( icur, istop );
  1145. if( CHARSET_RANGE == tok && ch_prev )
  1146. {
  1147. // remember the current position
  1148. basic_string<char_type>::iterator iprev2 = icur;
  1149. char_type old_ch = ch_prev;
  1150. ch_prev = 0;
  1151. // old_ch is lower bound of a range
  1152. switch( SY::charset_token( icur, istop ) )
  1153. {
  1154. case CHARSET_RANGE:
  1155. case CHARSET_NEGATE:
  1156. icur = iprev2; // un-get these tokens and fall through
  1157. case NO_TOKEN:
  1158. case CHARSET_ESCAPE: // BUGBUG user-defined charset?
  1159. _set_bit_range( old_ch, *icur++, fnocase );
  1160. continue;
  1161. case CHARSET_BACKSPACE:
  1162. _set_bit_range( old_ch, char_type(8), fnocase ); // backspace
  1163. continue;
  1164. case CHARSET_END: // fall through
  1165. default: // not a range.
  1166. icur = iprev; // backup to range token
  1167. _set_bit( old_ch, fnocase );
  1168. _set_bit( *icur++, fnocase );
  1169. continue;
  1170. }
  1171. }
  1172. if( ch_prev )
  1173. _set_bit( ch_prev, fnocase );
  1174. ch_prev = 0;
  1175. switch( tok )
  1176. {
  1177. // None of the intrinsic charsets are case-sensitive,
  1178. // so no special handling must be done when the NOCASE
  1179. // flag is set.
  1180. case CHARSET_RANGE:
  1181. case CHARSET_NEGATE:
  1182. case CHARSET_END:
  1183. icur = iprev; // un-get these tokens
  1184. ch_prev = *icur++;
  1185. continue;
  1186. case CHARSET_BACKSPACE:
  1187. ch_prev = char_type(8); // backspace
  1188. continue;
  1189. case ESC_DIGIT:
  1190. *this |= match_charset<CI>( false, get_digit_vector() );
  1191. continue;
  1192. case ESC_NOT_DIGIT:
  1193. *this |= match_charset<CI>( true, get_digit_vector() );
  1194. continue;
  1195. case ESC_SPACE:
  1196. *this |= match_charset<CI>( false, get_space_vector() );
  1197. continue;
  1198. case ESC_NOT_SPACE:
  1199. *this |= match_charset<CI>( true, get_space_vector() );
  1200. continue;
  1201. case ESC_WORD:
  1202. *this |= match_charset<CI>( false, get_word_vector() );
  1203. continue;
  1204. case ESC_NOT_WORD:
  1205. *this |= match_charset<CI>( true, get_word_vector() );
  1206. continue;
  1207. case CHARSET_ALNUM:
  1208. m_ncharflags |= (_ALPHA|_DIGIT);
  1209. continue;
  1210. case CHARSET_ALPHA:
  1211. m_ncharflags |= (_ALPHA);
  1212. continue;
  1213. case CHARSET_BLANK:
  1214. m_ncharflags |= (_BLANK);
  1215. continue;
  1216. case CHARSET_CNTRL:
  1217. m_ncharflags |= (_CONTROL);
  1218. continue;
  1219. case CHARSET_DIGIT:
  1220. m_ncharflags |= (_DIGIT);
  1221. continue;
  1222. case CHARSET_GRAPH:
  1223. m_ncharflags |= (_PUNCT|_ALPHA|_DIGIT);
  1224. continue;
  1225. case CHARSET_LOWER:
  1226. m_ncharflags |= (_LOWER);
  1227. if( NOCASE == ( NOCASE & flags ) )
  1228. m_ncharflags |= (_UPPER);
  1229. continue;
  1230. case CHARSET_PRINT:
  1231. m_ncharflags |= (_BLANK|_PUNCT|_ALPHA|_DIGIT);
  1232. continue;
  1233. case CHARSET_PUNCT:
  1234. m_ncharflags |= (_PUNCT);
  1235. continue;
  1236. case CHARSET_SPACE:
  1237. m_ncharflags |= (_SPACE);
  1238. continue;
  1239. case CHARSET_UPPER:
  1240. m_ncharflags |= (_UPPER);
  1241. if( NOCASE == ( NOCASE & flags ) )
  1242. m_ncharflags |= (_LOWER);
  1243. continue;
  1244. case CHARSET_XDIGIT:
  1245. m_ncharflags |= (_HEX);
  1246. continue;
  1247. case CHARSET_ESCAPE:
  1248. // Maybe this is a user-defined intrinsic charset
  1249. pcharset = get_altern_charset( *icur, flags, SY() );
  1250. if( NULL != pcharset )
  1251. {
  1252. *this |= *pcharset;
  1253. ++icur;
  1254. continue;
  1255. }
  1256. // else fall through
  1257. default:
  1258. ch_prev = *icur++;
  1259. continue;
  1260. }
  1261. }
  1262. while( _check_iter( iprev = icur, istop ), CHARSET_END != ( tok = SY::charset_token( icur, istop ) ) );
  1263. if( ch_prev )
  1264. _set_bit( ch_prev, fnocase );
  1265. }
  1266. void _optimize()
  1267. {
  1268. // this sorts on range_type.first (uses operator<() for pair templates)
  1269. sort( m_rgrangescustom.begin(), m_rgrangescustom.end() );
  1270. // This merges ranges that overlap
  1271. for( size_t index = 1; index < m_rgrangescustom.size(); )
  1272. {
  1273. if( m_rgrangescustom[index].first <= m_rgrangescustom[index-1].second + 1 )
  1274. {
  1275. m_rgrangescustom[index-1].second = max(
  1276. m_rgrangescustom[index-1].second, m_rgrangescustom[index].second );
  1277. m_rgrangescustom.erase( m_rgrangescustom.begin() + index );
  1278. }
  1279. else
  1280. ++index;
  1281. }
  1282. }
  1283. // Note overloading based on second parameter
  1284. void _set_bit( char ch, const bool fnocase ) throw()
  1285. {
  1286. if( fnocase )
  1287. {
  1288. m_rgasciicustom.set( (unsigned char)(tolower(ch)) );
  1289. m_rgasciicustom.set( (unsigned char)(toupper(ch)) );
  1290. }
  1291. else
  1292. {
  1293. m_rgasciicustom.set( (unsigned char)(ch) );
  1294. }
  1295. }
  1296. // Note overloading based on second parameter
  1297. void _set_bit( wchar_t ch, const bool fnocase ) throw(bad_alloc)
  1298. {
  1299. if( UCHAR_MAX >= ch )
  1300. _set_bit( char(ch), fnocase );
  1301. else
  1302. m_rgrangescustom.push_back( range_type( ch, ch ) );
  1303. }
  1304. // Note overloading based on second parameter
  1305. void _set_bit_range( char ch1, char ch2, const bool fnocase ) throw(bad_regexpr)
  1306. {
  1307. if( (unsigned char)(ch1) > (unsigned char)(ch2) )
  1308. throw bad_regexpr("invalid range specified in character set");
  1309. if( fnocase )
  1310. {
  1311. // i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
  1312. for( unsigned int i = (unsigned char)(ch1); i <= (unsigned char)(ch2); ++i )
  1313. {
  1314. m_rgasciicustom.set( (unsigned char)( toupper(i) ) );
  1315. m_rgasciicustom.set( (unsigned char)( tolower(i) ) );
  1316. }
  1317. }
  1318. else
  1319. {
  1320. // i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
  1321. for( unsigned int i = (unsigned char)(ch1); i <= (unsigned char)(ch2); ++i )
  1322. m_rgasciicustom.set( (unsigned char)(i) );
  1323. }
  1324. }
  1325. // Note overloading based on second parameter
  1326. void _set_bit_range( wchar_t ch1, wchar_t ch2, const bool fnocase ) throw(bad_regexpr,bad_alloc)
  1327. {
  1328. if( ch1 > ch2 )
  1329. throw bad_regexpr("invalid range specified in character set");
  1330. if( UCHAR_MAX >= ch1 )
  1331. _set_bit_range( char(ch1), char( min(wchar_t(UCHAR_MAX),ch2) ), fnocase );
  1332. if( UCHAR_MAX < ch2 )
  1333. m_rgrangescustom.push_back( range_type( max(wchar_t(UCHAR_MAX+1),ch1), ch2 ) );
  1334. }
  1335. ascii_bitvector m_rgasciicustom;
  1336. vector<range_type> m_rgrangescustom;
  1337. };
  1338. template< typename CI >
  1339. match_charset<CI> * create_charset(
  1340. const match_charset<CI> & that,
  1341. unsigned flags )
  1342. {
  1343. switch( ( NOCASE | CSTRINGS ) & flags )
  1344. {
  1345. case 0:
  1346. return new match_charset_t<eos_t<CI>,match_range_with_case>( that );
  1347. case NOCASE:
  1348. return new match_charset_t<eos_t<CI>,match_range_no_case>( that );
  1349. case CSTRINGS:
  1350. return new match_charset_t<eocs_t<CI>,match_range_with_case>( that );
  1351. case NOCASE | CSTRINGS:
  1352. return new match_charset_t<eocs_t<CI>,match_range_no_case>( that );
  1353. default:
  1354. __assume(0); // tells the compiler that this is unreachable
  1355. }
  1356. }
  1357. template< typename EOS >
  1358. class word_assertion_t : public assertion<typename EOS::const_iterator>
  1359. {
  1360. public:
  1361. typedef EOS eos_type;
  1362. typedef typename EOS::const_iterator CI;
  1363. word_assertion_t()
  1364. : m_isword( match_charset<CI>( false, get_word_vector() ) ) {}
  1365. virtual ~word_assertion_t() {}
  1366. protected:
  1367. bos_t<CI> m_bos;
  1368. eos_type m_eos;
  1369. match_charset_t<eos_type,match_range_with_case> m_isword;
  1370. };
  1371. template< typename EOS >
  1372. class word_boundary_t : public word_assertion_t<EOS>
  1373. {
  1374. public:
  1375. word_boundary_t( const bool fisboundary )
  1376. : m_fisboundary( fisboundary ) {}
  1377. virtual ~word_boundary_t() {}
  1378. protected:
  1379. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  1380. {
  1381. CI iprev = icur;
  1382. --iprev;
  1383. const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev );
  1384. const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur );
  1385. return ( m_fisboundary == ( fprevword != fthisword ) );
  1386. }
  1387. const bool m_fisboundary;
  1388. };
  1389. template< typename EOS >
  1390. class word_start_t : public word_assertion_t<EOS>
  1391. {
  1392. public:
  1393. word_start_t() {}
  1394. virtual ~word_start_t() {}
  1395. protected:
  1396. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  1397. {
  1398. CI iprev = icur;
  1399. --iprev;
  1400. const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev );
  1401. const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur );
  1402. return ! fprevword && fthisword;
  1403. }
  1404. };
  1405. template< typename EOS >
  1406. class word_stop_t : public word_assertion_t<EOS>
  1407. {
  1408. public:
  1409. word_stop_t() {}
  1410. virtual ~word_stop_t() {}
  1411. protected:
  1412. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  1413. {
  1414. CI iprev = icur;
  1415. --iprev;
  1416. const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev );
  1417. const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur );
  1418. return fprevword && ! fthisword;
  1419. }
  1420. };
  1421. template< typename CI >
  1422. assertion<CI> * create_word_boundary( const bool fisboundary, unsigned flags )
  1423. {
  1424. switch( CSTRINGS & flags )
  1425. {
  1426. case 0:
  1427. return new word_boundary_t<eos_t<CI> >( fisboundary );
  1428. case CSTRINGS:
  1429. return new word_boundary_t<eocs_t<CI> >( fisboundary );
  1430. default:
  1431. __assume(0); // tells the compiler that this is unreachable
  1432. }
  1433. }
  1434. template< typename CI >
  1435. assertion<CI> * create_word_start( unsigned flags )
  1436. {
  1437. switch( CSTRINGS & flags )
  1438. {
  1439. case 0:
  1440. return new word_start_t<eos_t<CI> >();
  1441. case CSTRINGS:
  1442. return new word_start_t<eocs_t<CI> >();
  1443. default:
  1444. __assume(0); // tells the compiler that this is unreachable
  1445. }
  1446. }
  1447. template< typename CI >
  1448. assertion<CI> * create_word_stop( unsigned flags )
  1449. {
  1450. switch( CSTRINGS & flags )
  1451. {
  1452. case 0:
  1453. return new word_stop_t<eos_t<CI> >();
  1454. case CSTRINGS:
  1455. return new word_stop_t<eocs_t<CI> >();
  1456. default:
  1457. __assume(0); // tells the compiler that this is unreachable
  1458. }
  1459. }
  1460. template< typename CI > class group_quantifier;
  1461. template< typename CI >
  1462. class match_group : public sub_expr<CI>
  1463. {
  1464. public:
  1465. friend class group_quantifier<CI>;
  1466. match_group( size_t cgroup )
  1467. : m_rgalternates(), m_pptail(NULL), m_cgroup( cgroup ),
  1468. m_end_group( this ), m_nwidth(uninit_width) {}
  1469. virtual ~match_group() {}
  1470. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1471. {
  1472. CI old_istart = CI();
  1473. if( size_t(-1) != m_cgroup ) // could be -1 if this is a lookahead_assertion
  1474. {
  1475. old_istart = (*param.prgbackrefs)[ m_cgroup ].first;
  1476. (*param.prgbackrefs)[ m_cgroup ].first = icur;
  1477. }
  1478. typedef vector<sub_expr<CI>*>::const_iterator VCI;
  1479. for( VCI ialt = m_rgalternates.begin(); ialt != m_rgalternates.end(); ++ialt )
  1480. {
  1481. if( (*ialt)->domatch( param, icur ) )
  1482. return true;
  1483. }
  1484. if( size_t(-1) != m_cgroup )
  1485. (*param.prgbackrefs)[ m_cgroup ].first = old_istart;
  1486. return false;
  1487. }
  1488. virtual void _delete()
  1489. {
  1490. typedef vector<sub_expr<CI>*>::iterator VI;
  1491. for( VI ialt = m_rgalternates.begin(); ialt != m_rgalternates.end(); ++ialt )
  1492. delete_sub_expr( *ialt );
  1493. sub_expr<CI>::_delete();
  1494. }
  1495. size_t group_number() const
  1496. {
  1497. return m_cgroup;
  1498. }
  1499. void group_number( size_t cgroup )
  1500. {
  1501. m_cgroup = cgroup;
  1502. }
  1503. void add_item( sub_expr<CI> * pitem )
  1504. {
  1505. *m_pptail = pitem;
  1506. m_pptail = & pitem->next();
  1507. }
  1508. void add_alternate()
  1509. {
  1510. m_rgalternates.push_back( NULL );
  1511. m_pptail = & m_rgalternates.back();
  1512. }
  1513. void end_alternate()
  1514. {
  1515. *m_pptail = & m_end_group;
  1516. }
  1517. size_t calternates() const
  1518. {
  1519. return m_rgalternates.size();
  1520. }
  1521. width_type group_width()
  1522. {
  1523. (void) match_group<CI>::_width_this();
  1524. return m_nwidth;
  1525. }
  1526. protected:
  1527. virtual bool _call_back( match_param<CI> & param, CI icur ) const throw()
  1528. {
  1529. CI old_iend = CI();
  1530. if( size_t(-1) != m_cgroup )
  1531. {
  1532. old_iend = (*param.prgbackrefs)[ m_cgroup ].second;
  1533. (*param.prgbackrefs)[ m_cgroup ].second = icur;
  1534. }
  1535. if( match_next( param, icur ) )
  1536. return true;
  1537. if( size_t(-1) != m_cgroup )
  1538. (*param.prgbackrefs)[ m_cgroup ].second = old_iend;
  1539. return false;
  1540. }
  1541. virtual width_type _width_this() throw()
  1542. {
  1543. typedef vector<sub_expr<CI>*>::const_iterator VCI;
  1544. if( uninit_width == m_nwidth )
  1545. {
  1546. m_nwidth = width_type(size_t(-1),0);
  1547. for( VCI ialt = m_rgalternates.begin(); worst_width != m_nwidth && ialt != m_rgalternates.end(); ++ialt )
  1548. {
  1549. width_type temp_width = (*ialt)->get_width();
  1550. m_nwidth.m_min = min( m_nwidth.m_min, temp_width.m_min );
  1551. m_nwidth.m_max = max( m_nwidth.m_max, temp_width.m_max );
  1552. }
  1553. }
  1554. return m_nwidth;
  1555. }
  1556. class end_group;
  1557. friend class end_group;
  1558. class end_group : public sub_expr<CI>
  1559. {
  1560. void * operator new( size_t );
  1561. public:
  1562. end_group( match_group * pgroup )
  1563. : m_pgroup( pgroup ) {}
  1564. virtual ~end_group() {}
  1565. virtual void _delete() {} // don't delete this, because it was never alloc'ed
  1566. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1567. {
  1568. return m_pgroup->_call_back( param, icur );
  1569. }
  1570. protected:
  1571. // since m_pnext is always NULL for end_groups, get_width() stops recursing here
  1572. virtual width_type _width_this() throw()
  1573. {
  1574. return width_type(0,0);
  1575. }
  1576. match_group<CI> * m_pgroup;
  1577. };
  1578. vector<sub_expr<CI>*> m_rgalternates;
  1579. sub_expr<CI> ** m_pptail; // only used when adding elements
  1580. size_t m_cgroup;
  1581. end_group m_end_group;
  1582. width_type m_nwidth;
  1583. };
  1584. // Behaves like a lookahead assertion if m_cgroup is -1, or like
  1585. // an independent group otherwise.
  1586. template< typename CI >
  1587. class independent_group : public match_group<CI>
  1588. {
  1589. public:
  1590. independent_group()
  1591. : match_group<CI>( size_t(-1) ), m_fexpected(true) {}
  1592. virtual ~independent_group() {}
  1593. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1594. {
  1595. // Copy the entire backref vector onto the stack
  1596. backref_tag<CI> * prgbr = (backref_tag<CI>*)_alloca( param.prgbackrefs->size() * sizeof backref_tag<CI> );
  1597. copy( param.prgbackrefs->begin(), param.prgbackrefs->end(),
  1598. raw_storage_iterator<backref_tag<CI>*,backref_tag<CI> >(prgbr) );
  1599. // Match until the end of this group and then return
  1600. const bool fdomatch = match_group<CI>::domatch( param, icur );
  1601. if( m_fexpected == fdomatch )
  1602. {
  1603. // If m_cgroup != 1, then this is not a zero-width assertion.
  1604. if( size_t(-1) != m_cgroup )
  1605. icur = (*param.prgbackrefs)[ m_cgroup ].second;
  1606. if( match_next( param, icur ) )
  1607. return true;
  1608. }
  1609. // if match_group::domatch returned true, the backrefs must be restored
  1610. if( fdomatch )
  1611. copy( prgbr, prgbr + param.prgbackrefs->size(), param.prgbackrefs->begin() );
  1612. return false;
  1613. }
  1614. protected:
  1615. independent_group( const bool fexpected )
  1616. : match_group<CI>( size_t(-1) ), m_fexpected(fexpected) {}
  1617. virtual bool _call_back( match_param<CI> & param, CI icur ) const throw()
  1618. {
  1619. if( size_t(-1) != m_cgroup )
  1620. (*param.prgbackrefs)[ m_cgroup ].second = icur;
  1621. return true;
  1622. }
  1623. const bool m_fexpected;
  1624. };
  1625. template< typename CI >
  1626. class lookahead_assertion : public independent_group<CI>
  1627. {
  1628. public:
  1629. lookahead_assertion( const bool fexpected )
  1630. : independent_group<CI>( fexpected ) {}
  1631. virtual ~lookahead_assertion() {}
  1632. virtual bool is_assertion() const throw() { return true; }
  1633. protected:
  1634. virtual width_type _width_this() throw() { return width_type(0,0); }
  1635. };
  1636. template< typename CI >
  1637. class lookbehind_assertion : public independent_group<CI>
  1638. {
  1639. public:
  1640. lookbehind_assertion( const bool fexpected )
  1641. : independent_group<CI>( fexpected ) {}
  1642. virtual ~lookbehind_assertion() {}
  1643. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1644. {
  1645. // This is the room in the string from the start to the current position
  1646. size_t room = distance( param.ibegin, icur );
  1647. // If we don't have enough room to match the lookbehind, the match fails.
  1648. // If we wanted the match to fail, try to match the rest of the pattern.
  1649. if( m_nwidth.m_min > room )
  1650. return m_fexpected ? false : match_next( param, icur );
  1651. // Copy the entire backref vector onto the stack
  1652. backref_tag<CI> * prgbr = (backref_tag<CI>*)_alloca( param.prgbackrefs->size() * sizeof backref_tag<CI> );
  1653. copy( param.prgbackrefs->begin(), param.prgbackrefs->end(),
  1654. raw_storage_iterator<backref_tag<CI>*,backref_tag<CI> >(prgbr) );
  1655. CI local_istart = icur;
  1656. advance( local_istart, -int( min( m_nwidth.m_max, room ) ) );
  1657. CI local_istop = icur;
  1658. advance( local_istop, -int( m_nwidth.m_min ) );
  1659. // Create a local param struct that has icur as param.iend
  1660. match_param<CI> local_param(param.ibegin,param.istart,icur,param.prgbackrefs);
  1661. // Find the rightmost match that ends at icur.
  1662. for( CI local_icur = local_istart; local_icur <= local_istop; ++local_icur )
  1663. {
  1664. // Match until the end of this group and then return
  1665. const bool fmatched = match_group<CI>::domatch( local_param, local_icur );
  1666. // If the match results were what we were expecting, try to match the
  1667. // rest of the pattern. If that succeeds, return true.
  1668. if( m_fexpected == fmatched && match_next( param, icur ) )
  1669. return true;
  1670. // if match_group::domatch returned true, the backrefs must be restored
  1671. if( fmatched )
  1672. {
  1673. copy( prgbr, prgbr + param.prgbackrefs->size(), param.prgbackrefs->begin() );
  1674. // Match succeeded. If this is a negative lookbehind, we didn't want it
  1675. // to succeed, so return false.
  1676. if( ! m_fexpected )
  1677. return false;
  1678. }
  1679. }
  1680. // No variation of the lookbehind was satisfied in a way that permited
  1681. // the rest of the pattern to match successfully, so return false.
  1682. return false;
  1683. }
  1684. virtual bool is_assertion() const throw() { return true; }
  1685. protected:
  1686. virtual bool _call_back( match_param<CI> & param, CI icur ) const throw()
  1687. {
  1688. return param.istop == icur;
  1689. }
  1690. virtual width_type _width_this() throw() { return width_type(0,0); }
  1691. };
  1692. // Corresponds to the (?:foo) extension, which has grouping semantics, but
  1693. // does not store any backref information.
  1694. template< typename CI >
  1695. class group_nobackref : public match_group<CI>
  1696. {
  1697. public:
  1698. group_nobackref( )
  1699. : match_group( size_t(-1) ) {} // will be assigned a group number in basic_rpattern::basic_rpattern()
  1700. virtual ~group_nobackref() {}
  1701. };
  1702. template< typename CI >
  1703. class match_wrapper : public sub_expr<CI>
  1704. {
  1705. public:
  1706. match_wrapper( sub_expr<CI> * psub )
  1707. : m_psub(psub) {}
  1708. virtual ~match_wrapper() {}
  1709. virtual void _delete()
  1710. {
  1711. delete_sub_expr( m_psub );
  1712. sub_expr<CI>::_delete();
  1713. }
  1714. protected:
  1715. bool _wrapped_match_this( match_param<CI> & param, CI & icur ) const throw()
  1716. {
  1717. return m_psub->_match_this( param, icur );
  1718. }
  1719. virtual width_type _width_this() throw()
  1720. {
  1721. return m_psub->_width_this();
  1722. }
  1723. sub_expr<CI> * m_psub;
  1724. };
  1725. template< typename CI >
  1726. class match_quantifier : public match_wrapper<CI>
  1727. {
  1728. public:
  1729. match_quantifier( sub_expr<CI> * psub, size_t lbound, size_t ubound )
  1730. : match_wrapper<CI>( psub ), m_lbound(lbound), m_ubound(ubound) {}
  1731. virtual ~match_quantifier() {}
  1732. protected:
  1733. virtual width_type _width_this() throw()
  1734. {
  1735. width_type this_width = match_wrapper<CI>::_width_this();
  1736. return this_width * width_type( m_lbound, m_ubound );
  1737. }
  1738. const size_t m_lbound;
  1739. const size_t m_ubound;
  1740. };
  1741. template< typename CI >
  1742. class max_atom_quantifier : public match_quantifier<CI>
  1743. {
  1744. public:
  1745. max_atom_quantifier( sub_expr<CI> * psub, size_t lbound, size_t ubound )
  1746. : match_quantifier<CI>( psub, lbound, ubound ) {}
  1747. virtual ~max_atom_quantifier() {}
  1748. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1749. {
  1750. size_t cmatches = 0;
  1751. int cdiff = 0; // must be a signed quantity for advance() below
  1752. if( cmatches < m_ubound )
  1753. {
  1754. CI istart = icur;
  1755. if( _wrapped_match_this( param, icur ) )
  1756. {
  1757. ++cmatches;
  1758. cdiff = distance( istart, icur );
  1759. if( 0 == cdiff )
  1760. return ( match_next( param, icur ) );
  1761. while( cmatches < m_ubound &&
  1762. _wrapped_match_this( param, icur ) )
  1763. {
  1764. ++cmatches;
  1765. }
  1766. }
  1767. }
  1768. if( cmatches >= m_lbound )
  1769. {
  1770. if( ! next() )
  1771. return true;
  1772. for(;;)
  1773. {
  1774. if( next()->domatch( param, icur ) )
  1775. return true;
  1776. if( cmatches-- <= m_lbound )
  1777. break;
  1778. advance( icur, -cdiff );
  1779. }
  1780. }
  1781. return false;
  1782. }
  1783. };
  1784. template< typename CI >
  1785. class min_atom_quantifier : public match_quantifier<CI>
  1786. {
  1787. public:
  1788. min_atom_quantifier( sub_expr<CI> * psub, size_t lbound, size_t ubound )
  1789. : match_quantifier<CI>( psub, lbound, ubound ) {}
  1790. virtual ~min_atom_quantifier() {}
  1791. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1792. {
  1793. size_t cmatches = 0;
  1794. bool fsuccess = true;
  1795. CI icur_tmp = icur;
  1796. if( _wrapped_match_this( param, icur_tmp ) )
  1797. {
  1798. if( icur_tmp == icur )
  1799. return ( match_next( param, icur ) );
  1800. if( m_lbound )
  1801. {
  1802. icur = icur_tmp;
  1803. ++cmatches;
  1804. }
  1805. while( ( cmatches < m_lbound ) &&
  1806. ( fsuccess = _wrapped_match_this( param, icur ) ) )
  1807. {
  1808. ++cmatches;
  1809. }
  1810. }
  1811. else
  1812. {
  1813. fsuccess = ! m_lbound;
  1814. }
  1815. if( fsuccess && next() )
  1816. {
  1817. do
  1818. {
  1819. if( next()->domatch( param, icur ) )
  1820. break;
  1821. } while( fsuccess = ( cmatches++ < m_ubound &&
  1822. _wrapped_match_this( param, icur ) ) );
  1823. }
  1824. return fsuccess;
  1825. }
  1826. };
  1827. template< typename CI >
  1828. class group_quantifier : public match_quantifier<CI>
  1829. {
  1830. public:
  1831. group_quantifier( match_group<CI> * psub, size_t lbound, size_t ubound )
  1832. : match_quantifier<CI>( psub, lbound, ubound ),
  1833. m_group( *psub ), m_end_quantifier( this )
  1834. {
  1835. psub->next() = & m_end_quantifier;
  1836. }
  1837. virtual ~group_quantifier() {}
  1838. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1839. {
  1840. // group_number is only -1 for assertions, which can't be quantified
  1841. assert( size_t(-1) != group_number() );
  1842. backref_tag<CI> & br = (*param.prgbackrefs)[ group_number() ];
  1843. backref_tag<CI> old_backref = br;
  1844. br = backref_tag<CI>( icur, icur ); // sets cmatches (reserved) to 0
  1845. if( _recurse( param, icur ) )
  1846. return true;
  1847. br = old_backref;
  1848. return false;
  1849. }
  1850. protected:
  1851. class end_quantifier;
  1852. friend class end_quantifier;
  1853. class end_quantifier : public sub_expr<CI>
  1854. {
  1855. void * operator new( size_t );
  1856. public:
  1857. end_quantifier( group_quantifier<CI> * pquant )
  1858. : m_pquant( pquant ) {}
  1859. virtual ~end_quantifier() {}
  1860. virtual void _delete() {} // don't delete this, since it wasn't alloc'ed
  1861. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1862. {
  1863. // group_number is only -1 for assertions, which can't be quantified
  1864. assert( size_t(-1) != m_pquant->group_number() );
  1865. // handle special the case where a group matches 0 characters
  1866. backref_tag<CI> & br = (*param.prgbackrefs)[ m_pquant->group_number() ];
  1867. if( icur == br.first )
  1868. {
  1869. size_t old_cmatches = br.reserved;
  1870. br.reserved = m_pquant->m_ubound;
  1871. if( m_pquant->_recurse( param, icur ) )
  1872. return true;
  1873. br.reserved = old_cmatches;
  1874. return false;
  1875. }
  1876. return m_pquant->_recurse( param, icur );
  1877. }
  1878. protected:
  1879. virtual width_type _width_this() throw() { return width_type(0,0); }
  1880. group_quantifier<CI> * m_pquant;
  1881. };
  1882. size_t group_number() const
  1883. {
  1884. return m_group.group_number();
  1885. }
  1886. size_t & cmatches( match_param<CI> & param ) const
  1887. {
  1888. return (*param.prgbackrefs)[ group_number() ].reserved;
  1889. }
  1890. virtual bool _recurse( match_param<CI> & param, CI icur ) const throw() = 0;
  1891. match_group<CI> & m_group;
  1892. end_quantifier m_end_quantifier;
  1893. };
  1894. template< typename CI >
  1895. class max_group_quantifier : public group_quantifier<CI>
  1896. {
  1897. public:
  1898. max_group_quantifier( match_group<CI> * psub, size_t lbound, size_t ubound )
  1899. : group_quantifier<CI>( psub, lbound, ubound ) {}
  1900. virtual ~max_group_quantifier() {}
  1901. protected:
  1902. virtual bool _recurse( match_param<CI> & param, CI icur ) const throw()
  1903. {
  1904. if( m_ubound == cmatches( param ) )
  1905. return match_next( param, icur );
  1906. ++cmatches( param );
  1907. if( m_psub->domatch( param, icur ) )
  1908. return true;
  1909. if( --cmatches( param ) < m_lbound )
  1910. return false;
  1911. return match_next( param, icur );
  1912. }
  1913. };
  1914. template< typename CI >
  1915. class min_group_quantifier : public group_quantifier<CI>
  1916. {
  1917. public:
  1918. min_group_quantifier( match_group<CI> * psub, size_t lbound, size_t ubound )
  1919. : group_quantifier<CI>( psub, lbound, ubound ) {}
  1920. virtual ~min_group_quantifier() {}
  1921. protected:
  1922. virtual bool _recurse( match_param<CI> & param, CI icur ) const throw()
  1923. {
  1924. if( m_lbound > cmatches( param ) )
  1925. {
  1926. ++cmatches( param );
  1927. return m_psub->domatch( param, icur );
  1928. }
  1929. if( match_next( param, icur ) )
  1930. return true;
  1931. if( cmatches( param )++ == m_ubound )
  1932. return false;
  1933. return m_psub->domatch( param, icur );
  1934. }
  1935. };
  1936. template< typename CI >
  1937. class match_backref : public sub_expr<CI>
  1938. {
  1939. public:
  1940. match_backref( size_t cbackref, const width_type & group_width )
  1941. : m_cbackref( cbackref ), m_nwidth(group_width) {}
  1942. virtual ~match_backref() {}
  1943. protected:
  1944. // Return the width specifications of the group to which this backref refers
  1945. virtual width_type _width_this() throw() { return m_nwidth; }
  1946. const size_t m_cbackref;
  1947. const width_type m_nwidth;
  1948. };
  1949. template< typename CMP, typename EOS >
  1950. class match_backref_t : public match_backref<typename EOS::const_iterator>
  1951. {
  1952. public:
  1953. typedef CMP cmp_type;
  1954. typedef EOS eos_type;
  1955. typedef typename EOS::const_iterator CI;
  1956. match_backref_t( size_t cbackref, const width_type & group_width )
  1957. : match_backref<CI>( cbackref, group_width ) {}
  1958. virtual ~match_backref_t() {}
  1959. protected:
  1960. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  1961. {
  1962. CI ithis = (*param.prgbackrefs)[ m_cbackref ].first;
  1963. CI istop = (*param.prgbackrefs)[ m_cbackref ].second;
  1964. CI icur_tmp = icur;
  1965. // Don't match a backref that hasn't match anything
  1966. if( ! (*param.prgbackrefs)[ m_cbackref ] )
  1967. return false;
  1968. for( ; ithis != istop; ++icur_tmp, ++ithis )
  1969. {
  1970. if( m_eos( param, icur_tmp ) || m_cmp( *icur_tmp, *ithis ) )
  1971. return false;
  1972. }
  1973. icur = icur_tmp;
  1974. return true;
  1975. }
  1976. cmp_type m_cmp;
  1977. eos_type m_eos;
  1978. };
  1979. template< typename CI >
  1980. match_backref<CI> * create_backref(
  1981. size_t cbackref,
  1982. const width_type & group_width,
  1983. unsigned flags )
  1984. {
  1985. typedef typename iterator_traits<CI>::value_type char_type;
  1986. switch( ( NOCASE | CSTRINGS ) & flags )
  1987. {
  1988. case 0:
  1989. return new match_backref_t<ch_neq_t<char_type>,eos_t<CI> >( cbackref, group_width );
  1990. case NOCASE:
  1991. return new match_backref_t<ch_neq_nocase_t<char_type>,eos_t<CI> >( cbackref, group_width );
  1992. case CSTRINGS:
  1993. return new match_backref_t<ch_neq_t<char_type>,eocs_t<CI> >( cbackref, group_width );
  1994. case NOCASE | CSTRINGS:
  1995. return new match_backref_t<ch_neq_nocase_t<char_type>,eocs_t<CI> >( cbackref, group_width );
  1996. default:
  1997. __assume(0); // tells the compiler that this is unreachable
  1998. }
  1999. }
  2000. // Replace some escape sequences with the actual characters
  2001. // they represent
  2002. template< typename CI >
  2003. void basic_rpattern_base<CI>::_normalize_string( basic_string<basic_rpattern_base<CI>::char_type> & str )
  2004. {
  2005. // Don't do pattern normalization if the user didn't ask for it.
  2006. if( NORMALIZE != ( NORMALIZE & m_flags ) )
  2007. return;
  2008. process_escapes( str );
  2009. }
  2010. //
  2011. // Implementation of basic_rpattern:
  2012. //
  2013. template< typename CI, typename SY >
  2014. basic_rpattern<CI,SY>::basic_rpattern() throw()
  2015. : basic_rpattern_base<CI>( 0 )
  2016. {
  2017. }
  2018. template< typename CI, typename SY >
  2019. basic_rpattern<CI,SY>::basic_rpattern(
  2020. const basic_string<basic_rpattern<CI,SY>::char_type> & pat,
  2021. unsigned flags ) throw(bad_regexpr,bad_alloc)
  2022. : basic_rpattern_base<CI>( flags, pat )
  2023. {
  2024. push_new_handler pnh( &my_new_handler );
  2025. _normalize_string( m_pat );
  2026. _common_init( flags );
  2027. }
  2028. template< typename CI, typename SY >
  2029. basic_rpattern<CI,SY>::basic_rpattern(
  2030. const basic_string<basic_rpattern<CI,SY>::char_type> & pat,
  2031. const basic_string<basic_rpattern<CI,SY>::char_type> & subst,
  2032. unsigned flags ) throw(bad_regexpr,bad_alloc)
  2033. : basic_rpattern_base<CI>( flags, pat, subst )
  2034. {
  2035. push_new_handler pnh( &my_new_handler );
  2036. _normalize_string( m_pat );
  2037. _common_init( flags );
  2038. _normalize_string( m_subst );
  2039. _parse_subst(); // must come after _common_init
  2040. }
  2041. template< typename CI, typename SY >
  2042. void basic_rpattern<CI,SY>::init(
  2043. const basic_string<basic_rpattern<CI,SY>::char_type> & pat,
  2044. unsigned flags ) throw(bad_regexpr,bad_alloc)
  2045. {
  2046. push_new_handler pnh( &my_new_handler );
  2047. _reset();
  2048. m_flags = flags;
  2049. m_pat = pat;
  2050. _normalize_string( m_pat );
  2051. _common_init( m_flags );
  2052. }
  2053. template< typename CI, typename SY >
  2054. void basic_rpattern<CI,SY>::init(
  2055. const basic_string<basic_rpattern<CI,SY>::char_type> & pat,
  2056. const basic_string<basic_rpattern<CI,SY>::char_type> & subst,
  2057. unsigned flags ) throw(bad_regexpr,bad_alloc)
  2058. {
  2059. push_new_handler pnh( &my_new_handler );
  2060. _reset();
  2061. m_flags = flags;
  2062. m_pat = pat;
  2063. m_subst = subst;
  2064. _normalize_string( m_pat );
  2065. _common_init( m_flags );
  2066. _normalize_string( m_subst );
  2067. _parse_subst(); // must come after _common_init
  2068. }
  2069. template< typename CI, typename SY >
  2070. void basic_rpattern<CI,SY>::_common_init( unsigned flags )
  2071. {
  2072. m_cgroups = 0;
  2073. vector<match_group<CI>*> rggroups;
  2074. basic_string<char_type>::iterator ipat = m_pat.begin();
  2075. match_group<CI> * pgroup = _find_next_group( ipat, flags, rggroups );
  2076. m_pfirst = pgroup;
  2077. m_nwidth = pgroup->group_width();
  2078. // Number the invisible groups
  2079. m_cgroups_visible = m_cgroups;
  2080. while( ! m_invisible_groups.empty() )
  2081. {
  2082. m_invisible_groups.front()->group_number( _get_next_group_nbr() );
  2083. m_invisible_groups.pop_front();
  2084. }
  2085. //
  2086. // determine if we can get away with only calling m_pfirst->domatch only once
  2087. //
  2088. m_floop = true;
  2089. // Optimization: if first character of pattern string is '^'
  2090. // and we are not doing a multiline match, then we only
  2091. // need to try domatch once
  2092. basic_string<char_type>::iterator icur = m_pat.begin();
  2093. if( MULTILINE != ( MULTILINE & m_flags ) &&
  2094. 1 == pgroup->calternates() &&
  2095. icur != m_pat.end() &&
  2096. BEGIN_LINE == SY::reg_token( icur, m_pat.end() ) )
  2097. {
  2098. m_flags &= ~RIGHTMOST;
  2099. m_floop = false;
  2100. }
  2101. // Optimization: if first 2 characters of pattern string are ".*" or ".+",
  2102. // then we only need to try domatch once
  2103. icur = m_pat.begin();
  2104. if( RIGHTMOST != ( RIGHTMOST & m_flags ) &&
  2105. SINGLELINE == ( SINGLELINE & m_flags ) &&
  2106. 1 == pgroup->calternates() &&
  2107. icur != m_pat.end() &&
  2108. MATCH_ANY == SY::reg_token( icur, m_pat.end() ) &&
  2109. icur != m_pat.end() )
  2110. {
  2111. switch( SY::quant_token( icur, m_pat.end() ) )
  2112. {
  2113. case ONE_OR_MORE:
  2114. case ZERO_OR_MORE:
  2115. case ONE_OR_MORE_MIN:
  2116. case ZERO_OR_MORE_MIN:
  2117. m_floop = false;
  2118. }
  2119. }
  2120. }
  2121. template< typename CI, typename SY >
  2122. void basic_rpattern<CI,SY>::_reset() throw()
  2123. {
  2124. basic_rpattern_base<CI>::_reset();
  2125. m_cgroups = m_cgroups_visible = 0;
  2126. m_floop = true;
  2127. m_subst.erase();
  2128. m_pat.erase();
  2129. m_pfirst.free_ptr();
  2130. m_nwidth = uninit_width;
  2131. m_subst_list.clear();
  2132. m_invisible_groups.clear();
  2133. }
  2134. template< typename CI, typename SY >
  2135. void basic_rpattern<CI,SY>::set_flags( unsigned flags ) throw(bad_regexpr,bad_alloc)
  2136. {
  2137. push_new_handler pnh( &my_new_handler );
  2138. m_pfirst.free_ptr();
  2139. m_flags = flags;
  2140. _common_init( m_flags );
  2141. }
  2142. template< typename CI, typename SY >
  2143. void basic_rpattern<CI,SY>::set_substitution( const basic_string<basic_rpattern<CI,SY>::char_type> & subst ) throw(bad_regexpr,bad_alloc)
  2144. {
  2145. push_new_handler pnh( &my_new_handler );
  2146. m_subst_list.clear();
  2147. m_subst = subst;
  2148. _normalize_string( m_subst );
  2149. _parse_subst();
  2150. }
  2151. template< typename CI, typename SY >
  2152. match_group<CI> * basic_rpattern<CI,SY>::_find_next_group(
  2153. basic_string<basic_rpattern<CI,SY>::char_type>::iterator & ipat,
  2154. unsigned & flags,
  2155. vector<match_group<CI>*> & rggroups )
  2156. {
  2157. auto_sub_ptr<match_group<CI> > pgroup;
  2158. basic_string<char_type>::iterator itemp = ipat;
  2159. unsigned old_flags = flags;
  2160. TOKEN tok;
  2161. // Look for group extensions. (This could change the value of the flags variable.)
  2162. if( ipat != m_pat.end() && NO_TOKEN != ( tok = SY::ext_token( ipat, m_pat.end(), flags ) ) )
  2163. {
  2164. if( itemp == m_pat.begin() || ipat == m_pat.end() )
  2165. throw bad_regexpr("ill-formed regular expression");
  2166. // Don't process empty groups
  2167. if( END_GROUP != SY::reg_token( itemp = ipat, m_pat.end() ) )
  2168. {
  2169. switch( tok )
  2170. {
  2171. case EXT_NOBACKREF:
  2172. // invisible groups are numbered only after all
  2173. // visible groups have been numbererd
  2174. pgroup = new match_group<CI>( size_t(-1) );
  2175. m_invisible_groups.push_back( pgroup.get() );
  2176. break;
  2177. case EXT_INDEPENDENT:
  2178. pgroup = new independent_group<CI>();
  2179. m_invisible_groups.push_back( pgroup.get() );
  2180. break;
  2181. case EXT_POS_LOOKAHEAD:
  2182. pgroup = new lookahead_assertion<CI>( true );
  2183. break;
  2184. case EXT_NEG_LOOKAHEAD:
  2185. pgroup = new lookahead_assertion<CI>( false );
  2186. break;
  2187. case EXT_POS_LOOKBEHIND:
  2188. // For look-behind assertions, turn off the CSTRINGs optimization
  2189. flags &= ~CSTRINGS;
  2190. pgroup = new lookbehind_assertion<CI>( true );
  2191. break;
  2192. case EXT_NEG_LOOKBEHIND:
  2193. // For look-behind assertions, turn off the CSTRINGs optimization
  2194. flags &= ~CSTRINGS;
  2195. pgroup = new lookbehind_assertion<CI>( false );
  2196. break;
  2197. default:
  2198. throw bad_regexpr("bad extension sequence");
  2199. }
  2200. }
  2201. else
  2202. {
  2203. // Skip over the END_GROUP token
  2204. ipat = itemp;
  2205. }
  2206. }
  2207. else
  2208. {
  2209. pgroup = new match_group<CI>( _get_next_group_nbr() );
  2210. }
  2211. if( NULL != pgroup.get() )
  2212. {
  2213. pgroup->add_alternate();
  2214. while( _find_next( ipat, pgroup.get(), flags, rggroups ) );
  2215. pgroup->end_alternate();
  2216. // Add this group to the rggroups array
  2217. if( size_t(-1) != pgroup->group_number() )
  2218. {
  2219. if( pgroup->group_number() >= rggroups.size() )
  2220. rggroups.resize( pgroup->group_number() + 1, NULL );
  2221. rggroups[ pgroup->group_number() ] = pgroup.get();
  2222. }
  2223. // The group should calculate its own width now and
  2224. // save the result for later.
  2225. pgroup->group_width();
  2226. // If this is not a pattern modifier, restore the
  2227. // flags to their previous settings. This causes
  2228. // pattern modifiers to have the scope of their
  2229. // enclosing group.
  2230. flags = old_flags;
  2231. }
  2232. return pgroup.release();
  2233. }
  2234. //
  2235. // Read ahead through the pattern and treat sequential atoms
  2236. // as a single atom, making sure to handle quantification
  2237. // correctly. Warning: dense code ahead.
  2238. //
  2239. template< typename CI, typename SY >
  2240. void basic_rpattern<CI,SY>::_find_atom(
  2241. basic_string<basic_rpattern<CI,SY>::char_type>::iterator & ipat,
  2242. match_group<CI> * pgroup,
  2243. unsigned flags )
  2244. {
  2245. basic_string<char_type>::iterator itemp = ipat, istart = ipat;
  2246. do
  2247. {
  2248. switch( SY::quant_token( itemp, m_pat.end() ) )
  2249. {
  2250. // if {,} can't be interpreted as quantifiers, treat them as regular chars
  2251. case BEGIN_RANGE:
  2252. if( istart != ipat ) // treat as a quantifier
  2253. goto quantify;
  2254. case NO_TOKEN:
  2255. case END_RANGE:
  2256. case END_RANGE_MIN:
  2257. case RANGE_SEPARATOR:
  2258. break;
  2259. default:
  2260. if( istart == ipat ) // must be able to quantify something.
  2261. throw bad_regexpr("quantifier not expected");
  2262. quantify: if( istart != --ipat )
  2263. pgroup->add_item( create_atom<CI>( istart, ipat, flags ) );
  2264. auto_sub_ptr<sub_expr<CI> > pnew( create_atom<CI>( ipat++, flags ) );
  2265. _quantify( pnew, NULL, ipat );
  2266. pgroup->add_item( pnew.release() );
  2267. return;
  2268. }
  2269. } while( m_pat.end() != ++ipat && ! SY::reg_token( itemp = ipat, m_pat.end() ) );
  2270. assert( ipat != istart );
  2271. pgroup->add_item( create_atom<CI>( istart, ipat, flags ) );
  2272. }
  2273. template< typename CI, typename SY >
  2274. bool basic_rpattern<CI,SY>::_find_next(
  2275. basic_string<basic_rpattern<CI,SY>::char_type>::iterator & ipat,
  2276. match_group<CI> * pgroup,
  2277. unsigned & flags,
  2278. vector<match_group<CI>*> & rggroups )
  2279. {
  2280. match_group<CI> * pnew_group = NULL;
  2281. auto_sub_ptr<sub_expr<CI> > pnew;
  2282. basic_string<char_type>::iterator istart, itemp;
  2283. bool fdone;
  2284. if( ipat == m_pat.end() )
  2285. {
  2286. if( 0 != pgroup->group_number() )
  2287. throw bad_regexpr( "mismatched parenthesis" );
  2288. return false;
  2289. }
  2290. switch( SY::reg_token( ipat, m_pat.end() ) )
  2291. {
  2292. case NO_TOKEN: // not a token. Must be an atom
  2293. _find_atom( ipat, pgroup, flags );
  2294. return true;
  2295. case END_GROUP:
  2296. if( 0 == pgroup->group_number() )
  2297. throw bad_regexpr( "mismatched parenthesis" );
  2298. return false;
  2299. case ALTERNATION:
  2300. pgroup->end_alternate();
  2301. pgroup->add_alternate();
  2302. return true;
  2303. case BEGIN_GROUP:
  2304. // Find next group could return NULL if the group is really
  2305. // a pattern modifier, like: (?s-i)
  2306. pnew = pnew_group = _find_next_group( ipat, flags, rggroups );
  2307. break;
  2308. case BEGIN_LINE:
  2309. pnew = create_bol<CI>( flags );
  2310. break;
  2311. case END_LINE:
  2312. pnew = create_eol<CI>( flags );
  2313. break;
  2314. case BEGIN_CHARSET:
  2315. pnew = create_charset_helper<CI,SY>::create_charset_aux( m_pat, ipat, flags );
  2316. break;
  2317. case MATCH_ANY:
  2318. pnew = create_any<CI>( flags );
  2319. break;
  2320. case ESC_WORD_BOUNDARY:
  2321. pnew = create_word_boundary<CI>( true, flags );
  2322. break;
  2323. case ESC_NOT_WORD_BOUNDARY:
  2324. pnew = create_word_boundary<CI>( false, flags );
  2325. break;
  2326. case ESC_WORD_START:
  2327. pnew = create_word_start<CI>( flags );
  2328. break;
  2329. case ESC_WORD_STOP:
  2330. pnew = create_word_stop<CI>( flags );
  2331. break;
  2332. case ESC_DIGIT:
  2333. pnew = create_charset<CI>( match_charset<CI>( false,
  2334. get_digit_vector() ),
  2335. flags );
  2336. break;
  2337. case ESC_NOT_DIGIT:
  2338. pnew = create_charset<CI>( match_charset<CI>( true,
  2339. get_digit_vector() ),
  2340. flags );
  2341. break;
  2342. case ESC_WORD:
  2343. pnew = create_charset<CI>( match_charset<CI>( false,
  2344. get_word_vector() ),
  2345. flags );
  2346. break;
  2347. case ESC_NOT_WORD:
  2348. pnew = create_charset<CI>( match_charset<CI>( true,
  2349. get_word_vector() ),
  2350. flags );
  2351. break;
  2352. case ESC_SPACE:
  2353. pnew = create_charset<CI>( match_charset<CI>( false,
  2354. get_space_vector() ),
  2355. flags );
  2356. break;
  2357. case ESC_NOT_SPACE:
  2358. pnew = create_charset<CI>( match_charset<CI>( true,
  2359. get_space_vector() ),
  2360. flags );
  2361. break;
  2362. case ESC_BEGIN_STRING:
  2363. pnew = create_bos<CI>( flags );
  2364. break;
  2365. case ESC_END_STRING:
  2366. pnew = create_eos<CI>( flags );
  2367. break;
  2368. case ESC_END_STRING_z:
  2369. pnew = create_eoz<CI>( flags );
  2370. break;
  2371. case ESCAPE:
  2372. if( char_type('0') <= *ipat && char_type('9') >= *ipat )
  2373. {
  2374. // use _cgroups_total here since the invisible groups have not been numbered yet.
  2375. unsigned nbackref = parse_int( ipat, m_pat.end(), _cgroups_total() - 1 );// always at least 1 group
  2376. if( 0 == nbackref || rggroups.size() <= nbackref || NULL == rggroups[ nbackref ] )
  2377. throw bad_regexpr( "invalid backreference" );
  2378. pnew = create_backref<CI>( nbackref, rggroups[nbackref]->group_width(), flags );
  2379. }
  2380. else
  2381. {
  2382. // Is this a user-defined intrinsic character set?
  2383. match_charset<CI> * pcharset = s_charset_map.get( *ipat, flags );
  2384. if( NULL != pcharset )
  2385. pnew = create_charset<CI>( *pcharset, flags );
  2386. else
  2387. pnew = create_atom<CI>( ipat, flags );
  2388. ++ipat;
  2389. }
  2390. break;
  2391. // If quotemeta, loop until we find quotemeta off or end of string
  2392. case ESC_QUOTE_META_ON:
  2393. for( istart = itemp = ipat, fdone = false; !fdone && ipat != m_pat.end(); )
  2394. {
  2395. switch( SY::reg_token( ipat, m_pat.end() ) )
  2396. {
  2397. case ESC_QUOTE_META_OFF:
  2398. fdone = true;
  2399. break;
  2400. case NO_TOKEN:
  2401. ++ipat; // fallthrough
  2402. default:
  2403. itemp = ipat;
  2404. break;
  2405. }
  2406. }
  2407. if( itemp != istart )
  2408. pgroup->add_item( create_atom<CI>( istart, itemp, flags ) );
  2409. // skip the quantification code below
  2410. return true;
  2411. // Should never get here for valid patterns
  2412. case ESC_QUOTE_META_OFF:
  2413. throw bad_regexpr("quotemeta turned off, but was never turned on");
  2414. default:
  2415. assert( ! "Unhandled token type" );
  2416. break;
  2417. }
  2418. // If pnew is null, then the current subexpression is a no-op.
  2419. if( pnew.get() )
  2420. {
  2421. // Look for quantifiers
  2422. _quantify( pnew, pnew_group, ipat );
  2423. // Add the item to the group
  2424. pgroup->add_item( pnew.release() );
  2425. }
  2426. return true;
  2427. }
  2428. template< typename CI, typename SY >
  2429. void basic_rpattern<CI,SY>::_quantify(
  2430. auto_sub_ptr<sub_expr<CI> > & pnew,
  2431. match_group<CI> * pnew_group,
  2432. basic_string<basic_rpattern<CI,SY>::char_type>::iterator & ipat )
  2433. {
  2434. if( ipat != m_pat.end() && ! pnew->is_assertion() )
  2435. {
  2436. basic_string<char_type>::iterator itemp = ipat;
  2437. bool fmin = false;
  2438. // Since size_t is unsigned, -1 is really the largest size_t
  2439. size_t lbound = (size_t)-1;
  2440. size_t ubound = (size_t)-1;
  2441. size_t ubound_tmp;
  2442. switch( SY::quant_token( itemp, m_pat.end() ) )
  2443. {
  2444. case ZERO_OR_MORE_MIN:
  2445. fmin = true;
  2446. case ZERO_OR_MORE:
  2447. lbound = 0;
  2448. break;
  2449. case ONE_OR_MORE_MIN:
  2450. fmin = true;
  2451. case ONE_OR_MORE:
  2452. lbound = 1;
  2453. break;
  2454. case ZERO_OR_ONE_MIN:
  2455. fmin = true;
  2456. case ZERO_OR_ONE:
  2457. lbound = 0;
  2458. ubound = 1;
  2459. break;
  2460. case BEGIN_RANGE:
  2461. lbound = parse_int( itemp, m_pat.end() );
  2462. if( itemp == m_pat.end() )
  2463. throw bad_regexpr( "expecting end of range" );
  2464. switch( SY::quant_token( itemp, m_pat.end() ) )
  2465. {
  2466. case END_RANGE_MIN:
  2467. fmin = true;
  2468. case END_RANGE:
  2469. ubound = lbound;
  2470. break;
  2471. case RANGE_SEPARATOR:
  2472. ipat = itemp;
  2473. ubound_tmp = parse_int( itemp, m_pat.end() );
  2474. if( itemp != ipat )
  2475. ubound = ubound_tmp;
  2476. if( itemp == m_pat.end() )
  2477. throw bad_regexpr( "expecting end of range" );
  2478. switch( SY::quant_token( itemp, m_pat.end() ) )
  2479. {
  2480. case END_RANGE_MIN:
  2481. fmin = true;
  2482. case END_RANGE:
  2483. break;
  2484. default:
  2485. throw bad_regexpr( "expecting end of range" );
  2486. }
  2487. break;
  2488. default:
  2489. throw bad_regexpr( "ill-formed quantifier" );
  2490. }
  2491. if( ubound < lbound )
  2492. throw bad_regexpr( "ill-formed quantifier" );
  2493. break;
  2494. }
  2495. if( (size_t)-1 != lbound )
  2496. {
  2497. auto_sub_ptr<match_quantifier<CI> > pquant;
  2498. // a group quantifier is less efficient than an atom quantifier
  2499. if( fmin )
  2500. {
  2501. if( pnew_group )
  2502. pquant = new min_group_quantifier<CI>( pnew_group,
  2503. lbound, ubound );
  2504. else
  2505. pquant = new min_atom_quantifier<CI>( pnew.get(),
  2506. lbound, ubound );
  2507. }
  2508. else
  2509. {
  2510. if( pnew_group )
  2511. pquant = new max_group_quantifier<CI>( pnew_group,
  2512. lbound, ubound );
  2513. else
  2514. pquant = new max_atom_quantifier<CI>( pnew.get(),
  2515. lbound, ubound );
  2516. }
  2517. pnew.release();
  2518. pnew = pquant.release();
  2519. ipat = itemp;
  2520. }
  2521. }
  2522. }
  2523. template< typename CI, typename SY >
  2524. void basic_rpattern<CI,SY>::_add_subst_backref( subst_node & snode, size_t nbackref, size_t rstart )
  2525. {
  2526. m_fuses_backrefs = true;
  2527. assert( subst_node::SUBST_STRING == snode.stype );
  2528. if( snode.subst_string.rlength )
  2529. m_subst_list.push_back( snode );
  2530. snode.stype = subst_node::SUBST_BACKREF;
  2531. snode.subst_backref = nbackref;
  2532. m_subst_list.push_back( snode );
  2533. // re-initialize the subst_node
  2534. snode.stype = subst_node::SUBST_STRING;
  2535. snode.subst_string.rstart = rstart;
  2536. snode.subst_string.rlength = 0;
  2537. }
  2538. template< typename CI, typename SY >
  2539. void basic_rpattern<CI,SY>::_parse_subst()
  2540. {
  2541. TOKEN tok;
  2542. subst_node snode;
  2543. basic_string<char_type>::iterator icur = m_subst.begin();
  2544. size_t nbackref;
  2545. basic_string<char_type>::iterator itemp;
  2546. bool fdone;
  2547. m_fuses_backrefs = false;
  2548. // Initialize the subst_node
  2549. snode.stype = subst_node::SUBST_STRING;
  2550. snode.subst_string.rstart = 0;
  2551. snode.subst_string.rlength = 0;
  2552. while( icur != m_subst.end() )
  2553. {
  2554. switch( tok = SY::subst_token( icur, m_subst.end() ) )
  2555. {
  2556. case SUBST_MATCH:
  2557. _add_subst_backref( snode, 0, distance( m_subst.begin(), icur ) );
  2558. break;
  2559. case SUBST_PREMATCH:
  2560. _add_subst_backref( snode, (size_t)subst_node::PREMATCH, distance( m_subst.begin(), icur ) );
  2561. break;
  2562. case SUBST_POSTMATCH:
  2563. _add_subst_backref( snode, (size_t)subst_node::POSTMATCH, distance( m_subst.begin(), icur ) );
  2564. break;
  2565. case SUBST_BACKREF:
  2566. nbackref = parse_int( icur, m_subst.end(), cgroups() - 1 ); // always at least 1 group
  2567. if( 0 == nbackref )
  2568. throw bad_regexpr( "invalid backreference in substitution" );
  2569. _add_subst_backref( snode, nbackref, distance( m_subst.begin(), icur ) );
  2570. break;
  2571. case SUBST_QUOTE_META_ON:
  2572. assert( subst_node::SUBST_STRING == snode.stype );
  2573. if( snode.subst_string.rlength )
  2574. m_subst_list.push_back( snode );
  2575. snode.subst_string.rstart = distance( m_subst.begin(), icur );
  2576. for( itemp = icur, fdone = false; !fdone && icur != m_subst.end(); )
  2577. {
  2578. switch( tok = SY::subst_token( icur, m_subst.end() ) )
  2579. {
  2580. case SUBST_ALL_OFF:
  2581. fdone = true;
  2582. break;
  2583. case NO_TOKEN:
  2584. ++icur; // fall-through
  2585. default:
  2586. itemp = icur;
  2587. break;
  2588. }
  2589. }
  2590. snode.subst_string.rlength = distance( m_subst.begin(), itemp ) - snode.subst_string.rstart;
  2591. if( snode.subst_string.rlength )
  2592. m_subst_list.push_back( snode );
  2593. if( tok == SUBST_ALL_OFF )
  2594. {
  2595. snode.stype = subst_node::SUBST_OP;
  2596. snode.op = subst_node::ALL_OFF;
  2597. m_subst_list.push_back( snode );
  2598. }
  2599. // re-initialize the subst_node
  2600. snode.stype = subst_node::SUBST_STRING;
  2601. snode.subst_string.rstart = distance( m_subst.begin(), icur );
  2602. snode.subst_string.rlength = 0;
  2603. break;
  2604. case SUBST_UPPER_ON:
  2605. case SUBST_UPPER_NEXT:
  2606. case SUBST_LOWER_ON:
  2607. case SUBST_LOWER_NEXT:
  2608. case SUBST_ALL_OFF:
  2609. assert( subst_node::SUBST_STRING == snode.stype );
  2610. if( snode.subst_string.rlength )
  2611. m_subst_list.push_back( snode );
  2612. snode.stype = subst_node::SUBST_OP;
  2613. snode.op = (subst_node::op_type) tok;
  2614. m_subst_list.push_back( snode );
  2615. // re-initialize the subst_node
  2616. snode.stype = subst_node::SUBST_STRING;
  2617. snode.subst_string.rstart = distance( m_subst.begin(), icur );
  2618. snode.subst_string.rlength = 0;
  2619. break;
  2620. case SUBST_ESCAPE:
  2621. if( icur == m_subst.end() )
  2622. throw bad_regexpr("expecting escape sequence in substitution string");
  2623. assert( subst_node::SUBST_STRING == snode.stype );
  2624. if( snode.subst_string.rlength )
  2625. m_subst_list.push_back( snode );
  2626. snode.subst_string.rstart = distance( m_subst.begin(), icur++ );
  2627. snode.subst_string.rlength = 1;
  2628. break;
  2629. case NO_TOKEN:
  2630. default:
  2631. ++snode.subst_string.rlength;
  2632. ++icur;
  2633. break;
  2634. }
  2635. }
  2636. assert( subst_node::SUBST_STRING == snode.stype );
  2637. if( snode.subst_string.rlength )
  2638. m_subst_list.push_back( snode );
  2639. }
  2640. template< typename CI, typename SY >
  2641. basic_rpattern<CI,SY>::charset_map basic_rpattern<CI,SY>::s_charset_map;
  2642. // Pass in an interator to one after the opening bracket of the character set.
  2643. // On return, icur points to one character after the closing bracket
  2644. template< typename CI, typename SY >
  2645. sub_expr<CI> * create_charset_helper<CI,SY>::create_charset_aux(
  2646. basic_string<iterator_traits<CI>::value_type> & str,
  2647. basic_string<iterator_traits<CI>::value_type>::iterator & icur,
  2648. unsigned flags )
  2649. {
  2650. bool fcomplement = false;
  2651. match_charset<CI> * pnew = NULL;
  2652. basic_string<iterator_traits<CI>::value_type>::iterator itemp = icur;
  2653. if( itemp != str.end() && CHARSET_NEGATE == SY::charset_token( itemp, str.end() ) )
  2654. {
  2655. fcomplement = true;
  2656. icur = itemp;
  2657. }
  2658. switch( ( NOCASE | CSTRINGS ) & flags )
  2659. {
  2660. case 0:
  2661. pnew = new match_custom_charset_t<eos_t<CI>,match_range_with_case>( fcomplement, icur, str.end(), flags, SY() );
  2662. break;
  2663. case NOCASE:
  2664. pnew = new match_custom_charset_t<eos_t<CI>,match_range_no_case>( fcomplement, icur, str.end(), flags, SY() );
  2665. break;
  2666. case CSTRINGS:
  2667. pnew = new match_custom_charset_t<eocs_t<CI>,match_range_with_case>( fcomplement, icur, str.end(), flags, SY() );
  2668. break;
  2669. case NOCASE | CSTRINGS:
  2670. pnew = new match_custom_charset_t<eocs_t<CI>,match_range_no_case>( fcomplement, icur, str.end(), flags, SY() );
  2671. break;
  2672. default:
  2673. __assume(0); // tells the compiler that this is unreachable
  2674. }
  2675. return pnew;
  2676. }
  2677. #pragma warning( disable : 4660 )
  2678. // Explicit instantiation
  2679. #ifdef REGEX_FORCE_INSTANTIATION
  2680. template class basic_regexpr<char>;
  2681. template class basic_regexpr<wchar_t>;
  2682. #else
  2683. template class basic_regexpr<TCHAR>;
  2684. #endif
  2685. #ifndef NO_PERL_RE
  2686. #ifdef REGEX_FORCE_INSTANTIATION
  2687. template class basic_rpattern<const char *>;
  2688. template class basic_rpattern<const wchar_t *>;
  2689. template class basic_rpattern<string::const_iterator>;
  2690. template class basic_rpattern<wstring::const_iterator>;
  2691. #else
  2692. template class basic_rpattern<const TCHAR *>;
  2693. template class basic_rpattern<tstring::const_iterator>;
  2694. #endif
  2695. #endif
  2696. #ifdef POSIX_RE
  2697. #ifdef REGEX_FORCE_INSTANTIATION
  2698. template class basic_rpattern<const char *,posix_syntax<char> >;
  2699. template class basic_rpattern<const wchar_t *,posix_syntax<wchar_t> >;
  2700. template class basic_rpattern<string::const_iterator,posix_syntax<char> >;
  2701. template class basic_rpattern<wstring::const_iterator,posix_syntax<wchar_t> >;
  2702. #else
  2703. template class basic_rpattern<const TCHAR *,posix_syntax<TCHAR> >;
  2704. template class basic_rpattern<tstring::const_iterator,posix_syntax<TCHAR> >;
  2705. #endif
  2706. #endif
  2707. } // namespace regex