Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3154 lines
98 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // File: basic_regexpr.cxx
  4. //
  5. // Contents:
  6. //
  7. // Classes:
  8. //
  9. // Functions:
  10. //
  11. // Coupling:
  12. //
  13. // Notes:
  14. //
  15. // History: 1-11-1999 ericne Created
  16. //
  17. //----------------------------------------------------------------------------
  18. #include "stdafx.h"
  19. #pragma hdrstop
  20. // unlimited inline expansion (compile with /Ob1 or /Ob2)
  21. #pragma inline_depth(255)
  22. // C4355 'this' : used in base member initializer list
  23. // C4660 template-class specialization 'foo<bar>' is already instantiated
  24. // C4786 identifier was truncated to '255' characters in the debug information
  25. // C4800 'int' : forcing value to bool 'true' or 'false' (performance warning)
  26. #pragma warning( disable : 4355 4660 4786 4800 )
  27. #include <assert.h>
  28. #include <malloc.h> // for _alloca
  29. #include <algorithm>
  30. #include <minmax.h>
  31. #include "regexpr.h"
  32. using namespace std;
  33. namespace regex
  34. {
  35. #ifdef _MT
  36. // Global critical section used to synchronize the creation of static const patterns
  37. CRegExCritSect g_objRegExCritSect;
  38. #endif
  39. // For use while doing uppercase/lowercase conversions:
  40. // For use while doing uppercase/lowercase conversions:
  41. inline char to_upper( char ch ) { return ( char )toupper(ch); }
  42. inline char to_lower( char ch ) { return ( char )tolower(ch); }
  43. inline wint_t to_upper( wint_t ch ) { return (wint_t)towupper(ch); }
  44. inline wint_t to_lower( wint_t ch ) { return (wint_t)towlower(ch); }
  45. template< typename II, typename CI >
  46. void to_upper( II ibegin, CI iend )
  47. {
  48. for( ; (CI)ibegin != iend; ++ibegin )
  49. *ibegin = to_upper( *ibegin );
  50. }
  51. template< typename II, typename CI >
  52. void to_lower( II ibegin, CI iend )
  53. {
  54. for( ; (CI)ibegin != iend; ++ibegin )
  55. *ibegin = to_lower( *ibegin );
  56. }
  57. template< typename II, typename CI >
  58. unsigned parse_int( II & istr, CI iend, const unsigned m_max = -1 )
  59. {
  60. unsigned retval = 0;
  61. while( (CI)istr != iend && '0' <= *istr && '9' >= *istr && m_max > retval )
  62. {
  63. retval = retval * 10 + ( (unsigned)*istr++ - (unsigned)'0' );
  64. }
  65. if( m_max < retval )
  66. {
  67. retval /= 10;
  68. --istr;
  69. }
  70. return retval;
  71. }
  72. // This class is used to speed up character set matching by providing
  73. // a bitset that spans the ASCII range. std::bitset is not used because
  74. // the range-checking slows it down.
  75. // Note: The division and modulus operations are optimized by the compiler
  76. // into bit-shift operations.
  77. class ascii_bitvector
  78. {
  79. typedef unsigned __int32 elem_type; // use 32-bit ints on 32-bit platforms
  80. //typedef unsigned __int64 elem_type; // use 64-bit ints on 64-bit platforms
  81. enum { CBELEM = 8 * sizeof elem_type, // count of bytes per element
  82. CELEMS = (UCHAR_MAX+1) / CBELEM }; // number of element in array
  83. elem_type m_rg[ CELEMS ];
  84. // Used to inline operations like: bv1 |= ~bv2; without creating temp bit vectors.
  85. struct not_ascii_bitvector
  86. {
  87. const ascii_bitvector & m_ref;
  88. not_ascii_bitvector( const ascii_bitvector & ref ) throw()
  89. : m_ref(ref) {}
  90. };
  91. public:
  92. ascii_bitvector() throw()
  93. { memset( m_rg, 0, CELEMS * sizeof elem_type ); }
  94. inline void set( unsigned char ch ) throw()
  95. { m_rg[ ( ch / CBELEM ) ] |= ( (elem_type)1U << ( ch % CBELEM ) ); }
  96. inline bool operator[]( unsigned char ch ) const throw()
  97. { return 0 != ( m_rg[ ( ch / CBELEM ) ] & ( (elem_type)1U << ( ch % CBELEM ) ) ); }
  98. inline not_ascii_bitvector operator~() const throw()
  99. { return not_ascii_bitvector(*this); }
  100. inline ascii_bitvector & operator|=( const ascii_bitvector & that ) throw()
  101. { for( int i=0; i<CELEMS; ++i )
  102. m_rg[i] |= that.m_rg[i];
  103. return *this; }
  104. inline ascii_bitvector & operator|=( const not_ascii_bitvector & that ) throw()
  105. { for( int i=0; i<CELEMS; ++i )
  106. m_rg[i] |= ~that.m_ref.m_rg[i];
  107. return *this; }
  108. };
  109. const ascii_bitvector & get_digit_vector(void)
  110. {
  111. // 0-9
  112. class digit_vector : public ascii_bitvector
  113. {
  114. public:
  115. digit_vector()
  116. {
  117. unsigned char ich;
  118. for( ich ='0'; ich <= '9'; ++ich )
  119. set(ich);
  120. }
  121. };
  122. static const digit_vector s_digit_vector;
  123. return s_digit_vector;
  124. }
  125. const ascii_bitvector & get_word_vector(void)
  126. {
  127. // a-zA-Z_0-9
  128. class word_vector : public ascii_bitvector
  129. {
  130. public:
  131. word_vector()
  132. {
  133. unsigned char ich;
  134. for( ich = 'a'; ich <= 'z'; ++ich )
  135. set(ich);
  136. for( ich = 'A'; ich <= 'Z'; ++ich )
  137. set(ich);
  138. for( ich = '0'; ich <= '9'; ++ich )
  139. set(ich);
  140. set('_');
  141. }
  142. };
  143. static const word_vector s_word_vector;
  144. return s_word_vector;
  145. }
  146. const ascii_bitvector & get_space_vector(void)
  147. {
  148. // " \t\r\n\f"
  149. class space_vector : public ascii_bitvector
  150. {
  151. public:
  152. space_vector()
  153. {
  154. set(' ');
  155. set('\t');
  156. set('\v');
  157. set('\r');
  158. set('\n');
  159. set('\f');
  160. }
  161. };
  162. static const space_vector s_space_vector;
  163. return s_space_vector;
  164. }
  165. //
  166. // Operator implementations
  167. //
  168. // Base type used so that all derived operators share typedefs.
  169. template< typename CI >
  170. struct op_t : public binary_function<match_param<CI>,CI,bool>
  171. {
  172. typedef CI const_iterator;
  173. typedef typename iterator_traits<CI>::value_type char_type;
  174. };
  175. // Evaluates the beginning-of-string condition
  176. template< typename CI >
  177. struct bos_t : public op_t<CI>
  178. {
  179. inline bool operator()( const match_param<CI> & param, CI iter ) const
  180. {
  181. return param.ibegin == iter;
  182. }
  183. };
  184. // Find the beginning of a line, either beginning of a string, or the character
  185. // immediately following a newline
  186. template< typename CI >
  187. struct bol_t : public bos_t<CI>
  188. {
  189. inline bool operator()( const match_param<CI> & param, CI iter ) const
  190. {
  191. return bos_t<CI>::operator()(param,iter) || char_type('\n') == *--iter;
  192. }
  193. };
  194. // Evaluates end-of-string condition for string's
  195. template< typename CI >
  196. struct eos_t : public op_t<CI>
  197. {
  198. inline bool operator()( const match_param<CI> & param, CI iter ) const
  199. {
  200. return param.istop == iter;
  201. }
  202. };
  203. // Evaluates end-of-string condidition for C-style string's when the length is unknown by
  204. // looking for the null-terminator.
  205. template< typename CI >
  206. struct eocs_t : public op_t<CI>
  207. {
  208. inline bool operator()( const match_param<CI> & param, CI iter ) const
  209. {
  210. return char_type('\0') == *iter;
  211. }
  212. };
  213. // Evaluates end-of-line conditions, either the end of the string, or a
  214. // return or newline character.
  215. template< typename EOS >
  216. struct eol_t_t : public EOS
  217. {
  218. typedef typename EOS::const_iterator CI;
  219. inline bool operator()( const match_param<CI> & param, CI iter ) const
  220. {
  221. return EOS::operator()(param,iter) || char_type('\n') == *iter || char_type('\r') == *iter;
  222. }
  223. };
  224. template< typename CI > struct eol_t : public eol_t_t<eos_t<CI> > {};
  225. template< typename CI > struct eocl_t : public eol_t_t<eocs_t<CI> > {};
  226. // Evaluates perl's end-of-string conditions, either the end of the string, or a
  227. // newline character followed by end of string. (Only used by $ and /Z assertions)
  228. template< typename EOS >
  229. struct peos_t_t : public EOS
  230. {
  231. typedef typename EOS::const_iterator CI;
  232. inline bool operator()( const match_param<CI> & param, CI iter ) const
  233. {
  234. return EOS::operator()(param,iter) || ( ( char_type('\n') == *iter ) && EOS::operator()(param,++iter) );
  235. }
  236. };
  237. template< typename CI > struct peos_t : public peos_t_t<eos_t<CI> > {};
  238. template< typename CI > struct peocs_t : public peos_t_t<eocs_t<CI> > {};
  239. // compare two characters, case-sensitive
  240. template< typename CH >
  241. struct ch_neq_t : public binary_function<CH, CH, bool>
  242. {
  243. typedef CH char_type;
  244. inline bool operator()( register CH ch1, register CH ch2 ) const
  245. {
  246. return ch1 != ch2;
  247. }
  248. };
  249. // Compare two characters, disregarding case
  250. template< typename CH >
  251. struct ch_neq_nocase_t : public binary_function<CH, CH, bool>
  252. {
  253. typedef CH char_type;
  254. inline bool operator()( register CH ch1, register CH ch2 ) const
  255. {
  256. return to_upper(ch1) != to_upper(ch2);
  257. }
  258. };
  259. //
  260. // Helper functions for match and substitute
  261. //
  262. template< typename CI >
  263. size_t string_length( CI iter )
  264. {
  265. size_t n = 0;
  266. while( 0 != *iter++ )
  267. ++n;
  268. return n;
  269. }
  270. template< typename CI >
  271. backref_tag<CI> _do_match( const basic_rpattern_base<CI> & pat, match_param<CI> & param ) throw()
  272. {
  273. typedef typename iterator_traits<CI>::value_type char_type;
  274. bool floop = pat.loops();
  275. unsigned flags = pat.flags();
  276. width_type nwidth = pat.get_width();
  277. const sub_expr<CI> * pfirst = pat.get_first_subexpression();
  278. try
  279. {
  280. vector<backref_tag<CI> > rgbackrefs; // dummy backref vector
  281. if( NULL == param.prgbackrefs )
  282. param.prgbackrefs = & rgbackrefs;
  283. param.prgbackrefs->resize( pat._cgroups_total() );
  284. fill( param.prgbackrefs->begin(), param.prgbackrefs->end(), backref_tag<CI>() );
  285. // If a pattern is optimized for CSTRINGS, it can save a call
  286. // to calculate the length of the string.
  287. if( CI(0) == param.istop && ( ( RIGHTMOST & flags ) || ( 0 == ( CSTRINGS & flags ) ) ) )
  288. param.istop = param.istart + string_length( param.istart );
  289. if( CI(0) != param.istop )
  290. {
  291. // If the minimum width of the pattern exceeds the width of the
  292. // string, a succesful match is impossible
  293. if( nwidth.m_min <= (size_t)distance( param.istart, param.istop ) )
  294. {
  295. CI local_istop = param.istop;
  296. advance( local_istop, -int( nwidth.m_min ) );
  297. if( RIGHTMOST & flags )
  298. {
  299. // begin trying to match after the last character.
  300. // Continue to the beginning
  301. for( CI icur = local_istop; icur >= param.istart; --icur )
  302. if( pfirst->domatch( param, icur ) )
  303. break; // m_floop not used for rightmost matches
  304. }
  305. else
  306. {
  307. // begin trying to match before the first character.
  308. // Continue to the end
  309. for( CI icur = param.istart; icur <= local_istop; ++icur )
  310. if( pfirst->domatch( param, icur ) || ! floop )
  311. break;
  312. }
  313. }
  314. }
  315. else
  316. {
  317. // begin trying to match before the first character.
  318. // Continue to the end
  319. for( CI icur = param.istart; ; ++icur )
  320. {
  321. if( pfirst->domatch( param, icur ) || ! floop )
  322. break;
  323. if( char_type('\0') == *icur )
  324. break;
  325. }
  326. }
  327. }
  328. catch(...) // bad alloc, stack overflow?
  329. {
  330. fill( param.prgbackrefs->begin(), param.prgbackrefs->end(), backref_tag<CI>() );
  331. }
  332. // Shrink the backref vector to chop off information about the "invisible" groups
  333. param.prgbackrefs->resize( pat.cgroups() );
  334. return (*param.prgbackrefs)[0];
  335. }
  336. template< typename CI, typename CH, typename TR, typename AL >
  337. size_t _do_subst( basic_regexpr<CH,TR,AL> & str, const basic_rpattern_base<CI> & pat, size_t strpos, size_t strlen ) throw(bad_alloc)
  338. {
  339. typedef iterator_traits<CI>::value_type char_type;
  340. typedef list<subst_node>::const_iterator LCI;
  341. enum { UPPER = -1, NIL, LOWER } next = NIL, rest = NIL;
  342. bool first = true;
  343. size_t old_strpos = strpos;
  344. const list<subst_node> & subst_list = pat.get_subst_list();
  345. basic_string<CH,TR,AL>::iterator itstrlen = str.begin();
  346. advance( itstrlen, strpos + strlen );
  347. const basic_string<char_type> & subst = pat.get_subst();
  348. push_new_handler pnh( &my_new_handler );
  349. for( LCI isubst = subst_list.begin(); isubst != subst_list.end(); ++isubst )
  350. {
  351. size_t sublen;
  352. basic_string<CH,TR,AL>::const_iterator itsubpos1; // iter into str
  353. basic_string<CH,TR,AL>::const_iterator itsublen1;
  354. basic_string<char_type>::const_iterator itsubpos2; // iter into subst string
  355. basic_string<char_type>::const_iterator itsublen2;
  356. basic_string<CH,TR,AL>::iterator itstrpos = str.begin();
  357. advance( itstrpos, strpos );
  358. switch( isubst->stype )
  359. {
  360. case subst_node::SUBST_STRING:
  361. itsubpos2 = subst.begin();
  362. advance( itsubpos2, isubst->subst_string.rstart );
  363. itsublen2 = itsubpos2;
  364. advance( itsublen2, isubst->subst_string.rlength );
  365. first ? str.replace( itstrpos, itstrlen, itsubpos2, itsublen2 ) :
  366. str.insert( itstrpos, itsubpos2, itsublen2 );
  367. sublen = distance( itsubpos2, itsublen2 );
  368. break;
  369. case subst_node::SUBST_BACKREF:
  370. switch( isubst->subst_backref )
  371. {
  372. case subst_node::PREMATCH:
  373. itsubpos1 = str.backref_str().begin();
  374. itsublen1 = itsubpos1;
  375. advance( itsublen1, sublen = str.rstart() );
  376. break;
  377. case subst_node::POSTMATCH:
  378. itsubpos1 = str.backref_str().begin();
  379. advance( itsubpos1, str.rstart() + str.rlength() );
  380. itsublen1 = str.backref_str().end();
  381. break;
  382. default:
  383. itsubpos1 = str.backref_str().begin();
  384. advance( itsubpos1, str.rstart( isubst->subst_backref ) );
  385. itsublen1 = itsubpos1;
  386. advance( itsublen1, str.rlength( isubst->subst_backref ) );
  387. break;
  388. }
  389. first ? str.replace( itstrpos, itstrlen, itsubpos1, itsublen1 ) :
  390. str.insert( itstrpos, itsubpos1, itsublen1 );
  391. sublen = distance( itsubpos1, itsublen1 );
  392. break;
  393. case subst_node::SUBST_OP:
  394. switch( isubst->op )
  395. {
  396. case subst_node::UPPER_ON:
  397. rest = UPPER;
  398. break;
  399. case subst_node::UPPER_NEXT:
  400. next = UPPER;
  401. break;
  402. case subst_node::LOWER_ON:
  403. rest = LOWER;
  404. break;
  405. case subst_node::LOWER_NEXT:
  406. next = LOWER;
  407. break;
  408. case subst_node::ALL_OFF:
  409. rest = NIL;
  410. break;
  411. default:
  412. __assume(0);
  413. }
  414. continue; // jump to the next item in the list
  415. default:
  416. __assume(0);
  417. }
  418. first = false;
  419. // Are we upper- or lower-casing this string?
  420. if( rest )
  421. {
  422. basic_string<CH,TR,AL>::iterator istart = str.begin();
  423. advance( istart, strpos );
  424. basic_string<CH,TR,AL>::const_iterator istop = istart;
  425. advance( istop, sublen );
  426. switch( rest )
  427. {
  428. case UPPER:
  429. to_upper( istart, istop );
  430. break;
  431. case LOWER:
  432. to_lower( istart, istop );
  433. break;
  434. default:
  435. __assume(0);
  436. }
  437. }
  438. // Are we upper- or lower-casing the next character?
  439. if( next )
  440. {
  441. switch( next )
  442. {
  443. case UPPER:
  444. str[strpos] = to_upper(str[strpos]);
  445. break;
  446. case LOWER:
  447. str[strpos] = to_lower(str[strpos]);
  448. break;
  449. default:
  450. __assume(0);
  451. }
  452. next = NIL;
  453. }
  454. strpos += sublen;
  455. }
  456. // If *first* is still true, then we never called str.replace, and the substitution
  457. // string is empty. Erase the part of the string that the pattern matched.
  458. if( first )
  459. str.erase( strpos, strlen );
  460. // return length of the substitution
  461. return strpos - old_strpos;
  462. }
  463. //
  464. // Implementation of basic_regexpr
  465. //
  466. template< typename CH, typename TR, typename AL >
  467. size_t basic_regexpr<CH,TR,AL>::substitute(
  468. const basic_rpattern_base<basic_regexpr<CH,TR,AL>::const_iterator> & pat,
  469. size_type pos,
  470. size_type len ) throw(bad_alloc)
  471. {
  472. if( pat.flags() & CSTRINGS )
  473. {
  474. assert( ! "You can't use a pattern optimized for CSTRINGS with regexpr::substitute" );
  475. return 0;
  476. }
  477. backref_vector rgbackrefs; // dummy backref vector
  478. backref_vector * prgbackrefs = & rgbackrefs;
  479. const bool fsave_backrefs = ( pat.uses_backrefs() || !( pat.flags() & NOBACKREFS ) );
  480. if( fsave_backrefs )
  481. {
  482. prgbackrefs = & m_rgbackrefs;
  483. m_pbackref_str = & ( m_backref_str = *this );
  484. }
  485. else
  486. {
  487. m_backref_str.erase();
  488. m_pbackref_str = this;
  489. m_rgbackrefs.resize( 0 );
  490. }
  491. backref_type br;
  492. size_t csubst = 0;
  493. long stop_offset = ( len == npos ?
  494. m_pbackref_str->size() :
  495. min( pos + len, m_pbackref_str->size() ) );
  496. match_param<const_iterator> param( m_pbackref_str->begin(),
  497. m_pbackref_str->begin(),
  498. prgbackrefs );
  499. advance( param.istart, pos );
  500. advance( param.istop, stop_offset );
  501. param.ibegin = param.istart;
  502. if( GLOBAL & pat.flags() )
  503. {
  504. const bool fAll = ( ALLBACKREFS == ( ALLBACKREFS & pat.flags() ) );
  505. const bool fFirst = ( FIRSTBACKREFS == ( FIRSTBACKREFS & pat.flags() ) );
  506. backref_vector rgtempbackrefs; // temporary vector used if fsave_backrefs
  507. long pos_offset = 0; // keep track of how much the backref_str and
  508. // the current string are out of sync
  509. while( br = _do_match( pat, param ) )
  510. {
  511. ++csubst;
  512. size_type match_length = distance( br.first, br.second );
  513. pos = distance( m_pbackref_str->begin(), br.first );
  514. size_type subst_length = _do_subst( *this, pat, pos + pos_offset, match_length );
  515. if( fsave_backrefs )
  516. {
  517. pos += match_length;
  518. pos_offset += ( subst_length - match_length );
  519. // Handle specially the backref flags
  520. if( fFirst )
  521. rgtempbackrefs.push_back( br );
  522. else if( fAll )
  523. rgtempbackrefs.insert( rgtempbackrefs.end(),
  524. param.prgbackrefs->begin(),
  525. param.prgbackrefs->end() );
  526. else
  527. rgtempbackrefs.swap( *param.prgbackrefs );
  528. }
  529. else
  530. {
  531. pos += subst_length;
  532. stop_offset += ( subst_length - match_length );
  533. // we're not saving backref information, so we don't
  534. // need to do any special backref maintenance here
  535. }
  536. // prevent a pattern that matches 0 characters from matching
  537. // again at the same point in the string
  538. if( 0 == match_length )
  539. {
  540. if( br.first == param.istop ) // We're at the end, so we're done
  541. break;
  542. ++pos;
  543. }
  544. param.istart = m_pbackref_str->begin();
  545. advance( param.istart, pos ); // ineffecient for bidirectional iterators.
  546. param.istop = m_pbackref_str->begin();
  547. advance( param.istop, stop_offset ); // ineffecient for bidirectional iterators.
  548. }
  549. // If we did special backref handling, swap the backref vectors
  550. if( fsave_backrefs && ( !br || fFirst || fAll ) )
  551. param.prgbackrefs->swap( rgtempbackrefs );
  552. else if( ! (*param.prgbackrefs)[0] )
  553. param.prgbackrefs->clear();
  554. }
  555. else if( br = _do_match( pat, param ) )
  556. {
  557. ++csubst;
  558. _do_subst( *this, pat,
  559. distance( m_pbackref_str->begin(), br.first ),
  560. distance( br.first, br.second ) );
  561. }
  562. if( NOBACKREFS == ( pat.flags() & NOBACKREFS ) )
  563. param.prgbackrefs->clear();
  564. return csubst;
  565. }
  566. //
  567. // Helper functions called from both basic_regexpr match methods
  568. //
  569. template< typename EOS >
  570. backref_tag< typename EOS::const_iterator > _match_helper(
  571. const basic_rpattern_base<typename EOS::const_iterator> & pat,
  572. match_param<typename EOS::const_iterator> & param,
  573. EOS eos )
  574. {
  575. typedef typename EOS::const_iterator CI;
  576. if( GLOBAL & pat.flags() ) // do a global find
  577. {
  578. // The NOBACKREFS flag is ignored in the match method.
  579. const bool fAll = ( ALLBACKREFS == ( ALLBACKREFS & pat.flags() ) );
  580. const bool fFirst = ( FIRSTBACKREFS == ( FIRSTBACKREFS & pat.flags() ) );
  581. backref_tag<CI> br;
  582. vector<backref_tag<CI> > rgtempbackrefs;
  583. while( br = _do_match( pat, param ) )
  584. {
  585. // Handle specially the backref flags
  586. if( fFirst )
  587. rgtempbackrefs.push_back( br );
  588. else if( fAll )
  589. rgtempbackrefs.insert( rgtempbackrefs.end(),
  590. param.prgbackrefs->begin(),
  591. param.prgbackrefs->end() );
  592. else
  593. rgtempbackrefs.swap( *param.prgbackrefs );
  594. if( br.first == ( param.istart = br.second ) )
  595. {
  596. if( eos( param, param.istart ) )
  597. break;
  598. ++param.istart;
  599. }
  600. }
  601. // restore the backref vectors
  602. if( !br || fFirst || fAll )
  603. param.prgbackrefs->swap( rgtempbackrefs );
  604. else if( ! (*param.prgbackrefs)[0] )
  605. param.prgbackrefs->clear();
  606. return param.prgbackrefs->empty() ? backref_tag<CI>() : (*param.prgbackrefs)[0];
  607. }
  608. else
  609. return _do_match( pat, param );
  610. }
  611. template< typename CH, typename TR, typename AL >
  612. basic_regexpr<CH,TR,AL>::backref_type basic_regexpr<CH,TR,AL>::match(
  613. const basic_rpattern_base<const_iterator> & pat,
  614. size_type pos, size_type len ) const throw()
  615. {
  616. if( pat.flags() & CSTRINGS )
  617. {
  618. assert( ! "A pattern optimized for CSTRINGS can only be used with the static regexpr::match method" );
  619. return backref_type();
  620. }
  621. m_pbackref_str = this;
  622. m_backref_str.erase(); // free up unused memory
  623. const_iterator istart = begin();
  624. advance( istart, pos );
  625. const_iterator istop;
  626. if( len == npos || pos + len >= size() )
  627. istop = end();
  628. else
  629. advance( istop = begin(), pos + len );
  630. match_param<const_iterator> param( istart, istop, & m_rgbackrefs );
  631. return _match_helper<eos_t<const_iterator> >( pat, param, eos_t<const_iterator>() );
  632. }
  633. template< typename CH >
  634. backref_tag<const CH *> _static_match_helper(
  635. const CH * szstr,
  636. const basic_rpattern_base<const CH *> & pat,
  637. vector< backref_tag< const CH * > > * prgbackrefs ) throw()
  638. {
  639. vector< backref_tag< const CH * > > rgdummyvector;
  640. if( NULL == prgbackrefs )
  641. prgbackrefs = &rgdummyvector;
  642. match_param<const CH *> param( szstr, NULL, prgbackrefs );
  643. return _match_helper<eocs_t<const CH *> >( pat, param, eocs_t<const CH *>() );
  644. }
  645. //
  646. // Helper function called from both basic_regexpr::count methods
  647. //
  648. template< typename EOS >
  649. size_t _count_helper(
  650. const basic_rpattern_base<typename EOS::const_iterator> & pat,
  651. match_param<typename EOS::const_iterator> & param,
  652. EOS eos )
  653. {
  654. typedef typename EOS::const_iterator CI;
  655. size_t cmatches = 0;
  656. vector<backref_tag<CI> > rgbackrefs; // dummy backref vector
  657. backref_tag<CI> br;
  658. param.prgbackrefs = &rgbackrefs;
  659. while( br = _do_match( pat, param ) )
  660. {
  661. ++cmatches;
  662. if( br.first == ( param.istart = br.second ) )
  663. {
  664. if( eos( param, param.istart ) )
  665. break;
  666. ++param.istart;
  667. }
  668. }
  669. return cmatches;
  670. }
  671. template< typename CH, typename TR, typename AL >
  672. size_t basic_regexpr<CH,TR,AL>::count(
  673. const basic_rpattern_base<basic_regexpr<CH,TR,AL>::const_iterator> & pat,
  674. size_type pos,
  675. size_type len ) const throw()
  676. {
  677. if( pat.flags() & CSTRINGS )
  678. {
  679. assert( ! "A pattern optimized for CSTRINGS can only be used with the static regexpr::count method" );
  680. return backref_type();
  681. }
  682. m_pbackref_str = this;
  683. const_iterator istart = begin();
  684. advance( istart, pos );
  685. const_iterator istop;
  686. if( len == npos || pos + len >= size() )
  687. istop = end();
  688. else
  689. advance( istop = begin(), pos + len );
  690. match_param<const_iterator> param( istart, istop, NULL );
  691. return _count_helper<eos_t<const_iterator> >( pat, param, eos_t<const_iterator>() );
  692. }
  693. template< typename CH >
  694. size_t _static_count_helper(
  695. const CH * szstr,
  696. const basic_rpattern_base<const CH *> & pat ) throw()
  697. {
  698. match_param<const CH *> param( szstr, NULL, NULL );
  699. return _count_helper<eocs_t<const CH *> >( pat, param, eocs_t<const CH *>() );
  700. }
  701. // Base class for sub-expressions which are zero-width
  702. // (i.e., assertions eat no characters during matching)
  703. // Assertions cannot be quantified.
  704. template< typename CI >
  705. class assertion : public sub_expr<CI>
  706. {
  707. public:
  708. virtual ~assertion() {}
  709. virtual bool is_assertion() const throw() { return true; }
  710. protected:
  711. virtual width_type _width_this() throw() { return width_type(0,0); }
  712. };
  713. template< typename OP >
  714. class assert_op : public assertion<typename OP::const_iterator>
  715. {
  716. public:
  717. typedef OP op_type;
  718. typedef typename OP::const_iterator CI;
  719. virtual ~assert_op() {}
  720. protected:
  721. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  722. {
  723. return m_op( param, icur );
  724. }
  725. op_type m_op;
  726. };
  727. template< typename CI >
  728. assertion<CI> * create_bos( unsigned /*flags*/ )
  729. {
  730. return new assert_op<bos_t<CI> >();
  731. }
  732. template< typename CI >
  733. assertion<CI> * create_eos( unsigned flags )
  734. {
  735. switch( CSTRINGS & flags )
  736. {
  737. case 0:
  738. return new assert_op<peos_t<CI> >();
  739. case CSTRINGS:
  740. return new assert_op<peocs_t<CI> >();
  741. default:
  742. __assume(0); // tells the compiler that this is unreachable
  743. }
  744. }
  745. template< typename CI >
  746. assertion<CI> * create_eoz( unsigned flags )
  747. {
  748. switch( CSTRINGS & flags )
  749. {
  750. case 0:
  751. return new assert_op<eos_t<CI> >();
  752. case CSTRINGS:
  753. return new assert_op<eocs_t<CI> >();
  754. default:
  755. __assume(0); // tells the compiler that this is unreachable
  756. }
  757. }
  758. template< typename CI >
  759. assertion<CI> * create_bol( unsigned flags )
  760. {
  761. switch( MULTILINE & flags )
  762. {
  763. case 0:
  764. return new assert_op<bos_t<CI> >();
  765. case MULTILINE:
  766. return new assert_op<bol_t<CI> >();
  767. default:
  768. __assume(0); // tells the compiler that this is unreachable
  769. }
  770. }
  771. template< typename CI >
  772. assertion<CI> * create_eol( unsigned flags )
  773. {
  774. switch( ( MULTILINE | CSTRINGS ) & flags )
  775. {
  776. case 0:
  777. return new assert_op<peos_t<CI> >();
  778. case MULTILINE:
  779. return new assert_op<eol_t<CI> >();
  780. case CSTRINGS:
  781. return new assert_op<peocs_t<CI> >();
  782. case MULTILINE | CSTRINGS:
  783. return new assert_op<eocl_t<CI> >();
  784. default:
  785. __assume(0); // tells the compiler that this is unreachable
  786. }
  787. }
  788. template< typename CI >
  789. class match_atom : public sub_expr<CI>
  790. {
  791. public:
  792. match_atom( const basic_string<sub_expr<CI>::char_type>::iterator istart,
  793. basic_string<sub_expr<CI>::char_type>::const_iterator istop )
  794. : m_istart( istart ), m_istop( istop ) {}
  795. virtual ~match_atom() {}
  796. const basic_string<sub_expr<CI>::char_type>::iterator m_istart;
  797. basic_string<sub_expr<CI>::char_type>::const_iterator m_istop;
  798. protected:
  799. virtual width_type _width_this() throw()
  800. {
  801. size_t width = distance( (basic_string<sub_expr<CI>::char_type>::const_iterator)m_istart, m_istop );
  802. return width_type( width, width );
  803. }
  804. };
  805. template< typename EOS >
  806. class match_atom_t : public match_atom<typename EOS::const_iterator>
  807. {
  808. public:
  809. typedef EOS eos_type;
  810. typedef typename EOS::const_iterator CI;
  811. match_atom_t( const basic_string<sub_expr<CI>::char_type>::iterator istart,
  812. basic_string<sub_expr<CI>::char_type>::const_iterator istop )
  813. : match_atom<CI>( istart, istop ) {}
  814. virtual ~match_atom_t() {}
  815. protected:
  816. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  817. {
  818. CI icur_tmp = icur;
  819. basic_string<sub_expr<CI>::char_type>::const_iterator ithis = m_istart;
  820. for( ; ithis != m_istop; ++icur_tmp, ++ithis )
  821. {
  822. if( m_eos( param, icur_tmp ) || *ithis != *icur_tmp )
  823. return false;
  824. }
  825. icur = icur_tmp;
  826. return true;
  827. }
  828. eos_type m_eos;
  829. };
  830. template< typename EOS >
  831. class match_atom_nocase_t : public match_atom<typename EOS::const_iterator>
  832. {
  833. public:
  834. typedef EOS eos_type;
  835. typedef typename EOS::const_iterator CI;
  836. match_atom_nocase_t( const basic_string<sub_expr<CI>::char_type>::iterator istart,
  837. basic_string<sub_expr<CI>::char_type>::const_iterator istop )
  838. : match_atom<CI>( istart, istop ), m_strlower( (basic_string<sub_expr<CI>::char_type>::const_iterator)istart, istop )
  839. {
  840. // Store the uppercase version of the atom in [m_istart,m_istop).
  841. to_upper( m_istart, m_istop );
  842. // Store the lowercase version of the atom in m_strlower.
  843. to_lower( m_strlower.begin(), m_strlower.end() );
  844. }
  845. virtual ~match_atom_nocase_t() {}
  846. protected:
  847. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  848. {
  849. CI icur_tmp = icur;
  850. basic_string<sub_expr<CI>::char_type>::const_iterator ithisu = m_istart; // uppercase
  851. basic_string<sub_expr<CI>::char_type>::const_iterator ithisl = m_strlower.begin(); // lowercase
  852. for( ; ithisu != m_istop; ++icur_tmp, ++ithisu, ++ithisl )
  853. {
  854. if( m_eos( param, icur_tmp ) || ( *ithisu != *icur_tmp && *ithisl != *icur_tmp ) )
  855. return false;
  856. }
  857. icur = icur_tmp;
  858. return true;
  859. }
  860. eos_type m_eos;
  861. basic_string<sub_expr<CI>::char_type> m_strlower;
  862. };
  863. template< typename CI >
  864. match_atom<CI> * create_atom(
  865. const basic_string<iterator_traits<CI>::value_type>::iterator istart,
  866. basic_string<iterator_traits<CI>::value_type>::const_iterator istop,
  867. unsigned flags )
  868. {
  869. switch( ( NOCASE | CSTRINGS ) & flags )
  870. {
  871. case 0:
  872. return new match_atom_t<eos_t<CI> >( istart, istop );
  873. case NOCASE:
  874. return new match_atom_nocase_t<eos_t<CI> >( istart, istop );
  875. case CSTRINGS:
  876. return new match_atom_t<eocs_t<CI> >( istart, istop );
  877. case NOCASE | CSTRINGS:
  878. return new match_atom_nocase_t<eocs_t<CI> >( istart, istop );
  879. default:
  880. __assume(0); // tells the compiler that this is unreachable
  881. }
  882. }
  883. template< typename CI >
  884. match_atom<CI> * create_atom(
  885. const basic_string<iterator_traits<CI>::value_type>::iterator istart,
  886. unsigned flags )
  887. {
  888. basic_string<iterator_traits<CI>::value_type>::const_iterator istop = istart;
  889. return create_atom<CI>( istart, ++istop, flags );
  890. }
  891. template< typename CI >
  892. class match_any : public sub_expr<CI>
  893. {
  894. public:
  895. virtual ~match_any() {}
  896. protected:
  897. virtual width_type _width_this() throw() { return width_type(1,1); }
  898. };
  899. template< typename EOS >
  900. class match_any_t : public match_any<typename EOS::const_iterator>
  901. {
  902. public:
  903. typedef EOS eos_type;
  904. typedef typename EOS::const_iterator CI;
  905. virtual ~match_any_t() {}
  906. protected:
  907. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  908. {
  909. if( m_eos( param, icur ) )
  910. return false;
  911. ++icur;
  912. return true;
  913. }
  914. eos_type m_eos;
  915. };
  916. template< typename CI >
  917. match_any<CI> * create_any( unsigned flags )
  918. {
  919. switch( ( SINGLELINE | CSTRINGS ) & flags )
  920. {
  921. case 0:
  922. return new match_any_t<eol_t<CI> >();
  923. case SINGLELINE:
  924. return new match_any_t<eos_t<CI> >();
  925. case CSTRINGS:
  926. return new match_any_t<eocl_t<CI> >();
  927. case SINGLELINE | CSTRINGS:
  928. return new match_any_t<eocs_t<CI> >();
  929. default:
  930. __assume(0); // tells the compiler that this is unreachable
  931. }
  932. }
  933. typedef pair<wint_t,wint_t> range_type;
  934. const vector<range_type> g_rgranges; // empty
  935. template< typename CI >
  936. class match_charset : public sub_expr<CI>
  937. {
  938. public:
  939. match_charset( bool fcomplement,
  940. const ascii_bitvector & bvect )
  941. : m_fcomplement( fcomplement ),
  942. m_rgascii( bvect ),
  943. m_rgranges( g_rgranges ),
  944. m_ncharflags(0) {}
  945. // Note that only the references are copied here -- they are not ref counted.
  946. // Beware of variable lifetime issues.
  947. match_charset( const match_charset<CI> & that )
  948. : m_fcomplement( that.m_fcomplement ),
  949. m_rgascii( that.m_rgascii ),
  950. m_rgranges( that.m_rgranges ),
  951. m_ncharflags( that.m_ncharflags ) {}
  952. virtual ~match_charset() {}
  953. const bool m_fcomplement;
  954. const ascii_bitvector & m_rgascii; // bitmap for chars in range 0-255
  955. const vector<range_type> & m_rgranges; // vector of included character ranges 256-65535
  956. int m_ncharflags; // Parameter to isctype()
  957. // The case-sensitivity of a character set is "compiled" into the ascii_bitvector
  958. // but not into the range vector because it is too computationally expensive. Instead,
  959. // when doing a unicode case-insensitive match on the ranges vector, two lookups
  960. // must be performed -- one lowercase and one uppercase. By contrast, only one lookup
  961. // is needed for the ascii_bitvector.
  962. protected:
  963. match_charset( bool fcomplement,
  964. const ascii_bitvector & bvect,
  965. const vector<range_type> & rgranges )
  966. : m_fcomplement( fcomplement ),
  967. m_rgascii( bvect ),
  968. m_rgranges( rgranges ),
  969. m_ncharflags(0) {}
  970. // this method should never be called. match_charset is only a base class
  971. // for match_charset_t
  972. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  973. {
  974. assert(false);
  975. return true;
  976. }
  977. template< typename SY >
  978. match_charset<CI> * get_altern_charset( char_type ch, unsigned flags, SY /*sy*/ ) const throw()
  979. {
  980. return basic_rpattern<CI,SY>::s_charset_map.get( ch, flags );
  981. }
  982. virtual width_type _width_this() throw() { return width_type(1,1); }
  983. };
  984. // Used as a template parameter to find a unicode character in an array of ranges.
  985. class match_range : public unary_function<wint_t,bool>
  986. {
  987. protected:
  988. const vector<range_type> & m_rgranges;
  989. // determines if one range is less then another.
  990. // used in binary search of range vector
  991. inline static bool _range_less( const range_type & rg1,
  992. const range_type & rg2 ) throw()
  993. {
  994. return rg1.second < rg2.first;
  995. }
  996. match_range( const vector<range_type> & rgranges )
  997. : m_rgranges( rgranges ) {}
  998. };
  999. class match_range_with_case : public match_range
  1000. {
  1001. public:
  1002. match_range_with_case( const vector<range_type> & rgranges )
  1003. : match_range( rgranges ) {}
  1004. inline bool operator()( wint_t ch ) const throw()
  1005. {
  1006. return binary_search( m_rgranges.begin(), m_rgranges.end(),
  1007. range_type(ch,ch), _range_less );
  1008. }
  1009. };
  1010. class match_range_no_case : public match_range
  1011. {
  1012. public:
  1013. match_range_no_case( const vector<range_type> & rgranges )
  1014. : match_range( rgranges ) {}
  1015. inline bool operator()( wint_t ch ) const throw()
  1016. {
  1017. const wint_t chup = towupper( ch );
  1018. if( binary_search( m_rgranges.begin(), m_rgranges.end(),
  1019. range_type(chup,chup), _range_less ) )
  1020. return true;
  1021. const wint_t chlo = towlower( ch );
  1022. if( chup != chlo &&
  1023. binary_search( m_rgranges.begin(), m_rgranges.end(),
  1024. range_type(chlo,chlo), _range_less ) )
  1025. return true;
  1026. return false;
  1027. }
  1028. };
  1029. template< typename EOS, typename RGM >
  1030. class match_charset_t : public match_charset<typename EOS::const_iterator>
  1031. {
  1032. public:
  1033. typedef EOS eos_type;
  1034. typedef RGM range_match_type;
  1035. typedef typename EOS::const_iterator CI;
  1036. match_charset_t( const match_charset<CI> & that )
  1037. : match_charset<CI>( that ), m_rgm( m_rgranges ) {}
  1038. virtual ~match_charset_t() {}
  1039. inline bool is_in_charset( char_type ch ) const throw()
  1040. {
  1041. return m_fcomplement != _is_in_charset( ch );
  1042. }
  1043. protected:
  1044. match_charset_t( bool fcomplement,
  1045. const ascii_bitvector & bvect,
  1046. const vector<range_type> & rgranges )
  1047. : match_charset<CI>( fcomplement, bvect, rgranges ), m_rgm( m_rgranges ) {}
  1048. // Note overloading based on parameter
  1049. inline bool _is_in_charset( char ch ) const throw()
  1050. {
  1051. return ( m_rgascii[ unsigned char(ch) ] ) ||
  1052. ( m_ncharflags && ( _pctype[unsigned char(ch)] & m_ncharflags ) );
  1053. }
  1054. // Note overloading based on parameter
  1055. inline bool _is_in_charset( wint_t ch ) const throw()
  1056. {
  1057. if( UCHAR_MAX >= ch )
  1058. return _is_in_charset( char(ch) );
  1059. // use range_match_type to see if this character is within one of the
  1060. // ranges stored in m_rgranges.
  1061. return ( ! m_rgranges.empty() && m_rgm( ch ) ) ||
  1062. ( m_ncharflags && iswctype( ch, (int)m_ncharflags ) );
  1063. }
  1064. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  1065. {
  1066. if( m_eos( param, icur ) || ! is_in_charset( *icur ) )
  1067. return false;
  1068. ++icur;
  1069. return true;
  1070. }
  1071. // range_match_type encapsulates the case-sensitivity
  1072. // issues with doing a unicode lookup on the ranges vector.
  1073. range_match_type m_rgm;
  1074. eos_type m_eos;
  1075. };
  1076. template< typename EOS, typename RGM >
  1077. class match_custom_charset_t : public match_charset_t<EOS,RGM>
  1078. {
  1079. public:
  1080. template< typename SY >
  1081. match_custom_charset_t( bool fcomplement,
  1082. basic_string<char_type>::iterator & icur,
  1083. basic_string<char_type>::const_iterator istop,
  1084. unsigned flags, SY /*sy*/ ) throw(bad_regexpr,bad_alloc)
  1085. : match_charset_t<EOS,RGM>( fcomplement, m_rgasciicustom, m_rgrangescustom )
  1086. {
  1087. _parse_charset( icur, istop, flags, SY() );
  1088. _optimize();
  1089. }
  1090. virtual ~match_custom_charset_t() {}
  1091. // for including one character set in another
  1092. match_custom_charset_t<EOS,RGM> & operator|=( const match_charset<CI> & that )
  1093. {
  1094. assert( 0 == that.m_ncharflags );
  1095. if( that.m_fcomplement )
  1096. {
  1097. m_rgasciicustom |= ~ that.m_rgascii;
  1098. // append the inverse of that.m_rgranges to this->m_rgrangescustom
  1099. wint_t chlow = UCHAR_MAX;
  1100. typedef vector<range_type>::const_iterator VCI;
  1101. for( VCI prg = that.m_rgranges.begin(); prg != that.m_rgranges.end(); ++prg )
  1102. {
  1103. if( UCHAR_MAX + 1 != prg->first )
  1104. m_rgrangescustom.push_back( range_type( chlow + 1, prg->first - 1 ) );
  1105. chlow = prg->second;
  1106. }
  1107. if( WCHAR_MAX != chlow )
  1108. m_rgrangescustom.push_back( range_type( chlow + 1, WCHAR_MAX ) );
  1109. }
  1110. else
  1111. {
  1112. m_rgasciicustom |= that.m_rgascii;
  1113. m_rgrangescustom.insert( m_rgrangescustom.end(),
  1114. that.m_rgranges.begin(),
  1115. that.m_rgranges.end() );
  1116. }
  1117. return *this;
  1118. }
  1119. protected:
  1120. template< typename SY >
  1121. void _parse_charset( basic_string<char_type>::iterator & icur,
  1122. basic_string<char_type>::const_iterator istop,
  1123. unsigned flags, SY /*sy*/ ) throw(bad_regexpr,bad_alloc)
  1124. {
  1125. TOKEN tok;
  1126. char_type ch_prev = 0;
  1127. match_charset<CI> * pcharset;
  1128. basic_string<char_type>::iterator iprev = icur;
  1129. const bool fnocase = ( NOCASE == ( NOCASE & flags ) );
  1130. if( (basic_string<char_type>::const_iterator)icur == istop )
  1131. throw bad_regexpr("expecting end of character set");
  1132. // remember the current position and grab the next token
  1133. tok = SY::charset_token( icur, istop );
  1134. do
  1135. {
  1136. // If we reached the end of the string before finding the end of the
  1137. // character set, then this is an ill-formed regex
  1138. if( (basic_string<char_type>::const_iterator)icur == istop )
  1139. throw bad_regexpr("expecting end of character set");
  1140. if( CHARSET_RANGE == tok && ch_prev )
  1141. {
  1142. // remember the current position
  1143. basic_string<char_type>::iterator iprev2 = icur;
  1144. char_type old_ch = ch_prev;
  1145. ch_prev = 0;
  1146. // old_ch is lower bound of a range
  1147. switch( SY::charset_token( icur, istop ) )
  1148. {
  1149. case CHARSET_RANGE:
  1150. case CHARSET_NEGATE:
  1151. icur = iprev2; // un-get these tokens and fall through
  1152. case NO_TOKEN:
  1153. case CHARSET_ESCAPE: // BUGBUG user-defined charset?
  1154. _set_bit_range( old_ch, *icur++, fnocase );
  1155. continue;
  1156. case CHARSET_BACKSPACE:
  1157. _set_bit_range( old_ch, char_type(8), fnocase ); // backspace
  1158. continue;
  1159. case CHARSET_END: // fall through
  1160. default: // not a range.
  1161. icur = iprev; // backup to range token
  1162. _set_bit( old_ch, fnocase );
  1163. _set_bit( *icur++, fnocase );
  1164. continue;
  1165. }
  1166. }
  1167. if( ch_prev )
  1168. _set_bit( ch_prev, fnocase );
  1169. ch_prev = 0;
  1170. switch( tok )
  1171. {
  1172. // None of the intrinsic charsets are case-sensitive,
  1173. // so no special handling must be done when the NOCASE
  1174. // flag is set.
  1175. case CHARSET_RANGE:
  1176. case CHARSET_NEGATE:
  1177. case CHARSET_END:
  1178. icur = iprev; // un-get these tokens
  1179. ch_prev = *icur++;
  1180. continue;
  1181. case CHARSET_BACKSPACE:
  1182. ch_prev = char_type(8); // backspace
  1183. continue;
  1184. case ESC_DIGIT:
  1185. *this |= match_charset<CI>( false, get_digit_vector() );
  1186. continue;
  1187. case ESC_NOT_DIGIT:
  1188. *this |= match_charset<CI>( true, get_digit_vector() );
  1189. continue;
  1190. case ESC_SPACE:
  1191. *this |= match_charset<CI>( false, get_space_vector() );
  1192. continue;
  1193. case ESC_NOT_SPACE:
  1194. *this |= match_charset<CI>( true, get_space_vector() );
  1195. continue;
  1196. case ESC_WORD:
  1197. *this |= match_charset<CI>( false, get_word_vector() );
  1198. continue;
  1199. case ESC_NOT_WORD:
  1200. *this |= match_charset<CI>( true, get_word_vector() );
  1201. continue;
  1202. case CHARSET_ALNUM:
  1203. m_ncharflags |= (_ALPHA|_DIGIT);
  1204. continue;
  1205. case CHARSET_ALPHA:
  1206. m_ncharflags |= (_ALPHA);
  1207. continue;
  1208. case CHARSET_BLANK:
  1209. m_ncharflags |= (_BLANK);
  1210. continue;
  1211. case CHARSET_CNTRL:
  1212. m_ncharflags |= (_CONTROL);
  1213. continue;
  1214. case CHARSET_DIGIT:
  1215. m_ncharflags |= (_DIGIT);
  1216. continue;
  1217. case CHARSET_GRAPH:
  1218. m_ncharflags |= (_PUNCT|_ALPHA|_DIGIT);
  1219. continue;
  1220. case CHARSET_LOWER:
  1221. m_ncharflags |= (_LOWER);
  1222. if( NOCASE == ( NOCASE & flags ) )
  1223. m_ncharflags |= (_UPPER);
  1224. continue;
  1225. case CHARSET_PRINT:
  1226. m_ncharflags |= (_BLANK|_PUNCT|_ALPHA|_DIGIT);
  1227. continue;
  1228. case CHARSET_PUNCT:
  1229. m_ncharflags |= (_PUNCT);
  1230. continue;
  1231. case CHARSET_SPACE:
  1232. m_ncharflags |= (_SPACE);
  1233. continue;
  1234. case CHARSET_UPPER:
  1235. m_ncharflags |= (_UPPER);
  1236. if( NOCASE == ( NOCASE & flags ) )
  1237. m_ncharflags |= (_LOWER);
  1238. continue;
  1239. case CHARSET_XDIGIT:
  1240. m_ncharflags |= (_HEX);
  1241. continue;
  1242. case CHARSET_ESCAPE:
  1243. // Maybe this is a user-defined intrinsic charset
  1244. pcharset = get_altern_charset( *icur, flags, SY() );
  1245. if( NULL != pcharset )
  1246. {
  1247. *this |= *pcharset;
  1248. ++icur;
  1249. continue;
  1250. }
  1251. // else fall through
  1252. default:
  1253. ch_prev = *icur++;
  1254. continue;
  1255. }
  1256. }
  1257. while( iprev = icur, CHARSET_END != ( tok = SY::charset_token( icur, istop ) ) );
  1258. if( ch_prev )
  1259. _set_bit( ch_prev, fnocase );
  1260. }
  1261. void _optimize()
  1262. {
  1263. // this sorts on range_type.first (uses operator<() for pair templates)
  1264. sort( m_rgrangescustom.begin(), m_rgrangescustom.end() );
  1265. // This merges ranges that overlap
  1266. for( size_t index = 1; index < m_rgrangescustom.size(); )
  1267. {
  1268. if( m_rgrangescustom[index].first <= m_rgrangescustom[index-1].second + 1 )
  1269. {
  1270. m_rgrangescustom[index-1].second = max(
  1271. m_rgrangescustom[index-1].second, m_rgrangescustom[index].second );
  1272. m_rgrangescustom.erase( m_rgrangescustom.begin() + index );
  1273. }
  1274. else
  1275. ++index;
  1276. }
  1277. }
  1278. // Note overloading based on second parameter
  1279. void _set_bit( char ch, const bool fnocase ) throw()
  1280. {
  1281. if( fnocase )
  1282. {
  1283. m_rgasciicustom.set( unsigned char(tolower(ch)) );
  1284. m_rgasciicustom.set( unsigned char(toupper(ch)) );
  1285. }
  1286. else
  1287. {
  1288. m_rgasciicustom.set( unsigned char(ch) );
  1289. }
  1290. }
  1291. // Note overloading based on second parameter
  1292. void _set_bit( wint_t ch, const bool fnocase ) throw(bad_alloc)
  1293. {
  1294. if( UCHAR_MAX >= ch )
  1295. _set_bit( char(ch), fnocase );
  1296. else
  1297. m_rgrangescustom.push_back( range_type( ch, ch ) );
  1298. }
  1299. // Note overloading based on second parameter
  1300. void _set_bit_range( char ch1, char ch2, const bool fnocase ) throw(bad_regexpr)
  1301. {
  1302. if( unsigned char(ch1) > unsigned char(ch2) )
  1303. throw bad_regexpr("invalid range specified in character set");
  1304. if( fnocase )
  1305. {
  1306. // i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
  1307. for( unsigned int i = unsigned char(ch1); i <= unsigned char(ch2); ++i )
  1308. {
  1309. m_rgasciicustom.set( unsigned char( toupper(i) ) );
  1310. m_rgasciicustom.set( unsigned char( tolower(i) ) );
  1311. }
  1312. }
  1313. else
  1314. {
  1315. // i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
  1316. for( unsigned int i = unsigned char(ch1); i <= unsigned char(ch2); ++i )
  1317. m_rgasciicustom.set( unsigned char(i) );
  1318. }
  1319. }
  1320. // Note overloading based on second parameter
  1321. void _set_bit_range( wint_t ch1, wint_t ch2, const bool fnocase ) throw(bad_regexpr,bad_alloc)
  1322. {
  1323. if( ch1 > ch2 )
  1324. throw bad_regexpr("invalid range specified in character set");
  1325. if( UCHAR_MAX >= ch1 )
  1326. _set_bit_range( char(ch1), char( min(wint_t(UCHAR_MAX),ch2) ), fnocase );
  1327. if( UCHAR_MAX < ch2 )
  1328. m_rgrangescustom.push_back( range_type( max(wint_t(UCHAR_MAX+1),ch1), ch2 ) );
  1329. }
  1330. ascii_bitvector m_rgasciicustom;
  1331. vector<range_type> m_rgrangescustom;
  1332. };
  1333. template< typename CI >
  1334. match_charset<CI> * create_charset(
  1335. const match_charset<CI> & that,
  1336. unsigned flags )
  1337. {
  1338. switch( ( NOCASE | CSTRINGS ) & flags )
  1339. {
  1340. case 0:
  1341. return new match_charset_t<eos_t<CI>,match_range_with_case>( that );
  1342. case NOCASE:
  1343. return new match_charset_t<eos_t<CI>,match_range_no_case>( that );
  1344. case CSTRINGS:
  1345. return new match_charset_t<eocs_t<CI>,match_range_with_case>( that );
  1346. case NOCASE | CSTRINGS:
  1347. return new match_charset_t<eocs_t<CI>,match_range_no_case>( that );
  1348. default:
  1349. __assume(0); // tells the compiler that this is unreachable
  1350. }
  1351. }
  1352. template< typename EOS >
  1353. class word_assertion_t : public assertion<typename EOS::const_iterator>
  1354. {
  1355. public:
  1356. typedef EOS eos_type;
  1357. typedef typename EOS::const_iterator CI;
  1358. word_assertion_t()
  1359. : m_isword( match_charset<CI>( false, get_word_vector() ) ) {}
  1360. virtual ~word_assertion_t() {}
  1361. protected:
  1362. bos_t<CI> m_bos;
  1363. eos_type m_eos;
  1364. match_charset_t<eos_type,match_range_with_case> m_isword;
  1365. };
  1366. template< typename EOS >
  1367. class word_boundary_t : public word_assertion_t<EOS>
  1368. {
  1369. public:
  1370. word_boundary_t( const bool fisboundary )
  1371. : m_fisboundary( fisboundary ) {}
  1372. virtual ~word_boundary_t() {}
  1373. protected:
  1374. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  1375. {
  1376. CI iprev = icur;
  1377. --iprev;
  1378. const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev );
  1379. const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur );
  1380. return ( m_fisboundary == ( fprevword != fthisword ) );
  1381. }
  1382. const bool m_fisboundary;
  1383. };
  1384. template< typename EOS >
  1385. class word_start_t : public word_assertion_t<EOS>
  1386. {
  1387. public:
  1388. word_start_t() {}
  1389. virtual ~word_start_t() {}
  1390. protected:
  1391. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  1392. {
  1393. CI iprev = icur;
  1394. --iprev;
  1395. const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev );
  1396. const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur );
  1397. return ! fprevword && fthisword;
  1398. }
  1399. };
  1400. template< typename EOS >
  1401. class word_stop_t : public word_assertion_t<EOS>
  1402. {
  1403. public:
  1404. word_stop_t() {}
  1405. virtual ~word_stop_t() {}
  1406. protected:
  1407. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  1408. {
  1409. CI iprev = icur;
  1410. --iprev;
  1411. const bool fprevword = ! m_bos( param, icur ) && m_isword.is_in_charset( *iprev );
  1412. const bool fthisword = ! m_eos( param, icur ) && m_isword.is_in_charset( *icur );
  1413. return fprevword && ! fthisword;
  1414. }
  1415. };
  1416. template< typename CI >
  1417. assertion<CI> * create_word_boundary( const bool fisboundary, unsigned flags )
  1418. {
  1419. switch( CSTRINGS & flags )
  1420. {
  1421. case 0:
  1422. return new word_boundary_t<eos_t<CI> >( fisboundary );
  1423. case CSTRINGS:
  1424. return new word_boundary_t<eocs_t<CI> >( fisboundary );
  1425. default:
  1426. __assume(0); // tells the compiler that this is unreachable
  1427. }
  1428. }
  1429. template< typename CI >
  1430. assertion<CI> * create_word_start( unsigned flags )
  1431. {
  1432. switch( CSTRINGS & flags )
  1433. {
  1434. case 0:
  1435. return new word_start_t<eos_t<CI> >();
  1436. case CSTRINGS:
  1437. return new word_start_t<eocs_t<CI> >();
  1438. default:
  1439. __assume(0); // tells the compiler that this is unreachable
  1440. }
  1441. }
  1442. template< typename CI >
  1443. assertion<CI> * create_word_stop( unsigned flags )
  1444. {
  1445. switch( CSTRINGS & flags )
  1446. {
  1447. case 0:
  1448. return new word_stop_t<eos_t<CI> >();
  1449. case CSTRINGS:
  1450. return new word_stop_t<eocs_t<CI> >();
  1451. default:
  1452. __assume(0); // tells the compiler that this is unreachable
  1453. }
  1454. }
  1455. template< typename CI > class group_quantifier;
  1456. template< typename CI >
  1457. class match_group : public sub_expr<CI>
  1458. {
  1459. public:
  1460. friend class group_quantifier<CI>;
  1461. match_group( size_t cgroup )
  1462. : m_rgalternates(), m_cgroup( cgroup ),
  1463. m_pptail(NULL), m_end_group( this ), m_nwidth(uninit_width) {}
  1464. virtual ~match_group() {}
  1465. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1466. {
  1467. CI old_istart;
  1468. if( -1 != m_cgroup ) // could be -1 if this is a lookahead_assertion
  1469. {
  1470. old_istart = (*param.prgbackrefs)[ m_cgroup ].first;
  1471. (*param.prgbackrefs)[ m_cgroup ].first = icur;
  1472. }
  1473. typedef vector<sub_expr<CI>*>::const_iterator VCI;
  1474. for( VCI ialt = m_rgalternates.begin(); ialt != m_rgalternates.end(); ++ialt )
  1475. {
  1476. if( (*ialt)->domatch( param, icur ) )
  1477. return true;
  1478. }
  1479. if( -1 != m_cgroup )
  1480. (*param.prgbackrefs)[ m_cgroup ].first = old_istart;
  1481. return false;
  1482. }
  1483. virtual void _delete()
  1484. {
  1485. typedef vector<sub_expr<CI>*>::iterator VI;
  1486. for( VI ialt = m_rgalternates.begin(); ialt != m_rgalternates.end(); ++ialt )
  1487. delete_sub_expr( *ialt );
  1488. sub_expr<CI>::_delete();
  1489. }
  1490. size_t group_number() const
  1491. {
  1492. return m_cgroup;
  1493. }
  1494. void group_number( size_t cgroup )
  1495. {
  1496. m_cgroup = cgroup;
  1497. }
  1498. void add_item( sub_expr<CI> * pitem )
  1499. {
  1500. *m_pptail = pitem;
  1501. m_pptail = & pitem->next();
  1502. }
  1503. void add_alternate()
  1504. {
  1505. m_rgalternates.push_back( NULL );
  1506. m_pptail = & m_rgalternates.back();
  1507. }
  1508. void end_alternate()
  1509. {
  1510. *m_pptail = & m_end_group;
  1511. }
  1512. size_t calternates() const
  1513. {
  1514. return m_rgalternates.size();
  1515. }
  1516. width_type group_width()
  1517. {
  1518. (void) match_group<CI>::_width_this();
  1519. return m_nwidth;
  1520. }
  1521. protected:
  1522. virtual bool _call_back( match_param<CI> & param, CI icur ) const throw()
  1523. {
  1524. CI old_iend;
  1525. if( -1 != m_cgroup )
  1526. {
  1527. old_iend = (*param.prgbackrefs)[ m_cgroup ].second;
  1528. (*param.prgbackrefs)[ m_cgroup ].second = icur;
  1529. }
  1530. if( match_next( param, icur ) )
  1531. return true;
  1532. if( -1 != m_cgroup )
  1533. (*param.prgbackrefs)[ m_cgroup ].second = old_iend;
  1534. return false;
  1535. }
  1536. virtual width_type _width_this() throw()
  1537. {
  1538. typedef vector<sub_expr<CI>*>::const_iterator VCI;
  1539. if( uninit_width == m_nwidth )
  1540. {
  1541. m_nwidth = width_type(-1,0);
  1542. for( VCI ialt = m_rgalternates.begin(); worst_width != m_nwidth && ialt != m_rgalternates.end(); ++ialt )
  1543. {
  1544. width_type temp_width = (*ialt)->get_width();
  1545. m_nwidth.m_min = min( m_nwidth.m_min, temp_width.m_min );
  1546. m_nwidth.m_max = max( m_nwidth.m_max, temp_width.m_max );
  1547. }
  1548. }
  1549. return m_nwidth;
  1550. }
  1551. class end_group;
  1552. friend class end_group;
  1553. class end_group : public sub_expr<CI>
  1554. {
  1555. void * operator new( size_t );
  1556. public:
  1557. end_group( match_group * pgroup )
  1558. : m_pgroup( pgroup ) {}
  1559. virtual ~end_group() {}
  1560. virtual void _delete() {} // don't delete this, because it was never alloc'ed
  1561. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1562. {
  1563. return m_pgroup->_call_back( param, icur );
  1564. }
  1565. protected:
  1566. // since m_pnext is always NULL for end_groups, get_width() stops recursing here
  1567. virtual width_type _width_this() throw()
  1568. {
  1569. return width_type(0,0);
  1570. }
  1571. match_group<CI> * m_pgroup;
  1572. };
  1573. vector<sub_expr<CI>*> m_rgalternates;
  1574. sub_expr<CI> ** m_pptail; // only used when adding elements
  1575. size_t m_cgroup;
  1576. end_group m_end_group;
  1577. width_type m_nwidth;
  1578. };
  1579. // Behaves like a lookahead assertion if m_cgroup is -1, or like
  1580. // an independent group otherwise.
  1581. template< typename CI >
  1582. class independent_group : public match_group<CI>
  1583. {
  1584. public:
  1585. independent_group()
  1586. : match_group<CI>( -1 ), m_fexpected(true) {}
  1587. virtual ~independent_group() {}
  1588. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1589. {
  1590. // Copy the entire backref vector onto the stack
  1591. backref_tag<CI> * prgbr = (backref_tag<CI>*)_alloca( param.prgbackrefs->size() * sizeof backref_tag<CI> );
  1592. copy( param.prgbackrefs->begin(), param.prgbackrefs->end(),
  1593. raw_storage_iterator<backref_tag<CI>*,backref_tag<CI> >(prgbr) );
  1594. // Match until the end of this group and then return
  1595. const bool fdomatch = match_group<CI>::domatch( param, icur );
  1596. if( m_fexpected == fdomatch )
  1597. {
  1598. // If m_cgroup != 1, then this is not a zero-width assertion.
  1599. if( -1 != m_cgroup )
  1600. icur = (*param.prgbackrefs)[ m_cgroup ].second;
  1601. if( match_next( param, icur ) )
  1602. return true;
  1603. }
  1604. // if match_group::domatch returned true, the backrefs must be restored
  1605. if( fdomatch )
  1606. copy( prgbr, prgbr + param.prgbackrefs->size(), param.prgbackrefs->begin() );
  1607. return false;
  1608. }
  1609. protected:
  1610. independent_group( const bool fexpected )
  1611. : match_group<CI>( -1 ), m_fexpected(fexpected) {}
  1612. virtual bool _call_back( match_param<CI> & param, CI icur ) const throw()
  1613. {
  1614. if( -1 != m_cgroup )
  1615. (*param.prgbackrefs)[ m_cgroup ].second = icur;
  1616. return true;
  1617. }
  1618. const bool m_fexpected;
  1619. };
  1620. template< typename CI >
  1621. class lookahead_assertion : public independent_group<CI>
  1622. {
  1623. public:
  1624. lookahead_assertion( const bool fexpected )
  1625. : independent_group<CI>( fexpected ) {}
  1626. virtual ~lookahead_assertion() {}
  1627. virtual bool is_assertion() const throw() { return true; }
  1628. protected:
  1629. virtual width_type _width_this() throw() { return width_type(0,0); }
  1630. };
  1631. template< typename CI >
  1632. class lookbehind_assertion : public independent_group<CI>
  1633. {
  1634. public:
  1635. lookbehind_assertion( const bool fexpected )
  1636. : independent_group<CI>( fexpected ) {}
  1637. virtual ~lookbehind_assertion() {}
  1638. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1639. {
  1640. // This is the room in the string from the start to the current position
  1641. size_t room = distance( param.ibegin, icur );
  1642. // If we don't have enough room to match the lookbehind, the match fails.
  1643. // If we wanted the match to fail, try to match the rest of the pattern.
  1644. if( m_nwidth.m_min > room )
  1645. return m_fexpected ? false : match_next( param, icur );
  1646. // Copy the entire backref vector onto the stack
  1647. backref_tag<CI> * prgbr = (backref_tag<CI>*)_alloca( param.prgbackrefs->size() * sizeof backref_tag<CI> );
  1648. copy( param.prgbackrefs->begin(), param.prgbackrefs->end(),
  1649. raw_storage_iterator<backref_tag<CI>*,backref_tag<CI> >(prgbr) );
  1650. CI local_istart = icur;
  1651. advance( local_istart, -int( min( m_nwidth.m_max, room ) ) );
  1652. CI local_istop = icur;
  1653. advance( local_istop, -int( m_nwidth.m_min ) );
  1654. // Create a local param struct that has icur as param.iend
  1655. match_param<CI> local_param(param.ibegin,param.istart,icur,param.prgbackrefs);
  1656. // Find the rightmost match that ends at icur.
  1657. for( CI local_icur = local_istart; local_icur <= local_istop; ++local_icur )
  1658. {
  1659. // Match until the end of this group and then return
  1660. const bool fmatched = match_group<CI>::domatch( local_param, local_icur );
  1661. // If the match results were what we were expecting, try to match the
  1662. // rest of the pattern. If that succeeds, return true.
  1663. if( m_fexpected == fmatched && match_next( param, icur ) )
  1664. return true;
  1665. // if match_group::domatch returned true, the backrefs must be restored
  1666. if( fmatched )
  1667. {
  1668. copy( prgbr, prgbr + param.prgbackrefs->size(), param.prgbackrefs->begin() );
  1669. // Match succeeded. If this is a negative lookbehind, we didn't want it
  1670. // to succeed, so return false.
  1671. if( ! m_fexpected )
  1672. return false;
  1673. }
  1674. }
  1675. // No variation of the lookbehind was satisfied in a way that permited
  1676. // the rest of the pattern to match successfully, so return false.
  1677. return false;
  1678. }
  1679. virtual bool is_assertion() const throw() { return true; }
  1680. protected:
  1681. virtual bool _call_back( match_param<CI> & param, CI icur ) const throw()
  1682. {
  1683. return param.istop == icur;
  1684. }
  1685. virtual width_type _width_this() throw() { return width_type(0,0); }
  1686. };
  1687. // Corresponds to the (?:foo) extension, which has grouping semantics, but
  1688. // does not store any backref information.
  1689. template< typename CI >
  1690. class group_nobackref : public match_group<CI>
  1691. {
  1692. public:
  1693. group_nobackref( )
  1694. : match_group( -1 ) {} // will be assigned a group number in basic_rpattern::basic_rpattern()
  1695. virtual ~group_nobackref() {}
  1696. };
  1697. template< typename CI >
  1698. class match_wrapper : public sub_expr<CI>
  1699. {
  1700. public:
  1701. match_wrapper( sub_expr<CI> * psub )
  1702. : m_psub(psub) {}
  1703. virtual ~match_wrapper() {}
  1704. virtual void _delete()
  1705. {
  1706. delete_sub_expr( m_psub );
  1707. sub_expr<CI>::_delete();
  1708. }
  1709. protected:
  1710. bool _wrapped_match_this( match_param<CI> & param, CI & icur ) const throw()
  1711. {
  1712. return m_psub->_match_this( param, icur );
  1713. }
  1714. virtual width_type _width_this() throw()
  1715. {
  1716. return m_psub->_width_this();
  1717. }
  1718. sub_expr<CI> * m_psub;
  1719. };
  1720. template< typename CI >
  1721. class match_quantifier : public match_wrapper<CI>
  1722. {
  1723. public:
  1724. match_quantifier( sub_expr<CI> * psub, size_t lbound, size_t ubound )
  1725. : match_wrapper<CI>( psub ), m_lbound(lbound), m_ubound(ubound) {}
  1726. virtual ~match_quantifier() {}
  1727. protected:
  1728. virtual width_type _width_this() throw()
  1729. {
  1730. width_type this_width = match_wrapper<CI>::_width_this();
  1731. return this_width * width_type( m_lbound, m_ubound );
  1732. }
  1733. const size_t m_lbound;
  1734. const size_t m_ubound;
  1735. };
  1736. template< typename CI >
  1737. class max_atom_quantifier : public match_quantifier<CI>
  1738. {
  1739. public:
  1740. max_atom_quantifier( sub_expr<CI> * psub, size_t lbound, size_t ubound )
  1741. : match_quantifier<CI>( psub, lbound, ubound ) {}
  1742. virtual ~max_atom_quantifier() {}
  1743. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1744. {
  1745. size_t cmatches = 0;
  1746. int cdiff = 0; // must be a signed quantity for advance() below
  1747. if( cmatches < m_ubound )
  1748. {
  1749. CI istart = icur;
  1750. if( _wrapped_match_this( param, icur ) )
  1751. {
  1752. ++cmatches;
  1753. cdiff = distance( istart, icur );
  1754. if( 0 == cdiff )
  1755. return ( match_next( param, icur ) );
  1756. while( cmatches < m_ubound &&
  1757. _wrapped_match_this( param, icur ) )
  1758. {
  1759. ++cmatches;
  1760. }
  1761. }
  1762. }
  1763. if( cmatches >= m_lbound )
  1764. {
  1765. if( ! next() )
  1766. return true;
  1767. for(;;)
  1768. {
  1769. if( next()->domatch( param, icur ) )
  1770. return true;
  1771. if( cmatches-- <= m_lbound )
  1772. break;
  1773. advance( icur, -cdiff );
  1774. }
  1775. }
  1776. return false;
  1777. }
  1778. };
  1779. template< typename CI >
  1780. class min_atom_quantifier : public match_quantifier<CI>
  1781. {
  1782. public:
  1783. min_atom_quantifier( sub_expr<CI> * psub, size_t lbound, size_t ubound )
  1784. : match_quantifier<CI>( psub, lbound, ubound ) {}
  1785. virtual ~min_atom_quantifier() {}
  1786. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1787. {
  1788. size_t cmatches = 0;
  1789. bool fsuccess = true;
  1790. CI icur_tmp = icur;
  1791. if( _wrapped_match_this( param, icur_tmp ) )
  1792. {
  1793. if( icur_tmp == icur )
  1794. return ( match_next( param, icur ) );
  1795. if( m_lbound )
  1796. {
  1797. icur = icur_tmp;
  1798. ++cmatches;
  1799. }
  1800. while( ( cmatches < m_lbound ) &&
  1801. ( fsuccess = _wrapped_match_this( param, icur ) ) )
  1802. {
  1803. ++cmatches;
  1804. }
  1805. }
  1806. else
  1807. {
  1808. fsuccess = ! m_lbound;
  1809. }
  1810. if( fsuccess && next() )
  1811. {
  1812. do
  1813. {
  1814. if( next()->domatch( param, icur ) )
  1815. break;
  1816. } while( fsuccess = ( cmatches++ < m_ubound &&
  1817. _wrapped_match_this( param, icur ) ) );
  1818. }
  1819. return fsuccess;
  1820. }
  1821. };
  1822. template< typename CI >
  1823. class group_quantifier : public match_quantifier<CI>
  1824. {
  1825. public:
  1826. group_quantifier( match_group<CI> * psub, size_t lbound, size_t ubound )
  1827. : match_quantifier<CI>( psub, lbound, ubound ),
  1828. m_group( *psub ), m_end_quantifier( this )
  1829. {
  1830. psub->next() = & m_end_quantifier;
  1831. }
  1832. virtual ~group_quantifier() {}
  1833. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1834. {
  1835. // group_number is only -1 for assertions, which can't be quantified
  1836. assert( -1 != group_number() );
  1837. backref_tag<CI> & br = (*param.prgbackrefs)[ group_number() ];
  1838. backref_tag<CI> old_backref = br;
  1839. br = backref_tag<CI>( icur, icur ); // sets cmatches (reserved) to 0
  1840. if( _recurse( param, icur ) )
  1841. return true;
  1842. br = old_backref;
  1843. return false;
  1844. }
  1845. protected:
  1846. class end_quantifier;
  1847. friend class end_quantifier;
  1848. class end_quantifier : public sub_expr<CI>
  1849. {
  1850. void * operator new( size_t );
  1851. public:
  1852. end_quantifier( group_quantifier<CI> * pquant )
  1853. : m_pquant( pquant ) {}
  1854. virtual ~end_quantifier() {}
  1855. virtual void _delete() {} // don't delete this, since it wasn't alloc'ed
  1856. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  1857. {
  1858. // group_number is only -1 for assertions, which can't be quantified
  1859. assert( -1 != m_pquant->group_number() );
  1860. // handle special the case where a group matches 0 characters
  1861. backref_tag<CI> & br = (*param.prgbackrefs)[ m_pquant->group_number() ];
  1862. if( icur == br.first )
  1863. {
  1864. size_t old_cmatches = br.reserved;
  1865. br.reserved = m_pquant->m_ubound;
  1866. if( m_pquant->_recurse( param, icur ) )
  1867. return true;
  1868. br.reserved = old_cmatches;
  1869. return false;
  1870. }
  1871. return m_pquant->_recurse( param, icur );
  1872. }
  1873. protected:
  1874. virtual width_type _width_this() throw() { return width_type(0,0); }
  1875. group_quantifier<CI> * m_pquant;
  1876. };
  1877. size_t group_number() const
  1878. {
  1879. return m_group.group_number();
  1880. }
  1881. size_t & cmatches( match_param<CI> & param ) const
  1882. {
  1883. return (*param.prgbackrefs)[ group_number() ].reserved;
  1884. }
  1885. virtual bool _recurse( match_param<CI> & param, CI icur ) const throw() = 0;
  1886. match_group<CI> & m_group;
  1887. end_quantifier m_end_quantifier;
  1888. };
  1889. template< typename CI >
  1890. class max_group_quantifier : public group_quantifier<CI>
  1891. {
  1892. public:
  1893. max_group_quantifier( match_group<CI> * psub, size_t lbound, size_t ubound )
  1894. : group_quantifier<CI>( psub, lbound, ubound ) {}
  1895. virtual ~max_group_quantifier() {}
  1896. protected:
  1897. virtual bool _recurse( match_param<CI> & param, CI icur ) const throw()
  1898. {
  1899. if( m_ubound == cmatches( param ) )
  1900. return match_next( param, icur );
  1901. ++cmatches( param );
  1902. if( m_psub->domatch( param, icur ) )
  1903. return true;
  1904. if( --cmatches( param ) < m_lbound )
  1905. return false;
  1906. return match_next( param, icur );
  1907. }
  1908. };
  1909. template< typename CI >
  1910. class min_group_quantifier : public group_quantifier<CI>
  1911. {
  1912. public:
  1913. min_group_quantifier( match_group<CI> * psub, size_t lbound, size_t ubound )
  1914. : group_quantifier<CI>( psub, lbound, ubound ) {}
  1915. virtual ~min_group_quantifier() {}
  1916. protected:
  1917. virtual bool _recurse( match_param<CI> & param, CI icur ) const throw()
  1918. {
  1919. if( m_lbound > cmatches( param ) )
  1920. {
  1921. ++cmatches( param );
  1922. return m_psub->domatch( param, icur );
  1923. }
  1924. if( match_next( param, icur ) )
  1925. return true;
  1926. if( cmatches( param )++ == m_ubound )
  1927. return false;
  1928. return m_psub->domatch( param, icur );
  1929. }
  1930. };
  1931. template< typename CI >
  1932. class match_backref : public sub_expr<CI>
  1933. {
  1934. public:
  1935. match_backref( size_t cbackref, const width_type & group_width )
  1936. : m_cbackref( cbackref ), m_nwidth(group_width) {}
  1937. virtual ~match_backref() {}
  1938. protected:
  1939. // Return the width specifications of the group to which this backref refers
  1940. virtual width_type _width_this() throw() { return m_nwidth; }
  1941. const size_t m_cbackref;
  1942. const width_type m_nwidth;
  1943. };
  1944. template< typename CMP, typename EOS >
  1945. class match_backref_t : public match_backref<typename EOS::const_iterator>
  1946. {
  1947. public:
  1948. typedef CMP cmp_type;
  1949. typedef EOS eos_type;
  1950. typedef typename EOS::const_iterator CI;
  1951. match_backref_t( size_t cbackref, const width_type & group_width )
  1952. : match_backref<CI>( cbackref, group_width ) {}
  1953. virtual ~match_backref_t() {}
  1954. protected:
  1955. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  1956. {
  1957. CI ithis = (*param.prgbackrefs)[ m_cbackref ].first;
  1958. CI istop = (*param.prgbackrefs)[ m_cbackref ].second;
  1959. CI icur_tmp = icur;
  1960. // Don't match a backref that hasn't match anything
  1961. if( ! (*param.prgbackrefs)[ m_cbackref ] )
  1962. return false;
  1963. for( ; ithis != istop; ++icur_tmp, ++ithis )
  1964. {
  1965. if( m_eos( param, icur_tmp ) || m_cmp( *icur_tmp, *ithis ) )
  1966. return false;
  1967. }
  1968. icur = icur_tmp;
  1969. return true;
  1970. }
  1971. cmp_type m_cmp;
  1972. eos_type m_eos;
  1973. };
  1974. template< typename CI >
  1975. match_backref<CI> * create_backref(
  1976. size_t cbackref,
  1977. const width_type & group_width,
  1978. unsigned flags )
  1979. {
  1980. typedef typename iterator_traits<CI>::value_type char_type;
  1981. switch( ( NOCASE | CSTRINGS ) & flags )
  1982. {
  1983. case 0:
  1984. return new match_backref_t<ch_neq_t<char_type>,eos_t<CI> >( cbackref, group_width );
  1985. case NOCASE:
  1986. return new match_backref_t<ch_neq_nocase_t<char_type>,eos_t<CI> >( cbackref, group_width );
  1987. case CSTRINGS:
  1988. return new match_backref_t<ch_neq_t<char_type>,eocs_t<CI> >( cbackref, group_width );
  1989. case NOCASE | CSTRINGS:
  1990. return new match_backref_t<ch_neq_nocase_t<char_type>,eocs_t<CI> >( cbackref, group_width );
  1991. default:
  1992. __assume(0); // tells the compiler that this is unreachable
  1993. }
  1994. }
  1995. // Replace some escape sequences with the actual characters
  1996. // they represent
  1997. template< typename CI >
  1998. void basic_rpattern_base<CI>::_normalize_string( basic_string<basic_rpattern_base<CI>::char_type> & str )
  1999. {
  2000. size_t i = 0;
  2001. // Don't do pattern normalization if the user didn't ask for it.
  2002. if( NORMALIZE != ( NORMALIZE & m_flags ) )
  2003. return;
  2004. while( basic_string<char_type>::npos != ( i = str.find( char_type('\\'), i ) ) )
  2005. {
  2006. if( str.size() - 1 == i )
  2007. return;
  2008. switch( str[i+1] )
  2009. {
  2010. case char_type('f'):
  2011. str.replace( i, 2, 1, char_type('\f') );
  2012. break;
  2013. case char_type('n'):
  2014. str.replace( i, 2, 1, char_type('\n') );
  2015. break;
  2016. case char_type('r'):
  2017. str.replace( i, 2, 1, char_type('\r') );
  2018. break;
  2019. case char_type('t'):
  2020. str.replace( i, 2, 1, char_type('\t') );
  2021. break;
  2022. case char_type('v'):
  2023. str.replace( i, 2, 1, char_type('\v') );
  2024. break;
  2025. case char_type('\\'):
  2026. str.replace( i, 2, 1, char_type('\\') );
  2027. break;
  2028. default:
  2029. ++i;
  2030. break;
  2031. }
  2032. ++i;
  2033. if( str.size() <= i )
  2034. return;
  2035. }
  2036. }
  2037. //
  2038. // Implementation of basic_rpattern:
  2039. //
  2040. template< typename CI, typename SY >
  2041. basic_rpattern<CI,SY>::basic_rpattern() throw()
  2042. : basic_rpattern_base<CI>( 0 )
  2043. {
  2044. }
  2045. template< typename CI, typename SY >
  2046. basic_rpattern<CI,SY>::basic_rpattern(
  2047. const basic_string<basic_rpattern<CI,SY>::char_type> & pat,
  2048. unsigned flags ) throw(bad_regexpr,bad_alloc)
  2049. : basic_rpattern_base<CI>( flags, pat )
  2050. {
  2051. push_new_handler pnh( &my_new_handler );
  2052. _normalize_string( m_pat );
  2053. _common_init( flags );
  2054. }
  2055. template< typename CI, typename SY >
  2056. basic_rpattern<CI,SY>::basic_rpattern(
  2057. const basic_string<basic_rpattern<CI,SY>::char_type> & pat,
  2058. const basic_string<basic_rpattern<CI,SY>::char_type> & subst,
  2059. unsigned flags ) throw(bad_regexpr,bad_alloc)
  2060. : basic_rpattern_base<CI>( flags, pat, subst )
  2061. {
  2062. push_new_handler pnh( &my_new_handler );
  2063. _normalize_string( m_pat );
  2064. _common_init( flags );
  2065. _normalize_string( m_subst );
  2066. _parse_subst(); // must come after _common_init
  2067. }
  2068. template< typename CI, typename SY >
  2069. void basic_rpattern<CI,SY>::init(
  2070. const basic_string<basic_rpattern<CI,SY>::char_type> & pat,
  2071. unsigned flags ) throw(bad_regexpr,bad_alloc)
  2072. {
  2073. push_new_handler pnh( &my_new_handler );
  2074. _reset();
  2075. m_flags = flags;
  2076. m_pat = pat;
  2077. _normalize_string( m_pat );
  2078. _common_init( m_flags );
  2079. }
  2080. template< typename CI, typename SY >
  2081. void basic_rpattern<CI,SY>::init(
  2082. const basic_string<basic_rpattern<CI,SY>::char_type> & pat,
  2083. const basic_string<basic_rpattern<CI,SY>::char_type> & subst,
  2084. unsigned flags ) throw(bad_regexpr,bad_alloc)
  2085. {
  2086. push_new_handler pnh( &my_new_handler );
  2087. _reset();
  2088. m_flags = flags;
  2089. m_pat = pat;
  2090. m_subst = subst;
  2091. _normalize_string( m_pat );
  2092. _common_init( m_flags );
  2093. _normalize_string( m_subst );
  2094. _parse_subst(); // must come after _common_init
  2095. }
  2096. template< typename CI, typename SY >
  2097. void basic_rpattern<CI,SY>::_common_init( unsigned flags )
  2098. {
  2099. m_cgroups = 0;
  2100. vector<match_group<CI>*> rggroups;
  2101. basic_string<char_type>::iterator ipat = m_pat.begin();
  2102. match_group<CI> * pgroup = _find_next_group( ipat, flags, rggroups );
  2103. m_pfirst = pgroup;
  2104. m_nwidth = pgroup->group_width();
  2105. // Number the invisible groups
  2106. m_cgroups_visible = m_cgroups;
  2107. while( ! m_invisible_groups.empty() )
  2108. {
  2109. m_invisible_groups.front()->group_number( _get_next_group_nbr() );
  2110. m_invisible_groups.pop_front();
  2111. }
  2112. //
  2113. // determine if we can get away with only calling m_pfirst->domatch only once
  2114. //
  2115. m_floop = true;
  2116. // Optimization: if first character of pattern string is '^'
  2117. // and we are not doing a multiline match, then we only
  2118. // need to try domatch once
  2119. basic_string<char_type>::iterator icur = m_pat.begin();
  2120. if( MULTILINE != ( MULTILINE & m_flags ) &&
  2121. 1 == pgroup->calternates() &&
  2122. icur != m_pat.end() &&
  2123. BEGIN_LINE == SY::reg_token( icur, m_pat.end() ) )
  2124. {
  2125. m_flags &= ~RIGHTMOST;
  2126. m_floop = false;
  2127. }
  2128. // Optimization: if first 2 characters of pattern string are ".*" or ".+",
  2129. // then we only need to try domatch once
  2130. icur = m_pat.begin();
  2131. if( RIGHTMOST != ( RIGHTMOST & m_flags ) &&
  2132. SINGLELINE == ( SINGLELINE & m_flags ) &&
  2133. 1 == pgroup->calternates() &&
  2134. icur != m_pat.end() &&
  2135. MATCH_ANY == SY::reg_token( icur, m_pat.end() ) &&
  2136. icur != m_pat.end() )
  2137. {
  2138. switch( SY::quant_token( icur, m_pat.end() ) )
  2139. {
  2140. case ONE_OR_MORE:
  2141. case ZERO_OR_MORE:
  2142. case ONE_OR_MORE_MIN:
  2143. case ZERO_OR_MORE_MIN:
  2144. m_floop = false;
  2145. }
  2146. }
  2147. }
  2148. template< typename CI, typename SY >
  2149. void basic_rpattern<CI,SY>::_reset() throw()
  2150. {
  2151. basic_rpattern_base<CI>::_reset();
  2152. m_cgroups = m_cgroups_visible = 0;
  2153. m_floop = true;
  2154. m_subst.erase();
  2155. m_pat.erase();
  2156. m_pfirst.free_ptr();
  2157. m_nwidth = uninit_width;
  2158. m_subst_list.clear();
  2159. m_invisible_groups.clear();
  2160. }
  2161. template< typename CI, typename SY >
  2162. void basic_rpattern<CI,SY>::set_flags( unsigned flags ) throw(bad_regexpr,bad_alloc)
  2163. {
  2164. push_new_handler pnh( &my_new_handler );
  2165. m_pfirst.free_ptr();
  2166. m_flags = flags;
  2167. _common_init( m_flags );
  2168. }
  2169. template< typename CI, typename SY >
  2170. void basic_rpattern<CI,SY>::set_substitution( const basic_string<basic_rpattern<CI,SY>::char_type> & subst )
  2171. {
  2172. push_new_handler pnh( &my_new_handler );
  2173. m_subst_list.clear();
  2174. m_subst = subst;
  2175. _normalize_string( m_subst );
  2176. _parse_subst();
  2177. }
  2178. template< typename CI, typename SY >
  2179. match_group<CI> * basic_rpattern<CI,SY>::_find_next_group(
  2180. basic_string<basic_rpattern<CI,SY>::char_type>::iterator & ipat,
  2181. unsigned & flags,
  2182. vector<match_group<CI>*> & rggroups )
  2183. {
  2184. auto_sub_ptr<match_group<CI> > pgroup;
  2185. basic_string<char_type>::iterator itemp = ipat;
  2186. unsigned old_flags = flags;
  2187. TOKEN tok;
  2188. // Look for group extensions. (This could change the value of the flags variable.)
  2189. if( ipat != m_pat.end() && NO_TOKEN != ( tok = SY::ext_token( ipat, m_pat.end(), flags ) ) )
  2190. {
  2191. if( itemp == m_pat.begin() || ipat == m_pat.end() )
  2192. throw bad_regexpr("ill-formed regular expression");
  2193. // Don't process empty groups
  2194. if( END_GROUP != SY::reg_token( itemp = ipat, m_pat.end() ) )
  2195. {
  2196. switch( tok )
  2197. {
  2198. case EXT_NOBACKREF:
  2199. // invisible groups are numbered only after all
  2200. // visible groups have been numbererd
  2201. pgroup = new match_group<CI>( -1 );
  2202. m_invisible_groups.push_back( pgroup.get() );
  2203. break;
  2204. case EXT_INDEPENDENT:
  2205. pgroup = new independent_group<CI>();
  2206. m_invisible_groups.push_back( pgroup.get() );
  2207. break;
  2208. case EXT_POS_LOOKAHEAD:
  2209. pgroup = new lookahead_assertion<CI>( true );
  2210. break;
  2211. case EXT_NEG_LOOKAHEAD:
  2212. pgroup = new lookahead_assertion<CI>( false );
  2213. break;
  2214. case EXT_POS_LOOKBEHIND:
  2215. // For look-behind assertions, turn off the CSTRINGs optimization
  2216. flags &= ~CSTRINGS;
  2217. pgroup = new lookbehind_assertion<CI>( true );
  2218. break;
  2219. case EXT_NEG_LOOKBEHIND:
  2220. // For look-behind assertions, turn off the CSTRINGs optimization
  2221. flags &= ~CSTRINGS;
  2222. pgroup = new lookbehind_assertion<CI>( false );
  2223. break;
  2224. default:
  2225. throw bad_regexpr("bad extension sequence");
  2226. }
  2227. }
  2228. else
  2229. {
  2230. // Skip over the END_GROUP token
  2231. ipat = itemp;
  2232. }
  2233. }
  2234. else
  2235. {
  2236. pgroup = new match_group<CI>( _get_next_group_nbr() );
  2237. }
  2238. if( NULL != pgroup.get() )
  2239. {
  2240. pgroup->add_alternate();
  2241. while( _find_next( ipat, pgroup.get(), flags, rggroups ) );
  2242. pgroup->end_alternate();
  2243. // Add this group to the rggroups array
  2244. if( -1 != pgroup->group_number() )
  2245. {
  2246. if( pgroup->group_number() >= rggroups.size() )
  2247. rggroups.resize( pgroup->group_number() + 1, NULL );
  2248. rggroups[ pgroup->group_number() ] = pgroup.get();
  2249. }
  2250. // The group should calculate its own width now and
  2251. // save the result for later.
  2252. pgroup->group_width();
  2253. // If this is not a pattern modifier, restore the
  2254. // flags to their previous settings. This causes
  2255. // pattern modifiers to have the scope of their
  2256. // enclosing group.
  2257. flags = old_flags;
  2258. }
  2259. return pgroup.release();
  2260. }
  2261. //
  2262. // Read ahead through the pattern and treat sequential atoms
  2263. // as a single atom, making sure to handle quantification
  2264. // correctly. Warning: dense code ahead.
  2265. //
  2266. template< typename CI, typename SY >
  2267. void basic_rpattern<CI,SY>::_find_atom(
  2268. basic_string<basic_rpattern<CI,SY>::char_type>::iterator & ipat,
  2269. match_group<CI> * pgroup,
  2270. unsigned flags )
  2271. {
  2272. basic_string<char_type>::iterator itemp = ipat, istart = ipat;
  2273. do
  2274. {
  2275. switch( SY::quant_token( itemp, m_pat.end() ) )
  2276. {
  2277. // if {,} can't be interpreted as quantifiers, treat them as regular chars
  2278. case BEGIN_RANGE:
  2279. if( istart != ipat ) // treat as a quantifier
  2280. goto quantify;
  2281. case NO_TOKEN:
  2282. case END_RANGE:
  2283. case END_RANGE_MIN:
  2284. case RANGE_SEPARATOR:
  2285. break;
  2286. default:
  2287. if( istart == ipat ) // must be able to quantify something.
  2288. throw bad_regexpr("quantifier not expected");
  2289. quantify: if( istart != --ipat )
  2290. pgroup->add_item( create_atom<CI>( istart, ipat, flags ) );
  2291. auto_sub_ptr<sub_expr<CI> > pnew( create_atom<CI>( ipat++, flags ) );
  2292. _quantify( pnew, NULL, ipat );
  2293. pgroup->add_item( pnew.release() );
  2294. return;
  2295. }
  2296. } while( m_pat.end() != ++ipat && ! SY::reg_token( itemp = ipat, m_pat.end() ) );
  2297. assert( ipat != istart );
  2298. pgroup->add_item( create_atom<CI>( istart, ipat, flags ) );
  2299. }
  2300. template< typename CI, typename SY >
  2301. bool basic_rpattern<CI,SY>::_find_next(
  2302. basic_string<basic_rpattern<CI,SY>::char_type>::iterator & ipat,
  2303. match_group<CI> * pgroup,
  2304. unsigned & flags,
  2305. vector<match_group<CI>*> & rggroups )
  2306. {
  2307. match_group<CI> * pnew_group = NULL;
  2308. auto_sub_ptr<sub_expr<CI> > pnew;
  2309. basic_string<char_type>::iterator istart, itemp;
  2310. bool fdone;
  2311. if( ipat == m_pat.end() )
  2312. {
  2313. if( 0 != pgroup->group_number() )
  2314. throw bad_regexpr( "mismatched parenthesis" );
  2315. return false;
  2316. }
  2317. switch( SY::reg_token( ipat, m_pat.end() ) )
  2318. {
  2319. case NO_TOKEN: // not a token. Must be an atom
  2320. _find_atom( ipat, pgroup, flags );
  2321. return true;
  2322. case END_GROUP:
  2323. if( 0 == pgroup->group_number() )
  2324. throw bad_regexpr( "mismatched parenthesis" );
  2325. return false;
  2326. case ALTERNATION:
  2327. pgroup->end_alternate();
  2328. pgroup->add_alternate();
  2329. return true;
  2330. case BEGIN_GROUP:
  2331. // Find next group could return NULL if the group is really
  2332. // a pattern modifier, like: (?s-i)
  2333. pnew = pnew_group = _find_next_group( ipat, flags, rggroups );
  2334. break;
  2335. case BEGIN_LINE:
  2336. pnew = create_bol<CI>( flags );
  2337. break;
  2338. case END_LINE:
  2339. pnew = create_eol<CI>( flags );
  2340. break;
  2341. case BEGIN_CHARSET:
  2342. pnew = create_charset_helper<CI,SY>::create_charset_aux( m_pat, ipat, flags );
  2343. break;
  2344. case MATCH_ANY:
  2345. pnew = create_any<CI>( flags );
  2346. break;
  2347. case ESC_WORD_BOUNDARY:
  2348. pnew = create_word_boundary<CI>( true, flags );
  2349. break;
  2350. case ESC_NOT_WORD_BOUNDARY:
  2351. pnew = create_word_boundary<CI>( false, flags );
  2352. break;
  2353. case ESC_WORD_START:
  2354. pnew = create_word_start<CI>( flags );
  2355. break;
  2356. case ESC_WORD_STOP:
  2357. pnew = create_word_stop<CI>( flags );
  2358. break;
  2359. case ESC_DIGIT:
  2360. pnew = create_charset<CI>( match_charset<CI>( false,
  2361. get_digit_vector() ),
  2362. flags );
  2363. break;
  2364. case ESC_NOT_DIGIT:
  2365. pnew = create_charset<CI>( match_charset<CI>( true,
  2366. get_digit_vector() ),
  2367. flags );
  2368. break;
  2369. case ESC_WORD:
  2370. pnew = create_charset<CI>( match_charset<CI>( false,
  2371. get_word_vector() ),
  2372. flags );
  2373. break;
  2374. case ESC_NOT_WORD:
  2375. pnew = create_charset<CI>( match_charset<CI>( true,
  2376. get_word_vector() ),
  2377. flags );
  2378. break;
  2379. case ESC_SPACE:
  2380. pnew = create_charset<CI>( match_charset<CI>( false,
  2381. get_space_vector() ),
  2382. flags );
  2383. break;
  2384. case ESC_NOT_SPACE:
  2385. pnew = create_charset<CI>( match_charset<CI>( true,
  2386. get_space_vector() ),
  2387. flags );
  2388. break;
  2389. case ESC_BEGIN_STRING:
  2390. pnew = create_bos<CI>( flags );
  2391. break;
  2392. case ESC_END_STRING:
  2393. pnew = create_eos<CI>( flags );
  2394. break;
  2395. case ESC_END_STRING_z:
  2396. pnew = create_eoz<CI>( flags );
  2397. break;
  2398. case ESCAPE:
  2399. if( char_type('0') <= *ipat && char_type('9') >= *ipat )
  2400. {
  2401. // use _cgroups_total here since the invisible groups have not been numbered yet.
  2402. unsigned nbackref = parse_int( ipat, m_pat.end(), _cgroups_total() - 1 );// always at least 1 group
  2403. if( 0 == nbackref || rggroups.size() <= nbackref || NULL == rggroups[ nbackref ] )
  2404. throw bad_regexpr( "invalid backreference" );
  2405. pnew = create_backref<CI>( nbackref, rggroups[nbackref]->group_width(), flags );
  2406. }
  2407. else
  2408. {
  2409. // Is this a user-defined intrinsic character set?
  2410. match_charset<CI> * pcharset = s_charset_map.get( *ipat, flags );
  2411. if( NULL != pcharset )
  2412. pnew = create_charset<CI>( *pcharset, flags );
  2413. else
  2414. pnew = create_atom<CI>( ipat, flags );
  2415. ++ipat;
  2416. }
  2417. break;
  2418. // If quotemeta, loop until we find quotemeta off or end of string
  2419. case ESC_QUOTE_META_ON:
  2420. for( istart = itemp = ipat, fdone = false; !fdone && ipat != m_pat.end(); )
  2421. {
  2422. switch( SY::reg_token( ipat, m_pat.end() ) )
  2423. {
  2424. case ESC_QUOTE_META_OFF:
  2425. fdone = true;
  2426. break;
  2427. case NO_TOKEN:
  2428. ++ipat; // fallthrough
  2429. default:
  2430. itemp = ipat;
  2431. break;
  2432. }
  2433. }
  2434. if( itemp != istart )
  2435. pgroup->add_item( create_atom<CI>( istart, itemp, flags ) );
  2436. // skip the quantification code below
  2437. return true;
  2438. // Should never get here for valid patterns
  2439. case ESC_QUOTE_META_OFF:
  2440. throw bad_regexpr("quotemeta turned off, but was never turned on");
  2441. default:
  2442. assert( ! "Unhandled token type" );
  2443. break;
  2444. }
  2445. // If pnew is null, then the current subexpression is a no-op.
  2446. if( pnew.get() )
  2447. {
  2448. // Look for quantifiers
  2449. _quantify( pnew, pnew_group, ipat );
  2450. // Add the item to the group
  2451. pgroup->add_item( pnew.release() );
  2452. }
  2453. return true;
  2454. }
  2455. template< typename CI, typename SY >
  2456. void basic_rpattern<CI,SY>::_quantify(
  2457. auto_sub_ptr<sub_expr<CI> > & pnew,
  2458. match_group<CI> * pnew_group,
  2459. basic_string<basic_rpattern<CI,SY>::char_type>::iterator & ipat )
  2460. {
  2461. if( ipat != m_pat.end() && ! pnew->is_assertion() )
  2462. {
  2463. basic_string<char_type>::iterator itemp = ipat;
  2464. bool fmin = false;
  2465. // Since size_t is unsigned, -1 is really the largest size_t
  2466. size_t lbound = (size_t)-1;
  2467. size_t ubound = (size_t)-1;
  2468. size_t ubound_tmp;
  2469. switch( SY::quant_token( itemp, m_pat.end() ) )
  2470. {
  2471. case ZERO_OR_MORE_MIN:
  2472. fmin = true;
  2473. case ZERO_OR_MORE:
  2474. lbound = 0;
  2475. break;
  2476. case ONE_OR_MORE_MIN:
  2477. fmin = true;
  2478. case ONE_OR_MORE:
  2479. lbound = 1;
  2480. break;
  2481. case ZERO_OR_ONE_MIN:
  2482. fmin = true;
  2483. case ZERO_OR_ONE:
  2484. lbound = 0;
  2485. ubound = 1;
  2486. break;
  2487. case BEGIN_RANGE:
  2488. lbound = parse_int( itemp, m_pat.end() );
  2489. if( itemp == m_pat.end() )
  2490. throw bad_regexpr( "expecting end of range" );
  2491. switch( SY::quant_token( itemp, m_pat.end() ) )
  2492. {
  2493. case END_RANGE_MIN:
  2494. fmin = true;
  2495. case END_RANGE:
  2496. ubound = lbound;
  2497. break;
  2498. case RANGE_SEPARATOR:
  2499. ipat = itemp;
  2500. ubound_tmp = parse_int( itemp, m_pat.end() );
  2501. if( itemp != ipat )
  2502. ubound = ubound_tmp;
  2503. if( itemp == m_pat.end() )
  2504. throw bad_regexpr( "expecting end of range" );
  2505. switch( SY::quant_token( itemp, m_pat.end() ) )
  2506. {
  2507. case END_RANGE_MIN:
  2508. fmin = true;
  2509. case END_RANGE:
  2510. break;
  2511. default:
  2512. throw bad_regexpr( "expecting end of range" );
  2513. }
  2514. break;
  2515. default:
  2516. throw bad_regexpr( "ill-formed quantifier" );
  2517. }
  2518. if( ubound < lbound )
  2519. throw bad_regexpr( "ill-formed quantifier" );
  2520. break;
  2521. }
  2522. if( (size_t)-1 != lbound )
  2523. {
  2524. auto_sub_ptr<match_quantifier<CI> > pquant;
  2525. // a group quantifier is less efficient than an atom quantifier
  2526. if( fmin )
  2527. {
  2528. if( pnew_group )
  2529. pquant = new min_group_quantifier<CI>( pnew_group,
  2530. lbound, ubound );
  2531. else
  2532. pquant = new min_atom_quantifier<CI>( pnew.get(),
  2533. lbound, ubound );
  2534. }
  2535. else
  2536. {
  2537. if( pnew_group )
  2538. pquant = new max_group_quantifier<CI>( pnew_group,
  2539. lbound, ubound );
  2540. else
  2541. pquant = new max_atom_quantifier<CI>( pnew.get(),
  2542. lbound, ubound );
  2543. }
  2544. pnew.release();
  2545. pnew = pquant.release();
  2546. ipat = itemp;
  2547. }
  2548. }
  2549. }
  2550. template< typename CI, typename SY >
  2551. void basic_rpattern<CI,SY>::_add_subst_backref( subst_node & snode, size_t nbackref, size_t rstart )
  2552. {
  2553. m_fuses_backrefs = true;
  2554. assert( subst_node::SUBST_STRING == snode.stype );
  2555. if( snode.subst_string.rlength )
  2556. m_subst_list.push_back( snode );
  2557. snode.stype = subst_node::SUBST_BACKREF;
  2558. snode.subst_backref = nbackref;
  2559. m_subst_list.push_back( snode );
  2560. // re-initialize the subst_node
  2561. snode.stype = subst_node::SUBST_STRING;
  2562. snode.subst_string.rstart = rstart;
  2563. snode.subst_string.rlength = 0;
  2564. }
  2565. template< typename CI, typename SY >
  2566. void basic_rpattern<CI,SY>::_parse_subst()
  2567. {
  2568. TOKEN tok;
  2569. subst_node snode;
  2570. basic_string<char_type>::iterator icur = m_subst.begin();
  2571. size_t nbackref;
  2572. basic_string<char_type>::iterator itemp;
  2573. bool fdone;
  2574. m_fuses_backrefs = false;
  2575. // Initialize the subst_node
  2576. snode.stype = subst_node::SUBST_STRING;
  2577. snode.subst_string.rstart = 0;
  2578. snode.subst_string.rlength = 0;
  2579. while( icur != m_subst.end() )
  2580. {
  2581. switch( tok = SY::subst_token( icur, m_subst.end() ) )
  2582. {
  2583. case SUBST_MATCH:
  2584. _add_subst_backref( snode, 0, distance( m_subst.begin(), icur ) );
  2585. break;
  2586. case SUBST_PREMATCH:
  2587. _add_subst_backref( snode, subst_node::PREMATCH, distance( m_subst.begin(), icur ) );
  2588. break;
  2589. case SUBST_POSTMATCH:
  2590. _add_subst_backref( snode, subst_node::POSTMATCH, distance( m_subst.begin(), icur ) );
  2591. break;
  2592. case SUBST_BACKREF:
  2593. nbackref = parse_int( icur, m_subst.end(), cgroups() - 1 ); // always at least 1 group
  2594. if( 0 == nbackref )
  2595. throw bad_regexpr( "invalid backreference in substitution" );
  2596. _add_subst_backref( snode, nbackref, distance( m_subst.begin(), icur ) );
  2597. break;
  2598. case SUBST_QUOTE_META_ON:
  2599. assert( subst_node::SUBST_STRING == snode.stype );
  2600. if( snode.subst_string.rlength )
  2601. m_subst_list.push_back( snode );
  2602. snode.subst_string.rstart = distance( m_subst.begin(), icur );
  2603. for( itemp = icur, fdone = false; !fdone && icur != m_subst.end(); )
  2604. {
  2605. switch( tok = SY::subst_token( icur, m_subst.end() ) )
  2606. {
  2607. case SUBST_ALL_OFF:
  2608. fdone = true;
  2609. break;
  2610. case NO_TOKEN:
  2611. ++icur; // fall-through
  2612. default:
  2613. itemp = icur;
  2614. break;
  2615. }
  2616. }
  2617. snode.subst_string.rlength = distance( m_subst.begin(), itemp ) - snode.subst_string.rstart;
  2618. if( snode.subst_string.rlength )
  2619. m_subst_list.push_back( snode );
  2620. if( tok == SUBST_ALL_OFF )
  2621. {
  2622. snode.stype = subst_node::SUBST_OP;
  2623. snode.op = subst_node::ALL_OFF;
  2624. m_subst_list.push_back( snode );
  2625. }
  2626. // re-initialize the subst_node
  2627. snode.stype = subst_node::SUBST_STRING;
  2628. snode.subst_string.rstart = distance( m_subst.begin(), icur );
  2629. snode.subst_string.rlength = 0;
  2630. break;
  2631. case SUBST_UPPER_ON:
  2632. case SUBST_UPPER_NEXT:
  2633. case SUBST_LOWER_ON:
  2634. case SUBST_LOWER_NEXT:
  2635. case SUBST_ALL_OFF:
  2636. assert( subst_node::SUBST_STRING == snode.stype );
  2637. if( snode.subst_string.rlength )
  2638. m_subst_list.push_back( snode );
  2639. snode.stype = subst_node::SUBST_OP;
  2640. snode.op = (subst_node::op_type) tok;
  2641. m_subst_list.push_back( snode );
  2642. // re-initialize the subst_node
  2643. snode.stype = subst_node::SUBST_STRING;
  2644. snode.subst_string.rstart = distance( m_subst.begin(), icur );
  2645. snode.subst_string.rlength = 0;
  2646. break;
  2647. case SUBST_ESCAPE:
  2648. if( icur == m_subst.end() )
  2649. throw bad_regexpr("expecting escape sequence in substitution string");
  2650. assert( subst_node::SUBST_STRING == snode.stype );
  2651. if( snode.subst_string.rlength )
  2652. m_subst_list.push_back( snode );
  2653. snode.subst_string.rstart = distance( m_subst.begin(), icur++ );
  2654. snode.subst_string.rlength = 1;
  2655. break;
  2656. case NO_TOKEN:
  2657. default:
  2658. ++snode.subst_string.rlength;
  2659. ++icur;
  2660. break;
  2661. }
  2662. }
  2663. assert( subst_node::SUBST_STRING == snode.stype );
  2664. if( snode.subst_string.rlength )
  2665. m_subst_list.push_back( snode );
  2666. }
  2667. template< typename CI, typename SY >
  2668. basic_rpattern<CI,SY>::charset_map basic_rpattern<CI,SY>::s_charset_map;
  2669. // Pass in an interator to one after the opening bracket of the character set.
  2670. // On return, icur points to one character after the closing bracket
  2671. template< typename CI, typename SY >
  2672. sub_expr<CI> * create_charset_helper<CI,SY>::create_charset_aux(
  2673. basic_string<iterator_traits<CI>::value_type> & str,
  2674. basic_string<iterator_traits<CI>::value_type>::iterator & icur,
  2675. unsigned flags )
  2676. {
  2677. bool fcomplement = false;
  2678. match_charset<CI> * pnew = NULL;
  2679. basic_string<iterator_traits<CI>::value_type>::iterator itemp = icur;
  2680. if( itemp != str.end() && CHARSET_NEGATE == SY::charset_token( itemp, str.end() ) )
  2681. {
  2682. fcomplement = true;
  2683. icur = itemp;
  2684. }
  2685. switch( ( NOCASE | CSTRINGS ) & flags )
  2686. {
  2687. case 0:
  2688. pnew = new match_custom_charset_t<eos_t<CI>,match_range_with_case>( fcomplement, icur, str.end(), flags, SY() );
  2689. break;
  2690. case NOCASE:
  2691. pnew = new match_custom_charset_t<eos_t<CI>,match_range_no_case>( fcomplement, icur, str.end(), flags, SY() );
  2692. break;
  2693. case CSTRINGS:
  2694. pnew = new match_custom_charset_t<eocs_t<CI>,match_range_with_case>( fcomplement, icur, str.end(), flags, SY() );
  2695. break;
  2696. case NOCASE | CSTRINGS:
  2697. pnew = new match_custom_charset_t<eocs_t<CI>,match_range_no_case>( fcomplement, icur, str.end(), flags, SY() );
  2698. break;
  2699. default:
  2700. __assume(0); // tells the compiler that this is unreachable
  2701. }
  2702. return pnew;
  2703. }
  2704. #pragma warning( disable : 4660 )
  2705. // Explicit instantiation
  2706. #ifdef REGEX_FORCE_INSTANTIATION
  2707. template class basic_regexpr<char>;
  2708. template class basic_regexpr<wint_t>;
  2709. #else
  2710. template class basic_regexpr<TCHAR>;
  2711. #endif
  2712. #ifndef NO_PERL_RE
  2713. #ifdef REGEX_FORCE_INSTANTIATION
  2714. template class basic_rpattern<const char *>;
  2715. template class basic_rpattern<const wint_t *>;
  2716. template class basic_rpattern<string::const_iterator>;
  2717. template class basic_rpattern<wstring::const_iterator>;
  2718. #else
  2719. template class basic_rpattern<const TCHAR *>;
  2720. template class basic_rpattern<tstring::const_iterator>;
  2721. #endif
  2722. #endif
  2723. #ifdef POSIX_RE
  2724. #ifdef REGEX_FORCE_INSTANTIATION
  2725. template class basic_rpattern<const char *,posix_syntax<char> >;
  2726. template class basic_rpattern<const wint_t *,posix_syntax<wint_t> >;
  2727. template class basic_rpattern<string::const_iterator,posix_syntax<char> >;
  2728. template class basic_rpattern<wstring::const_iterator,posix_syntax<wint_t> >;
  2729. #else
  2730. template class basic_rpattern<const TCHAR *,posix_syntax<TCHAR> >;
  2731. template class basic_rpattern<tstring::const_iterator,posix_syntax<TCHAR> >;
  2732. #endif
  2733. #endif
  2734. } // namespace regex