Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1018 lines
34 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // File: basic_regexpr.hxx
  4. //
  5. // Contents: classes for regular expression pattern matching a-la perl
  6. //
  7. // Classes: basic_rpattern, basic_regexpr
  8. //
  9. // Functions: basic_regexpr::match
  10. // basic_regexpr::substitute
  11. // basic_regexpr::cbackrefs
  12. // basic_regexpr::backref
  13. // basic_regexpr::all_backrefs
  14. // basic_regexpr::backref_str
  15. //
  16. // Coupling:
  17. //
  18. // History: 12-11-1998 ericne Created
  19. // 01-05-2001 ericne Removed dependency on VC's choice
  20. // of STL iterator types.
  21. //
  22. //----------------------------------------------------------------------------
  23. #pragma once
  24. // C4786 identifier was truncated to '255' characters in the debug information
  25. #pragma warning( disable : 4290 4786 )
  26. #ifdef _MT
  27. #include <windows.h> // for CRITICAL_SECTION
  28. #endif
  29. #include <string>
  30. #include <stdexcept>
  31. #include <vector>
  32. #include <list>
  33. #include <map>
  34. #include <iostream>
  35. #include <tchar.h>
  36. #include <new.h> // for _set_new_handler
  37. #include <crtdbg.h>
  38. #include "syntax.h"
  39. namespace regex
  40. {
  41. // Called when an allocation fails
  42. inline int __cdecl my_new_handler( size_t )
  43. {
  44. throw std::bad_alloc();
  45. }
  46. // For pushing and popping the new handler
  47. class push_new_handler
  48. {
  49. _PNH m_pnh;
  50. public:
  51. push_new_handler( _PNH pnh )
  52. {
  53. m_pnh = _set_new_handler( pnh );
  54. }
  55. ~push_new_handler()
  56. {
  57. (void)_set_new_handler( m_pnh );
  58. }
  59. };
  60. class bad_regexpr : public std::runtime_error
  61. {
  62. public:
  63. explicit bad_regexpr(const std::string& _S)
  64. : std::runtime_error(_S) {}
  65. virtual ~bad_regexpr() {}
  66. };
  67. //
  68. // Flags to control how matching occurs
  69. //
  70. enum REGEX_FLAGS
  71. {
  72. NOCASE = 0x0001, // ignore case
  73. GLOBAL = 0x0002, // match everywhere in the string
  74. MULTILINE = 0x0004, // ^ and $ can match internal line breaks
  75. SINGLELINE = 0x0008, // . can match newline character
  76. RIGHTMOST = 0x0010, // start matching at the right of the string
  77. NOBACKREFS = 0x0020, // only meaningful when used with GLOBAL and substitute
  78. FIRSTBACKREFS = 0x0040, // only meaningful when used with GLOBAL
  79. ALLBACKREFS = 0x0080, // only meaningful when used with GLOBAL
  80. CSTRINGS = 0x0100, // optimize pattern for use with null-terminated strings
  81. NORMALIZE = 0x0200 // Preprocess patterns: "\\n" => "\n", etc.
  82. };
  83. // Forward declarations
  84. template< typename CI > struct match_param;
  85. template< typename CI > class match_group;
  86. template< typename CI > class match_wrapper;
  87. template< typename CI > class match_charset;
  88. template< typename CI > class basic_rpattern_base;
  89. // --------------------------------------------------------------------------
  90. //
  91. // Class: width_type
  92. //
  93. // Description: represents the width of a sub-expression
  94. //
  95. // Methods: width_add - add two widths
  96. // width_mult - multiply two widths
  97. // width_type - ctor
  98. // width_type - ctor
  99. // operator= - assign a width
  100. // operator== - are widths equal
  101. // operator!= - are widths now equal
  102. // operator+ - add two widths
  103. // operator* - multiply two widths
  104. //
  105. // Members: m_min - smallest number of characters a sub-expr can span
  106. // m_max - largest number of characters a sub-expr can span
  107. //
  108. // History: 8/14/2000 - ericne - Created
  109. //
  110. // --------------------------------------------------------------------------
  111. struct width_type
  112. {
  113. size_t m_min;
  114. size_t m_max;
  115. static size_t width_add( size_t a, size_t b )
  116. {
  117. return ( -1 == a || -1 == b ? -1 : a + b );
  118. }
  119. static size_t width_mult( size_t a, size_t b )
  120. {
  121. return ( -1 == a || -1 == b ? -1 : a * b );
  122. }
  123. width_type( size_t _min = 0, size_t _max = -1 )
  124. : m_min(_min), m_max(_max)
  125. {
  126. }
  127. width_type( const width_type & that )
  128. : m_min(that.m_min), m_max(that.m_max)
  129. {
  130. }
  131. width_type & operator=( const width_type & that )
  132. {
  133. m_min = that.m_min;
  134. m_max = that.m_max;
  135. return *this;
  136. }
  137. bool operator==( const width_type & that ) const
  138. {
  139. return ( m_min == that.m_min && m_max == that.m_max );
  140. }
  141. bool operator!=( const width_type & that ) const
  142. {
  143. return ( m_min != that.m_min || m_max != that.m_max );
  144. }
  145. width_type operator+( const width_type & that ) const
  146. {
  147. return width_type( width_add( m_min, that.m_min ), width_add( m_max, that.m_max ) );
  148. }
  149. width_type operator*( const width_type & that ) const
  150. {
  151. return width_type( width_mult( m_min, that.m_min ), width_mult( m_max, that.m_max ) );
  152. }
  153. };
  154. const width_type worst_width(0,-1);
  155. const width_type uninit_width(-1,-1);
  156. // --------------------------------------------------------------------------
  157. //
  158. // Class: sub_expr
  159. //
  160. // Description: patterns are "compiled" into a directed graph of sub_expr
  161. // structs. Matching is accomplished by traversing this graph.
  162. //
  163. // Methods: sub_expr - construct a sub_expr
  164. // _match_this - does this sub_expr match at the given location
  165. // _width_this - what is the width of this sub_expr
  166. // ~sub_expr - virt dtor so cleanup happens correctly
  167. // _delete - delete this node in the graph and all nodes linked
  168. // next - pointer to the next node in the graph
  169. // next - pointer to the next node in the graph
  170. // match_next - match the rest of the graph
  171. // domatch - match_this and match_next
  172. // is_assertion - true if this sub_expr is a zero-width assertion
  173. // get_width - find the width of the graph at this sub_expr
  174. //
  175. // Members: m_pnext - pointer to the next node in the graph
  176. //
  177. // History: 8/14/2000 - ericne - Created
  178. //
  179. // --------------------------------------------------------------------------
  180. template< typename CI >
  181. class sub_expr
  182. {
  183. sub_expr * m_pnext;
  184. protected:
  185. // Only derived classes and basic_rpattern can instantiate sub_expr's
  186. sub_expr( )
  187. : m_pnext(NULL)
  188. {
  189. }
  190. // match this object only
  191. virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
  192. {
  193. return true;
  194. }
  195. virtual width_type _width_this() throw() = 0;
  196. public:
  197. typedef typename std::iterator_traits<CI>::value_type char_type;
  198. friend class match_wrapper<CI>; // wrappers can access _match_this method
  199. virtual ~sub_expr() {}
  200. virtual void _delete()
  201. {
  202. if( m_pnext )
  203. m_pnext->_delete();
  204. delete this;
  205. }
  206. inline const sub_expr *const next() const { return m_pnext; }
  207. inline sub_expr * & next() { return m_pnext; }
  208. // Match all subsequent objects
  209. inline bool match_next( match_param<CI> & param, CI icur ) const throw()
  210. {
  211. return NULL == m_pnext || m_pnext->domatch( param, icur );
  212. }
  213. // Match this object and all subsequent objects
  214. // If domatch returns false, it must not change any internal state
  215. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  216. {
  217. return ( _match_this(param,icur) && match_next(param,icur) );
  218. }
  219. virtual bool is_assertion() const throw()
  220. {
  221. return false;
  222. }
  223. width_type get_width() throw()
  224. {
  225. width_type this_width = _width_this();
  226. if( NULL == m_pnext )
  227. return this_width;
  228. width_type that_width = m_pnext->get_width();
  229. return ( this_width + that_width );
  230. }
  231. };
  232. template< typename CI >
  233. void delete_sub_expr( sub_expr<CI> * psub )
  234. {
  235. if( psub )
  236. psub->_delete();
  237. }
  238. template< typename CI, typename SY = perl_syntax<std::iterator_traits<CI>::value_type> >
  239. class create_charset_helper
  240. {
  241. public:
  242. typedef typename std::iterator_traits<CI>::value_type char_type;
  243. static sub_expr<CI> * create_charset_aux(
  244. std::basic_string<char_type> & str,
  245. typename std::basic_string<char_type>::iterator & icur,
  246. unsigned flags );
  247. };
  248. // --------------------------------------------------------------------------
  249. //
  250. // Class: auto_sub_ptr
  251. //
  252. // Description: Class for automatically cleaning up the structure associated
  253. // with a parsed pattern
  254. //
  255. // Methods: auto_sub_ptr - private copy ctor - not used
  256. // operator= - private assign operator - not used
  257. // operator T* - private implicit cast operator - not used
  258. // auto_sub_ptr - ctor
  259. // ~auto_sub_ptr - dtor, frees ptr
  260. // free_ptr - explicitly free pointer
  261. // release - relinquish ownership of ptr
  262. // operator= - take ownership of ptr
  263. // get - return ptr
  264. // get - return ptr
  265. // operator-> - method call through ptr
  266. // operator-> - method call through ptr
  267. //
  268. // Members: m_psub - sub_expr pointer
  269. //
  270. // History: 8/14/2000 - ericne - Created
  271. //
  272. // --------------------------------------------------------------------------
  273. template< typename T >
  274. class auto_sub_ptr
  275. {
  276. T * m_psub;
  277. // hide these methods
  278. auto_sub_ptr( const auto_sub_ptr<T> & ) {}
  279. auto_sub_ptr & operator=( const auto_sub_ptr<T> & ) { return *this; }
  280. operator T*() const { return m_psub; }
  281. public:
  282. auto_sub_ptr( T * psub = NULL ) : m_psub( psub ) {}
  283. ~auto_sub_ptr()
  284. {
  285. free_ptr();
  286. }
  287. void free_ptr() // deallocate
  288. {
  289. delete_sub_expr( m_psub );
  290. }
  291. T * release() // relinquish ownership, but don't deallocate
  292. {
  293. T * psub = m_psub;
  294. m_psub = NULL;
  295. return psub;
  296. }
  297. auto_sub_ptr<T> & operator=( T * psub )
  298. {
  299. delete_sub_expr( m_psub );
  300. m_psub = psub;
  301. return *this;
  302. }
  303. inline const T*const get() const { return m_psub; }
  304. inline T* & get() { return m_psub; }
  305. inline const T*const operator->() const { return m_psub; }
  306. inline T* operator->() { return m_psub; }
  307. };
  308. template< typename CI >
  309. struct backref_tag : public std::pair<CI,CI>
  310. {
  311. backref_tag( CI i1 = CI(0), CI i2 = CI(0) )
  312. : std::pair<CI,CI>(i1,i2), reserved(0) {}
  313. operator bool() const throw() { return first != CI(0) && second != CI(0); }
  314. bool operator!() const throw() { return ! operator bool(); }
  315. size_t reserved; // used for internal book-keeping
  316. };
  317. template< typename CH >
  318. backref_tag< const CH * > _static_match_helper(
  319. const CH * szstr,
  320. const basic_rpattern_base< const CH * > & pat,
  321. std::vector< backref_tag< const CH * > > * prgbackrefs ) throw();
  322. template< typename CH >
  323. size_t _static_count_helper(
  324. const CH * szstr,
  325. const basic_rpattern_base< const CH * > & pat ) throw();
  326. // --------------------------------------------------------------------------
  327. //
  328. // Class: basic_regexpr
  329. //
  330. // Description: string class that allows regular expression pattern matching
  331. //
  332. // Methods: basic_regexpr - ctor
  333. // match - static method for matching C-style strings
  334. // match - non-static method for matching C++-style strings
  335. // count - static method for couting matches in C-style strings
  336. // count - non-static method for counting matches in C++-style strin
  337. // substitute - perform substitutions in C++-style strings
  338. // cbackrefs - return the count of internally stored back-references
  339. // rstart - offset to start of n-th backref
  340. // rlength - lenght of n-th backref
  341. // backref - return the n-th backref
  342. // all_backrefs - return a vector of all saved backrefs
  343. // backref_str - return the string to which the backreferences refer
  344. //
  345. // Members: m_rgbackrefs - vector of backrefs
  346. // m_backref_str - temp string buffer
  347. // m_pbackref_str - pointer to the string containing the string to which
  348. // the backreferences refer (either *this or m_backref_str)
  349. //
  350. // Typedefs: backref_type -
  351. // backref_vector -
  352. //
  353. // History: 8/14/2000 - ericne - Created
  354. //
  355. // --------------------------------------------------------------------------
  356. template< typename CH, typename TR = std::char_traits<CH>, typename AL = std::allocator<CH> >
  357. class basic_regexpr : public std::basic_string<CH,TR,AL>
  358. {
  359. public:
  360. basic_regexpr( const allocator_type & a = allocator_type() )
  361. : std::basic_string<CH,TR,AL>( a ), m_pbackref_str( & m_backref_str ) {}
  362. basic_regexpr( const CH * p,
  363. const allocator_type & a = allocator_type() )
  364. : std::basic_string<CH,TR,AL>( p, a ), m_pbackref_str( & m_backref_str ) {}
  365. basic_regexpr( const CH * p, size_type n,
  366. const allocator_type & a = allocator_type() )
  367. : std::basic_string<CH,TR,AL>( p, n, a ), m_pbackref_str( & m_backref_str ) {}
  368. basic_regexpr( const std::basic_string<CH,TR,AL> & s, size_type pos = 0, size_type n = npos,
  369. const allocator_type & a = allocator_type() )
  370. : std::basic_string<CH,TR,AL>( s, pos, n, a ), m_pbackref_str( & m_backref_str ) {}
  371. basic_regexpr( size_type n, CH ch,
  372. const allocator_type & a = allocator_type() )
  373. : std::basic_string<CH,TR,AL>( n, ch, a ), m_pbackref_str( & m_backref_str ) {}
  374. basic_regexpr( const_iterator begin, const_iterator end,
  375. const allocator_type & a = allocator_type() )
  376. : std::basic_string<CH,TR,AL>( begin, end, a ), m_pbackref_str( & m_backref_str ) {}
  377. // actually stores iterators into *m_pbackref_str:
  378. typedef backref_tag<const_iterator> backref_type;
  379. typedef std::vector< backref_type > backref_vector;
  380. // stores pointers into the null-terminated C-stype string
  381. typedef backref_tag< const CH * > backref_type_c;
  382. typedef std::vector< backref_type_c > backref_vector_c;
  383. // returns $0, the first backref
  384. static backref_type_c match( const CH * szstr,
  385. const basic_rpattern_base< const CH * > & pat,
  386. backref_vector_c * prgbackrefs = NULL ) throw()
  387. {
  388. return _static_match_helper<CH>( szstr, pat, prgbackrefs );
  389. }
  390. // returns $0, the first backref
  391. backref_type match( const basic_rpattern_base< const_iterator > & pat,
  392. size_type pos = 0,
  393. size_type len = npos ) const throw();
  394. static size_t count( const CH * szstr,
  395. const basic_rpattern_base< const CH * > & pat ) throw()
  396. {
  397. return _static_count_helper<CH>( szstr, pat );
  398. }
  399. size_t count( const basic_rpattern_base< const_iterator > & pat,
  400. size_type pos = 0,
  401. size_type len = npos ) const throw();
  402. size_t substitute( const basic_rpattern_base< const_iterator > & pat,
  403. size_type pos = 0,
  404. size_type len = npos ) throw(std::bad_alloc);
  405. size_t cbackrefs() const throw()
  406. {
  407. return m_rgbackrefs.size();
  408. }
  409. size_type rstart( size_t cbackref = 0 ) const throw(std::out_of_range)
  410. {
  411. return std::distance( m_pbackref_str->begin(), m_rgbackrefs.at( cbackref ).first );
  412. }
  413. size_type rlength( size_t cbackref = 0 ) const throw(std::out_of_range)
  414. {
  415. return std::distance( m_rgbackrefs.at( cbackref ).first, m_rgbackrefs.at( cbackref ).second );
  416. }
  417. backref_type backref( size_t cbackref ) const throw(std::out_of_range)
  418. {
  419. return m_rgbackrefs.at( cbackref );
  420. }
  421. const backref_vector & all_backrefs() const throw()
  422. {
  423. return m_rgbackrefs;
  424. }
  425. const std::basic_string<CH,TR,AL> & backref_str() const throw()
  426. {
  427. return *m_pbackref_str;
  428. }
  429. protected:
  430. // save information about the backrefs
  431. // mutable because these can change in the "const" match() method.
  432. mutable backref_vector m_rgbackrefs;
  433. mutable std::basic_string<CH,TR,AL> m_backref_str;
  434. mutable const std::basic_string<CH,TR,AL> * m_pbackref_str;
  435. };
  436. // --------------------------------------------------------------------------
  437. //
  438. // Class: match_param
  439. //
  440. // Description: Struct that contains the state of the matching operation.
  441. // Passed by reference to all domatch and _match_this routines.
  442. //
  443. // Methods: match_param - ctor
  444. // match_param - ctor
  445. //
  446. // Members: ibegin - start of the string
  447. // istart - start of this iteration
  448. // istop - end of the string
  449. // prgbackrefs - pointer to backref array0
  450. //
  451. // History: 8/14/2000 - ericne - Created
  452. //
  453. // --------------------------------------------------------------------------
  454. template< typename CI >
  455. struct match_param
  456. {
  457. CI ibegin;
  458. CI istart;
  459. CI istop;
  460. std::vector< backref_tag< CI > > * prgbackrefs;
  461. match_param( CI _istart,
  462. CI _istop,
  463. std::vector< backref_tag< CI > > * _prgbackrefs )
  464. : ibegin(_istart),
  465. istart(_istart),
  466. istop(_istop),
  467. prgbackrefs(_prgbackrefs)
  468. {
  469. }
  470. match_param( CI _ibegin,
  471. CI _istart,
  472. CI _istop,
  473. std::vector< backref_tag< CI > > * _prgbackrefs )
  474. : ibegin(_ibegin),
  475. istart(_istart),
  476. istop(_istop),
  477. prgbackrefs(_prgbackrefs)
  478. {
  479. }
  480. };
  481. // --------------------------------------------------------------------------
  482. //
  483. // Class: subst_node
  484. //
  485. // Description: Substitution strings are parsed into an array of these
  486. // structures in order to speed up subst operations.
  487. //
  488. // Members: stype - type of this struct
  489. // subst_string - do a string substitution
  490. // subst_backref - do a bacref substitution
  491. // op - execute an operation
  492. //
  493. // History: 8/14/2000 - ericne - Created
  494. //
  495. // --------------------------------------------------------------------------
  496. struct subst_node
  497. {
  498. enum subst_type { SUBST_STRING, SUBST_BACKREF, SUBST_OP };
  499. enum { PREMATCH = -1, POSTMATCH = -2 };
  500. enum op_type { UPPER_ON = SUBST_UPPER_ON,
  501. UPPER_NEXT = SUBST_UPPER_NEXT,
  502. LOWER_ON = SUBST_LOWER_ON,
  503. LOWER_NEXT = SUBST_LOWER_NEXT,
  504. ALL_OFF = SUBST_ALL_OFF };
  505. subst_type stype;
  506. union
  507. {
  508. struct
  509. {
  510. size_t rstart;
  511. size_t rlength;
  512. } subst_string;
  513. size_t subst_backref;
  514. op_type op;
  515. };
  516. };
  517. // --------------------------------------------------------------------------
  518. //
  519. // Class: basic_rpattern_base
  520. //
  521. // Description:
  522. //
  523. // Methods: basic_rpattern_base - ctor
  524. // flags - get the state of the flags
  525. // uses_backrefs - true if the backrefs are referenced
  526. // get_first_subexpression - return ptr to first sub_expr struct
  527. // get_width - get min/max nbr chars this pattern can match
  528. // loops - if false, we only need to try to match at 1st position
  529. // cgroups - number of visible groups
  530. // _cgroups_total - total number of groups, including hidden (?:) groups
  531. // get_pat - get string representing the pattern
  532. // get_subst - get string representing the substitution string
  533. // get_subst_list - get the list of subst nodes
  534. // _normalize_string - perform character escaping
  535. // _reset - reinitialize the pattern
  536. //
  537. // Members: m_fuses_backrefs -
  538. // m_floop -
  539. // m_cgroups -
  540. // m_cgroups_visible -
  541. // m_flags -
  542. // m_nwidth -
  543. // m_pat -
  544. // m_subst -
  545. // m_subst_list -
  546. // m_pfirst -
  547. //
  548. // Typedefs: char_type -
  549. //
  550. // History: 8/14/2000 - ericne - Created
  551. //
  552. // --------------------------------------------------------------------------
  553. template< typename CI >
  554. class basic_rpattern_base
  555. {
  556. public:
  557. typedef typename std::iterator_traits<CI>::value_type char_type;
  558. basic_rpattern_base( unsigned flags = 0,
  559. const std::basic_string<char_type> & pat = std::basic_string<char_type>(),
  560. const std::basic_string<char_type> & subst = std::basic_string<char_type>() ) throw()
  561. : m_fuses_backrefs( false ),
  562. m_floop( true ),
  563. m_cgroups( 0 ),
  564. m_cgroups_visible( 0 ),
  565. m_flags( flags ),
  566. m_nwidth( uninit_width ),
  567. m_pat( pat ),
  568. m_subst( subst ),
  569. m_pfirst( NULL )
  570. {
  571. }
  572. unsigned flags() const throw()
  573. {
  574. return m_flags;
  575. }
  576. bool uses_backrefs() const throw()
  577. {
  578. return m_fuses_backrefs;
  579. }
  580. const sub_expr<CI> * get_first_subexpression() const throw()
  581. {
  582. return m_pfirst.get();
  583. }
  584. width_type get_width() const throw()
  585. {
  586. return m_nwidth;
  587. }
  588. bool loops() const throw()
  589. {
  590. return m_floop;
  591. }
  592. size_t cgroups() const throw()
  593. {
  594. return m_cgroups_visible;
  595. }
  596. size_t _cgroups_total() const throw()
  597. {
  598. return m_cgroups;
  599. }
  600. const std::basic_string<char_type> & get_pat() const throw()
  601. {
  602. return m_pat;
  603. }
  604. const std::basic_string<char_type> & get_subst() const throw()
  605. {
  606. return m_subst;
  607. }
  608. const std::list<subst_node> & get_subst_list() const throw()
  609. {
  610. return m_subst_list;
  611. }
  612. protected:
  613. void _normalize_string( std::basic_string<char_type> & str );
  614. void _reset()
  615. {
  616. m_fuses_backrefs = false;
  617. m_flags = 0;
  618. }
  619. bool m_fuses_backrefs; // true if the substitution uses backrefs
  620. bool m_floop; // false if m_pfirst->domatch only needs to be called once
  621. size_t m_cgroups; // number of groups (always at least one)
  622. size_t m_cgroups_visible; // number of visible groups
  623. unsigned m_flags; // flags used to customize search/replace
  624. width_type m_nwidth; // width of the pattern
  625. std::basic_string<char_type> m_pat; // contains the unparsed pattern
  626. std::basic_string<char_type> m_subst; // contains the unparsed substitution
  627. std::list<subst_node> m_subst_list; // used to speed up substitution
  628. auto_sub_ptr<sub_expr<CI> > m_pfirst; // first subexpression in pattern
  629. };
  630. // --------------------------------------------------------------------------
  631. //
  632. // Class: basic_rpattern
  633. //
  634. // Description:
  635. //
  636. // Methods: basic_rpattern - ctor
  637. // basic_rpattern -
  638. // basic_rpattern -
  639. // init - for (re)initializing a pattern
  640. // init -
  641. // set_substitution - set the substitution string
  642. // set_flags - set the flags
  643. // register_intrinsic_charset - bind an escape sequence to a user-def'd charset
  644. // purge_intrinsic_charsets - delete all user-def'd charsets
  645. // _get_next_group_nbr - return a monotomically increasing id
  646. // _find_next_group - parse the next group of the pattern
  647. // _find_next - parse the next sub_expr of the pattern
  648. // _find_atom - parse the next atom of the pattern
  649. // _quantify - quantify the sub_expr
  650. // _common_init - perform some common initialization tasks
  651. // _parse_subst - parse the substitution string
  652. // _add_subst_backref - add a backref node to the subst list
  653. // _reset - reinitialize the pattern
  654. //
  655. // Members: s_charset_map - for maintaining user-defined charsets
  656. // m_invisible_groups - list of hidden groups to be numbered last
  657. //
  658. // Typedefs: syntax_type -
  659. //
  660. // History: 8/14/2000 - ericne - Created
  661. //
  662. // --------------------------------------------------------------------------
  663. template< typename CI, typename SY = perl_syntax<std::iterator_traits<CI>::value_type> >
  664. class basic_rpattern : public basic_rpattern_base<CI>
  665. {
  666. public:
  667. friend class match_charset<CI>;
  668. typedef SY syntax_type;
  669. basic_rpattern() throw();
  670. basic_rpattern( const std::basic_string<char_type> & pat, unsigned flags=0 ) throw(bad_regexpr,std::bad_alloc);
  671. basic_rpattern( const std::basic_string<char_type> & pat, const std::basic_string<char_type> & subst, unsigned flags=0 ) throw(bad_regexpr,std::bad_alloc);
  672. void init( const std::basic_string<char_type> & pat, unsigned flags=0 ) throw(bad_regexpr,std::bad_alloc);
  673. void init( const std::basic_string<char_type> & pat, const std::basic_string<char_type> & subst, unsigned flags=0 ) throw(bad_regexpr,std::bad_alloc);
  674. void set_substitution( const std::basic_string<char_type> & subst ) throw(bad_regexpr,std::bad_alloc);
  675. void set_flags( unsigned flags ) throw(bad_regexpr,std::bad_alloc);
  676. class charset_map
  677. {
  678. struct charsets
  679. {
  680. sub_expr<CI> * rgpcharsets[2];
  681. std::basic_string<char_type> str_charset;
  682. charsets() throw()
  683. {
  684. memset( rgpcharsets, 0, sizeof( rgpcharsets ) );
  685. }
  686. ~charsets() throw()
  687. {
  688. clean();
  689. }
  690. void clean() throw()
  691. {
  692. for( int i=0; i < (sizeof(rgpcharsets)/sizeof(*rgpcharsets)); ++i )
  693. delete_sub_expr( rgpcharsets[i] );
  694. }
  695. match_charset<CI> * get_charset( unsigned flags ) throw(bad_regexpr,std::bad_alloc)
  696. {
  697. push_new_handler pnh( &my_new_handler );
  698. // Since these charsets are only used while creating other charsets,
  699. // all flags besides NOCASE can safely be ignored here.
  700. bool index = ( NOCASE == ( NOCASE & flags ) );
  701. if( NULL == rgpcharsets[ index ] )
  702. {
  703. std::basic_string<char_type>::iterator istart = str_charset.begin();
  704. rgpcharsets[ index ] = create_charset_helper<CI,SY>::create_charset_aux( str_charset, ++istart, flags );
  705. }
  706. return (match_charset<CI>*) rgpcharsets[ index ];
  707. }
  708. };
  709. typedef std::map<char_type,charsets> map_type;
  710. std::auto_ptr<map_type> m_pmap;
  711. public:
  712. void put( char_type ch, const std::basic_string<char_type> & str ) throw(bad_regexpr,std::bad_alloc)
  713. {
  714. // These characters cannot be bound to a user-defined intrinsic character set
  715. static const char_type rgIllegal[] =
  716. {
  717. '0','1','2','3','4','5','6','7','8','9','A','Z','z','Q',
  718. 'b','B','d','D','f','n','r','s','S','t','v','w','W','E'
  719. };
  720. // So operator new throws bad_alloc on failure.
  721. push_new_handler pnh( &my_new_handler );
  722. if( std::char_traits<char_type>::find( rgIllegal, ARRAYSIZE( rgIllegal ), ch ) )
  723. throw bad_regexpr( "illegal character specified for intrinsic character set." );
  724. if( NULL == m_pmap.get() )
  725. m_pmap = auto_ptr<map_type>( new map_type );
  726. // creates an empty entry if one does not already exist
  727. charsets & chrsts = (*m_pmap)[ch];
  728. chrsts.clean();
  729. chrsts.str_charset = str;
  730. // Try compiling the character set once to make sure it is properly formed:
  731. (void) chrsts.get_charset( 0 );
  732. }
  733. match_charset<CI> * get( char_type ch, unsigned flags ) throw()
  734. {
  735. match_charset<CI> * pRet = NULL;
  736. if( NULL != m_pmap.get() )
  737. {
  738. try
  739. {
  740. push_new_handler pnh( &my_new_handler );
  741. map_type::iterator iter = m_pmap->find( ch );
  742. if( iter != m_pmap->end() )
  743. pRet = iter->second.get_charset( flags );
  744. }
  745. catch(std::bad_alloc) {}
  746. }
  747. return pRet;
  748. }
  749. void purge() throw()
  750. {
  751. if( NULL != m_pmap.get() )
  752. delete m_pmap.release();
  753. }
  754. };
  755. static void register_intrinsic_charset(
  756. char_type ch, const std::basic_string<char_type> & str ) throw(bad_regexpr,std::bad_alloc)
  757. {
  758. s_charset_map.put( ch, str );
  759. }
  760. static void purge_intrinsic_charsets() throw()
  761. {
  762. s_charset_map.purge();
  763. }
  764. protected:
  765. static charset_map s_charset_map;
  766. size_t _get_next_group_nbr()
  767. {
  768. return m_cgroups++;
  769. }
  770. match_group<CI> * _find_next_group( typename std::basic_string<char_type>::iterator & ipat,
  771. unsigned & flags,
  772. std::vector<match_group<CI>*> & rggroups );
  773. bool _find_next( typename std::basic_string<char_type>::iterator & ipat,
  774. match_group<CI> * pgroup, unsigned & flags,
  775. std::vector<match_group<CI>*> & rggroups );
  776. void _find_atom( typename std::basic_string<char_type>::iterator & ipat,
  777. match_group<CI> * pgroup, unsigned flags );
  778. void _quantify( auto_sub_ptr<sub_expr<CI> > & pnew,
  779. match_group<CI> * pnew_group,
  780. typename std::basic_string<char_type>::iterator & ipat );
  781. void _common_init( unsigned flags );
  782. void _parse_subst();
  783. void _add_subst_backref( subst_node & snode, size_t nbackref, size_t rstart );
  784. void _reset();
  785. std::list<match_group<CI>*> m_invisible_groups; // groups w/o backrefs
  786. };
  787. inline std::ostream & operator<<( std::ostream & sout,
  788. const basic_regexpr<char>::backref_type & br )
  789. {
  790. for( std::string::const_iterator ithis = br.first; ithis != br.second; ++ithis )
  791. sout.put( *ithis );
  792. return sout;
  793. }
  794. inline std::wostream & operator<<( std::wostream & sout,
  795. const basic_regexpr<wchar_t>::backref_type & br )
  796. {
  797. for( std::wstring::const_iterator ithis = br.first; ithis != br.second; ++ithis )
  798. sout.put( *ithis > UCHAR_MAX ? L'?' : *ithis );
  799. return sout;
  800. }
  801. typedef basic_regexpr<TCHAR> regexpr;
  802. typedef std::basic_string<TCHAR> tstring;
  803. typedef basic_rpattern<const TCHAR *,perl_syntax<TCHAR> > perl_rpattern_c;
  804. typedef basic_rpattern<const TCHAR *,posix_syntax<TCHAR> > posix_rpattern_c;
  805. typedef basic_rpattern<tstring::const_iterator,perl_syntax<TCHAR> > perl_rpattern;
  806. typedef basic_rpattern<tstring::const_iterator,posix_syntax<TCHAR> > posix_rpattern;
  807. typedef perl_rpattern rpattern; // matches against std::string
  808. typedef perl_rpattern_c rpattern_c; // matches against null-terminated, c-style strings
  809. #ifdef _MT
  810. //
  811. // Define some classes and macros for creating function-local
  812. // static const rpatterns in a thread-safe way
  813. //
  814. template< typename PAT >
  815. class rpattern_destroyer
  816. {
  817. const bool & m_fConstructed;
  818. const PAT & m_refPat;
  819. public:
  820. rpattern_destroyer( const bool & fConstructed, const PAT & refPat )
  821. : m_fConstructed( fConstructed ), m_refPat( refPat )
  822. {
  823. }
  824. ~rpattern_destroyer()
  825. {
  826. if( m_fConstructed )
  827. _Destroy( & m_refPat );
  828. }
  829. };
  830. class CRegExCritSect : private CRITICAL_SECTION
  831. {
  832. public:
  833. CRegExCritSect() { InitializeCriticalSection(this); }
  834. ~CRegExCritSect() { DeleteCriticalSection(this); }
  835. void Enter() { EnterCriticalSection(this); }
  836. void Leave() { LeaveCriticalSection(this); }
  837. };
  838. extern CRegExCritSect g_objRegExCritSect;
  839. class CRegExLock
  840. {
  841. public:
  842. CRegExLock() { g_objRegExCritSect.Enter(); }
  843. ~CRegExLock() { g_objRegExCritSect.Leave(); }
  844. };
  845. #define STATIC_RPATTERN_EX( type, var, params ) \
  846. static unsigned char s_rgb_##var[ sizeof type ]; \
  847. static bool s_f_##var = false; \
  848. static const type & var = *reinterpret_cast<type*>( s_rgb_##var ); \
  849. static const regex::rpattern_destroyer<type> s_des_##var( s_f_##var, var ); \
  850. if( ! s_f_##var ) \
  851. { \
  852. regex::CRegExLock objLock; \
  853. if( ! s_f_##var ) \
  854. { \
  855. new( s_rgb_##var ) type params; \
  856. s_f_##var = true; \
  857. } \
  858. }
  859. #else
  860. #define STATIC_RPATTERN_EX( type, var, params ) \
  861. static const type var params;
  862. #endif
  863. #define STATIC_RPATTERN( var, params ) \
  864. STATIC_RPATTERN_EX( regex::rpattern, var, params )
  865. } // namespace regex