Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1067 lines
35 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // File: basic_regexpr.hxx
  4. //
  5. // Contents: classes for regular expression pattern matching a-la perl
  6. //
  7. // Classes: basic_rpattern, basic_regexpr
  8. //
  9. // Functions: basic_regexpr::match
  10. // basic_regexpr::substitute
  11. // basic_regexpr::cbackrefs
  12. // basic_regexpr::backref
  13. // basic_regexpr::all_backrefs
  14. // basic_regexpr::backref_str
  15. //
  16. // Coupling:
  17. //
  18. // History: 12-11-1998 ericne Created
  19. // 01-05-2001 ericne Removed dependency on VC's choice
  20. // of STL iterator types.
  21. //
  22. //----------------------------------------------------------------------------
  23. #pragma once
  24. // warning C4290: C++ Exception Specification ignored
  25. // warning C4786: identifier was truncated to '255' characters in the debug information
  26. #pragma warning( disable : 4290 4786 )
  27. #pragma warning( push )
  28. // warning C4511: copy constructor could not be generated
  29. // warning C4512: assignment operator could not be generated
  30. #pragma warning( disable : 4511 4512 )
  31. #ifdef _MT
  32. #include <windows.h> // for CRITICAL_SECTION
  33. #endif
  34. #include <string>
  35. #include <stdexcept>
  36. #include <vector>
  37. #include <list>
  38. #include <map>
  39. #include <iosfwd>
  40. #include <tchar.h>
  41. #include <new.h> // for _set_new_handler
  42. #include <crtdbg.h>
  43. #include "syntax.h"
  44. namespace regex
  45. {
  46. // Called when an allocation fails
  47. inline int __cdecl my_new_handler( size_t )
  48. {
  49. throw std::bad_alloc();
  50. }
  51. // For pushing and popping the new handler
  52. class push_new_handler
  53. {
  54. _PNH m_pnh;
  55. public:
  56. push_new_handler( _PNH pnh )
  57. {
  58. m_pnh = _set_new_handler( pnh );
  59. }
  60. ~push_new_handler()
  61. {
  62. (void)_set_new_handler( m_pnh );
  63. }
  64. };
  65. class bad_regexpr : public std::runtime_error
  66. {
  67. public:
  68. explicit bad_regexpr(const std::string& _S)
  69. : std::runtime_error(_S) {}
  70. virtual ~bad_regexpr() {}
  71. };
  72. //
  73. // Flags to control how matching occurs
  74. //
  75. enum REGEX_FLAGS
  76. {
  77. NOCASE = 0x0001, // ignore case
  78. GLOBAL = 0x0002, // match everywhere in the string
  79. MULTILINE = 0x0004, // ^ and $ can match internal line breaks
  80. SINGLELINE = 0x0008, // . can match newline character
  81. RIGHTMOST = 0x0010, // start matching at the right of the string
  82. NOBACKREFS = 0x0020, // only meaningful when used with GLOBAL and substitute
  83. FIRSTBACKREFS = 0x0040, // only meaningful when used with GLOBAL
  84. ALLBACKREFS = 0x0080, // only meaningful when used with GLOBAL
  85. CSTRINGS = 0x0100, // optimize pattern for use with null-terminated strings
  86. NORMALIZE = 0x0200 // Preprocess patterns: "\\n" => "\n", etc.
  87. };
  88. // Forward declarations
  89. template< typename CI > struct match_param;
  90. template< typename CI > class match_group;
  91. template< typename CI > class match_wrapper;
  92. template< typename CI > class match_charset;
  93. template< typename CI > class basic_rpattern_base;
  94. // --------------------------------------------------------------------------
  95. //
  96. // Class: width_type
  97. //
  98. // Description: represents the width of a sub-expression
  99. //
  100. // Methods: width_add - add two widths
  101. // width_mult - multiply two widths
  102. // width_type - ctor
  103. // width_type - ctor
  104. // operator= - assign a width
  105. // operator== - are widths equal
  106. // operator!= - are widths now equal
  107. // operator+ - add two widths
  108. // operator* - multiply two widths
  109. //
  110. // Members: m_min - smallest number of characters a sub-expr can span
  111. // m_max - largest number of characters a sub-expr can span
  112. //
  113. // History: 8/14/2000 - ericne - Created
  114. //
  115. // --------------------------------------------------------------------------
  116. struct width_type
  117. {
  118. size_t m_min;
  119. size_t m_max;
  120. static size_t width_add( size_t a, size_t b )
  121. {
  122. return ( size_t(-1) == a || size_t(-1) == b ? size_t(-1) : a + b );
  123. }
  124. static size_t width_mult( size_t a, size_t b )
  125. {
  126. return ( size_t(-1) == a || size_t(-1) == b ? size_t(-1) : a * b );
  127. }
  128. width_type( size_t _min = 0, size_t _max = size_t(-1) )
  129. : m_min(_min), m_max(_max)
  130. {
  131. }
  132. width_type( const width_type & that )
  133. : m_min(that.m_min), m_max(that.m_max)
  134. {
  135. }
  136. width_type & operator=( const width_type & that )
  137. {
  138. m_min = that.m_min;
  139. m_max = that.m_max;
  140. return *this;
  141. }
  142. bool operator==( const width_type & that ) const
  143. {
  144. return ( m_min == that.m_min && m_max == that.m_max );
  145. }
  146. bool operator!=( const width_type & that ) const
  147. {
  148. return ( m_min != that.m_min || m_max != that.m_max );
  149. }
  150. width_type operator+( const width_type & that ) const
  151. {
  152. return width_type( width_add( m_min, that.m_min ), width_add( m_max, that.m_max ) );
  153. }
  154. width_type operator*( const width_type & that ) const
  155. {
  156. return width_type( width_mult( m_min, that.m_min ), width_mult( m_max, that.m_max ) );
  157. }
  158. };
  159. const width_type worst_width(0,size_t(-1));
  160. const width_type uninit_width(size_t(-1),size_t(-1));
  161. // --------------------------------------------------------------------------
  162. //
  163. // Class: sub_expr
  164. //
  165. // Description: patterns are "compiled" into a directed graph of sub_expr
  166. // structs. Matching is accomplished by traversing this graph.
  167. //
  168. // Methods: sub_expr - construct a sub_expr
  169. // _match_this - does this sub_expr match at the given location
  170. // _width_this - what is the width of this sub_expr
  171. // ~sub_expr - virt dtor so cleanup happens correctly
  172. // _delete - delete this node in the graph and all nodes linked
  173. // next - pointer to the next node in the graph
  174. // next - pointer to the next node in the graph
  175. // match_next - match the rest of the graph
  176. // domatch - match_this and match_next
  177. // is_assertion - true if this sub_expr is a zero-width assertion
  178. // get_width - find the width of the graph at this sub_expr
  179. //
  180. // Members: m_pnext - pointer to the next node in the graph
  181. //
  182. // History: 8/14/2000 - ericne - Created
  183. //
  184. // --------------------------------------------------------------------------
  185. template< typename CI >
  186. class sub_expr
  187. {
  188. sub_expr * m_pnext;
  189. protected:
  190. // Only derived classes and basic_rpattern can instantiate sub_expr's
  191. sub_expr( )
  192. : m_pnext(NULL)
  193. {
  194. }
  195. // match this object only
  196. virtual bool _match_this( match_param<CI> &, CI & ) const throw()
  197. {
  198. return true;
  199. }
  200. virtual width_type _width_this() throw() = 0;
  201. public:
  202. typedef typename std::iterator_traits<CI>::value_type char_type;
  203. friend class match_wrapper<CI>; // wrappers can access _match_this method
  204. virtual ~sub_expr() {}
  205. virtual void _delete()
  206. {
  207. if( m_pnext )
  208. m_pnext->_delete();
  209. delete this;
  210. }
  211. inline const sub_expr *const next() const { return m_pnext; }
  212. inline sub_expr * & next() { return m_pnext; }
  213. // Match all subsequent objects
  214. inline bool match_next( match_param<CI> & param, CI icur ) const throw()
  215. {
  216. return NULL == m_pnext || m_pnext->domatch( param, icur );
  217. }
  218. // Match this object and all subsequent objects
  219. // If domatch returns false, it must not change any internal state
  220. virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
  221. {
  222. return ( _match_this(param,icur) && match_next(param,icur) );
  223. }
  224. virtual bool is_assertion() const throw()
  225. {
  226. return false;
  227. }
  228. width_type get_width() throw()
  229. {
  230. width_type this_width = _width_this();
  231. if( NULL == m_pnext )
  232. return this_width;
  233. width_type that_width = m_pnext->get_width();
  234. return ( this_width + that_width );
  235. }
  236. };
  237. template< typename CI >
  238. void delete_sub_expr( sub_expr<CI> * psub )
  239. {
  240. if( psub )
  241. psub->_delete();
  242. }
  243. template< typename CI, typename SY = perl_syntax<std::iterator_traits<CI>::value_type> >
  244. class create_charset_helper
  245. {
  246. public:
  247. typedef std::iterator_traits<CI>::value_type char_type;
  248. static sub_expr<CI> * create_charset_aux(
  249. std::basic_string<char_type> & str,
  250. std::basic_string<char_type>::iterator & icur,
  251. unsigned flags );
  252. };
  253. // --------------------------------------------------------------------------
  254. //
  255. // Class: auto_sub_ptr
  256. //
  257. // Description: Class for automatically cleaning up the structure associated
  258. // with a parsed pattern
  259. //
  260. // Methods: auto_sub_ptr - private copy ctor - not used
  261. // operator= - private assign operator - not used
  262. // operator T* - private implicit cast operator - not used
  263. // auto_sub_ptr - ctor
  264. // ~auto_sub_ptr - dtor, frees ptr
  265. // free_ptr - explicitly free pointer
  266. // release - relinquish ownership of ptr
  267. // operator= - take ownership of ptr
  268. // get - return ptr
  269. // get - return ptr
  270. // operator-> - method call through ptr
  271. // operator-> - method call through ptr
  272. //
  273. // Members: m_psub - sub_expr pointer
  274. //
  275. // History: 8/14/2000 - ericne - Created
  276. //
  277. // --------------------------------------------------------------------------
  278. template< typename T >
  279. class auto_sub_ptr
  280. {
  281. T * m_psub;
  282. // hide these methods
  283. auto_sub_ptr( const auto_sub_ptr<T> & ) {}
  284. auto_sub_ptr & operator=( const auto_sub_ptr<T> & ) { return *this; }
  285. operator T*() const { return m_psub; }
  286. public:
  287. auto_sub_ptr( T * psub = NULL ) : m_psub( psub ) {}
  288. ~auto_sub_ptr()
  289. {
  290. free_ptr();
  291. }
  292. void free_ptr() // deallocate
  293. {
  294. delete_sub_expr( m_psub );
  295. m_psub = NULL;
  296. }
  297. T * release() // relinquish ownership, but don't deallocate
  298. {
  299. T * psub = m_psub;
  300. m_psub = NULL;
  301. return psub;
  302. }
  303. auto_sub_ptr<T> & operator=( T * psub )
  304. {
  305. delete_sub_expr( m_psub );
  306. m_psub = psub;
  307. return *this;
  308. }
  309. inline const T*const get() const { return m_psub; }
  310. inline T* & get() { return m_psub; }
  311. inline const T*const operator->() const { return m_psub; }
  312. inline T* operator->() { return m_psub; }
  313. };
  314. template< typename CI >
  315. struct backref_tag : public std::pair<CI,CI>
  316. {
  317. backref_tag( CI i1 = CI(0), CI i2 = CI(0) )
  318. : std::pair<CI,CI>(i1,i2), reserved(0) {}
  319. operator bool() const throw() { return first != CI(0) && second != CI(0); }
  320. bool operator!() const throw() { return ! operator bool(); }
  321. size_t reserved; // used for internal book-keeping
  322. };
  323. template< typename CH >
  324. backref_tag< const CH * > _static_match_helper(
  325. const CH * szstr,
  326. const basic_rpattern_base< const CH * > & pat,
  327. std::vector< backref_tag< const CH * > > * prgbackrefs ) throw();
  328. template< typename CH >
  329. size_t _static_count_helper(
  330. const CH * szstr,
  331. const basic_rpattern_base< const CH * > & pat ) throw();
  332. // --------------------------------------------------------------------------
  333. //
  334. // Class: basic_regexpr
  335. //
  336. // Description: string class that allows regular expression pattern matching
  337. //
  338. // Methods: basic_regexpr - ctor
  339. // match - static method for matching C-style strings
  340. // match - non-static method for matching C++-style strings
  341. // count - static method for couting matches in C-style strings
  342. // count - non-static method for counting matches in C++-style strin
  343. // substitute - perform substitutions in C++-style strings
  344. // cbackrefs - return the count of internally stored back-references
  345. // rstart - offset to start of n-th backref
  346. // rlength - lenght of n-th backref
  347. // backref - return the n-th backref
  348. // all_backrefs - return a vector of all saved backrefs
  349. // backref_str - return the string to which the backreferences refer
  350. //
  351. // Members: m_rgbackrefs - vector of backrefs
  352. // m_backref_str - temp string buffer
  353. // m_pbackref_str - pointer to the string containing the string to which
  354. // the backreferences refer (either *this or m_backref_str)
  355. //
  356. // Typedefs: backref_type -
  357. // backref_vector -
  358. //
  359. // History: 8/14/2000 - ericne - Created
  360. //
  361. // --------------------------------------------------------------------------
  362. template< typename CH, typename TR = std::char_traits<CH>, typename AL = std::allocator<CH> >
  363. class basic_regexpr : public std::basic_string<CH,TR,AL>
  364. {
  365. public:
  366. basic_regexpr( const allocator_type & a = allocator_type() )
  367. : std::basic_string<CH,TR,AL>( a ), m_pbackref_str( & m_backref_str ) {}
  368. basic_regexpr( const CH * p,
  369. const allocator_type & a = allocator_type() )
  370. : std::basic_string<CH,TR,AL>( p, a ), m_pbackref_str( & m_backref_str ) {}
  371. basic_regexpr( const CH * p, size_type n,
  372. const allocator_type & a = allocator_type() )
  373. : std::basic_string<CH,TR,AL>( p, n, a ), m_pbackref_str( & m_backref_str ) {}
  374. basic_regexpr( const std::basic_string<CH,TR,AL> & s, size_type pos = 0, size_type n = npos,
  375. const allocator_type & a = allocator_type() )
  376. : std::basic_string<CH,TR,AL>( s, pos, n, a ), m_pbackref_str( & m_backref_str ) {}
  377. basic_regexpr( size_type n, CH ch,
  378. const allocator_type & a = allocator_type() )
  379. : std::basic_string<CH,TR,AL>( n, ch, a ), m_pbackref_str( & m_backref_str ) {}
  380. basic_regexpr( const_iterator begin, const_iterator end,
  381. const allocator_type & a = allocator_type() )
  382. : std::basic_string<CH,TR,AL>( begin, end, a ), m_pbackref_str( & m_backref_str ) {}
  383. // actually stores iterators into *m_pbackref_str:
  384. typedef backref_tag<const_iterator> backref_type;
  385. typedef std::vector< backref_type > backref_vector;
  386. // stores pointers into the null-terminated C-stype string
  387. typedef backref_tag< const CH * > backref_type_c;
  388. typedef std::vector< backref_type_c > backref_vector_c;
  389. // returns $0, the first backref
  390. static backref_type_c match( const CH * szstr,
  391. const basic_rpattern_base< const CH * > & pat,
  392. backref_vector_c * prgbackrefs = NULL ) throw()
  393. {
  394. return _static_match_helper<CH>( szstr, pat, prgbackrefs );
  395. }
  396. // returns $0, the first backref
  397. backref_type match( const basic_rpattern_base< const_iterator > & pat,
  398. size_type pos = 0,
  399. size_type len = npos ) const throw();
  400. static size_t count( const CH * szstr,
  401. const basic_rpattern_base< const CH * > & pat ) throw()
  402. {
  403. return _static_count_helper<CH>( szstr, pat );
  404. }
  405. size_t count( const basic_rpattern_base< const_iterator > & pat,
  406. size_type pos = 0,
  407. size_type len = npos ) const throw();
  408. size_t substitute( const basic_rpattern_base< const_iterator > & pat,
  409. size_type pos = 0,
  410. size_type len = npos ) throw(std::bad_alloc);
  411. size_t cbackrefs() const throw()
  412. {
  413. return m_rgbackrefs.size();
  414. }
  415. size_type rstart( size_t cbackref = 0 ) const throw(std::out_of_range)
  416. {
  417. return std::distance( m_pbackref_str->begin(), m_rgbackrefs.at( cbackref ).first );
  418. }
  419. size_type rlength( size_t cbackref = 0 ) const throw(std::out_of_range)
  420. {
  421. return std::distance( m_rgbackrefs.at( cbackref ).first, m_rgbackrefs.at( cbackref ).second );
  422. }
  423. backref_type backref( size_t cbackref ) const throw(std::out_of_range)
  424. {
  425. return m_rgbackrefs.at( cbackref );
  426. }
  427. const backref_vector & all_backrefs() const throw()
  428. {
  429. return m_rgbackrefs;
  430. }
  431. const std::basic_string<CH,TR,AL> & backref_str() const throw()
  432. {
  433. return *m_pbackref_str;
  434. }
  435. protected:
  436. // save information about the backrefs
  437. // mutable because these can change in the "const" match() method.
  438. mutable backref_vector m_rgbackrefs;
  439. mutable std::basic_string<CH,TR,AL> m_backref_str;
  440. mutable const std::basic_string<CH,TR,AL> * m_pbackref_str;
  441. };
  442. // --------------------------------------------------------------------------
  443. //
  444. // Class: match_param
  445. //
  446. // Description: Struct that contains the state of the matching operation.
  447. // Passed by reference to all domatch and _match_this routines.
  448. //
  449. // Methods: match_param - ctor
  450. // match_param - ctor
  451. //
  452. // Members: ibegin - start of the string
  453. // istart - start of this iteration
  454. // istop - end of the string
  455. // prgbackrefs - pointer to backref array0
  456. //
  457. // History: 8/14/2000 - ericne - Created
  458. //
  459. // --------------------------------------------------------------------------
  460. template< typename CI >
  461. struct match_param
  462. {
  463. CI ibegin;
  464. CI istart;
  465. CI istop;
  466. std::vector< backref_tag< CI > > * prgbackrefs;
  467. match_param( CI _istart,
  468. CI _istop,
  469. std::vector< backref_tag< CI > > * _prgbackrefs )
  470. : ibegin(_istart),
  471. istart(_istart),
  472. istop(_istop),
  473. prgbackrefs(_prgbackrefs)
  474. {
  475. }
  476. match_param( CI _ibegin,
  477. CI _istart,
  478. CI _istop,
  479. std::vector< backref_tag< CI > > * _prgbackrefs )
  480. : ibegin(_ibegin),
  481. istart(_istart),
  482. istop(_istop),
  483. prgbackrefs(_prgbackrefs)
  484. {
  485. }
  486. };
  487. // --------------------------------------------------------------------------
  488. //
  489. // Class: subst_node
  490. //
  491. // Description: Substitution strings are parsed into an array of these
  492. // structures in order to speed up subst operations.
  493. //
  494. // Members: stype - type of this struct
  495. // subst_string - do a string substitution
  496. // subst_backref - do a bacref substitution
  497. // op - execute an operation
  498. //
  499. // History: 8/14/2000 - ericne - Created
  500. //
  501. // --------------------------------------------------------------------------
  502. struct subst_node
  503. {
  504. enum subst_type { SUBST_STRING, SUBST_BACKREF, SUBST_OP };
  505. enum { PREMATCH = -1, POSTMATCH = -2 };
  506. enum op_type { UPPER_ON = SUBST_UPPER_ON,
  507. UPPER_NEXT = SUBST_UPPER_NEXT,
  508. LOWER_ON = SUBST_LOWER_ON,
  509. LOWER_NEXT = SUBST_LOWER_NEXT,
  510. ALL_OFF = SUBST_ALL_OFF };
  511. subst_type stype;
  512. union
  513. {
  514. struct
  515. {
  516. size_t rstart;
  517. size_t rlength;
  518. } subst_string;
  519. size_t subst_backref;
  520. op_type op;
  521. };
  522. };
  523. // --------------------------------------------------------------------------
  524. //
  525. // Class: basic_rpattern_base
  526. //
  527. // Description:
  528. //
  529. // Methods: basic_rpattern_base - ctor
  530. // flags - get the state of the flags
  531. // uses_backrefs - true if the backrefs are referenced
  532. // get_first_subexpression - return ptr to first sub_expr struct
  533. // get_width - get min/max nbr chars this pattern can match
  534. // loops - if false, we only need to try to match at 1st position
  535. // cgroups - number of visible groups
  536. // _cgroups_total - total number of groups, including hidden (?:) groups
  537. // get_pat - get string representing the pattern
  538. // get_subst - get string representing the substitution string
  539. // get_subst_list - get the list of subst nodes
  540. // _normalize_string - perform character escaping
  541. // _reset - reinitialize the pattern
  542. //
  543. // Members: m_fuses_backrefs -
  544. // m_floop -
  545. // m_cgroups -
  546. // m_cgroups_visible -
  547. // m_flags -
  548. // m_nwidth -
  549. // m_pat -
  550. // m_subst -
  551. // m_subst_list -
  552. // m_pfirst -
  553. //
  554. // Typedefs: char_type -
  555. //
  556. // History: 8/14/2000 - ericne - Created
  557. //
  558. // --------------------------------------------------------------------------
  559. template< typename CI >
  560. class basic_rpattern_base
  561. {
  562. public:
  563. typedef std::iterator_traits<CI>::value_type char_type;
  564. basic_rpattern_base( unsigned flags = 0,
  565. const std::basic_string<char_type> & pat = std::basic_string<char_type>(),
  566. const std::basic_string<char_type> & subst = std::basic_string<char_type>() ) throw()
  567. : m_fuses_backrefs( false ),
  568. m_floop( true ),
  569. m_cgroups( 0 ),
  570. m_cgroups_visible( 0 ),
  571. m_flags( flags ),
  572. m_nwidth( uninit_width ),
  573. m_pat( pat ),
  574. m_subst( subst ),
  575. m_pfirst( NULL )
  576. {
  577. }
  578. unsigned flags() const throw()
  579. {
  580. return m_flags;
  581. }
  582. bool uses_backrefs() const throw()
  583. {
  584. return m_fuses_backrefs;
  585. }
  586. const sub_expr<CI> * get_first_subexpression() const throw()
  587. {
  588. return m_pfirst.get();
  589. }
  590. width_type get_width() const throw()
  591. {
  592. return m_nwidth;
  593. }
  594. bool loops() const throw()
  595. {
  596. return m_floop;
  597. }
  598. size_t cgroups() const throw()
  599. {
  600. return m_cgroups_visible;
  601. }
  602. size_t _cgroups_total() const throw()
  603. {
  604. return m_cgroups;
  605. }
  606. const std::basic_string<char_type> & get_pat() const throw()
  607. {
  608. return m_pat;
  609. }
  610. const std::basic_string<char_type> & get_subst() const throw()
  611. {
  612. return m_subst;
  613. }
  614. const std::list<subst_node> & get_subst_list() const throw()
  615. {
  616. return m_subst_list;
  617. }
  618. protected:
  619. void _normalize_string( std::basic_string<char_type> & str );
  620. void _reset() throw()
  621. {
  622. m_fuses_backrefs = false;
  623. m_flags = 0;
  624. }
  625. bool m_fuses_backrefs; // true if the substitution uses backrefs
  626. bool m_floop; // false if m_pfirst->domatch only needs to be called once
  627. size_t m_cgroups; // number of groups (always at least one)
  628. size_t m_cgroups_visible; // number of visible groups
  629. unsigned m_flags; // flags used to customize search/replace
  630. width_type m_nwidth; // width of the pattern
  631. std::basic_string<char_type> m_pat; // contains the unparsed pattern
  632. std::basic_string<char_type> m_subst; // contains the unparsed substitution
  633. std::list<subst_node> m_subst_list; // used to speed up substitution
  634. auto_sub_ptr<sub_expr<CI> > m_pfirst; // first subexpression in pattern
  635. };
  636. // --------------------------------------------------------------------------
  637. //
  638. // Class: basic_rpattern
  639. //
  640. // Description:
  641. //
  642. // Methods: basic_rpattern - ctor
  643. // basic_rpattern -
  644. // basic_rpattern -
  645. // init - for (re)initializing a pattern
  646. // init -
  647. // set_substitution - set the substitution string
  648. // set_flags - set the flags
  649. // register_intrinsic_charset - bind an escape sequence to a user-def'd charset
  650. // purge_intrinsic_charsets - delete all user-def'd charsets
  651. // _get_next_group_nbr - return a monotomically increasing id
  652. // _find_next_group - parse the next group of the pattern
  653. // _find_next - parse the next sub_expr of the pattern
  654. // _find_atom - parse the next atom of the pattern
  655. // _quantify - quantify the sub_expr
  656. // _common_init - perform some common initialization tasks
  657. // _parse_subst - parse the substitution string
  658. // _add_subst_backref - add a backref node to the subst list
  659. // _reset - reinitialize the pattern
  660. //
  661. // Members: s_charset_map - for maintaining user-defined charsets
  662. // m_invisible_groups - list of hidden groups to be numbered last
  663. //
  664. // Typedefs: syntax_type -
  665. //
  666. // History: 8/14/2000 - ericne - Created
  667. //
  668. // --------------------------------------------------------------------------
  669. template< typename CI, typename SY = perl_syntax<std::iterator_traits<CI>::value_type> >
  670. class basic_rpattern : public basic_rpattern_base<CI>
  671. {
  672. public:
  673. friend class match_charset<CI>;
  674. typedef SY syntax_type;
  675. basic_rpattern() throw();
  676. basic_rpattern( const std::basic_string<char_type> & pat, unsigned flags=0 ) throw(bad_regexpr,std::bad_alloc);
  677. basic_rpattern( const std::basic_string<char_type> & pat, const std::basic_string<char_type> & subst, unsigned flags=0 ) throw(bad_regexpr,std::bad_alloc);
  678. void init( const std::basic_string<char_type> & pat, unsigned flags=0 ) throw(bad_regexpr,std::bad_alloc);
  679. void init( const std::basic_string<char_type> & pat, const std::basic_string<char_type> & subst, unsigned flags=0 ) throw(bad_regexpr,std::bad_alloc);
  680. void set_substitution( const std::basic_string<char_type> & subst ) throw(bad_regexpr,std::bad_alloc);
  681. void set_flags( unsigned flags ) throw(bad_regexpr,std::bad_alloc);
  682. class charset_map
  683. {
  684. struct charsets
  685. {
  686. sub_expr<CI> * rgpcharsets[2];
  687. std::basic_string<char_type> str_charset;
  688. charsets() throw()
  689. {
  690. memset( rgpcharsets, 0, sizeof( rgpcharsets ) );
  691. }
  692. ~charsets() throw()
  693. {
  694. clean();
  695. }
  696. void clean() throw()
  697. {
  698. for( int i=0; i < (sizeof(rgpcharsets)/sizeof(*rgpcharsets)); ++i )
  699. {
  700. delete_sub_expr( rgpcharsets[i] );
  701. rgpcharsets[i] = NULL;
  702. }
  703. }
  704. match_charset<CI> * get_charset( unsigned flags ) throw(bad_regexpr,std::bad_alloc)
  705. {
  706. push_new_handler pnh( &my_new_handler );
  707. // Since these charsets are only used while creating other charsets,
  708. // all flags besides NOCASE can safely be ignored here.
  709. bool index = ( NOCASE == ( NOCASE & flags ) );
  710. if( NULL == rgpcharsets[ index ] )
  711. {
  712. std::basic_string<char_type>::iterator istart = str_charset.begin();
  713. rgpcharsets[ index ] = create_charset_helper<CI,SY>::create_charset_aux( str_charset, ++istart, flags );
  714. }
  715. return (match_charset<CI>*) rgpcharsets[ index ];
  716. }
  717. };
  718. typedef std::map<char_type,charsets> map_type;
  719. std::auto_ptr<map_type> m_pmap;
  720. public:
  721. void put( char_type ch, const std::basic_string<char_type> & str ) throw(bad_regexpr,std::bad_alloc)
  722. {
  723. // These characters cannot be bound to a user-defined intrinsic character set
  724. static const char_type rgIllegal[] =
  725. {
  726. '0','1','2','3','4','5','6','7','8','9','A','Z','z','Q',
  727. 'b','B','d','D','f','n','r','s','S','t','v','w','W','E'
  728. };
  729. // So operator new throws bad_alloc on failure.
  730. push_new_handler pnh( &my_new_handler );
  731. if( std::char_traits<char_type>::find( rgIllegal, ARRAYSIZE( rgIllegal ), ch ) )
  732. throw bad_regexpr( "illegal character specified for intrinsic character set." );
  733. if( NULL == m_pmap.get() )
  734. m_pmap = std::auto_ptr<map_type>( new map_type );
  735. // creates an empty entry if one does not already exist
  736. charsets & chrsts = (*m_pmap)[ch];
  737. chrsts.clean();
  738. chrsts.str_charset = str;
  739. // Try compiling the character set once to make sure it is properly formed:
  740. (void) chrsts.get_charset( 0 );
  741. }
  742. match_charset<CI> * get( char_type ch, unsigned flags ) throw()
  743. {
  744. match_charset<CI> * pRet = NULL;
  745. if( NULL != m_pmap.get() )
  746. {
  747. try
  748. {
  749. push_new_handler pnh( &my_new_handler );
  750. map_type::iterator iter = m_pmap->find( ch );
  751. if( iter != m_pmap->end() )
  752. pRet = iter->second.get_charset( flags );
  753. }
  754. catch(...) {}
  755. }
  756. return pRet;
  757. }
  758. void purge() throw()
  759. {
  760. if( NULL != m_pmap.get() )
  761. delete m_pmap.release();
  762. }
  763. };
  764. static void register_intrinsic_charset(
  765. char_type ch, const std::basic_string<char_type> & str ) throw(bad_regexpr,std::bad_alloc)
  766. {
  767. s_charset_map.put( ch, str );
  768. }
  769. static void purge_intrinsic_charsets() throw()
  770. {
  771. s_charset_map.purge();
  772. }
  773. protected:
  774. static charset_map s_charset_map;
  775. size_t _get_next_group_nbr()
  776. {
  777. return m_cgroups++;
  778. }
  779. match_group<CI> * _find_next_group( std::basic_string<char_type>::iterator & ipat,
  780. unsigned & flags,
  781. std::vector<match_group<CI>*> & rggroups );
  782. bool _find_next( std::basic_string<char_type>::iterator & ipat,
  783. match_group<CI> * pgroup, unsigned & flags,
  784. std::vector<match_group<CI>*> & rggroups );
  785. void _find_atom( std::basic_string<char_type>::iterator & ipat,
  786. match_group<CI> * pgroup, unsigned flags );
  787. void _quantify( auto_sub_ptr<sub_expr<CI> > & pnew,
  788. match_group<CI> * pnew_group,
  789. std::basic_string<char_type>::iterator & ipat );
  790. void _common_init( unsigned flags );
  791. void _parse_subst();
  792. void _add_subst_backref( subst_node & snode, size_t nbackref, size_t rstart );
  793. void _reset() throw();
  794. std::list<match_group<CI>*> m_invisible_groups; // groups w/o backrefs
  795. };
  796. template< typename CH, typename TR, typename AL >
  797. void process_escapes( std::basic_string<CH,TR,AL> & str )
  798. {
  799. size_t i = 0;
  800. while( basic_string<CH,TR,AL>::npos != ( i = str.find( CH('\\'), i ) ) )
  801. {
  802. if( str.size() - 1 == i )
  803. return;
  804. switch( str[i+1] )
  805. {
  806. case CH('f'):
  807. str.replace( i, 2, 1, CH('\f') );
  808. break;
  809. case CH('n'):
  810. str.replace( i, 2, 1, CH('\n') );
  811. break;
  812. case CH('r'):
  813. str.replace( i, 2, 1, CH('\r') );
  814. break;
  815. case CH('t'):
  816. str.replace( i, 2, 1, CH('\t') );
  817. break;
  818. case CH('v'):
  819. str.replace( i, 2, 1, CH('\v') );
  820. break;
  821. case CH('\\'):
  822. str.replace( i, 2, 1, CH('\\') );
  823. break;
  824. default:
  825. ++i;
  826. break;
  827. }
  828. ++i;
  829. if( str.size() <= i )
  830. return;
  831. }
  832. }
  833. inline std::ostream & operator<<( std::ostream & sout,
  834. const basic_regexpr<char>::backref_type & br )
  835. {
  836. for( std::string::const_iterator ithis = br.first; ithis != br.second; ++ithis )
  837. sout.put( *ithis );
  838. return sout;
  839. }
  840. inline std::wostream & operator<<( std::wostream & sout,
  841. const basic_regexpr<wchar_t>::backref_type & br )
  842. {
  843. for( std::wstring::const_iterator ithis = br.first; ithis != br.second; ++ithis )
  844. sout.put( *ithis > UCHAR_MAX ? L'?' : *ithis );
  845. return sout;
  846. }
  847. typedef basic_regexpr<TCHAR> regexpr;
  848. typedef std::basic_string<TCHAR> tstring;
  849. typedef basic_rpattern<const TCHAR *,perl_syntax<TCHAR> > perl_rpattern_c;
  850. typedef basic_rpattern<const TCHAR *,posix_syntax<TCHAR> > posix_rpattern_c;
  851. typedef basic_rpattern<tstring::const_iterator,perl_syntax<TCHAR> > perl_rpattern;
  852. typedef basic_rpattern<tstring::const_iterator,posix_syntax<TCHAR> > posix_rpattern;
  853. typedef perl_rpattern rpattern; // matches against std::string
  854. typedef perl_rpattern_c rpattern_c; // matches against null-terminated, c-style strings
  855. #ifdef _MT
  856. //
  857. // Define some classes and macros for creating function-local
  858. // static const rpatterns in a thread-safe way
  859. //
  860. template< typename PAT >
  861. class rpattern_destroyer
  862. {
  863. const bool & m_fConstructed;
  864. const PAT & m_refPat;
  865. public:
  866. rpattern_destroyer( const bool & fConstructed, const PAT & refPat )
  867. : m_fConstructed( fConstructed ), m_refPat( refPat )
  868. {
  869. }
  870. ~rpattern_destroyer()
  871. {
  872. if( m_fConstructed )
  873. _Destroy( & m_refPat );
  874. }
  875. };
  876. class CRegExCritSect : private CRITICAL_SECTION
  877. {
  878. public:
  879. CRegExCritSect() { InitializeCriticalSection(this); }
  880. ~CRegExCritSect() { DeleteCriticalSection(this); }
  881. void Enter() { EnterCriticalSection(this); }
  882. void Leave() { LeaveCriticalSection(this); }
  883. };
  884. extern CRegExCritSect g_objRegExCritSect;
  885. class CRegExLock
  886. {
  887. public:
  888. CRegExLock() { g_objRegExCritSect.Enter(); }
  889. ~CRegExLock() { g_objRegExCritSect.Leave(); }
  890. };
  891. #define STATIC_RPATTERN_EX( type, var, params ) \
  892. static unsigned char s_rgb_##var[ sizeof type ]; \
  893. static bool s_f_##var = false; \
  894. static const type & var = *reinterpret_cast<type*>( s_rgb_##var ); \
  895. static const regex::rpattern_destroyer<type> s_des_##var( s_f_##var, var ); \
  896. if( ! s_f_##var ) \
  897. { \
  898. regex::CRegExLock objLock; \
  899. if( ! s_f_##var ) \
  900. { \
  901. new( s_rgb_##var ) type params; \
  902. s_f_##var = true; \
  903. } \
  904. }
  905. #else
  906. #define STATIC_RPATTERN_EX( type, var, params ) \
  907. static const type var params;
  908. #endif
  909. #define STATIC_RPATTERN( var, params ) \
  910. STATIC_RPATTERN_EX( regex::rpattern, var, params )
  911. } // namespace regex
  912. #pragma warning( pop )