Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

613 lines
16 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // File: syntax.h
  4. //
  5. // Contents: syntax modules for regexpr
  6. //
  7. // Classes: perl_syntax, posix_syntax
  8. //
  9. // History: 3-29-00 ericne Created
  10. //
  11. //----------------------------------------------------------------------------
  12. #pragma once
  13. #pragma warning(push)
  14. // warning C4511: copy constructor could not be generated
  15. // warning C4512: assignment operator could not be generated
  16. #pragma warning( disable : 4511 4512 )
  17. #include <string>
  18. #include <cwchar>
  19. #include <iterator>
  20. #ifndef ARRAYSIZE
  21. #define ARRAYSIZE(x) (sizeof(x)/sizeof(*(x)))
  22. #endif
  23. #ifndef UCHAR_MAX
  24. #define UCHAR_MAX 0xff
  25. #endif
  26. #ifndef WCHAR_MAX
  27. #define WCHAR_MAX ((wchar_t)-1)
  28. #endif
  29. template<>
  30. struct std::iterator_traits< const char * >
  31. { // get traits from iterator _Iter
  32. typedef random_access_iterator_tag iterator_category;
  33. typedef char value_type;
  34. typedef ptrdiff_t difference_type;
  35. typedef difference_type distance_type; // retained
  36. typedef char * pointer;
  37. typedef char & reference;
  38. };
  39. template<>
  40. struct std::iterator_traits< const wchar_t * >
  41. { // get traits from iterator _Iter
  42. typedef random_access_iterator_tag iterator_category;
  43. typedef wchar_t value_type;
  44. typedef ptrdiff_t difference_type;
  45. typedef difference_type distance_type; // retained
  46. typedef wchar_t * pointer;
  47. typedef wchar_t & reference;
  48. };
  49. namespace regex
  50. {
  51. //
  52. // The following are the tokens that can be emitted by the syntax module.
  53. // Don't reorder this list!!!
  54. //
  55. enum TOKEN
  56. {
  57. NO_TOKEN = 0,
  58. // REGULAR TOKENS
  59. BEGIN_GROUP,
  60. END_GROUP,
  61. ALTERNATION,
  62. BEGIN_LINE,
  63. END_LINE,
  64. BEGIN_CHARSET,
  65. MATCH_ANY,
  66. ESCAPE,
  67. // QUANTIFICATION TOKENS
  68. ONE_OR_MORE,
  69. ZERO_OR_MORE,
  70. ZERO_OR_ONE,
  71. ONE_OR_MORE_MIN,
  72. ZERO_OR_MORE_MIN,
  73. ZERO_OR_ONE_MIN,
  74. BEGIN_RANGE,
  75. RANGE_SEPARATOR,
  76. END_RANGE,
  77. END_RANGE_MIN,
  78. // ESCAPE SEQUENCES
  79. ESC_DIGIT,
  80. ESC_NOT_DIGIT,
  81. ESC_SPACE,
  82. ESC_NOT_SPACE,
  83. ESC_WORD,
  84. ESC_NOT_WORD,
  85. ESC_BEGIN_STRING,
  86. ESC_END_STRING,
  87. ESC_END_STRING_z,
  88. ESC_WORD_BOUNDARY,
  89. ESC_NOT_WORD_BOUNDARY,
  90. ESC_WORD_START,
  91. ESC_WORD_STOP,
  92. ESC_QUOTE_META_ON,
  93. ESC_QUOTE_META_OFF,
  94. // SUBSTITUTION TOKENS
  95. SUBST_BACKREF,
  96. SUBST_PREMATCH,
  97. SUBST_POSTMATCH,
  98. SUBST_MATCH,
  99. SUBST_ESCAPE,
  100. SUBST_QUOTE_META_ON,
  101. SUBST_UPPER_ON,
  102. SUBST_UPPER_NEXT,
  103. SUBST_LOWER_ON,
  104. SUBST_LOWER_NEXT,
  105. SUBST_ALL_OFF,
  106. // CHARSET TOKENS
  107. CHARSET_NEGATE,
  108. CHARSET_ESCAPE,
  109. CHARSET_RANGE,
  110. CHARSET_BACKSPACE,
  111. CHARSET_END,
  112. CHARSET_ALNUM,
  113. CHARSET_ALPHA,
  114. CHARSET_BLANK,
  115. CHARSET_CNTRL,
  116. CHARSET_DIGIT,
  117. CHARSET_GRAPH,
  118. CHARSET_LOWER,
  119. CHARSET_PRINT,
  120. CHARSET_PUNCT,
  121. CHARSET_SPACE,
  122. CHARSET_UPPER,
  123. CHARSET_XDIGIT,
  124. // EXTENSION TOKENS
  125. EXT_NOBACKREF,
  126. EXT_POS_LOOKAHEAD,
  127. EXT_NEG_LOOKAHEAD,
  128. EXT_POS_LOOKBEHIND,
  129. EXT_NEG_LOOKBEHIND,
  130. EXT_INDEPENDENT,
  131. EXT_UNKNOWN
  132. };
  133. struct posix_charset_type
  134. {
  135. const char * const szcharset;
  136. const size_t cchars;
  137. posix_charset_type( const char * const sz, const size_t c )
  138. : szcharset(sz), cchars(c) {}
  139. };
  140. extern const posix_charset_type g_rgposix_charsets[];
  141. extern const size_t g_cposix_charsets;
  142. template< typename const_iterator >
  143. bool is_posix_charset( const_iterator icur, const_iterator iend, const char * szcharset )
  144. {
  145. for( ; icur != iend && '\0' != *szcharset; ++icur, ++szcharset )
  146. {
  147. if( *icur != *szcharset )
  148. return false;
  149. }
  150. return '\0' == *szcharset;
  151. }
  152. //
  153. // The perl_syntax class encapsulates the Perl 5 regular expression syntax. It is
  154. // used as a template parameter to basic_rpattern. To customize regex syntax, create
  155. // your own syntax class and use it as a template parameter instead.
  156. //
  157. class perl_syntax_base
  158. {
  159. protected:
  160. static TOKEN s_rgreg[ UCHAR_MAX + 1 ];
  161. static TOKEN s_rgescape[ UCHAR_MAX + 1 ];
  162. struct init_perl_syntax;
  163. friend struct init_perl_syntax;
  164. static struct init_perl_syntax
  165. {
  166. init_perl_syntax();
  167. } s_init_perl_syntax;
  168. static inline TOKEN look_up( char ch, TOKEN rg[] ) { return rg[ (unsigned char)ch ]; }
  169. static inline TOKEN look_up( wchar_t ch, TOKEN rg[] ) { return UCHAR_MAX < ch ? NO_TOKEN : rg[ (unsigned char)ch ]; }
  170. };
  171. template< typename CH >
  172. class perl_syntax : protected perl_syntax_base
  173. {
  174. public:
  175. typedef std::basic_string<CH>::iterator iterator;
  176. typedef std::basic_string<CH>::const_iterator const_iterator;
  177. typedef CH char_type;
  178. private:
  179. static bool min_quant( iterator & icur, const_iterator iend )
  180. {
  181. return ( (const_iterator)++icur != iend && CH('?') == *icur ? (++icur,true) : false );
  182. }
  183. public:
  184. static TOKEN reg_token( iterator & icur, const_iterator iend )
  185. {
  186. assert( (const_iterator)icur != iend );
  187. TOKEN tok = look_up( *icur, s_rgreg );
  188. if( tok )
  189. ++icur;
  190. if( ESCAPE == tok && (const_iterator)icur != iend )
  191. {
  192. tok = look_up( *icur, s_rgescape );
  193. if( tok )
  194. ++icur;
  195. else
  196. tok = ESCAPE;
  197. }
  198. return tok;
  199. }
  200. static TOKEN quant_token( iterator & icur, const_iterator iend )
  201. {
  202. assert( (const_iterator)icur != iend );
  203. TOKEN tok = NO_TOKEN;
  204. switch( *icur )
  205. {
  206. case CH('*'):
  207. tok = min_quant( icur, iend ) ? ZERO_OR_MORE_MIN : ZERO_OR_MORE;
  208. break;
  209. case CH('+'):
  210. tok = min_quant( icur, iend ) ? ONE_OR_MORE_MIN : ONE_OR_MORE;
  211. break;
  212. case CH('?'):
  213. tok = min_quant( icur, iend ) ? ZERO_OR_ONE_MIN : ZERO_OR_ONE;
  214. break;
  215. case CH('}'):
  216. tok = min_quant( icur, iend ) ? END_RANGE_MIN : END_RANGE;
  217. break;
  218. case CH('{'):
  219. tok = BEGIN_RANGE;
  220. ++icur;
  221. break;
  222. case CH(','):
  223. tok = RANGE_SEPARATOR;
  224. ++icur;
  225. break;
  226. }
  227. return tok;
  228. }
  229. static TOKEN charset_token( iterator & icur, const_iterator iend )
  230. {
  231. assert( (const_iterator)icur != iend );
  232. TOKEN tok = NO_TOKEN;
  233. switch( *icur )
  234. {
  235. case CH('-'):
  236. tok = CHARSET_RANGE;
  237. ++icur;
  238. break;
  239. case CH('^'):
  240. tok = CHARSET_NEGATE;
  241. ++icur;
  242. break;
  243. case CH(']'):
  244. tok = CHARSET_END;
  245. ++icur;
  246. break;
  247. case CH('\\'):
  248. tok = CHARSET_ESCAPE;
  249. if( (const_iterator)++icur == iend )
  250. break;
  251. switch( *icur )
  252. {
  253. case CH('b'):
  254. tok = CHARSET_BACKSPACE;
  255. ++icur;
  256. break;
  257. case CH('d'):
  258. tok = ESC_DIGIT;
  259. ++icur;
  260. break;
  261. case CH('D'):
  262. tok = ESC_NOT_DIGIT;
  263. ++icur;
  264. break;
  265. case CH('s'):
  266. tok = ESC_SPACE;
  267. ++icur;
  268. break;
  269. case CH('S'):
  270. tok = ESC_NOT_SPACE;
  271. ++icur;
  272. break;
  273. case CH('w'):
  274. tok = ESC_WORD;
  275. ++icur;
  276. break;
  277. case CH('W'):
  278. tok = ESC_NOT_WORD;
  279. ++icur;
  280. break;
  281. }
  282. break;
  283. case CH('['):
  284. for( size_t i=0; !tok && i < g_cposix_charsets; ++i )
  285. {
  286. if( is_posix_charset<const_iterator>( icur, iend, g_rgposix_charsets[i].szcharset ) )
  287. {
  288. tok = TOKEN(CHARSET_ALNUM + i);
  289. std::advance( icur, g_rgposix_charsets[i].cchars );
  290. }
  291. }
  292. break;
  293. }
  294. return tok;
  295. }
  296. static TOKEN subst_token( iterator & icur, const_iterator iend )
  297. {
  298. assert( (const_iterator)icur != iend );
  299. TOKEN tok = NO_TOKEN;
  300. switch( *icur )
  301. {
  302. case CH('\\'):
  303. tok = SUBST_ESCAPE;
  304. if( (const_iterator)++icur != iend )
  305. switch( *icur )
  306. {
  307. case CH('Q'):
  308. tok = SUBST_QUOTE_META_ON;
  309. ++icur;
  310. break;
  311. case CH('U'):
  312. tok = SUBST_UPPER_ON;
  313. ++icur;
  314. break;
  315. case CH('u'):
  316. tok = SUBST_UPPER_NEXT;
  317. ++icur;
  318. break;
  319. case CH('L'):
  320. tok = SUBST_LOWER_ON;
  321. ++icur;
  322. break;
  323. case CH('l'):
  324. tok = SUBST_LOWER_NEXT;
  325. ++icur;
  326. break;
  327. case CH('E'):
  328. tok = SUBST_ALL_OFF;
  329. ++icur;
  330. break;
  331. }
  332. break;
  333. case CH('$'):
  334. tok = SUBST_BACKREF;
  335. if( (const_iterator)++icur != iend )
  336. switch( *icur )
  337. {
  338. case CH('&'):
  339. tok = SUBST_MATCH;
  340. ++icur;
  341. break;
  342. case CH('`'):
  343. tok = SUBST_PREMATCH;
  344. ++icur;
  345. break;
  346. case CH('\''):
  347. tok = SUBST_POSTMATCH;
  348. ++icur;
  349. break;
  350. }
  351. break;
  352. }
  353. return tok;
  354. }
  355. static TOKEN ext_token( iterator & icur, const_iterator iend, unsigned & flags )
  356. {
  357. assert( (const_iterator)icur != iend );
  358. bool finclude;
  359. TOKEN tok = NO_TOKEN;
  360. if( CH('?') == *icur )
  361. {
  362. tok = EXT_UNKNOWN;
  363. if( (const_iterator)++icur != iend )
  364. {
  365. switch( *icur )
  366. {
  367. case CH(':'):
  368. tok = EXT_NOBACKREF;
  369. ++icur;
  370. break;
  371. case CH('='):
  372. tok = EXT_POS_LOOKAHEAD;
  373. ++icur;
  374. break;
  375. case CH('!'):
  376. tok = EXT_NEG_LOOKAHEAD;
  377. ++icur;
  378. break;
  379. case CH('<'):
  380. if( (const_iterator)++icur == iend )
  381. break;
  382. switch( *icur )
  383. {
  384. case CH('='):
  385. tok = EXT_POS_LOOKBEHIND;
  386. ++icur;
  387. break;
  388. case CH('!'):
  389. tok = EXT_NEG_LOOKBEHIND;
  390. ++icur;
  391. break;
  392. }
  393. break;
  394. case CH('>'):
  395. tok = EXT_INDEPENDENT;
  396. ++icur;
  397. break;
  398. default:
  399. finclude = true;
  400. do
  401. {
  402. if( CH(':') == *icur )
  403. {
  404. tok = EXT_NOBACKREF;
  405. ++icur;
  406. break;
  407. }
  408. if( CH(')') == *icur )
  409. {
  410. tok = EXT_NOBACKREF;
  411. break;
  412. }
  413. if( CH('-') == *icur && finclude )
  414. finclude = false;
  415. else if( CH('i') == *icur )
  416. flags = finclude ? ( flags | NOCASE ) : ( flags & ~NOCASE );
  417. else if( CH('m') == *icur )
  418. flags = finclude ? ( flags | MULTILINE ) : ( flags & ~MULTILINE );
  419. else if( CH('s') == *icur )
  420. flags = finclude ? ( flags | SINGLELINE ) : ( flags & ~SINGLELINE );
  421. else
  422. break;
  423. } while( (const_iterator)++icur != iend );
  424. break;
  425. }
  426. }
  427. }
  428. return tok;
  429. }
  430. };
  431. //
  432. // Implements the basic POSIX regular expression syntax
  433. //
  434. template< typename CH >
  435. class posix_syntax
  436. {
  437. public:
  438. typedef std::basic_string<CH>::iterator iterator;
  439. typedef std::basic_string<CH>::const_iterator const_iterator;
  440. typedef CH char_type;
  441. static TOKEN reg_token( iterator & icur, const_iterator iend )
  442. {
  443. TOKEN tok = NO_TOKEN;
  444. switch( *icur )
  445. {
  446. case '.':
  447. tok = MATCH_ANY;
  448. ++icur;
  449. break;
  450. case '^':
  451. tok = BEGIN_LINE;
  452. ++icur;
  453. break;
  454. case '$':
  455. tok = END_LINE;
  456. ++icur;
  457. break;
  458. case '[':
  459. tok = BEGIN_CHARSET;
  460. ++icur;
  461. break;
  462. case '\\':
  463. tok = ESCAPE;
  464. ++icur;
  465. if( (const_iterator)icur != iend )
  466. {
  467. switch( *icur )
  468. {
  469. case '(':
  470. tok = BEGIN_GROUP;
  471. ++icur;
  472. break;
  473. case ')':
  474. tok = END_GROUP;
  475. ++icur;
  476. break;
  477. case '|':
  478. tok = ALTERNATION;
  479. ++icur;
  480. break;
  481. }
  482. }
  483. break;
  484. }
  485. return tok;
  486. }
  487. static TOKEN quant_token( iterator & icur, const_iterator iend )
  488. {
  489. TOKEN tok = NO_TOKEN;
  490. switch( *icur )
  491. {
  492. case '*':
  493. tok = ZERO_OR_MORE;
  494. ++icur;
  495. break;
  496. case ',':
  497. tok = RANGE_SEPARATOR;
  498. ++icur;
  499. break;
  500. case '\\':
  501. ++icur;
  502. if( (const_iterator)icur != iend )
  503. {
  504. switch( *icur )
  505. {
  506. case '?':
  507. tok = ZERO_OR_ONE;
  508. ++icur;
  509. break;
  510. case '+':
  511. tok = ONE_OR_MORE;
  512. ++icur;
  513. break;
  514. case '{':
  515. tok = BEGIN_RANGE;
  516. ++icur;
  517. break;
  518. case '}':
  519. tok = END_RANGE;
  520. ++icur;
  521. break;
  522. default:
  523. --icur;
  524. break;
  525. }
  526. }
  527. else
  528. {
  529. --icur;
  530. }
  531. }
  532. return tok;
  533. }
  534. static TOKEN charset_token( iterator & icur, const_iterator iend )
  535. {
  536. TOKEN tok = NO_TOKEN;
  537. switch( *icur )
  538. {
  539. case '^':
  540. tok = CHARSET_NEGATE;
  541. ++icur;
  542. break;
  543. case '-':
  544. tok = CHARSET_RANGE;
  545. ++icur;
  546. break;
  547. case ']':
  548. tok = CHARSET_END;
  549. ++icur;
  550. break;
  551. case '[':
  552. for( size_t i=0; !tok && i < g_cposix_charsets; ++i )
  553. {
  554. if( is_posix_charset<const_iterator>( icur, iend, g_rgposix_charsets[i].szcharset ) )
  555. {
  556. tok = TOKEN(CHARSET_ALNUM + i);
  557. std::advance( icur, g_rgposix_charsets[i].cchars );
  558. }
  559. }
  560. break;
  561. }
  562. return tok;
  563. }
  564. static TOKEN subst_token( iterator & icur, const_iterator iend )
  565. {
  566. TOKEN tok = NO_TOKEN;
  567. if( '\\' == *icur )
  568. {
  569. tok = SUBST_ESCAPE;
  570. ++icur;
  571. if( (const_iterator)icur != iend && '0' <= *icur && '9' >= *icur )
  572. {
  573. tok = SUBST_BACKREF;
  574. }
  575. }
  576. return tok;
  577. }
  578. static TOKEN ext_token( iterator &, const_iterator, unsigned & )
  579. {
  580. return NO_TOKEN;
  581. }
  582. };
  583. } // namespace regex
  584. #pragma warning(pop)