Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

608 lines
16 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // File: syntax.h
  4. //
  5. // Contents: syntax modules for regexpr
  6. //
  7. // Classes: perl_syntax, posix_syntax
  8. //
  9. // History: 3-29-00 ericne Created
  10. //
  11. //----------------------------------------------------------------------------
  12. #pragma once
  13. #include <string>
  14. #include <cwchar>
  15. #include <iterator>
  16. #ifndef ARRAYSIZE
  17. #define ARRAYSIZE(x) (sizeof(x)/sizeof(*(x)))
  18. #endif
  19. #ifndef UCHAR_MAX
  20. #define UCHAR_MAX 0xff
  21. #endif
  22. #ifndef WCHAR_MAX
  23. #define WCHAR_MAX ((wchar_t)-1)
  24. #endif
  25. template<>
  26. struct std::iterator_traits< const char * >
  27. { // get traits from iterator _Iter
  28. typedef random_access_iterator_tag iterator_category;
  29. typedef char value_type;
  30. typedef ptrdiff_t difference_type;
  31. typedef difference_type distance_type; // retained
  32. typedef char * pointer;
  33. typedef char & reference;
  34. };
  35. template<>
  36. struct std::iterator_traits< const wchar_t * >
  37. { // get traits from iterator _Iter
  38. typedef random_access_iterator_tag iterator_category;
  39. typedef wchar_t value_type;
  40. typedef ptrdiff_t difference_type;
  41. typedef difference_type distance_type; // retained
  42. typedef wchar_t * pointer;
  43. typedef wchar_t & reference;
  44. };
  45. namespace regex
  46. {
  47. //
  48. // The following are the tokens that can be emitted by the syntax module.
  49. // Don't reorder this list!!!
  50. //
  51. enum TOKEN
  52. {
  53. NO_TOKEN = 0,
  54. // REGULAR TOKENS
  55. BEGIN_GROUP,
  56. END_GROUP,
  57. ALTERNATION,
  58. BEGIN_LINE,
  59. END_LINE,
  60. BEGIN_CHARSET,
  61. MATCH_ANY,
  62. ESCAPE,
  63. // QUANTIFICATION TOKENS
  64. ONE_OR_MORE,
  65. ZERO_OR_MORE,
  66. ZERO_OR_ONE,
  67. ONE_OR_MORE_MIN,
  68. ZERO_OR_MORE_MIN,
  69. ZERO_OR_ONE_MIN,
  70. BEGIN_RANGE,
  71. RANGE_SEPARATOR,
  72. END_RANGE,
  73. END_RANGE_MIN,
  74. // ESCAPE SEQUENCES
  75. ESC_DIGIT,
  76. ESC_NOT_DIGIT,
  77. ESC_SPACE,
  78. ESC_NOT_SPACE,
  79. ESC_WORD,
  80. ESC_NOT_WORD,
  81. ESC_BEGIN_STRING,
  82. ESC_END_STRING,
  83. ESC_END_STRING_z,
  84. ESC_WORD_BOUNDARY,
  85. ESC_NOT_WORD_BOUNDARY,
  86. ESC_WORD_START,
  87. ESC_WORD_STOP,
  88. ESC_QUOTE_META_ON,
  89. ESC_QUOTE_META_OFF,
  90. // SUBSTITUTION TOKENS
  91. SUBST_BACKREF,
  92. SUBST_PREMATCH,
  93. SUBST_POSTMATCH,
  94. SUBST_MATCH,
  95. SUBST_ESCAPE,
  96. SUBST_QUOTE_META_ON,
  97. SUBST_UPPER_ON,
  98. SUBST_UPPER_NEXT,
  99. SUBST_LOWER_ON,
  100. SUBST_LOWER_NEXT,
  101. SUBST_ALL_OFF,
  102. // CHARSET TOKENS
  103. CHARSET_NEGATE,
  104. CHARSET_ESCAPE,
  105. CHARSET_RANGE,
  106. CHARSET_BACKSPACE,
  107. CHARSET_END,
  108. CHARSET_ALNUM,
  109. CHARSET_ALPHA,
  110. CHARSET_BLANK,
  111. CHARSET_CNTRL,
  112. CHARSET_DIGIT,
  113. CHARSET_GRAPH,
  114. CHARSET_LOWER,
  115. CHARSET_PRINT,
  116. CHARSET_PUNCT,
  117. CHARSET_SPACE,
  118. CHARSET_UPPER,
  119. CHARSET_XDIGIT,
  120. // EXTENSION TOKENS
  121. EXT_NOBACKREF,
  122. EXT_POS_LOOKAHEAD,
  123. EXT_NEG_LOOKAHEAD,
  124. EXT_POS_LOOKBEHIND,
  125. EXT_NEG_LOOKBEHIND,
  126. EXT_INDEPENDENT,
  127. EXT_UNKNOWN
  128. };
  129. struct posix_charset_type
  130. {
  131. const char * const szcharset;
  132. const size_t cchars;
  133. posix_charset_type( const char * const sz, const size_t c )
  134. : szcharset(sz), cchars(c) {}
  135. };
  136. extern const posix_charset_type g_rgposix_charsets[];
  137. extern const size_t g_cposix_charsets;
  138. template< typename const_iterator >
  139. bool is_posix_charset( const_iterator icur, const_iterator iend, const char * szcharset )
  140. {
  141. for( ; icur != iend && '\0' != *szcharset; ++icur, ++szcharset )
  142. {
  143. if( *icur != *szcharset )
  144. return false;
  145. }
  146. return '\0' == *szcharset;
  147. }
  148. //
  149. // The perl_syntax class encapsulates the Perl 5 regular expression syntax. It is
  150. // used as a template parameter to basic_rpattern. To customize regex syntax, create
  151. // your own syntax class and use it as a template parameter instead.
  152. //
  153. class perl_syntax_base
  154. {
  155. protected:
  156. static TOKEN s_rgreg[ UCHAR_MAX + 1 ];
  157. static TOKEN s_rgescape[ UCHAR_MAX + 1 ];
  158. struct init_perl_syntax;
  159. friend struct init_perl_syntax;
  160. static struct init_perl_syntax
  161. {
  162. init_perl_syntax();
  163. } s_init_perl_syntax;
  164. static inline TOKEN look_up( char ch, TOKEN rg[] ) { return rg[ (unsigned char)ch ]; }
  165. static inline TOKEN look_up( wchar_t ch, TOKEN rg[] ) { return UCHAR_MAX < ch ? NO_TOKEN : rg[ (unsigned char)ch ]; }
  166. };
  167. template< typename CH >
  168. class perl_syntax : protected perl_syntax_base
  169. {
  170. public:
  171. typedef std::basic_string<CH>::iterator iterator;
  172. typedef std::basic_string<CH>::const_iterator const_iterator;
  173. typedef CH char_type;
  174. private:
  175. static bool min_quant( iterator & icur, const_iterator iend )
  176. {
  177. return ( (const_iterator)++icur != iend && CH('?') == *icur ? (++icur,true) : false );
  178. }
  179. public:
  180. static TOKEN reg_token( iterator & icur, const_iterator iend )
  181. {
  182. assert( (const_iterator)icur != iend );
  183. TOKEN tok = look_up( *icur, s_rgreg );
  184. if( tok )
  185. ++icur;
  186. if( ESCAPE == tok && (const_iterator)icur != iend )
  187. {
  188. tok = look_up( *icur, s_rgescape );
  189. if( tok )
  190. ++icur;
  191. else
  192. tok = ESCAPE;
  193. }
  194. return tok;
  195. }
  196. static TOKEN quant_token( iterator & icur, const_iterator iend )
  197. {
  198. assert( (const_iterator)icur != iend );
  199. TOKEN tok = NO_TOKEN;
  200. switch( *icur )
  201. {
  202. case CH('*'):
  203. tok = min_quant( icur, iend ) ? ZERO_OR_MORE_MIN : ZERO_OR_MORE;
  204. break;
  205. case CH('+'):
  206. tok = min_quant( icur, iend ) ? ONE_OR_MORE_MIN : ONE_OR_MORE;
  207. break;
  208. case CH('?'):
  209. tok = min_quant( icur, iend ) ? ZERO_OR_ONE_MIN : ZERO_OR_ONE;
  210. break;
  211. case CH('}'):
  212. tok = min_quant( icur, iend ) ? END_RANGE_MIN : END_RANGE;
  213. break;
  214. case CH('{'):
  215. tok = BEGIN_RANGE;
  216. ++icur;
  217. break;
  218. case CH(','):
  219. tok = RANGE_SEPARATOR;
  220. ++icur;
  221. break;
  222. }
  223. return tok;
  224. }
  225. static TOKEN charset_token( iterator & icur, const_iterator iend )
  226. {
  227. assert( (const_iterator)icur != iend );
  228. TOKEN tok = NO_TOKEN;
  229. switch( *icur )
  230. {
  231. case CH('-'):
  232. tok = CHARSET_RANGE;
  233. ++icur;
  234. break;
  235. case CH('^'):
  236. tok = CHARSET_NEGATE;
  237. ++icur;
  238. break;
  239. case CH(']'):
  240. tok = CHARSET_END;
  241. ++icur;
  242. break;
  243. case CH('\\'):
  244. tok = CHARSET_ESCAPE;
  245. if( (const_iterator)++icur == iend )
  246. break;
  247. switch( *icur )
  248. {
  249. case CH('b'):
  250. tok = CHARSET_BACKSPACE;
  251. ++icur;
  252. break;
  253. case CH('d'):
  254. tok = ESC_DIGIT;
  255. ++icur;
  256. break;
  257. case CH('D'):
  258. tok = ESC_NOT_DIGIT;
  259. ++icur;
  260. break;
  261. case CH('s'):
  262. tok = ESC_SPACE;
  263. ++icur;
  264. break;
  265. case CH('S'):
  266. tok = ESC_NOT_SPACE;
  267. ++icur;
  268. break;
  269. case CH('w'):
  270. tok = ESC_WORD;
  271. ++icur;
  272. break;
  273. case CH('W'):
  274. tok = ESC_NOT_WORD;
  275. ++icur;
  276. break;
  277. }
  278. break;
  279. case CH('['):
  280. for( size_t i=0; !tok && i < g_cposix_charsets; ++i )
  281. {
  282. if( is_posix_charset<const_iterator>( icur, iend, g_rgposix_charsets[i].szcharset ) )
  283. {
  284. tok = TOKEN(CHARSET_ALNUM + i);
  285. std::advance( icur, g_rgposix_charsets[i].cchars );
  286. }
  287. }
  288. break;
  289. }
  290. return tok;
  291. }
  292. static TOKEN subst_token( iterator & icur, const_iterator iend )
  293. {
  294. assert( (const_iterator)icur != iend );
  295. TOKEN tok = NO_TOKEN;
  296. switch( *icur )
  297. {
  298. case CH('\\'):
  299. tok = SUBST_ESCAPE;
  300. if( (const_iterator)++icur != iend )
  301. switch( *icur )
  302. {
  303. case CH('Q'):
  304. tok = SUBST_QUOTE_META_ON;
  305. ++icur;
  306. break;
  307. case CH('U'):
  308. tok = SUBST_UPPER_ON;
  309. ++icur;
  310. break;
  311. case CH('u'):
  312. tok = SUBST_UPPER_NEXT;
  313. ++icur;
  314. break;
  315. case CH('L'):
  316. tok = SUBST_LOWER_ON;
  317. ++icur;
  318. break;
  319. case CH('l'):
  320. tok = SUBST_LOWER_NEXT;
  321. ++icur;
  322. break;
  323. case CH('E'):
  324. tok = SUBST_ALL_OFF;
  325. ++icur;
  326. break;
  327. }
  328. break;
  329. case CH('$'):
  330. tok = SUBST_BACKREF;
  331. if( (const_iterator)++icur != iend )
  332. switch( *icur )
  333. {
  334. case CH('&'):
  335. tok = SUBST_MATCH;
  336. ++icur;
  337. break;
  338. case CH('`'):
  339. tok = SUBST_PREMATCH;
  340. ++icur;
  341. break;
  342. case CH('\''):
  343. tok = SUBST_POSTMATCH;
  344. ++icur;
  345. break;
  346. }
  347. break;
  348. }
  349. return tok;
  350. }
  351. static TOKEN ext_token( iterator & icur, const_iterator iend, unsigned & flags )
  352. {
  353. assert( (const_iterator)icur != iend );
  354. bool finclude;
  355. TOKEN tok = NO_TOKEN;
  356. if( CH('?') == *icur )
  357. {
  358. tok = EXT_UNKNOWN;
  359. if( (const_iterator)++icur != iend )
  360. {
  361. switch( *icur )
  362. {
  363. case CH(':'):
  364. tok = EXT_NOBACKREF;
  365. ++icur;
  366. break;
  367. case CH('='):
  368. tok = EXT_POS_LOOKAHEAD;
  369. ++icur;
  370. break;
  371. case CH('!'):
  372. tok = EXT_NEG_LOOKAHEAD;
  373. ++icur;
  374. break;
  375. case CH('<'):
  376. if( (const_iterator)++icur == iend )
  377. break;
  378. switch( *icur )
  379. {
  380. case CH('='):
  381. tok = EXT_POS_LOOKBEHIND;
  382. ++icur;
  383. break;
  384. case CH('!'):
  385. tok = EXT_NEG_LOOKBEHIND;
  386. ++icur;
  387. break;
  388. }
  389. break;
  390. case CH('>'):
  391. tok = EXT_INDEPENDENT;
  392. ++icur;
  393. break;
  394. default:
  395. finclude = true;
  396. do
  397. {
  398. if( CH(':') == *icur )
  399. {
  400. tok = EXT_NOBACKREF;
  401. ++icur;
  402. break;
  403. }
  404. if( CH(')') == *icur )
  405. {
  406. tok = EXT_NOBACKREF;
  407. break;
  408. }
  409. if( CH('-') == *icur && finclude )
  410. finclude = false;
  411. else if( CH('i') == *icur )
  412. flags = finclude ? ( flags | NOCASE ) : ( flags & ~NOCASE );
  413. else if( CH('m') == *icur )
  414. flags = finclude ? ( flags | MULTILINE ) : ( flags & ~MULTILINE );
  415. else if( CH('s') == *icur )
  416. flags = finclude ? ( flags | SINGLELINE ) : ( flags & ~SINGLELINE );
  417. else
  418. break;
  419. } while( (const_iterator)++icur != iend );
  420. break;
  421. }
  422. }
  423. }
  424. return tok;
  425. }
  426. };
  427. //
  428. // Implements the basic POSIX regular expression syntax
  429. //
  430. template< typename CH >
  431. class posix_syntax
  432. {
  433. public:
  434. typedef std::basic_string<CH>::iterator iterator;
  435. typedef std::basic_string<CH>::const_iterator const_iterator;
  436. typedef CH char_type;
  437. static TOKEN reg_token( iterator & icur, const_iterator iend )
  438. {
  439. TOKEN tok = NO_TOKEN;
  440. switch( *icur )
  441. {
  442. case '.':
  443. tok = MATCH_ANY;
  444. ++icur;
  445. break;
  446. case '^':
  447. tok = BEGIN_LINE;
  448. ++icur;
  449. break;
  450. case '$':
  451. tok = END_LINE;
  452. ++icur;
  453. break;
  454. case '[':
  455. tok = BEGIN_CHARSET;
  456. ++icur;
  457. break;
  458. case '\\':
  459. tok = ESCAPE;
  460. ++icur;
  461. if( (const_iterator)icur != iend )
  462. {
  463. switch( *icur )
  464. {
  465. case '(':
  466. tok = BEGIN_GROUP;
  467. ++icur;
  468. break;
  469. case ')':
  470. tok = END_GROUP;
  471. ++icur;
  472. break;
  473. case '|':
  474. tok = ALTERNATION;
  475. ++icur;
  476. break;
  477. }
  478. }
  479. break;
  480. }
  481. return tok;
  482. }
  483. static TOKEN quant_token( iterator & icur, const_iterator iend )
  484. {
  485. TOKEN tok = NO_TOKEN;
  486. switch( *icur )
  487. {
  488. case '*':
  489. tok = ZERO_OR_MORE;
  490. ++icur;
  491. break;
  492. case ',':
  493. tok = RANGE_SEPARATOR;
  494. ++icur;
  495. break;
  496. case '\\':
  497. ++icur;
  498. if( (const_iterator)icur != iend )
  499. {
  500. switch( *icur )
  501. {
  502. case '?':
  503. tok = ZERO_OR_ONE;
  504. ++icur;
  505. break;
  506. case '+':
  507. tok = ONE_OR_MORE;
  508. ++icur;
  509. break;
  510. case '{':
  511. tok = BEGIN_RANGE;
  512. ++icur;
  513. break;
  514. case '}':
  515. tok = END_RANGE;
  516. ++icur;
  517. break;
  518. default:
  519. --icur;
  520. break;
  521. }
  522. }
  523. else
  524. {
  525. --icur;
  526. }
  527. }
  528. return tok;
  529. }
  530. static TOKEN charset_token( iterator & icur, const_iterator iend )
  531. {
  532. TOKEN tok = NO_TOKEN;
  533. switch( *icur )
  534. {
  535. case '^':
  536. tok = CHARSET_NEGATE;
  537. ++icur;
  538. break;
  539. case '-':
  540. tok = CHARSET_RANGE;
  541. ++icur;
  542. break;
  543. case ']':
  544. tok = CHARSET_END;
  545. ++icur;
  546. break;
  547. case '[':
  548. for( size_t i=0; !tok && i < g_cposix_charsets; ++i )
  549. {
  550. if( is_posix_charset<const_iterator>( icur, iend, g_rgposix_charsets[i].szcharset ) )
  551. {
  552. tok = TOKEN(CHARSET_ALNUM + i);
  553. std::advance( icur, g_rgposix_charsets[i].cchars );
  554. }
  555. }
  556. break;
  557. }
  558. return tok;
  559. }
  560. static TOKEN subst_token( iterator & icur, const_iterator iend )
  561. {
  562. TOKEN tok = NO_TOKEN;
  563. if( '\\' == *icur )
  564. {
  565. tok = SUBST_ESCAPE;
  566. ++icur;
  567. if( (const_iterator)icur != iend && '0' <= *icur && '9' >= *icur )
  568. {
  569. tok = SUBST_BACKREF;
  570. }
  571. }
  572. return tok;
  573. }
  574. static TOKEN ext_token( iterator & icur, const_iterator iend, unsigned & flags )
  575. {
  576. return NO_TOKEN;
  577. }
  578. };
  579. } // namespace regex