Team Fortress 2 Source Code as on 22/4/2020
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1888 lines
59 KiB

  1. //========= Copyright Valve Corporation, All rights reserved. ============//
  2. //
  3. // Purpose:
  4. //
  5. // $NoKeywords: $
  6. //
  7. //=============================================================================//
  8. /*
  9. *
  10. * Copyright (c) 1998-9
  11. * Dr John Maddock
  12. *
  13. * Permission to use, copy, modify, distribute and sell this software
  14. * and its documentation for any purpose is hereby granted without fee,
  15. * provided that the above copyright notice appear in all copies and
  16. * that both that copyright notice and this permission notice appear
  17. * in supporting documentation. Dr John Maddock makes no representations
  18. * about the suitability of this software for any purpose.
  19. * It is provided "as is" without express or implied warranty.
  20. *
  21. */
  22. /*
  23. * FILE regcomp.h
  24. * VERSION 2.12
  25. * This is an internal header file, do not include directly
  26. */
  27. JM_NAMESPACE(__JM)
  28. template <class traits>
  29. struct kmp_translator
  30. {
  31. typedef typename traits::char_type char_type;
  32. bool icase;
  33. kmp_translator(bool c) : icase(c) {}
  34. char_type operator()(char_type c
  35. #ifdef RE_LOCALE_CPP
  36. , const __JM_STD::locale& l
  37. #endif
  38. )
  39. {
  40. return traits::translate(c, icase MAYBE_PASS_LOCALE(l));
  41. }
  42. };
  43. #if defined(JM_NO_TEMPLATE_SWITCH_MERGE) && !defined(JM_NO_NAMESPACES)
  44. //
  45. // Ugly ugly hack,
  46. // template don't merge if they contain switch statements so declare these
  47. // templates in unnamed namespace (ie with internal linkage), each translation
  48. // unit then gets its own local copy, it works seemlessly but bloats the app.
  49. namespace{
  50. #endif
  51. template <class charT, class traits, class Allocator>
  52. inline bool RE_CALL reg_expression<charT, traits, Allocator>::can_start(charT c, const unsigned char* __map, unsigned char mask, const __wide_type&)
  53. {
  54. if((traits_size_type)(traits_uchar_type)c >= 256)
  55. return true;
  56. return JM_MAKE_BOOL(__map[(traits_uchar_type)c] & mask);
  57. }
  58. template <class charT, class traits, class Allocator>
  59. inline bool RE_CALL reg_expression<charT, traits, Allocator>::can_start(charT c, const unsigned char* __map, unsigned char mask, const __narrow_type&)
  60. {
  61. return JM_MAKE_BOOL(__map[(traits_uchar_type)c] & mask);
  62. }
  63. template <class charT, class traits, class Allocator>
  64. CONSTRUCTOR_INLINE reg_expression<charT, traits, Allocator>::reg_expression(const Allocator& a)
  65. : regbase(), data(a), pkmp(0)
  66. {
  67. }
  68. template <class charT, class traits, class Allocator>
  69. CONSTRUCTOR_INLINE reg_expression<charT, traits, Allocator>::reg_expression(const charT* p, jm_uintfast32_t f, const Allocator& a)
  70. : data(a), pkmp(0)
  71. {
  72. set_expression(p, f);
  73. }
  74. template <class charT, class traits, class Allocator>
  75. CONSTRUCTOR_INLINE reg_expression<charT, traits, Allocator>::reg_expression(const charT* p1, const charT* p2, jm_uintfast32_t f, const Allocator& a)
  76. : data(a), pkmp(0)
  77. {
  78. set_expression(p1, p2, f);
  79. }
  80. template <class charT, class traits, class Allocator>
  81. CONSTRUCTOR_INLINE reg_expression<charT, traits, Allocator>::reg_expression(const charT* p, size_type len, jm_uintfast32_t f, const Allocator& a)
  82. : data(a), pkmp(0)
  83. {
  84. set_expression(p, p + len, f);
  85. }
  86. template <class charT, class traits, class Allocator>
  87. reg_expression<charT, traits, Allocator>::reg_expression(const reg_expression<charT, traits, Allocator>& e)
  88. : regbase(e), data(e.allocator()), pkmp(0)
  89. {
  90. //
  91. // we do a deep copy only if e is a valid expression, otherwise fail.
  92. //
  93. //_flags = 0;
  94. //fail(e.error_code());
  95. if(error_code() == 0)
  96. set_expression(e.expression(), e.flags());
  97. }
  98. template <class charT, class traits, class Allocator>
  99. reg_expression<charT, traits, Allocator>::~reg_expression()
  100. {
  101. if(pkmp)
  102. kmp_free(pkmp, data.allocator());
  103. }
  104. template <class charT, class traits, class Allocator>
  105. reg_expression<charT, traits, Allocator>& RE_CALL reg_expression<charT, traits, Allocator>::operator=(const reg_expression<charT, traits, Allocator>& e)
  106. {
  107. //
  108. // we do a deep copy only if e is a valid expression, otherwise fail.
  109. //
  110. if(this == &e) return *this;
  111. _flags = 0;
  112. fail(e.error_code());
  113. if(error_code() == 0)
  114. set_expression(e.expression(), e.flags());
  115. return *this;
  116. }
  117. template <class charT, class traits, class Allocator>
  118. inline bool RE_CALL reg_expression<charT, traits, Allocator>::operator==(const reg_expression<charT, traits, Allocator>& e)
  119. {
  120. return (_flags == e.flags()) && (re_strcmp(expression(), e.expression()) == 0);
  121. }
  122. template <class charT, class traits, class Allocator>
  123. bool RE_CALL reg_expression<charT, traits, Allocator>::operator<(const reg_expression<charT, traits, Allocator>& e)
  124. {
  125. int i = re_strcmp(expression(), e.expression());
  126. if(i == 0)
  127. return _flags < e.flags();
  128. return i < 0;
  129. }
  130. template <class charT, class traits, class Allocator>
  131. Allocator RE_CALL reg_expression<charT, traits, Allocator>::allocator()const
  132. {
  133. return data.allocator();
  134. }
  135. template <class charT, class traits, class Allocator>
  136. unsigned int RE_CALL reg_expression<charT, traits, Allocator>::parse_inner_set(const charT*& first, const charT* last)
  137. {
  138. //
  139. // we have an inner [...] construct
  140. //
  141. jm_assert(traits_type::syntax_type((traits_size_type)(traits_uchar_type)*first MAYBE_PASS_LOCALE(locale_inst)) == syntax_open_set);
  142. const charT* base = first;
  143. while( (first != last)
  144. && (traits_type::syntax_type((traits_size_type)(traits_uchar_type)*first MAYBE_PASS_LOCALE(locale_inst)) != syntax_close_set) )
  145. ++first;
  146. if(first == last)
  147. return 0;
  148. ++first;
  149. if((first-base) < 5)
  150. return 0;
  151. if(*(base+1) != *(first-2))
  152. return 0;
  153. unsigned int result = traits_type::syntax_type((traits_size_type)(traits_uchar_type)*(base+1) MAYBE_PASS_LOCALE(locale_inst));
  154. if((result == syntax_colon) && ((first-base) == 5))
  155. {
  156. return traits_type::syntax_type((traits_size_type)(traits_uchar_type)*(base+2) MAYBE_PASS_LOCALE(locale_inst));
  157. }
  158. return ((result == syntax_colon) || (result == syntax_dot) || (result == syntax_equal)) ? result : 0;
  159. }
  160. template <class charT, class traits, class Allocator>
  161. bool RE_CALL reg_expression<charT, traits, Allocator>::skip_space(const charT*& first, const charT* last)
  162. {
  163. //
  164. // returns true if we get to last:
  165. //
  166. while((first != last) && (traits_type::is_class(*first, char_class_space MAYBE_PASS_LOCALE(locale_inst)) == true))
  167. {
  168. ++first;
  169. }
  170. return first == last;
  171. }
  172. template <class charT, class traits, class Allocator>
  173. void RE_CALL reg_expression<charT, traits, Allocator>::parse_range(const charT*& ptr, const charT* end, unsigned& min, unsigned& max)
  174. {
  175. //
  176. // we have {x} or {x,} or {x,y} NB no spaces inside braces
  177. // anything else is illegal
  178. // On input ptr points to "{"
  179. //
  180. ++ptr;
  181. if(skip_space(ptr, end))
  182. {
  183. fail(REG_EBRACE);
  184. return;
  185. }
  186. if(traits_type::syntax_type((traits_size_type)(traits_uchar_type)*ptr MAYBE_PASS_LOCALE(locale_inst)) != syntax_digit)
  187. {
  188. fail(REG_BADBR);
  189. return;
  190. }
  191. min = traits_type::toi(ptr, end, 10 MAYBE_PASS_LOCALE(locale_inst));
  192. if(skip_space(ptr, end))
  193. {
  194. fail(REG_EBRACE);
  195. return;
  196. }
  197. if(traits_type::syntax_type((traits_size_type)(traits_uchar_type)*ptr MAYBE_PASS_LOCALE(locale_inst)) == syntax_comma)
  198. {
  199. //we have a second interval:
  200. ++ptr;
  201. if(skip_space(ptr, end))
  202. {
  203. fail(REG_EBRACE);
  204. return;
  205. }
  206. if(traits_type::syntax_type((traits_size_type)(traits_uchar_type)*ptr MAYBE_PASS_LOCALE(locale_inst)) == syntax_digit)
  207. max = traits_type::toi(ptr, end, 10 MAYBE_PASS_LOCALE(locale_inst));
  208. else
  209. max = (unsigned)-1;
  210. }
  211. else
  212. max = min;
  213. // validate input:
  214. if(skip_space(ptr, end))
  215. {
  216. fail(REG_EBRACE);
  217. return;
  218. }
  219. if(max < min)
  220. {
  221. fail(REG_ERANGE);
  222. return;
  223. }
  224. if(_flags & bk_braces)
  225. {
  226. if(traits_type::syntax_type((traits_size_type)(traits_uchar_type)*ptr MAYBE_PASS_LOCALE(locale_inst)) != syntax_slash)
  227. {
  228. fail(REG_BADBR);
  229. return;
  230. }
  231. else
  232. {
  233. // back\ is OK now check the }
  234. ++ptr;
  235. if((ptr == end) || (traits_type::syntax_type((traits_size_type)(traits_uchar_type)*ptr MAYBE_PASS_LOCALE(locale_inst)) != syntax_close_brace))
  236. {
  237. fail(REG_BADBR);
  238. return;
  239. }
  240. }
  241. }
  242. else if(traits_type::syntax_type((traits_size_type)(traits_uchar_type)*ptr MAYBE_PASS_LOCALE(locale_inst)) != syntax_close_brace)
  243. {
  244. fail(REG_BADBR);
  245. return;
  246. }
  247. }
  248. template <class charT, class traits, class Allocator>
  249. charT RE_CALL reg_expression<charT, traits, Allocator>::parse_escape(const charT*& first, const charT* last)
  250. {
  251. charT c;
  252. switch(traits_type::syntax_type(*first MAYBE_PASS_LOCALE(locale_inst)))
  253. {
  254. case syntax_a:
  255. c = '\a';
  256. ++first;
  257. break;
  258. case syntax_f:
  259. c = '\f';
  260. ++first;
  261. break;
  262. case syntax_n:
  263. c = '\n';
  264. ++first;
  265. break;
  266. case syntax_r:
  267. c = '\r';
  268. ++first;
  269. break;
  270. case syntax_t:
  271. c = '\t';
  272. ++first;
  273. break;
  274. case syntax_v:
  275. c = '\v';
  276. ++first;
  277. break;
  278. case syntax_x:
  279. ++first;
  280. if(first == last)
  281. {
  282. fail(REG_EESCAPE);
  283. break;
  284. }
  285. // maybe have \x{ddd}
  286. if(traits_type::syntax_type(*first MAYBE_PASS_LOCALE(locale_inst)) == syntax_open_brace)
  287. {
  288. ++first;
  289. if(first == last)
  290. {
  291. fail(REG_EESCAPE);
  292. break;
  293. }
  294. if(traits_type::is_class(*first, char_class_xdigit MAYBE_PASS_LOCALE(locale_inst)) == false)
  295. {
  296. fail(REG_BADBR);
  297. break;
  298. }
  299. c = (charT)traits_type::toi(first, last, -16 MAYBE_PASS_LOCALE(locale_inst));
  300. if((first == last) || (traits_type::syntax_type(*first MAYBE_PASS_LOCALE(locale_inst)) != syntax_close_brace))
  301. {
  302. fail(REG_BADBR);
  303. }
  304. ++first;
  305. break;
  306. }
  307. else
  308. {
  309. if(traits_type::is_class(*first, char_class_xdigit MAYBE_PASS_LOCALE(locale_inst)) == false)
  310. {
  311. fail(REG_BADBR);
  312. break;
  313. }
  314. c = (charT)traits_type::toi(first, last, -16 MAYBE_PASS_LOCALE(locale_inst));
  315. }
  316. break;
  317. case syntax_c:
  318. ++first;
  319. if(first == last)
  320. {
  321. fail(REG_EESCAPE);
  322. break;
  323. }
  324. if(((traits_uchar_type)(*first) < (traits_uchar_type)'@')
  325. || ((traits_uchar_type)(*first) > (traits_uchar_type)127) )
  326. {
  327. fail(REG_EESCAPE);
  328. return (charT)0;
  329. }
  330. c = (charT)((traits_uchar_type)(*first) - (traits_uchar_type)'@');
  331. ++first;
  332. break;
  333. case syntax_e:
  334. c = (charT)27;
  335. ++first;
  336. break;
  337. case syntax_digit:
  338. c = (charT)traits_type::toi(first, last, -8 MAYBE_PASS_LOCALE(locale_inst));
  339. break;
  340. default:
  341. c = *first;
  342. ++first;
  343. }
  344. return c;
  345. }
  346. template <class charT, class traits, class Allocator>
  347. void RE_CALL reg_expression<charT, traits, Allocator>::compile_maps()
  348. {
  349. re_syntax_base* record = (re_syntax_base*)data.data();
  350. // always compile the first __map:
  351. memset(startmap, 0, 256);
  352. record->can_be_null = 0;
  353. compile_map(record, startmap, NULL, mask_all);
  354. while(record->type != syntax_element_match)
  355. {
  356. if((record->type == syntax_element_alt) || (record->type == syntax_element_rep))
  357. {
  358. memset(&(((re_jump*)record)->__map), 0, 256);
  359. record->can_be_null = 0;
  360. compile_map(record->next.p, ((re_jump*)record)->__map, &(record->can_be_null), mask_take, ((re_jump*)record)->alt.p);
  361. compile_map(((re_jump*)record)->alt.p, ((re_jump*)record)->__map, &(record->can_be_null), mask_skip);
  362. }
  363. else
  364. {
  365. record->can_be_null = 0;
  366. compile_map(record, NULL, &(record->can_be_null), mask_all);
  367. }
  368. record = record->next.p;
  369. }
  370. record->can_be_null = mask_all;
  371. }
  372. template <class charT, class traits_type, class Allocator>
  373. bool RE_CALL re_maybe_set_member(charT c,
  374. re_set_long* set,
  375. const reg_expression<charT, traits_type, Allocator>& e)
  376. {
  377. const charT* p = (const charT*)(set+1);
  378. bool icase = e.flags() & regbase::icase;
  379. charT col = traits_type::translate(c, icase MAYBE_PASS_LOCALE(e.locale()));
  380. for(unsigned int i = 0; i < set->csingles; ++i)
  381. {
  382. if(col == *p)
  383. return set->isnot ? false : true;
  384. while(*p)++p;
  385. ++p; // skip null
  386. }
  387. return set->isnot ? true : false;
  388. }
  389. template <class charT, class traits, class Allocator>
  390. bool RE_CALL reg_expression<charT, traits, Allocator>::probe_start(
  391. re_syntax_base* node, charT cc, re_syntax_base* terminal) const
  392. {
  393. unsigned int c;
  394. switch(node->type)
  395. {
  396. case syntax_element_startmark:
  397. case syntax_element_endmark:
  398. case syntax_element_start_line:
  399. case syntax_element_word_boundary:
  400. case syntax_element_buffer_start:
  401. case syntax_element_restart_continue:
  402. // doesn't tell us anything about the next character, so:
  403. return probe_start(node->next.p, cc, terminal);
  404. case syntax_element_literal:
  405. // only the first character of the literal can match:
  406. // note these have already been translated:
  407. if(*(charT*)(((re_literal*)node)+1) == traits_type::translate(cc, (_flags & regbase::icase) MAYBE_PASS_LOCALE(locale_inst)))
  408. return true;
  409. return false;
  410. case syntax_element_end_line:
  411. // next character (if there is one!) must be a newline:
  412. if(traits_type::is_separator(traits_type::translate(cc, (_flags & regbase::icase) MAYBE_PASS_LOCALE(locale_inst))))
  413. return true;
  414. return false;
  415. case syntax_element_wild:
  416. return true;
  417. case syntax_element_match:
  418. return true;
  419. case syntax_element_within_word:
  420. case syntax_element_word_start:
  421. return traits_type::is_class(traits_type::translate(cc, (_flags & regbase::icase) MAYBE_PASS_LOCALE(locale_inst)), char_class_word MAYBE_PASS_LOCALE(locale_inst));
  422. case syntax_element_word_end:
  423. // what follows must not be a word character,
  424. return traits_type::is_class(traits_type::translate(cc, (_flags & regbase::icase) MAYBE_PASS_LOCALE(locale_inst)), char_class_word MAYBE_PASS_LOCALE(locale_inst)) ? false : true;
  425. case syntax_element_buffer_end:
  426. // we can be null, nothing must follow,
  427. // NB we assume that this is followed by
  428. // syntax_element_match, if its not then we can
  429. // never match anything anyway!!
  430. return false;
  431. case syntax_element_soft_buffer_end:
  432. // we can be null, only newlines must follow,
  433. // NB we assume that this is followed by
  434. // syntax_element_match, if its not then we can
  435. // never match anything anyway!!
  436. return traits_type::is_separator(traits_type::translate(cc, (_flags & regbase::icase) MAYBE_PASS_LOCALE(locale_inst)));
  437. case syntax_element_backref:
  438. // there's no easy way to determine this
  439. // which is not to say it can't be done!
  440. // for now:
  441. return true;
  442. case syntax_element_long_set:
  443. // we can not be null,
  444. // we need to add already translated values in the set
  445. // to values in the __map
  446. return re_maybe_set_member(cc, (re_set_long*)node, *this) || re_is_set_member((const charT*)&cc, (const charT*)(&cc+1), (re_set_long*)node, *this) != &cc;
  447. case syntax_element_set:
  448. // set all the elements that are set in corresponding set:
  449. c = (traits_size_type)(traits_uchar_type)traits_type::translate(cc, (_flags & regbase::icase) MAYBE_PASS_LOCALE(locale_inst));
  450. return ((re_set*)node)->__map[c] != 0;
  451. case syntax_element_jump:
  452. if(((re_jump*)node)->alt.p < node)
  453. {
  454. // backwards jump,
  455. // caused only by end of repeat section, we'll treat this
  456. // the same as a match, because the sub-expression has matched.
  457. // this is only caused by NULL repeats as in "(a*)*" or "(\<)*"
  458. // these are really nonsensence and make the matching code much
  459. // harder, it would be nice to get rid of them altogether.
  460. if(node->next.p == terminal)
  461. return true;
  462. else
  463. return probe_start(((re_jump*)node)->alt.p, cc, terminal);
  464. }
  465. else
  466. // take the jump and compile:
  467. return probe_start(((re_jump*)node)->alt.p, cc, terminal);
  468. case syntax_element_alt:
  469. // we need to take the OR of the two alternatives:
  470. return probe_start(((re_jump*)node)->alt.p, cc, terminal) || probe_start(node->next.p, cc, terminal);
  471. case syntax_element_rep:
  472. // we need to take the OR of the two alternatives
  473. if(((re_repeat*)node)->min == 0)
  474. return probe_start(node->next.p, cc, ((re_jump*)node)->alt.p) || probe_start(((re_jump*)node)->alt.p, cc, terminal);
  475. else
  476. return probe_start(node->next.p, cc, ((re_jump*)node)->alt.p);
  477. case syntax_element_combining:
  478. return !traits_type::is_combining(traits_type::translate(cc, (_flags & regbase::icase) MAYBE_PASS_LOCALE(locale_inst)));
  479. }
  480. return false;
  481. }
  482. template <class charT, class traits, class Allocator>
  483. bool RE_CALL reg_expression<charT, traits, Allocator>::probe_start_null(re_syntax_base* node, re_syntax_base* terminal)const
  484. {
  485. switch(node->type)
  486. {
  487. case syntax_element_startmark:
  488. case syntax_element_endmark:
  489. case syntax_element_start_line:
  490. case syntax_element_word_boundary:
  491. case syntax_element_buffer_start:
  492. case syntax_element_restart_continue:
  493. case syntax_element_end_line:
  494. case syntax_element_word_end:
  495. // doesn't tell us anything about the next character, so:
  496. return probe_start_null(node->next.p, terminal);
  497. case syntax_element_match:
  498. case syntax_element_buffer_end:
  499. case syntax_element_soft_buffer_end:
  500. case syntax_element_backref:
  501. return true;
  502. case syntax_element_jump:
  503. if(((re_jump*)node)->alt.p < node)
  504. {
  505. // backwards jump,
  506. // caused only by end of repeat section, we'll treat this
  507. // the same as a match, because the sub-expression has matched.
  508. // this is only caused by NULL repeats as in "(a*)*" or "(\<)*"
  509. // these are really nonsensence and make the matching code much
  510. // harder, it would be nice to get rid of them altogether.
  511. if(node->next.p == terminal)
  512. return true;
  513. else
  514. return probe_start_null(((re_jump*)node)->alt.p, terminal);
  515. }
  516. else
  517. // take the jump and compile:
  518. return probe_start_null(((re_jump*)node)->alt.p, terminal);
  519. case syntax_element_alt:
  520. // we need to take the OR of the two alternatives:
  521. return probe_start_null(((re_jump*)node)->alt.p, terminal) || probe_start_null(node->next.p, terminal);
  522. case syntax_element_rep:
  523. // only need to consider skipping the repeat:
  524. return probe_start_null(((re_jump*)node)->alt.p, terminal);
  525. }
  526. return false;
  527. }
  528. template <class charT, class traits, class Allocator>
  529. void RE_CALL reg_expression<charT, traits, Allocator>::compile_map(
  530. re_syntax_base* node, unsigned char* __map,
  531. unsigned int* pnull, unsigned char mask, re_syntax_base* terminal)const
  532. {
  533. if(__map)
  534. {
  535. for(unsigned int i = 0; i < 256; ++i)
  536. {
  537. if(probe_start(node, (charT)i, terminal))
  538. __map[i] |= mask;
  539. }
  540. }
  541. if(pnull && probe_start_null(node, terminal))
  542. *pnull |= mask;
  543. }
  544. template <class charT, class traits, class Allocator>
  545. void RE_CALL reg_expression<charT, traits, Allocator>::move_offsets(re_syntax_base* j, unsigned size)
  546. {
  547. // move all offsets starting with j->link forward by size
  548. // called after an insert:
  549. j = (re_syntax_base*)((const char*)data.data() + j->next.i);
  550. while(true)
  551. {
  552. switch(j->type)
  553. {
  554. case syntax_element_rep:
  555. ((re_jump*)j)->alt.i += size;
  556. j->next.i += size;
  557. break;
  558. case syntax_element_jump:
  559. case syntax_element_alt:
  560. ((re_jump*)j)->alt.i += size;
  561. j->next.i += size;
  562. break;
  563. default:
  564. j->next.i += size;
  565. break;
  566. }
  567. if(j->next.i == size)
  568. break;
  569. j = (re_syntax_base*)((const char*)data.data() + j->next.i);
  570. }
  571. }
  572. template <class charT, class traits, class Allocator>
  573. re_syntax_base* RE_CALL reg_expression<charT, traits, Allocator>::compile_set_simple(re_syntax_base* dat, unsigned long cls, bool isnot)
  574. {
  575. jstack<re_str<charT>, Allocator> singles(64, data.allocator());
  576. jstack<re_str<charT>, Allocator> ranges(64, data.allocator());
  577. jstack<jm_uintfast32_t, Allocator> classes(64, data.allocator());
  578. jstack<re_str<charT>, Allocator> equivalents(64, data.allocator());
  579. classes.push(cls);
  580. if(dat)
  581. {
  582. data.align();
  583. dat->next.i = data.size();
  584. }
  585. return compile_set_aux(singles, ranges, classes, equivalents, isnot, is_byte<charT>::width_type());
  586. }
  587. template <class charT, class traits, class Allocator>
  588. re_syntax_base* RE_CALL reg_expression<charT, traits, Allocator>::compile_set(const charT*& first, const charT* last)
  589. {
  590. jstack<re_str<charT>, Allocator> singles(64, data.allocator());
  591. jstack<re_str<charT>, Allocator> ranges(64, data.allocator());
  592. jstack<jm_uintfast32_t, Allocator> classes(64, data.allocator());
  593. jstack<re_str<charT>, Allocator> equivalents(64, data.allocator());
  594. bool has_digraphs = false;
  595. jm_assert(traits_type::syntax_type((traits_size_type)(traits_uchar_type)*first MAYBE_PASS_LOCALE(locale_inst)) == syntax_open_set);
  596. ++first;
  597. bool started = false;
  598. bool done = false;
  599. bool isnot = false;
  600. enum last_type
  601. {
  602. last_single,
  603. last_none,
  604. last_dash
  605. };
  606. unsigned l = last_none;
  607. re_str<charT> s;
  608. while((first != last) && !done)
  609. {
  610. traits_size_type c = (traits_size_type)(traits_uchar_type)*first;
  611. switch(traits_type::syntax_type(c MAYBE_PASS_LOCALE(locale_inst)))
  612. {
  613. case syntax_caret:
  614. if(!started && !isnot)
  615. {
  616. isnot = true;
  617. }
  618. else
  619. {
  620. s = (charT)c;
  621. goto char_set_literal;
  622. }
  623. break;
  624. case syntax_open_set:
  625. {
  626. if((_flags & char_classes) == 0)
  627. {
  628. s = (charT)c;
  629. goto char_set_literal;
  630. }
  631. // check to see if we really have a class:
  632. const charT* base = first;
  633. switch(parse_inner_set(first, last))
  634. {
  635. case syntax_colon:
  636. {
  637. if(l == last_dash)
  638. {
  639. fail(REG_ERANGE);
  640. return NULL;
  641. }
  642. jm_uintfast32_t id = traits_type::lookup_classname(base+2, first-2 MAYBE_PASS_LOCALE(locale_inst));
  643. if(_flags & regbase::icase)
  644. {
  645. if((id == char_class_upper) || (id == char_class_lower))
  646. {
  647. id = char_class_alpha;
  648. }
  649. }
  650. if(id == 0)
  651. {
  652. fail(REG_ECTYPE);
  653. return NULL;
  654. }
  655. classes.push(id);
  656. started = true;
  657. l = last_none;
  658. }
  659. break;
  660. case syntax_dot:
  661. //
  662. // we have a collating element [.collating-name.]
  663. //
  664. if(traits_type::lookup_collatename(s, base+2, first-2 MAYBE_PASS_LOCALE(locale_inst)))
  665. {
  666. --first;
  667. if(s.size() > 1)
  668. has_digraphs = true;
  669. goto char_set_literal;
  670. }
  671. fail(REG_ECOLLATE);
  672. return NULL;
  673. case syntax_equal:
  674. //
  675. // we have an equivalence class [=collating-name=]
  676. //
  677. if(traits_type::lookup_collatename(s, base+2, first-2 MAYBE_PASS_LOCALE(locale_inst)))
  678. {
  679. unsigned i = 0;
  680. while(s[i])
  681. {
  682. s[i] = traits_type::translate(s[i], (_flags & regbase::icase) MAYBE_PASS_LOCALE(locale_inst));
  683. ++i;
  684. }
  685. re_str<charT> s2;
  686. traits_type::transform_primary(s2, s MAYBE_PASS_LOCALE(locale_inst));
  687. equivalents.push(s2);
  688. started = true;
  689. l = last_none;
  690. break;
  691. }
  692. fail(REG_ECOLLATE);
  693. return NULL;
  694. case syntax_left_word:
  695. if((started == false) && (traits_type::syntax_type((traits_size_type)(traits_uchar_type)*first MAYBE_PASS_LOCALE(locale_inst)) == syntax_close_set))
  696. {
  697. ++first;
  698. return add_simple(0, syntax_element_word_start);
  699. }
  700. fail(REG_EBRACK);
  701. return NULL;
  702. case syntax_right_word:
  703. if((started == false) && (traits_type::syntax_type((traits_size_type)(traits_uchar_type)*first MAYBE_PASS_LOCALE(locale_inst)) == syntax_close_set))
  704. {
  705. ++first;
  706. return add_simple(0, syntax_element_word_end);
  707. }
  708. fail(REG_EBRACK);
  709. return NULL;
  710. default:
  711. if(started == false)
  712. {
  713. unsigned int t = traits_type::syntax_type((traits_size_type)(traits_uchar_type)*(base+1) MAYBE_PASS_LOCALE(locale_inst));
  714. if((t != syntax_colon) && (t != syntax_dot) && (t != syntax_equal))
  715. {
  716. first = base;
  717. s = (charT)c;
  718. goto char_set_literal;
  719. }
  720. }
  721. fail(REG_EBRACK);
  722. return NULL;
  723. }
  724. if(first == last)
  725. {
  726. fail(REG_EBRACK);
  727. return NULL;
  728. }
  729. continue;
  730. }
  731. case syntax_close_set:
  732. if(started == false)
  733. {
  734. s = (charT)c;
  735. goto char_set_literal;
  736. }
  737. done = true;
  738. break;
  739. case syntax_dash:
  740. if(!started)
  741. {
  742. s = (charT)c;
  743. goto char_set_literal;
  744. }
  745. ++first;
  746. if(traits_type::syntax_type((traits_size_type)(traits_uchar_type)*first MAYBE_PASS_LOCALE(locale_inst)) == syntax_close_set)
  747. {
  748. --first;
  749. s = (charT)c;
  750. goto char_set_literal;
  751. }
  752. if((singles.empty() == true) || (l != last_single))
  753. {
  754. fail(REG_ERANGE);
  755. return NULL;
  756. }
  757. ranges.push(singles.peek());
  758. if(singles.peek().size() <= 1) // leave digraphs and ligatures in place
  759. singles.pop();
  760. l = last_dash;
  761. continue;
  762. case syntax_slash:
  763. if(_flags & regbase::escape_in_lists)
  764. {
  765. ++first;
  766. if(first == last)
  767. continue;
  768. switch(traits_type::syntax_type(*first MAYBE_PASS_LOCALE(locale_inst)))
  769. {
  770. case syntax_w:
  771. if(l == last_dash)
  772. {
  773. fail(REG_ERANGE);
  774. return NULL;
  775. }
  776. classes.push(char_class_word);
  777. started = true;
  778. l = last_none;
  779. ++first;
  780. continue;
  781. case syntax_d:
  782. if(l == last_dash)
  783. {
  784. fail(REG_ERANGE);
  785. return NULL;
  786. }
  787. classes.push(char_class_digit);
  788. started = true;
  789. l = last_none;
  790. ++first;
  791. continue;
  792. case syntax_s:
  793. if(l == last_dash)
  794. {
  795. fail(REG_ERANGE);
  796. return NULL;
  797. }
  798. classes.push(char_class_space);
  799. started = true;
  800. l = last_none;
  801. ++first;
  802. continue;
  803. case syntax_l:
  804. if(l == last_dash)
  805. {
  806. fail(REG_ERANGE);
  807. return NULL;
  808. }
  809. classes.push(char_class_lower);
  810. started = true;
  811. l = last_none;
  812. ++first;
  813. continue;
  814. case syntax_u:
  815. if(l == last_dash)
  816. {
  817. fail(REG_ERANGE);
  818. return NULL;
  819. }
  820. classes.push(char_class_upper);
  821. started = true;
  822. l = last_none;
  823. ++first;
  824. continue;
  825. case syntax_W:
  826. case syntax_D:
  827. case syntax_S:
  828. case syntax_U:
  829. case syntax_L:
  830. fail(REG_EESCAPE);
  831. return NULL;
  832. default:
  833. c = parse_escape(first, last);
  834. --first;
  835. s = (charT)c;
  836. goto char_set_literal;
  837. }
  838. }
  839. else
  840. {
  841. s = (charT)c;
  842. goto char_set_literal;
  843. }
  844. default:
  845. s = (charT)c;
  846. char_set_literal:
  847. unsigned i = 0;
  848. while(s[i])
  849. {
  850. s[i] = traits_type::translate(s[i], (_flags & regbase::icase) MAYBE_PASS_LOCALE(locale_inst));
  851. ++i;
  852. }
  853. started = true;
  854. if(l == last_dash)
  855. {
  856. ranges.push(s);
  857. l = last_none;
  858. if(s.size() > 1) // add ligatures to singles list as well
  859. singles.push(s);
  860. }
  861. else
  862. {
  863. singles.push(s);
  864. l = last_single;
  865. }
  866. }
  867. ++first;
  868. }
  869. if(!done)
  870. return NULL;
  871. re_syntax_base* result;
  872. if(has_digraphs)
  873. result = compile_set_aux(singles, ranges, classes, equivalents, isnot, __wide_type());
  874. else
  875. result = compile_set_aux(singles, ranges, classes, equivalents, isnot, is_byte<charT>::width_type());
  876. #ifdef __BORLANDC__
  877. // delayed throw:
  878. if((result == 0) && (_flags & regbase::use_except))
  879. fail(code);
  880. #endif
  881. return result;
  882. }
  883. template <class charT, class traits, class Allocator>
  884. re_syntax_base* RE_CALL reg_expression<charT, traits, Allocator>::compile_set_aux(jstack<re_str<charT>, Allocator>& singles, jstack<re_str<charT>, Allocator>& ranges, jstack<jm_uintfast32_t, Allocator>& classes, jstack<re_str<charT>, Allocator>& equivalents, bool isnot, const __wide_type&)
  885. {
  886. size_type base = data.size();
  887. data.extend(sizeof(re_set_long));
  888. unsigned int csingles = 0;
  889. unsigned int cranges = 0;
  890. jm_uintfast32_t cclasses = 0;
  891. unsigned int cequivalents = 0;
  892. bool nocollate_state = flags() & regbase::nocollate;
  893. while(singles.empty() == false)
  894. {
  895. ++csingles;
  896. const re_str<charT>& s = singles.peek();
  897. unsigned len = (re_strlen(s.c_str()) + 1) * sizeof(charT);
  898. memcpy((charT*)data.extend(len), s.c_str(), len);
  899. //*(charT*)data.extend(sizeof(charT)) = charT(singles.peek());
  900. singles.pop();
  901. }
  902. while(ranges.empty() == false)
  903. {
  904. re_str<charT> c1, c2;
  905. if(nocollate_state)
  906. c1 = ranges.peek();
  907. else
  908. traits_type::transform(c1, ranges.peek() MAYBE_PASS_LOCALE(locale_inst));
  909. ranges.pop();
  910. if(nocollate_state)
  911. c2 = ranges.peek();
  912. else
  913. traits_type::transform(c2, ranges.peek() MAYBE_PASS_LOCALE(locale_inst));
  914. ranges.pop();
  915. if(c1 < c2)
  916. {
  917. // for some reason bc5 crashes when throwing exceptions
  918. // from here - probably an EH-compiler bug, but hard to
  919. // be sure...
  920. // delay throw to later:
  921. #ifdef __BORLANDC__
  922. jm_uintfast32_t f = _flags;
  923. _flags &= ~regbase::use_except;
  924. #endif
  925. fail(REG_ERANGE);
  926. #ifdef __BORLANDC__
  927. _flags = f;
  928. #endif
  929. return NULL;
  930. }
  931. ++cranges;
  932. unsigned len = (re_strlen(c1.c_str()) + 1) * sizeof(charT);
  933. memcpy(data.extend(len), c1.c_str(), len);
  934. len = (re_strlen(c2.c_str()) + 1) * sizeof(charT);
  935. memcpy(data.extend(len), c2.c_str(), len);
  936. }
  937. while(classes.empty() == false)
  938. {
  939. cclasses |= classes.peek();
  940. classes.pop();
  941. }
  942. while(equivalents.empty() == false)
  943. {
  944. ++cequivalents;
  945. const re_str<charT>& s = equivalents.peek();
  946. unsigned len = (re_strlen(s.c_str()) + 1) * sizeof(charT);
  947. memcpy((charT*)data.extend(len), s.c_str(), len);
  948. equivalents.pop();
  949. }
  950. re_set_long* dat = (re_set_long*)((unsigned char*)data.data() + base);
  951. dat->type = syntax_element_long_set;
  952. dat->csingles = csingles;
  953. dat->cranges = cranges;
  954. dat->cclasses = cclasses;
  955. dat->cequivalents = cequivalents;
  956. dat->isnot = isnot;
  957. dat->next.i = -1;
  958. return dat;
  959. }
  960. template <class charT, class traits, class Allocator>
  961. re_syntax_base* RE_CALL reg_expression<charT, traits, Allocator>::compile_set_aux(jstack<re_str<charT>, Allocator>& singles, jstack<re_str<charT>, Allocator>& ranges, jstack<jm_uintfast32_t, Allocator>& classes, jstack<re_str<charT>, Allocator>& equivalents, bool isnot, const __narrow_type&)
  962. {
  963. re_set* dat = (re_set*)data.extend(sizeof(re_set));
  964. memset(dat, 0, sizeof(re_set));
  965. while(singles.empty() == false)
  966. {
  967. dat->__map[(traits_size_type)(traits_uchar_type)*(singles.peek().c_str())] = mask_all;
  968. singles.pop();
  969. }
  970. while(ranges.empty() == false)
  971. {
  972. re_str<charT> c1, c2, c3, c4;
  973. if(flags() & regbase::nocollate)
  974. c1 = ranges.peek();
  975. else
  976. traits_type::transform(c1, ranges.peek() MAYBE_PASS_LOCALE(locale_inst));
  977. ranges.pop();
  978. if(flags() & regbase::nocollate)
  979. c2 = ranges.peek();
  980. else
  981. traits_type::transform(c2, ranges.peek() MAYBE_PASS_LOCALE(locale_inst));
  982. ranges.pop();
  983. if(c1 < c2)
  984. {
  985. // for some reason bc5 crashes when throwing exceptions
  986. // from here - probably an EH-compiler bug, but hard to
  987. // be sure...
  988. // delay throw to later:
  989. #ifdef __BORLANDC__
  990. jm_uintfast32_t f = _flags;
  991. _flags &= ~regbase::use_except;
  992. #endif
  993. fail(REG_ERANGE);
  994. #ifdef __BORLANDC__
  995. _flags = f;
  996. #endif
  997. return NULL;
  998. }
  999. for(unsigned int i = 0; i < 256; ++i)
  1000. {
  1001. c4 = (charT)i;
  1002. if(flags() & regbase::nocollate)
  1003. c3 = c4;
  1004. else
  1005. traits_type::transform(c3, c4 MAYBE_PASS_LOCALE(locale_inst));
  1006. if((c3 <= c1) && (c3 >= c2))
  1007. dat->__map[i] = mask_all;
  1008. }
  1009. }
  1010. while(equivalents.empty() == false)
  1011. {
  1012. re_str<charT> c1, c2;
  1013. for(unsigned int i = 0; i < 256; ++i)
  1014. {
  1015. c2 = (charT)i;
  1016. traits_type::transform_primary(c1, c2 MAYBE_PASS_LOCALE(locale_inst));
  1017. if(c1 == equivalents.peek())
  1018. dat->__map[i] = mask_all;
  1019. }
  1020. equivalents.pop();
  1021. }
  1022. jm_uintfast32_t flags = 0;
  1023. while(classes.empty() == false)
  1024. {
  1025. flags |= classes.peek();
  1026. classes.pop();
  1027. }
  1028. if(flags)
  1029. {
  1030. for(unsigned int i = 0; i < 256; ++i)
  1031. {
  1032. if(traits_type::is_class(charT(i), flags MAYBE_PASS_LOCALE(locale_inst)))
  1033. dat->__map[(traits_uchar_type)traits_type::translate((charT)i, (_flags & regbase::icase) MAYBE_PASS_LOCALE(locale_inst))] = mask_all;
  1034. }
  1035. }
  1036. if(isnot)
  1037. {
  1038. for(unsigned int i = 0; i < 256; ++i)
  1039. {
  1040. dat->__map[i] = !dat->__map[i];
  1041. }
  1042. }
  1043. dat->type = syntax_element_set;
  1044. dat->next.i = -1;
  1045. return dat;
  1046. }
  1047. template <class charT, class traits, class Allocator>
  1048. void RE_CALL reg_expression<charT, traits, Allocator>::fixup_apply(re_syntax_base* b, unsigned cbraces)
  1049. {
  1050. typedef JM_MAYBE_TYPENAME REBIND_TYPE(bool, Allocator) b_alloc;
  1051. register unsigned char* base = (unsigned char*)b;
  1052. register re_syntax_base* ptr = b;
  1053. bool* pb = 0;
  1054. b_alloc a(data.allocator());
  1055. #ifndef JM_NO_EXCEPTIONS
  1056. try
  1057. {
  1058. #endif
  1059. pb = a.allocate(cbraces);
  1060. for(unsigned i = 0; i < cbraces; ++i)
  1061. pb[i] = false;
  1062. repeats = 0;
  1063. while(ptr->next.i)
  1064. {
  1065. switch(ptr->type)
  1066. {
  1067. case syntax_element_rep:
  1068. ((re_jump*)ptr)->alt.p = (re_syntax_base*)(base + ((re_jump*)ptr)->alt.i);
  1069. ((re_repeat*)ptr)->id = repeats;
  1070. ++repeats;
  1071. goto rebase;
  1072. case syntax_element_jump:
  1073. case syntax_element_alt:
  1074. ((re_jump*)ptr)->alt.p = (re_syntax_base*)(base + ((re_jump*)ptr)->alt.i);
  1075. goto rebase;
  1076. case syntax_element_backref:
  1077. if((((re_brace*)ptr)->index >= cbraces) || (pb[((re_brace*)ptr)->index] == false) )
  1078. {
  1079. fail(REG_ESUBREG);
  1080. a.deallocate(pb, cbraces);
  1081. return;
  1082. }
  1083. goto rebase;
  1084. case syntax_element_endmark:
  1085. pb[((re_brace*)ptr)->index] = true;
  1086. goto rebase;
  1087. default:
  1088. rebase:
  1089. ptr->next.p = (re_syntax_base*)(base + ptr->next.i);
  1090. ptr = ptr->next.p;
  1091. }
  1092. }
  1093. a.deallocate(pb, cbraces);
  1094. pb = 0;
  1095. #ifndef JM_NO_EXCEPTIONS
  1096. }
  1097. catch(...)
  1098. {
  1099. if(pb)
  1100. a.deallocate(pb, cbraces);
  1101. throw;
  1102. }
  1103. #endif
  1104. }
  1105. template <class charT, class traits, class Allocator>
  1106. unsigned int RE_CALL reg_expression<charT, traits, Allocator>::set_expression(const charT* p, const charT* end, jm_uintfast32_t f)
  1107. {
  1108. if(p == expression())
  1109. {
  1110. re_str<charT> s(p, end);
  1111. return set_expression(s.c_str(), f);
  1112. }
  1113. #if defined(RE_LOCALE_C) || defined(RE_LOCALE_W32)
  1114. locale_initialiser.update();
  1115. #else
  1116. if(JM_HAS_FACET(locale_inst, regfacet<charT>) == false)
  1117. {
  1118. #ifdef _MSC_VER
  1119. locale_inst = __JM_STD::_ADDFAC(locale_inst, new regfacet<charT>());
  1120. #else
  1121. locale_inst = __JM_STD::locale(locale_inst, new regfacet<charT>());
  1122. #endif
  1123. }
  1124. JM_USE_FACET(locale_inst, regfacet<charT>).update(locale_inst);
  1125. #endif
  1126. const charT* base = p;
  1127. data.clear();
  1128. _flags = f;
  1129. fail(REG_NOERROR); // clear any error
  1130. if(p >= end)
  1131. {
  1132. fail(REG_EMPTY);
  1133. return code;
  1134. }
  1135. const charT* ptr = p;
  1136. marks = 0;
  1137. jstack<unsigned int, Allocator> mark(64, data.allocator());
  1138. jstack<unsigned int, Allocator> markid(64, data.allocator());
  1139. unsigned int last_mark_popped = 0;
  1140. register traits_size_type c;
  1141. register re_syntax_base* dat;
  1142. unsigned rep_min, rep_max;
  1143. //
  1144. // set up header:
  1145. //
  1146. ++marks;
  1147. dat = 0;
  1148. if(_flags & regbase::literal)
  1149. {
  1150. while(ptr != end)
  1151. {
  1152. dat = add_literal(dat, traits::translate(*ptr, (_flags & regbase::icase) MAYBE_PASS_LOCALE(locale_inst)));
  1153. ++ptr;
  1154. }
  1155. }
  1156. while (ptr < end)
  1157. {
  1158. c = (traits_size_type)(traits_uchar_type)*ptr;
  1159. switch(traits_type::syntax_type(c MAYBE_PASS_LOCALE(locale_inst)))
  1160. {
  1161. case syntax_open_bracket:
  1162. if(_flags & bk_parens)
  1163. {
  1164. dat = add_literal(dat, (charT)c);
  1165. ++ptr;
  1166. continue;
  1167. }
  1168. open_bracked_jump:
  1169. // extend:
  1170. dat = add_simple(dat, syntax_element_startmark, sizeof(re_brace));
  1171. markid.push(marks);
  1172. ((re_brace*)dat)->index = marks++;
  1173. mark.push(data.index(dat));
  1174. ++ptr;
  1175. break;
  1176. case syntax_close_bracket:
  1177. if(_flags & bk_parens)
  1178. {
  1179. dat = add_literal(dat, (charT)c);
  1180. ++ptr;
  1181. continue;
  1182. }
  1183. close_bracked_jump:
  1184. if(dat)
  1185. {
  1186. data.align();
  1187. dat->next.i = data.size();
  1188. }
  1189. if(mark.empty())
  1190. {
  1191. fail(REG_EPAREN);
  1192. return code;
  1193. }
  1194. // see if we have an empty alternative:
  1195. if(mark.peek() == data.index(dat) )
  1196. {
  1197. re_syntax_base* para = (re_syntax_base*)((char*)data.data() + mark.peek());
  1198. if(para->type == syntax_element_jump)
  1199. {
  1200. fail(REG_EMPTY);
  1201. return code;
  1202. }
  1203. }
  1204. // pop any pushed alternatives and set the target end destination:
  1205. dat = (re_syntax_base*)((unsigned char*)data.data() + mark.peek());
  1206. while(dat->type == syntax_element_jump)
  1207. {
  1208. ((re_jump*)dat)->alt.i = data.size();
  1209. mark.pop();
  1210. dat = (re_jump*)((unsigned char*)data.data() + mark.peek());
  1211. if(mark.empty())
  1212. {
  1213. fail(REG_EPAREN);
  1214. return code;
  1215. }
  1216. }
  1217. dat = add_simple(0, syntax_element_endmark, sizeof(re_brace));
  1218. ((re_brace*)dat)->index = markid.peek();
  1219. markid.pop();
  1220. last_mark_popped = mark.peek();
  1221. mark.pop();
  1222. ++ptr;
  1223. break;
  1224. case syntax_char:
  1225. dat = add_literal(dat, (charT)c);
  1226. ++ptr;
  1227. break;
  1228. case syntax_slash:
  1229. if(++ptr == end)
  1230. {
  1231. fail(REG_EESCAPE);
  1232. return code;
  1233. }
  1234. c = (traits_size_type)(traits_uchar_type)*ptr;
  1235. switch(traits_type::syntax_type(c MAYBE_PASS_LOCALE(locale_inst)))
  1236. {
  1237. case syntax_open_bracket:
  1238. if(_flags & bk_parens)
  1239. goto open_bracked_jump;
  1240. break;
  1241. case syntax_close_bracket:
  1242. if(_flags & bk_parens)
  1243. goto close_bracked_jump;
  1244. break;
  1245. case syntax_plus:
  1246. if((_flags & bk_plus_qm) && ((_flags & limited_ops) == 0))
  1247. {
  1248. rep_min = 1;
  1249. rep_max = (unsigned)-1;
  1250. goto repeat_jump;
  1251. }
  1252. break;
  1253. case syntax_question:
  1254. if((_flags & bk_plus_qm) && ((_flags & limited_ops) == 0))
  1255. {
  1256. rep_min = 0;
  1257. rep_max = 1;
  1258. goto repeat_jump;
  1259. }
  1260. break;
  1261. case syntax_or:
  1262. if(((_flags & bk_vbar) == 0) || (_flags & limited_ops))
  1263. break;
  1264. goto alt_string_jump;
  1265. case syntax_open_brace:
  1266. if( ((_flags & bk_braces) == 0) || ((_flags & intervals) == 0))
  1267. break;
  1268. // we have {x} or {x,} or {x,y}:
  1269. parse_range(ptr, end, rep_min, rep_max);
  1270. goto repeat_jump;
  1271. case syntax_digit:
  1272. if(_flags & bk_refs)
  1273. {
  1274. // update previous:
  1275. int i = traits_type::toi((charT)c MAYBE_PASS_LOCALE(locale_inst));
  1276. if(i == 0)
  1277. {
  1278. // we can have \025 which means take char whose
  1279. // code is 25 (octal), so parse string:
  1280. c = traits_type::toi(ptr, end, -8 MAYBE_PASS_LOCALE(locale_inst));
  1281. --ptr;
  1282. break;
  1283. }
  1284. dat = add_simple(dat, syntax_element_backref, sizeof(re_brace));
  1285. ((re_brace*)dat)->index = i;
  1286. ++ptr;
  1287. continue;
  1288. }
  1289. break;
  1290. case syntax_b: // syntax_element_word_boundary
  1291. dat = add_simple(dat, syntax_element_word_boundary);
  1292. ++ptr;
  1293. continue;
  1294. case syntax_B:
  1295. dat = add_simple(dat, syntax_element_within_word);
  1296. ++ptr;
  1297. continue;
  1298. case syntax_left_word:
  1299. dat = add_simple(dat, syntax_element_word_start);
  1300. ++ptr;
  1301. continue;
  1302. case syntax_right_word:
  1303. dat = add_simple(dat, syntax_element_word_end);
  1304. ++ptr;
  1305. continue;
  1306. case syntax_w: //syntax_element_word_char
  1307. dat = compile_set_simple(dat, char_class_word);
  1308. ++ptr;
  1309. continue;
  1310. case syntax_W:
  1311. dat = compile_set_simple(dat, char_class_word, true);
  1312. ++ptr;
  1313. continue;
  1314. case syntax_d: //syntax_element_word_char
  1315. dat = compile_set_simple(dat, char_class_digit);
  1316. ++ptr;
  1317. continue;
  1318. case syntax_D:
  1319. dat = compile_set_simple(dat, char_class_digit, true);
  1320. ++ptr;
  1321. continue;
  1322. case syntax_s: //syntax_element_word_char
  1323. dat = compile_set_simple(dat, char_class_space);
  1324. ++ptr;
  1325. continue;
  1326. case syntax_S:
  1327. dat = compile_set_simple(dat, char_class_space, true);
  1328. ++ptr;
  1329. continue;
  1330. case syntax_l: //syntax_element_word_char
  1331. dat = compile_set_simple(dat, char_class_lower);
  1332. ++ptr;
  1333. continue;
  1334. case syntax_L:
  1335. dat = compile_set_simple(dat, char_class_lower, true);
  1336. ++ptr;
  1337. continue;
  1338. case syntax_u: //syntax_element_word_char
  1339. dat = compile_set_simple(dat, char_class_upper);
  1340. ++ptr;
  1341. continue;
  1342. case syntax_U:
  1343. dat = compile_set_simple(dat, char_class_upper, true);
  1344. ++ptr;
  1345. continue;
  1346. case syntax_Q:
  1347. ++ptr;
  1348. while(true)
  1349. {
  1350. if(ptr == end)
  1351. {
  1352. fail(REG_EESCAPE);
  1353. return code;
  1354. }
  1355. if(traits_type::syntax_type((traits_size_type)(traits_uchar_type)*ptr MAYBE_PASS_LOCALE(locale_inst)) == syntax_slash)
  1356. {
  1357. ++ptr;
  1358. if((ptr != end) && (traits_type::syntax_type((traits_size_type)(traits_uchar_type)*ptr MAYBE_PASS_LOCALE(locale_inst)) == syntax_E))
  1359. break;
  1360. else
  1361. {
  1362. dat = add_literal(dat, *(ptr-1));
  1363. continue;
  1364. }
  1365. }
  1366. dat = add_literal(dat, *ptr);
  1367. ++ptr;
  1368. }
  1369. ++ptr;
  1370. continue;
  1371. case syntax_C:
  1372. dat = add_simple(dat, syntax_element_wild);
  1373. ++ptr;
  1374. continue;
  1375. case syntax_X:
  1376. dat = add_simple(dat, syntax_element_combining);
  1377. ++ptr;
  1378. continue;
  1379. case syntax_Z:
  1380. dat = add_simple(dat, syntax_element_soft_buffer_end);
  1381. ++ptr;
  1382. continue;
  1383. case syntax_G:
  1384. dat = add_simple(dat, syntax_element_restart_continue);
  1385. ++ptr;
  1386. continue;
  1387. case syntax_start_buffer:
  1388. dat = add_simple(dat, syntax_element_buffer_start);
  1389. ++ptr;
  1390. continue;
  1391. case syntax_end_buffer:
  1392. dat = add_simple(dat, syntax_element_buffer_end);
  1393. ++ptr;
  1394. continue;
  1395. default:
  1396. c = (traits_size_type)(traits_uchar_type)parse_escape(ptr, end);
  1397. dat = add_literal(dat, (charT)c);
  1398. continue;
  1399. }
  1400. dat = add_literal(dat, (charT)c);
  1401. ++ptr;
  1402. break;
  1403. case syntax_dollar:
  1404. dat = add_simple(dat, syntax_element_end_line, sizeof(re_syntax_base));
  1405. ++ptr;
  1406. continue;
  1407. case syntax_caret:
  1408. dat = add_simple(dat, syntax_element_start_line, sizeof(re_syntax_base));
  1409. ++ptr;
  1410. continue;
  1411. case syntax_dot:
  1412. dat = add_simple(dat, syntax_element_wild, sizeof(re_syntax_base));
  1413. ++ptr;
  1414. continue;
  1415. case syntax_star:
  1416. rep_min = 0;
  1417. rep_max = (unsigned)-1;
  1418. repeat_jump:
  1419. {
  1420. unsigned offset;
  1421. if(dat == 0)
  1422. {
  1423. fail(REG_BADRPT);
  1424. return code;
  1425. }
  1426. switch(dat->type)
  1427. {
  1428. case syntax_element_endmark:
  1429. offset = last_mark_popped;
  1430. break;
  1431. case syntax_element_literal:
  1432. if(((re_literal*)dat)->length > 1)
  1433. {
  1434. // update previous:
  1435. charT lit = *(charT*)((char*)dat + sizeof(re_literal) + ((((re_literal*)dat)->length-1)*sizeof(charT)));
  1436. --((re_literal*)dat)->length;
  1437. dat = add_simple(dat, syntax_element_literal, sizeof(re_literal) + sizeof(charT));
  1438. ((re_literal*)dat)->length = 1;
  1439. *((charT*)(((re_literal*)dat)+1)) = lit;
  1440. }
  1441. offset = (char*)dat - (char*)data.data();
  1442. break;
  1443. case syntax_element_backref:
  1444. case syntax_element_long_set:
  1445. case syntax_element_set:
  1446. case syntax_element_wild:
  1447. case syntax_element_combining:
  1448. // we're repeating a single item:
  1449. offset = (char*)dat - (char*)data.data();
  1450. break;
  1451. default:
  1452. fail(REG_BADRPT);
  1453. return code;
  1454. }
  1455. data.align();
  1456. dat->next.i = data.size();
  1457. //unsigned pos = (char*)dat - (char*)data.data();
  1458. // add the trailing jump:
  1459. add_simple(dat, syntax_element_jump, re_jump_size);
  1460. // now insert the leading repeater:
  1461. dat = (re_syntax_base*)data.insert(offset, re_repeater_size);
  1462. dat->next.i = ((char*)dat - (char*)data.data()) + re_repeater_size;
  1463. dat->type = syntax_element_rep;
  1464. ((re_repeat*)dat)->alt.i = data.size();
  1465. ((re_repeat*)dat)->min = rep_min;
  1466. ((re_repeat*)dat)->max = rep_max;
  1467. ((re_repeat*)dat)->leading = false;
  1468. move_offsets(dat, re_repeater_size);
  1469. dat = (re_syntax_base*)((char*)data.data() + data.size() - re_jump_size);
  1470. ((re_repeat*)dat)->alt.i = offset;
  1471. ++ptr;
  1472. continue;
  1473. }
  1474. case syntax_plus:
  1475. if(_flags & (bk_plus_qm | limited_ops))
  1476. {
  1477. dat = add_literal(dat, (charT)c);
  1478. ++ptr;
  1479. continue;
  1480. }
  1481. rep_min = 1;
  1482. rep_max = (unsigned)-1;
  1483. goto repeat_jump;
  1484. case syntax_question:
  1485. if(_flags & (bk_plus_qm | limited_ops))
  1486. {
  1487. dat = add_literal(dat, (charT)c);
  1488. ++ptr;
  1489. continue;
  1490. }
  1491. rep_min = 0;
  1492. rep_max = 1;
  1493. goto repeat_jump;
  1494. case syntax_open_set:
  1495. // update previous:
  1496. if(dat)
  1497. {
  1498. data.align();
  1499. dat->next.i = data.size();
  1500. }
  1501. // extend:
  1502. dat = compile_set(ptr, end);
  1503. if(dat == 0)
  1504. {
  1505. if((_flags & regbase::failbit) == 0)
  1506. fail(REG_EBRACK);
  1507. return code;
  1508. }
  1509. break;
  1510. case syntax_or:
  1511. {
  1512. if(_flags & (bk_vbar | limited_ops))
  1513. {
  1514. dat = add_literal(dat, (charT)c);
  1515. ++ptr;
  1516. continue;
  1517. }
  1518. alt_string_jump:
  1519. // update previous:
  1520. if(dat == 0)
  1521. {
  1522. // start of pattern can't have empty "|"
  1523. fail(REG_EMPTY);
  1524. return code;
  1525. }
  1526. // see if we have an empty alternative:
  1527. if(mark.empty() == false)
  1528. if(mark.peek() == data.index(dat))
  1529. {
  1530. fail(REG_EMPTY);
  1531. return code;
  1532. }
  1533. // extend:
  1534. /*dat = */add_simple(dat, syntax_element_jump, re_jump_size);
  1535. data.align();
  1536. // now work out where to insert:
  1537. unsigned int offset = 0;
  1538. if(mark.empty() == false)
  1539. {
  1540. // we have a '(' or '|' to go back to:
  1541. offset = mark.peek();
  1542. re_syntax_base* base = (re_syntax_base*)((unsigned char*)data.data() + offset);
  1543. offset = base->next.i;
  1544. }
  1545. re_jump* j = (re_jump*)data.insert(offset, re_jump_size);
  1546. j->type = syntax_element_alt;
  1547. j->next.i = offset + re_jump_size;
  1548. j->alt.i = data.size();
  1549. move_offsets(j, re_jump_size);
  1550. dat = (re_syntax_base*)((unsigned char*)data.data() + data.size() - re_jump_size);
  1551. mark.push(data.size() - re_jump_size);
  1552. ++ptr;
  1553. break;
  1554. }
  1555. case syntax_open_brace:
  1556. if((_flags & bk_braces) || ((_flags & intervals) == 0))
  1557. {
  1558. dat = add_literal(dat, (charT)c);
  1559. ++ptr;
  1560. continue;
  1561. }
  1562. // we have {x} or {x,} or {x,y}:
  1563. parse_range(ptr, end, rep_min, rep_max);
  1564. goto repeat_jump;
  1565. case syntax_newline:
  1566. if(_flags & newline_alt)
  1567. goto alt_string_jump;
  1568. dat = add_literal(dat, (charT)c);
  1569. ++ptr;
  1570. continue;
  1571. case syntax_close_brace:
  1572. if(_flags & bk_braces)
  1573. {
  1574. dat = add_literal(dat, (charT)c);
  1575. ++ptr;
  1576. continue;
  1577. }
  1578. fail(REG_BADPAT);
  1579. return code;
  1580. default:
  1581. dat = add_literal(dat, (charT)c);
  1582. ++ptr;
  1583. break;
  1584. } // switch
  1585. } // while
  1586. //
  1587. // update previous:
  1588. if(dat)
  1589. {
  1590. data.align();
  1591. dat->next.i = data.size();
  1592. }
  1593. // see if we have an empty alternative:
  1594. if(mark.empty() == false)
  1595. if(mark.peek() == data.index(dat) )
  1596. {
  1597. re_syntax_base* para = (re_syntax_base*)((char*)data.data() + mark.peek());
  1598. if(para->type == syntax_element_jump)
  1599. {
  1600. fail(REG_EMPTY);
  1601. return code;
  1602. }
  1603. }
  1604. //
  1605. // set up tail:
  1606. //
  1607. if(mark.empty() == false)
  1608. {
  1609. // pop any pushed alternatives and set the target end destination:
  1610. dat = (re_syntax_base*)((unsigned char*)data.data() + mark.peek());
  1611. while(dat->type == syntax_element_jump)
  1612. {
  1613. ((re_jump*)dat)->alt.i = data.size();
  1614. mark.pop();
  1615. if(mark.empty() == true)
  1616. break;
  1617. dat = (re_jump*)((unsigned char*)data.data() + mark.peek());
  1618. }
  1619. }
  1620. dat = (re_brace*)data.extend(sizeof(re_syntax_base));
  1621. dat->type = syntax_element_match;
  1622. dat->next.i = 0;
  1623. if(mark.empty() == false)
  1624. {
  1625. fail(REG_EPAREN);
  1626. return code;
  1627. }
  1628. //
  1629. // allocate space for start __map:
  1630. startmap = (unsigned char*)data.extend(256 + ((end - base + 1) * sizeof(charT)));
  1631. //
  1632. // and copy the expression we just compiled:
  1633. _expression = (charT*)((const char*)startmap + 256);
  1634. memcpy(_expression, base, (end - base) * sizeof(charT));
  1635. *(_expression + (end - base)) = charT(0);
  1636. //
  1637. // now we need to apply fixups to the array
  1638. // so that we can use pointers and not indexes
  1639. fixup_apply((re_syntax_base*)data.data(), marks);
  1640. // check for error during fixup:
  1641. if(_flags & regbase::failbit)
  1642. return code;
  1643. //
  1644. // finally compile the maps so that we can make intelligent choices
  1645. // whenever we encounter an alternative:
  1646. compile_maps();
  1647. if(pkmp)
  1648. {
  1649. kmp_free(pkmp, data.allocator());
  1650. pkmp = 0;
  1651. }
  1652. re_syntax_base* sbase = (re_syntax_base*)data.data();
  1653. _restart_type = probe_restart(sbase);
  1654. _leading_len = fixup_leading_rep(sbase, 0);
  1655. if((sbase->type == syntax_element_literal) && (sbase->next.p->type == syntax_element_match))
  1656. {
  1657. _restart_type = restart_fixed_lit;
  1658. if(0 == pkmp)
  1659. {
  1660. charT* p1 = (charT*)((char*)sbase + sizeof(re_literal));
  1661. charT* p2 = p1 + ((re_literal*)sbase)->length;
  1662. pkmp = kmp_compile(p1, p2, charT(), kmp_translator<traits>(_flags&regbase::icase), data.allocator() MAYBE_PASS_LOCALE(locale_inst));
  1663. }
  1664. }
  1665. return code;
  1666. }
  1667. template <class charT, class traits, class Allocator>
  1668. re_syntax_base* RE_CALL reg_expression<charT, traits, Allocator>::add_simple(re_syntax_base* dat, syntax_element_type type, unsigned int size)
  1669. {
  1670. if(dat)
  1671. {
  1672. data.align();
  1673. dat->next.i = data.size();
  1674. }
  1675. if(size < sizeof(re_syntax_base))
  1676. size = sizeof(re_syntax_base);
  1677. dat = (re_syntax_base*)data.extend(size);
  1678. dat->type = type;
  1679. dat->next.i = 0;
  1680. return dat;
  1681. }
  1682. template <class charT, class traits, class Allocator>
  1683. re_syntax_base* RE_CALL reg_expression<charT, traits, Allocator>::add_literal(re_syntax_base* dat, charT c)
  1684. {
  1685. if(dat && (dat->type == syntax_element_literal))
  1686. {
  1687. // add another charT to the list:
  1688. __JM_STDC::ptrdiff_t pos = (unsigned char*)dat - (unsigned char*)data.data();
  1689. *(charT*)data.extend(sizeof(charT)) = traits::translate(c, (_flags & regbase::icase) MAYBE_PASS_LOCALE(locale_inst));
  1690. dat = (re_syntax_base*)((unsigned char*)data.data() + pos);
  1691. ++(((re_literal*)dat)->length);
  1692. }
  1693. else
  1694. {
  1695. // extend:
  1696. dat = add_simple(dat, syntax_element_literal, sizeof(re_literal) + sizeof(charT));
  1697. ((re_literal*)dat)->length = 1;
  1698. *((charT*)(((re_literal*)dat)+1)) = traits::translate(c, (_flags & regbase::icase) MAYBE_PASS_LOCALE(locale_inst));
  1699. }
  1700. return dat;
  1701. }
  1702. template <class charT, class traits, class Allocator>
  1703. unsigned int RE_CALL reg_expression<charT, traits, Allocator>::probe_restart(re_syntax_base* dat)
  1704. {
  1705. switch(dat->type)
  1706. {
  1707. case syntax_element_startmark:
  1708. case syntax_element_endmark:
  1709. return probe_restart(dat->next.p);
  1710. case syntax_element_start_line:
  1711. return regbase::restart_line;
  1712. case syntax_element_word_start:
  1713. return regbase::restart_word;
  1714. case syntax_element_buffer_start:
  1715. return regbase::restart_buf;
  1716. case syntax_element_restart_continue:
  1717. return regbase::restart_continue;
  1718. default:
  1719. return regbase::restart_any;
  1720. }
  1721. }
  1722. template <class charT, class traits, class Allocator>
  1723. unsigned int RE_CALL reg_expression<charT, traits, Allocator>::fixup_leading_rep(re_syntax_base* dat, re_syntax_base* end)
  1724. {
  1725. unsigned int len = 0;
  1726. bool leading_lit = end ? false : true;
  1727. while(dat != end)
  1728. {
  1729. switch(dat->type)
  1730. {
  1731. case syntax_element_literal:
  1732. len += ((re_literal*)dat)->length;
  1733. if((leading_lit) && (((re_literal*)dat)->length > 2))
  1734. {
  1735. // we can do a literal search for the leading literal string
  1736. // using Knuth-Morris-Pratt (or whatever), and only then check for
  1737. // matches. We need a decent length string though to make it
  1738. // worth while.
  1739. _leading_string = (charT*)((char*)dat + sizeof(re_literal));
  1740. _leading_string_len = ((re_literal*)dat)->length;
  1741. _restart_type = restart_lit;
  1742. leading_lit = false;
  1743. const charT* p1 = _leading_string;
  1744. const charT* p2 = _leading_string + _leading_string_len;
  1745. pkmp = kmp_compile(p1, p2, charT(), kmp_translator<traits>(_flags&regbase::icase), data.allocator() MAYBE_PASS_LOCALE(locale_inst));
  1746. }
  1747. break;
  1748. case syntax_element_wild:
  1749. ++len;
  1750. leading_lit = false;
  1751. break;
  1752. case syntax_element_match:
  1753. return len;
  1754. case syntax_element_backref:
  1755. //case syntax_element_jump:
  1756. case syntax_element_alt:
  1757. case syntax_element_combining:
  1758. return 0;
  1759. case syntax_element_long_set:
  1760. {
  1761. // we need to verify that there are no multi-character
  1762. // collating elements inside the repeat:
  1763. const charT* p = (const charT*)((const char*)dat + sizeof(re_set_long));
  1764. unsigned int csingles = ((re_set_long*)dat)->csingles;
  1765. for(unsigned int i = 0; i < csingles; ++i)
  1766. {
  1767. if(re_strlen(p) > 1)
  1768. return 0;
  1769. while(*p)++p;
  1770. ++p;
  1771. }
  1772. ++len;
  1773. leading_lit = false;
  1774. break;
  1775. }
  1776. case syntax_element_set:
  1777. ++len;
  1778. leading_lit = false;
  1779. break;
  1780. case syntax_element_rep:
  1781. if(1 == fixup_leading_rep(dat->next.p, ((re_repeat*)dat)->alt.p) )
  1782. {
  1783. ((re_repeat*)dat)->leading = true;
  1784. return len;
  1785. }
  1786. return 0;
  1787. }
  1788. dat = dat->next.p;
  1789. }
  1790. return len;
  1791. }
  1792. #if defined(JM_NO_TEMPLATE_SWITCH_MERGE) && !defined(JM_NO_NAMESPACES)
  1793. } // namespace
  1794. #endif
  1795. JM_END_NAMESPACE