Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1237 lines
32 KiB

  1. /* reparse.c - parse a regular expression
  2. *
  3. * cl /c /Zep /AM /NT RE /Gs /G2 /Oa /D LINT_ARGS /Fc reparse.c
  4. *
  5. * Modifications:
  6. *
  7. * 22-Jul-1986 mz Hookable allocator (allow Z to create enough free space)
  8. * 19-Nov-1986 mz Add RETranslateLength for Z to determine overflows
  9. * 18-Aug-1987 mz Add field width and justification in translations
  10. * 01-Mar-1988 mz Add in UNIX-like syntax
  11. * 14-Jun-1988 mz Fix file parts allowing backslashes
  12. * 04-Dec-1989 bp Let :p accept uppercase drive names
  13. * 20-Dec-1989 ln capture trailing periods in :p
  14. * 23-Jan-1990 ln Handle escaped characters & invalid trailing \ in
  15. * RETranslate.
  16. *
  17. * 28-Jul-1990 davegi Changed Fill to memset (OS/2 2.0)
  18. * Changed Move to memmove (OS/2 2.0)
  19. * 19-Oct-1990 w-barry changed cArg to unsigned int from int.
  20. */
  21. #include <ctype.h>
  22. #include <stdio.h>
  23. #include <malloc.h>
  24. #include <stdlib.h>
  25. #include <string.h>
  26. #include <windows.h>
  27. #include <tools.h>
  28. #include <remi.h>
  29. #include "re.h"
  30. #if DEBUG
  31. #define DEBOUT(x) printf x; fflush (stdout)
  32. #else
  33. #define DEBOUT(x)
  34. #endif
  35. /* regular expression compiler. A regular expression is compiled into pseudo-
  36. * machine code. The principle is portable to other machines and is outlined
  37. * below. We parse by recursive descent.
  38. *
  39. * The pseudo-code is fairly close to normal assembler and can be easily
  40. * converted to be real machine code and has been done for the 80*86
  41. * processor family.
  42. *
  43. * The basic regular expressions handled are:
  44. *
  45. * letter matches a single letter
  46. * [class] matches a single character in the class
  47. * [~class] matches a single character not in the class
  48. * ^ matches the beginning of the line
  49. * $ matches the end of the line
  50. * ? matches any character (except previous two)
  51. * \x literal x
  52. * \n matches the previously tagged/matched expression (n digit)
  53. *
  54. * Regular expressions are now build from the above via:
  55. *
  56. * x* matches 0 or more x, matching minimal number
  57. * x+ matches 1 or more x, matching minimal number
  58. * x@ matches 0 or more x, matching maximal number
  59. * x# matches 1 or more x, matching maximal number
  60. * (x1!x2!...) matches x1 or x2 or ...
  61. * ~x matches 0 characters but prevents x from occuring
  62. * {x} identifies an argument
  63. *
  64. * The final expression that is matched by the compiler is:
  65. *
  66. * xy matches x then y
  67. *
  68. *
  69. * The actual grammar used is: Parsing action:
  70. *
  71. * TOP -> re PROLOG .re. EPILOG
  72. *
  73. *
  74. * re -> { re } re | LEFTARG .re. RIGHTARG
  75. * e re |
  76. * empty
  77. *
  78. * e -> se * | SMSTAR .se. SMSTAR1
  79. * se + |
  80. * se @ | STAR .se. STAR1
  81. * se # |
  82. * se
  83. *
  84. * se -> ( alt ) |
  85. * [ ccl ] |
  86. * ? | ANY
  87. * ^ | BOL
  88. * $ | EOL
  89. * ~ se | NOTSIGN .se. NOTSIGN1
  90. * :x |
  91. * \n | PREV
  92. * letter LETTER x
  93. *
  94. * alt -> re ! alt | LEFTOR .re. ORSIGN
  95. * re LEFTOR .re. ORSIGN RIGHTOR
  96. *
  97. * ccl -> ~ cset | CCLBEG NOTSIGN .cset. CCLEND
  98. * cset CCLBEG NULL .cset. CCLEND
  99. *
  100. * cset -> item cset |
  101. * item
  102. *
  103. * item -> letter - letter | RANGE x y
  104. * letter RANGE x x
  105. *
  106. * Abbreviations are introduced by :.
  107. *
  108. * :a [a-zA-Z0-9] alphanumeric
  109. * :b ([<space><tab>]#) whitespace
  110. * :c [a-zA-Z] alphabetic
  111. * :d [0-9] digit
  112. * :f ([~/\\ "\[\]\:<|>+=;,.]#) file part
  113. * :h ([0-9a-fA-F]#) hex number
  114. * :i ([a-zA-Z_$][a-zA-Z0-9_$]@) identifier
  115. * :n ([0-9]#.[0-9]@![0-9]@.[0-9]#![0-9]#) number
  116. * :p (([A-Za-z]\:!)(\\!)(:f(.:f!)(\\!/))@:f(.:f!.!)) path
  117. * :q ("[~"]@"!'[~']@') quoted string
  118. * :w ([a-zA-Z]#) word
  119. * :z ([0-9]#) integer
  120. *
  121. */
  122. extern char XLTab[256]; /* lower-casing table */
  123. /* There are several classes of characters:
  124. *
  125. * Closure characters are suffixes that indicate repetition of the previous
  126. * RE.
  127. *
  128. * Simple RE chars are characters that indicate a particular type of match
  129. *
  130. */
  131. /* Closure character equates
  132. */
  133. #define CCH_SMPLUS 0 /* plus closure */
  134. #define CCH_SMCLOSURE 1 /* star closure */
  135. #define CCH_POWER 2 /* n repetitions of previous pattern */
  136. #define CCH_CLOSURE 3 /* greedy closure */
  137. #define CCH_PLUS 4 /* greedy plus */
  138. #define CCH_NONE 5
  139. #define CCH_ERROR -1
  140. /* Simple RE character equates */
  141. #define SR_BOL 0
  142. #define SR_EOL 1
  143. #define SR_ANY 2
  144. #define SR_CCLBEG 3
  145. #define SR_LEFTOR 4
  146. #define SR_CCLEND 5
  147. #define SR_ABBREV 6
  148. #define SR_RIGHTOR 7
  149. #define SR_ORSIGN 8
  150. #define SR_NOTSIGN 9
  151. #define SR_LEFTARG 10
  152. #define SR_RIGHTARG 11
  153. #define SR_LETTER 12
  154. #define SR_PREV 13
  155. int EndAltRE[] = { SR_ORSIGN, SR_RIGHTOR, -1};
  156. int EndArg[] = { SR_RIGHTARG, -1};
  157. char *pAbbrev[] = {
  158. "a[a-zA-Z0-9]",
  159. "b([ \t]#)",
  160. "c[a-zA-Z]",
  161. "d[0-9]",
  162. "f([~/\\\\ \\\"\\[\\]\\:<|>+=;,.]#!..!.)",
  163. "h([0-9a-fA-F]#)",
  164. "i([a-zA-Z_$][a-zA-Z0-9_$]@)",
  165. "n([0-9]#.[0-9]@![0-9]@.[0-9]#![0-9]#)",
  166. "p(([A-Za-z]\\:!)(\\\\!/!)(:f(.:f!)(\\\\!/))@:f(.:f!.!))",
  167. "q(\"[~\"]@\"!'[~']@')",
  168. "w([a-zA-Z]#)",
  169. "z([0-9]#)",
  170. NULL
  171. };
  172. static char *digits = "0123456789";
  173. static flagType fZSyntax = TRUE; /* TRUE => use Z syntax for things */
  174. static unsigned int cArg;
  175. /* RECharType - classify a character type
  176. *
  177. * p character pointer
  178. *
  179. * returns type of character (SR_xx)
  180. */
  181. int
  182. RECharType (
  183. char *p
  184. )
  185. {
  186. if (fZSyntax)
  187. /* Zibo syntax
  188. */
  189. switch (*p) {
  190. case '^':
  191. return SR_BOL;
  192. case '$':
  193. if (isdigit (p[1]))
  194. return SR_PREV;
  195. else
  196. return SR_EOL;
  197. case '?':
  198. return SR_ANY;
  199. case '[':
  200. return SR_CCLBEG;
  201. case '(':
  202. return SR_LEFTOR;
  203. case ']':
  204. return SR_CCLEND;
  205. case ':':
  206. return SR_ABBREV;
  207. case ')':
  208. return SR_RIGHTOR;
  209. case '!':
  210. return SR_ORSIGN;
  211. case '~':
  212. return SR_NOTSIGN;
  213. case '{':
  214. return SR_LEFTARG;
  215. case '}':
  216. return SR_RIGHTARG;
  217. default:
  218. return SR_LETTER;
  219. } else
  220. /* Crappy UNIX syntax
  221. */
  222. switch (*p) {
  223. case '^':
  224. return SR_BOL;
  225. case '$':
  226. return SR_EOL;
  227. case '.':
  228. return SR_ANY;
  229. case '[':
  230. return SR_CCLBEG;
  231. case ']':
  232. return SR_CCLEND;
  233. case '\\':
  234. switch (p[1]) {
  235. case ':': /* \:C */
  236. return SR_ABBREV;
  237. case '(': /* \( */
  238. return SR_LEFTARG;
  239. case ')': /* \) */
  240. return SR_RIGHTARG;
  241. case '~': /* \~ */
  242. return SR_NOTSIGN;
  243. case '{': /* \{ */
  244. return SR_LEFTOR;
  245. case '}': /* \} */
  246. return SR_RIGHTOR;
  247. case '!': /* \! */
  248. return SR_ORSIGN;
  249. }
  250. if (isdigit (p[1])) /* \N */
  251. return SR_PREV;
  252. default:
  253. return SR_LETTER;
  254. }
  255. }
  256. /* RECharLen - length of character type
  257. *
  258. * p character pointer to type
  259. *
  260. * returns length in chars of type
  261. */
  262. int
  263. RECharLen (
  264. char *p
  265. )
  266. {
  267. if (fZSyntax)
  268. if (RECharType (p) == SR_PREV) /* $N */
  269. return 2;
  270. else
  271. if (RECharType (p) == SR_ABBREV) /* :N */
  272. return 2;
  273. else
  274. return 1;
  275. else {
  276. if (*p == '\\')
  277. switch (p[1]) {
  278. case '{':
  279. case '}':
  280. case '~':
  281. case '(':
  282. case ')':
  283. case '!':
  284. return 2; /* \C */
  285. case ':': /* \:C */
  286. return 3;
  287. default:
  288. if (isdigit (p[1]))
  289. return 2; /* \N */
  290. else
  291. return 1;
  292. }
  293. return 1;
  294. }
  295. }
  296. /* REClosureLen - length of character type
  297. *
  298. * p character pointer to type
  299. *
  300. * returns length in chars of type
  301. */
  302. int
  303. REClosureLen (
  304. char *p
  305. )
  306. {
  307. p;
  308. return 1;
  309. }
  310. /* REParseRE - parse a general RE up to but not including the pEnd set
  311. * of chars. Apply a particular action to each node in the parse tree.
  312. *
  313. * pAction Parse action routine to call at particluar points in the
  314. * parse tree. This routine returns an unsigned quantity that
  315. * is expected to be passed on to other action calls within the
  316. * same node.
  317. * p character pointer to string being parsed
  318. * pEnd pointer to set of char types that end the current RE.
  319. * External callers will typically use NULL for this value.
  320. * Internally, however, we need to break on the ALT-terminating
  321. * types or on arg-terminating types.
  322. *
  323. * Returns: pointer to delimited character if successful parse
  324. * NULL if unsuccessful parse (syntax error).
  325. *
  326. */
  327. char *
  328. REParseRE (
  329. PACT pAction,
  330. register char *p,
  331. int *pEnd
  332. )
  333. {
  334. int *pe;
  335. UINT_PTR u;
  336. DEBOUT (("REParseRE (%04x, %s)\n", pAction, p));
  337. while (TRUE) {
  338. /* If we're at end of input
  339. */
  340. if (*p == '\0')
  341. /* If we're not in the midst of an open expression
  342. */
  343. if (pEnd == NULL)
  344. /* return the current parse position
  345. */
  346. return p;
  347. else {
  348. /* End of input, but expecting more, ERROR
  349. */
  350. DEBOUT (("REParse expecting more, ERROR\n"));
  351. return NULL;
  352. }
  353. /* If there is an open expression
  354. */
  355. if (pEnd != NULL)
  356. /* Find a matching character
  357. */
  358. for (pe = pEnd; *pe != -1; pe++)
  359. if (RECharType (p) == *pe)
  360. return p;
  361. /* If we are looking at a left argument
  362. */
  363. if (RECharType (p) == SR_LEFTARG) {
  364. /* Parse LEFTARG .re. RIGHTARG
  365. */
  366. u = (*pAction) (LEFTARG, 0, '\0', '\0');
  367. if ((p = REParseRE (pAction, p + RECharLen (p), EndArg)) == NULL)
  368. return NULL;
  369. (*pAction) (RIGHTARG, u, '\0', '\0');
  370. cArg++;
  371. p += RECharLen (p);
  372. } else
  373. /* Parse .e.
  374. */
  375. if ((p = REParseE (pAction, p)) == NULL)
  376. return NULL;
  377. }
  378. }
  379. /* REParseE - parse a simple regular expression with potential closures.
  380. *
  381. * pAction Action to apply at special parse nodes
  382. * p character pointer to spot where parsing occurs
  383. *
  384. * Returns pointer past parsed text if successful
  385. * NULL otherwise (syntax error)
  386. */
  387. char *
  388. REParseE (
  389. PACT pAction,
  390. register char *p
  391. )
  392. {
  393. DEBOUT (("REParseE (%04x, %s)\n", pAction, p));
  394. switch (REClosureChar (p)) {
  395. case CCH_SMPLUS:
  396. if (REParseSE (pAction, p) == NULL)
  397. return NULL;
  398. case CCH_SMCLOSURE:
  399. return REParseClosure (pAction, p);
  400. case CCH_PLUS:
  401. if (REParseSE (pAction, p) == NULL)
  402. return NULL;
  403. case CCH_CLOSURE:
  404. return REParseGreedy (pAction, p);
  405. case CCH_POWER:
  406. return REParsePower (pAction, p);
  407. case CCH_NONE:
  408. return REParseSE (pAction, p);
  409. default:
  410. return NULL;
  411. }
  412. }
  413. /* REParseSE - parse a simple regular expression
  414. *
  415. * pAction Action to apply at special parse nodes
  416. * p character pointer to spot where parsing occurs
  417. *
  418. * Returns pointer past parsed text if successful
  419. * NULL otherwise (syntax error)
  420. */
  421. char *
  422. REParseSE (
  423. register PACT pAction,
  424. register char *p
  425. )
  426. {
  427. DEBOUT (("REParseSE (%04x, %s)\n", pAction, p));
  428. switch (RECharType (p)) {
  429. case SR_CCLBEG:
  430. return REParseClass (pAction, p);
  431. case SR_ANY:
  432. return REParseAny (pAction, p);
  433. case SR_BOL:
  434. return REParseBOL (pAction, p);
  435. case SR_EOL:
  436. return REParseEOL (pAction, p);
  437. case SR_PREV:
  438. return REParsePrev (pAction, p);
  439. case SR_LEFTOR:
  440. return REParseAlt (pAction, p);
  441. case SR_NOTSIGN:
  442. return REParseNot (pAction, p);
  443. case SR_ABBREV:
  444. return REParseAbbrev (pAction, p);
  445. default:
  446. return REParseChar (pAction, p);
  447. }
  448. }
  449. /* REParseClass - parse a class membership match
  450. *
  451. * pAction Action to apply at beginning of parse and at each range
  452. * p character pointer to spot where parsing occurs
  453. *
  454. * Returns pointer past parsed text if successful
  455. * NULL otherwise (syntax error)
  456. */
  457. char *
  458. REParseClass (
  459. PACT pAction,
  460. register char *p
  461. )
  462. {
  463. char c;
  464. UINT_PTR u;
  465. DEBOUT (("REParseClass (%04x, %s)\n", pAction, p));
  466. p += RECharLen (p);
  467. if ((fZSyntax && *p == '~') || (!fZSyntax && *p == '^')) {
  468. u = (*pAction) (CCLNOT, 0, '\0', '\0');
  469. p += RECharLen (p);
  470. } else
  471. u = (*pAction) (CCLBEG, 0, '\0', '\0');
  472. while (RECharType (p) != SR_CCLEND) {
  473. if (*p == '\\')
  474. p++;
  475. if (*p == '\0') {
  476. DEBOUT (("REParseClass expecting more, ERROR\n"));
  477. return NULL;
  478. }
  479. c = *p++;
  480. if (*p == '-') {
  481. p++;
  482. if (*p == '\\')
  483. p++;
  484. if (*p == '\0') {
  485. DEBOUT (("REParseClass expecting more, ERROR\n"));
  486. return NULL;
  487. }
  488. (*pAction) (RANGE, u, c, *p);
  489. p++;
  490. } else
  491. (*pAction) (RANGE, u, c, c);
  492. }
  493. return p + RECharLen (p);
  494. }
  495. /* REParseAny - parse a match-any-character expression
  496. *
  497. * pAction Action to apply
  498. * p character pointer to spot where parsing occurs
  499. *
  500. * Returns pointer past parsed text if successful
  501. * NULL otherwise (syntax error)
  502. */
  503. char *
  504. REParseAny (
  505. PACT pAction,
  506. char *p
  507. )
  508. {
  509. DEBOUT (("REParseAny (%04x, %s)\n", pAction, p));
  510. (*pAction) (ANY, 0, '\0', '\0');
  511. return p + RECharLen (p);
  512. }
  513. /* REParseBOL - parse a beginning-of-line match
  514. *
  515. * pAction Action to apply
  516. * p character pointer to spot where parsing occurs
  517. *
  518. * Returns pointer past parsed text if successful
  519. * NULL otherwise (syntax error)
  520. */
  521. char *
  522. REParseBOL (
  523. PACT pAction,
  524. char *p
  525. )
  526. {
  527. DEBOUT (("REParseBOL (%04x, %s)\n", pAction, p));
  528. (*pAction) (BOL, 0, '\0', '\0');
  529. return p + RECharLen (p);
  530. }
  531. /* REParsePrev - parse a previous-match item
  532. *
  533. * pAction Action to apply
  534. * p character pointer to spot where parsing occurs
  535. *
  536. * Returns pointer past parsed text if successful
  537. * NULL otherwise (syntax error)
  538. */
  539. char *
  540. REParsePrev (
  541. PACT pAction,
  542. char *p
  543. )
  544. {
  545. unsigned int i = *(p + 1) - '0';
  546. DEBOUT (("REParsePrev (%04x, %s)\n", pAction, p));
  547. if (i < 1 || i > cArg) {
  548. DEBOUT (("REParsePrev invalid previous number, ERROR\n"));
  549. return NULL;
  550. }
  551. (*pAction) (PREV, i, '\0', '\0');
  552. return p + RECharLen (p);
  553. }
  554. /* REParseEOL - parse an end-of-line match
  555. *
  556. * pAction Action to apply
  557. * p character pointer to spot where parsing occurs
  558. *
  559. * Returns pointer past parsed text if successful
  560. * NULL otherwise (syntax error)
  561. */
  562. char *
  563. REParseEOL (
  564. PACT pAction,
  565. char *p
  566. )
  567. {
  568. DEBOUT (("REParseEOL (%04x, %s)\n", pAction, p));
  569. (*pAction) (EOL, 0, '\0', '\0');
  570. return p + RECharLen (p);
  571. }
  572. /* REParseAlt - parse a series of alternatives
  573. *
  574. * pAction Action to apply before and after each alternative
  575. * p character pointer to spot where parsing occurs
  576. *
  577. * Returns pointer past parsed text if successful
  578. * NULL otherwise (syntax error)
  579. */
  580. char *
  581. REParseAlt (
  582. PACT pAction,
  583. register char *p
  584. )
  585. {
  586. UINT_PTR u = 0;
  587. DEBOUT (("REParseAlt (%04x, %s)\n", pAction, p));
  588. while (RECharType (p) != SR_RIGHTOR) {
  589. p += RECharLen (p);
  590. u = (*pAction) (LEFTOR, u, '\0', '\0');
  591. if ((p = REParseRE (pAction, p, EndAltRE)) == NULL)
  592. return NULL;
  593. u = (*pAction) (ORSIGN, u, '\0', '\0');
  594. }
  595. (*pAction) (RIGHTOR, u, '\0', '\0');
  596. return p + RECharLen (p);
  597. }
  598. /* REParseNot - parse a guard-against match
  599. *
  600. * pAction Action to apply
  601. * p character pointer to spot where parsing occurs
  602. *
  603. * Returns pointer past parsed text if successful
  604. * NULL otherwise (syntax error)
  605. */
  606. char *
  607. REParseNot (
  608. PACT pAction,
  609. register char *p
  610. )
  611. {
  612. UINT_PTR u;
  613. DEBOUT (("REParseNot (%04x, %s)\n", pAction, p));
  614. p += RECharLen (p);
  615. if (*p == '\0') {
  616. DEBOUT (("REParseNot expecting more, ERROR\n"));
  617. return NULL;
  618. }
  619. u = (*pAction) (NOTSIGN, 0, '\0', '\0');
  620. p = REParseSE (pAction, p);
  621. (*pAction) (NOTSIGN1, u, '\0', '\0');
  622. return p;
  623. }
  624. /* REParseAbbrev - parse and expand an abbreviation
  625. *
  626. * Note that since the abbreviations are in Z syntax, we must change syntax
  627. * temporarily to Z. We are careful to do this so that we do not mess up
  628. * advancign the pointers.
  629. *
  630. * pAction Action to apply
  631. * p character pointer to spot where parsing occurs
  632. *
  633. * Returns pointer past parsed text if successful
  634. * NULL otherwise (syntax error)
  635. */
  636. char *
  637. REParseAbbrev (
  638. PACT pAction,
  639. register char *p
  640. )
  641. {
  642. int i;
  643. flagType fZSTmp;
  644. DEBOUT (("REParseAbbrev (%04x, %s)\n", pAction, p));
  645. p += RECharLen (p);
  646. fZSTmp = fZSyntax;
  647. fZSyntax = TRUE;
  648. if (p[-1] == '\0') {
  649. DEBOUT (("REParseAbbrev expecting abbrev char, ERROR\n"));
  650. fZSyntax = fZSTmp;
  651. return NULL;
  652. }
  653. for (i = 0; pAbbrev[i]; i++)
  654. if (p[-1] == *pAbbrev[i])
  655. if (REParseSE (pAction, pAbbrev[i] + 1) == NULL) {
  656. fZSyntax = fZSTmp;
  657. return NULL;
  658. } else {
  659. fZSyntax = fZSTmp;
  660. return p;
  661. }
  662. DEBOUT (("REParseAbbrev found invalid abbrev char %s, ERROR\n", p - 1));
  663. fZSyntax = fZSTmp;
  664. return NULL;
  665. }
  666. /* REParseChar - parse a single character match
  667. *
  668. * pAction Action to apply
  669. * p character pointer to spot where parsing occurs
  670. *
  671. * Returns pointer past parsed text if successful
  672. * NULL otherwise (syntax error)
  673. */
  674. char *
  675. REParseChar (
  676. PACT pAction,
  677. register char *p
  678. )
  679. {
  680. DEBOUT (("REParseChar (%04x, %s)\n", pAction, p));
  681. if (*p == '\\')
  682. p++;
  683. if (*p == '\0') {
  684. DEBOUT (("REParseChar expected more, ERROR\n"));
  685. return NULL;
  686. }
  687. (*pAction) (LETTER, 0, *p, '\0');
  688. return p+1;
  689. }
  690. /* REParseClosure - parse a minimal match closure. The match occurs by
  691. * matching none, then one, ...
  692. *
  693. * pAction Action to apply
  694. * p character pointer to spot where parsing occurs
  695. *
  696. * Returns pointer past parsed text if successful
  697. * NULL otherwise (syntax error)
  698. */
  699. char *
  700. REParseClosure (
  701. PACT pAction,
  702. register char *p
  703. )
  704. {
  705. UINT_PTR u;
  706. DEBOUT (("REParseaClosure (%04x, %s)\n", pAction, p));
  707. u = (*pAction) (SMSTAR, 0, '\0', '\0');
  708. if ((p = REParseSE (pAction, p)) == NULL)
  709. return NULL;
  710. (*pAction) (SMSTAR1, u, '\0', '\0');
  711. return p + REClosureLen (p);
  712. }
  713. /* REParseGreedy - parse a maximal-match closure. The match occurs by
  714. * matching the maximal number and then backing off as failures occur.
  715. *
  716. * pAction Action to apply
  717. * p character pointer to spot where parsing occurs
  718. *
  719. * Returns pointer past parsed text if successful
  720. * NULL otherwise (syntax error)
  721. */
  722. char *
  723. REParseGreedy (
  724. PACT pAction,
  725. register char *p
  726. )
  727. {
  728. UINT_PTR u;
  729. DEBOUT (("REParseGreedy (%04x, %s)\n", pAction, p));
  730. u = (*pAction) (STAR, 0, '\0', '\0');
  731. if ((p = REParseSE (pAction, p)) == NULL)
  732. return NULL;
  733. (*pAction) (STAR1, u, '\0', '\0');
  734. return p + REClosureLen (p);
  735. }
  736. /* REParsePower - parse a power-closure. This is merely the simple pattern
  737. * repeated the number of times specified by the exponent.
  738. *
  739. * pAction Action to apply
  740. * p character pointer to spot where parsing occurs
  741. *
  742. * Returns pointer past parsed text if successful
  743. * NULL otherwise (syntax error)
  744. */
  745. char *
  746. REParsePower (
  747. PACT pAction,
  748. char *p
  749. )
  750. {
  751. register char *p1;
  752. int exp;
  753. DEBOUT (("REParsePower (%04x, %s)\n", pAction, p));
  754. /* We have .se. POWER something. Skip over the .se. and POWER
  755. * to make sure that what follows is a valid number
  756. */
  757. p1 = REParseSE (NullAction, p);
  758. if (p1 == NULL)
  759. /* Parse of .se. failed
  760. */
  761. return NULL;
  762. /* skip POWER
  763. */
  764. p1 += REClosureLen (p1);
  765. if (*p1 == '\0') {
  766. DEBOUT (("REParsePower expecting more, ERROR\n"));
  767. return NULL;
  768. }
  769. /* try to parse off number */
  770. if (sscanf (p1, "%d", &exp) != 1) {
  771. DEBOUT (("REParsePower expecting number, ERROR\n"));
  772. return NULL;
  773. }
  774. p1 = strbskip (p1, digits);
  775. /* iterate the pattern the exponent number of times */
  776. while (exp--)
  777. if (REParseSE (pAction, p) == NULL)
  778. return NULL;
  779. return p1;
  780. }
  781. /* NullAction - a do-nothing action. Used for stubbing out the action
  782. * during a parse.
  783. */
  784. UINT_PTR
  785. NullAction(
  786. unsigned int type,
  787. UINT_PTR u,
  788. unsigned char x,
  789. unsigned char y
  790. )
  791. {
  792. type; u; x; y;
  793. return 0;
  794. }
  795. /* REClosureChar - return the character that corresponds to the next
  796. * closure to be parsed. We call REParseSE with a null action to merely
  797. * advance the character pointer to point just beyond the current simple
  798. * regular expression.
  799. *
  800. * p character pointer to spot where parsing occurs
  801. *
  802. * Returns closure character if appropriate
  803. * CCH_NONE if no closure character found.
  804. */
  805. char
  806. REClosureChar (
  807. char *p
  808. )
  809. {
  810. p = REParseSE (NullAction, p);
  811. if (p == NULL)
  812. return CCH_ERROR;
  813. if (fZSyntax)
  814. /* Zibo syntax
  815. */
  816. switch (*p) {
  817. case '^':
  818. return CCH_POWER;
  819. case '+':
  820. return CCH_SMPLUS;
  821. case '#':
  822. return CCH_PLUS;
  823. case '*':
  824. return CCH_SMCLOSURE;
  825. case '@':
  826. return CCH_CLOSURE;
  827. default:
  828. return CCH_NONE;
  829. } else
  830. /* Crappy UNIX syntax
  831. */
  832. switch (*p) {
  833. case '+':
  834. return CCH_PLUS;
  835. case '*':
  836. return CCH_CLOSURE;
  837. default:
  838. return CCH_NONE;
  839. }
  840. }
  841. /* RECompile - compile a pattern into the machine. Return a
  842. * pointer to the match machine.
  843. *
  844. * p character pointer to pattern being compiled
  845. *
  846. * Returns: pointer to the machine if compilation was successful
  847. * NULL if syntax error or not enough memory for malloc
  848. */
  849. struct patType *
  850. RECompile(
  851. char *p,
  852. flagType fCase,
  853. flagType fZS
  854. )
  855. {
  856. fZSyntax = fZS;
  857. REEstimate (p);
  858. DEBOUT (("Length is %04x\n", RESize));
  859. if (RESize == -1)
  860. return NULL;
  861. if ((REPat = (struct patType *) (*tools_alloc) (RESize)) == NULL)
  862. return NULL;
  863. memset ((char far *) REPat, -1, RESize);
  864. memset ((char far *) REPat->pArgBeg, 0, sizeof (REPat->pArgBeg));
  865. memset ((char far *) REPat->pArgEnd, 0, sizeof (REPat->pArgEnd));
  866. REip = REPat->code;
  867. REArg = 1;
  868. REPat->fCase = fCase;
  869. REPat->fUnix = (flagType) !fZS;
  870. cArg = 0;
  871. CompileAction (PROLOG, 0, '\0', '\0');
  872. if (REParseRE (CompileAction, p, NULL) == NULL)
  873. return NULL;
  874. CompileAction (EPILOG, 0, '\0', '\0');
  875. #if DEBUG
  876. REDump (REPat);
  877. #endif
  878. return REPat;
  879. }
  880. /* Escaped - translate an escaped character ala UNIX C conventions.
  881. *
  882. * \t => tab \e => ESC char \h => backspace \g => bell
  883. * \n => lf \r => cr \\ => \
  884. *
  885. * c character to be translated
  886. *
  887. * Returns: character as per above
  888. */
  889. char
  890. Escaped(
  891. char c
  892. )
  893. {
  894. switch (c) {
  895. case 't':
  896. return '\t';
  897. case 'e':
  898. return 0x1B;
  899. case 'h':
  900. return 0x08;
  901. case 'g':
  902. return 0x07;
  903. case 'n':
  904. return '\n';
  905. case 'r':
  906. return '\r';
  907. case '\\':
  908. return '\\';
  909. default:
  910. return c;
  911. }
  912. }
  913. /* REGetArg - copy argument string out from match.
  914. *
  915. * pat matched pattern
  916. * i index of argument to fetch, 0 is entire pattern
  917. * p destination of argument
  918. *
  919. * Returns: TRUE if successful, FALSE if i is out of range.
  920. */
  921. flagType
  922. REGetArg (
  923. struct patType *pat,
  924. int i,
  925. char *p
  926. )
  927. {
  928. int l = 0;
  929. if (i > MAXPATARG)
  930. return FALSE;
  931. else
  932. if (pat->pArgBeg[i] != (char *)-1)
  933. memmove ((char far *)p, (char far *)pat->pArgBeg[i], l = RELength (pat, i));
  934. p[l] = '\0';
  935. return TRUE;
  936. }
  937. /* RETranslate - translate a pattern string and match structure into an
  938. * output string. During pattern search-and-replace, RETranslate is used
  939. * to generate an output string based on an input match pattern and a template
  940. * that directs the output.
  941. *
  942. * The input match is any patType returned from RECompile that has been passed
  943. * to fREMatch and that causes fREMatch to return TRUE. The template string
  944. * is any set of ascii chars. The $ character leads in arguments:
  945. *
  946. * $$ is replaced with $
  947. * $0 is replaced with the entire match string
  948. * $1-$9 is replaced with the corresponding tagged (by {}) item from
  949. * the match.
  950. *
  951. * An alternative method is to specify the argument as:
  952. *
  953. * $([w,]a) where a is the argument number (0-9) and w is an optional field
  954. * width that will be used in a printf %ws format.
  955. *
  956. * buf pattern matched
  957. * src template for the match
  958. * dst destination of the translation
  959. *
  960. * Returns: TRUE if translation was successful, FALSE otherwise
  961. */
  962. flagType
  963. RETranslate (
  964. struct patType *buf,
  965. register char *src,
  966. register char *dst
  967. )
  968. {
  969. int i, w;
  970. char *work;
  971. char chArg = (char) (buf->fUnix ? '\\' : '$');
  972. work = (*tools_alloc) (MAXLINELEN);
  973. if (work == NULL)
  974. return FALSE;
  975. *dst = '\0';
  976. while (*src != '\0') {
  977. /* Process tagged substitutions first
  978. */
  979. if (*src == chArg && (isdigit (src[1]) || src[1] == '(')) {
  980. /* presume 0-width field */
  981. w = 0;
  982. /* skip $ and char */
  983. src += 2;
  984. /* if we saw $n */
  985. if (isdigit (src[-1]))
  986. i = src[-1] - '0';
  987. /* else we saw $( */
  988. else {
  989. /* get tagged expr number */
  990. i = atoi (src);
  991. /* skip over number */
  992. if (*src == '-')
  993. src++;
  994. src = strbskip (src, digits);
  995. /* was there a comma? */
  996. if (*src == ',') {
  997. /* We saw field width, parse off expr number */
  998. w = i;
  999. i = atoi (++src);
  1000. src = strbskip (src, digits);
  1001. }
  1002. /* We MUST end with a close paren */
  1003. if (*src++ != ')') {
  1004. free (work);
  1005. return FALSE;
  1006. }
  1007. }
  1008. /* w is field width
  1009. * i is selected argument
  1010. */
  1011. if (!REGetArg (buf, i, work)) {
  1012. free (work);
  1013. return FALSE;
  1014. }
  1015. sprintf (dst, "%*s", w, work);
  1016. dst += strlen (dst);
  1017. } else
  1018. /* process escaped characters */
  1019. if (*src == '\\') {
  1020. src++;
  1021. if (!*src) {
  1022. free (work);
  1023. return FALSE;
  1024. }
  1025. *dst++ = Escaped (*src++);
  1026. } else
  1027. /* chArg quotes itself */
  1028. if (*src == chArg && src[1] == chArg) {
  1029. *dst++ = chArg;
  1030. src += 2;
  1031. } else
  1032. *dst++ = *src++;
  1033. }
  1034. *dst = '\0';
  1035. free (work);
  1036. return TRUE;
  1037. }
  1038. /* RETranslateLength - given a matched pattern and a replacement string
  1039. * return the length of the final replacement
  1040. *
  1041. * The inputs have the same syntax/semantics as in RETranslate.
  1042. *
  1043. * buf pattern matched
  1044. * src template for the match
  1045. *
  1046. * Returns: number of bytes in total replacement, -1 if error
  1047. */
  1048. int
  1049. RETranslateLength (
  1050. struct patType *buf,
  1051. register char *src
  1052. )
  1053. {
  1054. int i, w;
  1055. int length = 0;
  1056. char chArg = (char) (buf->fUnix ? '\\' : '$');
  1057. while (*src != '\0') {
  1058. /* Process tagged substitutions first
  1059. */
  1060. if (*src == chArg && (isdigit (src[1]) || src[1] == '(')) {
  1061. w = 0;
  1062. src += 2;
  1063. if (isdigit (src[-1]))
  1064. i = src[-1] - '0';
  1065. else {
  1066. i = atoi (src);
  1067. if (*src == '-')
  1068. src++;
  1069. src = strbskip (src, digits);
  1070. if (*src == ',') {
  1071. w = i;
  1072. i = atoi (++src);
  1073. src = strbskip (src, digits);
  1074. }
  1075. if (*src++ != ')')
  1076. return -1;
  1077. }
  1078. /* w is field width
  1079. * i is selected argument
  1080. */
  1081. i = RELength (buf, i);
  1082. length += max (i, abs(w));
  1083. } else
  1084. /* process escaped characters */
  1085. if (*src == '\\') {
  1086. src += 2;
  1087. length++;
  1088. } else
  1089. /* chArg quotes itself */
  1090. if (*src == chArg && src[1] == chArg) {
  1091. src += 2;
  1092. length++;
  1093. } else {
  1094. length++;
  1095. src++;
  1096. }
  1097. }
  1098. return length;
  1099. }
  1100. /* RELength - return length of argument in match.
  1101. *
  1102. * pat matched pattern
  1103. * i index of argument to examine, 0 is entire pattern
  1104. *
  1105. * Returns: length of ith argument, -1 if i is out-of-range.
  1106. */
  1107. int
  1108. RELength (
  1109. struct patType *pat,
  1110. int i
  1111. )
  1112. {
  1113. if (i > MAXPATARG)
  1114. return -1;
  1115. else
  1116. if (pat->pArgBeg[i] == (char *)-1)
  1117. return 0;
  1118. else
  1119. return (int)(pat->pArgEnd[i] - pat->pArgBeg[i]);
  1120. }
  1121. /* REStart - return pointer to beginning of match.
  1122. *
  1123. * ppat matched pattern
  1124. *
  1125. * Returns: character pointer to beginning of match
  1126. */
  1127. char *
  1128. REStart (
  1129. struct patType *pat
  1130. )
  1131. {
  1132. return pat->pArgBeg[0] == (char *)-1 ? NULL : pat->pArgBeg[0];
  1133. }