Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1290 lines
40 KiB

  1. /* $Header: /nw/tony/src/stevie/src/RCS/regexp.c,v 1.5 89/07/07 16:27:11 tony Exp $
  2. *
  3. * NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE
  4. *
  5. * This is NOT the original regular expression code as written by
  6. * Henry Spencer. This code has been modified specifically for use
  7. * with the STEVIE editor, and should not be used apart from compiling
  8. * STEVIE. If you want a good regular expression library, get the
  9. * original code. The copyright notice that follows is from the
  10. * original.
  11. *
  12. * NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE
  13. *
  14. *
  15. * regcomp and regexec -- regsub and regerror are elsewhere
  16. *
  17. * Copyright (c) 1986 by University of Toronto.
  18. * Written by Henry Spencer. Not derived from licensed software.
  19. *
  20. * Permission is granted to anyone to use this software for any
  21. * purpose on any computer system, and to redistribute it freely,
  22. * subject to the following restrictions:
  23. *
  24. * 1. The author is not responsible for the consequences of use of
  25. * this software, no matter how awful, even if they arise
  26. * from defects in it.
  27. *
  28. * 2. The origin of this software must not be misrepresented, either
  29. * by explicit claim or by omission.
  30. *
  31. * 3. Altered versions must be plainly marked as such, and must not
  32. * be misrepresented as being the original software.
  33. *
  34. * Beware that some of this code is subtly aware of the way operator
  35. * precedence is structured in regular expressions. Serious changes in
  36. * regular-expression syntax might require a total rethink.
  37. *
  38. */
  39. #include "env.h"
  40. #include <stdio.h>
  41. #include <string.h>
  42. #include <malloc.h>
  43. #include "regexp.h"
  44. #include "regmagic.h"
  45. int cstrncmp(char *,char *,int);
  46. char *cstrchr(char *,char);
  47. /*
  48. * The "internal use only" fields in regexp.h are present to pass info from
  49. * compile to execute that permits the execute phase to run lots faster on
  50. * simple cases. They are:
  51. *
  52. * regstart char that must begin a match; '\0' if none obvious
  53. * reganch is the match anchored (at beginning-of-line only)?
  54. * regmust string (pointer into program) that match must include, or NULL
  55. * regmlen length of regmust string
  56. *
  57. * Regstart and reganch permit very fast decisions on suitable starting points
  58. * for a match, cutting down the work a lot. Regmust permits fast rejection
  59. * of lines that cannot possibly match. The regmust tests are costly enough
  60. * that regcomp() supplies a regmust only if the r.e. contains something
  61. * potentially expensive (at present, the only such thing detected is * or +
  62. * at the start of the r.e., which can involve a lot of backup). Regmlen is
  63. * supplied because the test in regexec() needs it and regcomp() is computing
  64. * it anyway.
  65. */
  66. /*
  67. * Structure for regexp "program". This is essentially a linear encoding
  68. * of a nondeterministic finite-state machine (aka syntax charts or
  69. * "railroad normal form" in parsing technology). Each node is an opcode
  70. * plus a "next" pointer, possibly plus an operand. "Next" pointers of
  71. * all nodes except BRANCH implement concatenation; a "next" pointer with
  72. * a BRANCH on both ends of it is connecting two alternatives. (Here we
  73. * have one of the subtle syntax dependencies: an individual BRANCH (as
  74. * opposed to a collection of them) is never concatenated with anything
  75. * because of operator precedence.) The operand of some types of node is
  76. * a literal string; for others, it is a node leading into a sub-FSM. In
  77. * particular, the operand of a BRANCH node is the first node of the branch.
  78. * (NB this is *not* a tree structure: the tail of the branch connects
  79. * to the thing following the set of BRANCHes.) The opcodes are:
  80. */
  81. /* definition number opnd? meaning */
  82. #define END 0 /* no End of program. */
  83. #define BOL 1 /* no Match "" at beginning of line. */
  84. #define EOL 2 /* no Match "" at end of line. */
  85. #define ANY 3 /* no Match any one character. */
  86. #define ANYOF 4 /* str Match any character in this string. */
  87. #define ANYBUT 5 /* str Match any character not in this string. */
  88. #define BRANCH 6 /* node Match this alternative, or the next... */
  89. #define BACK 7 /* no Match "", "next" ptr points backward. */
  90. #define EXACTLY 8 /* str Match this string. */
  91. #define NOTHING 9 /* no Match empty string. */
  92. #define STAR 10 /* node Match this (simple) thing 0 or more times. */
  93. #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
  94. #define OPEN 20 /* no Mark this point in input as start of #n. */
  95. /* OPEN+1 is number 1, etc. */
  96. #define CLOSE 30 /* no Analogous to OPEN. */
  97. /*
  98. * Opcode notes:
  99. *
  100. * BRANCH The set of branches constituting a single choice are hooked
  101. * together with their "next" pointers, since precedence prevents
  102. * anything being concatenated to any individual branch. The
  103. * "next" pointer of the last BRANCH in a choice points to the
  104. * thing following the whole choice. This is also where the
  105. * final "next" pointer of each individual branch points; each
  106. * branch starts with the operand node of a BRANCH node.
  107. *
  108. * BACK Normal "next" pointers all implicitly point forward; BACK
  109. * exists to make loop structures possible.
  110. *
  111. * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
  112. * BRANCH structures using BACK. Simple cases (one character
  113. * per match) are implemented with STAR and PLUS for speed
  114. * and to minimize recursive plunges.
  115. *
  116. * OPEN,CLOSE ...are numbered at compile time.
  117. */
  118. /*
  119. * A node is one char of opcode followed by two chars of "next" pointer.
  120. * "Next" pointers are stored as two 8-bit pieces, high order first. The
  121. * value is a positive offset from the opcode of the node containing it.
  122. * An operand, if any, simply follows the node. (Note that much of the
  123. * code generation knows about this implicit relationship.)
  124. *
  125. * Using two bytes for the "next" pointer is vast overkill for most things,
  126. * but allows patterns to get big without disasters.
  127. */
  128. #define OP(p) (*(p))
  129. #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
  130. #define OPERAND(p) ((p) + 3)
  131. /*
  132. * See regmagic.h for one further detail of program structure.
  133. */
  134. /*
  135. * Utility definitions.
  136. */
  137. #ifndef CHARBITS
  138. #define UCHARAT(p) ((int)*(unsigned char *)(p))
  139. #else
  140. #define UCHARAT(p) ((int)*(p)&CHARBITS)
  141. #endif
  142. #define FAIL(m) { regerror(m); return(NULL); }
  143. #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
  144. #define META "^$.[()|?+*\\"
  145. /*
  146. * Flags to be passed up and down.
  147. */
  148. #define HASWIDTH 01 /* Known never to match null string. */
  149. #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
  150. #define SPSTART 04 /* Starts with * or +. */
  151. #define WORST 0 /* Worst case. */
  152. #ifndef ORIGINAL
  153. /*
  154. * The following supports the ability to ignore case in searches.
  155. */
  156. #include <ctype.h>
  157. int reg_ic = 0; /* set by callers to ignore case */
  158. /*
  159. * mkup - convert to upper case IF we're doing caseless compares
  160. */
  161. #define mkup(c) ((reg_ic && islower(c)) ? toupper(c) : (c))
  162. #endif
  163. /*
  164. * Global work variables for regcomp().
  165. */
  166. static char *regparse; /* Input-scan pointer. */
  167. static int regnpar; /* () count. */
  168. static char regdummy;
  169. static char *regcode; /* Code-emit pointer; &regdummy = don't. */
  170. static long regsize; /* Code size. */
  171. /*
  172. * Forward declarations for regcomp()'s friends.
  173. */
  174. #ifndef STATIC
  175. #define STATIC static
  176. #endif
  177. STATIC char *reg();
  178. STATIC char *regbranch();
  179. STATIC char *regpiece();
  180. STATIC char *regatom();
  181. STATIC char *regnode();
  182. STATIC char *regnext();
  183. STATIC void regc();
  184. STATIC void reginsert();
  185. STATIC void regtail();
  186. STATIC void regoptail();
  187. #ifdef STRCSPN
  188. STATIC int strcspn();
  189. #endif
  190. /*
  191. - regcomp - compile a regular expression into internal code
  192. *
  193. * We can't allocate space until we know how big the compiled form will be,
  194. * but we can't compile it (and thus know how big it is) until we've got a
  195. * place to put the code. So we cheat: we compile it twice, once with code
  196. * generation turned off and size counting turned on, and once "for real".
  197. * This also means that we don't allocate space until we are sure that the
  198. * thing really will compile successfully, and we never have to move the
  199. * code and thus invalidate pointers into it. (Note that it has to be in
  200. * one piece because free() must be able to free it all.)
  201. *
  202. * Beware that the optimization-preparation code in here knows about some
  203. * of the structure of the compiled regexp.
  204. */
  205. regexp *
  206. regcomp(exp)
  207. char *exp;
  208. {
  209. register regexp *r;
  210. register char *scan;
  211. register char *longest;
  212. register int len;
  213. int flags;
  214. if (exp == NULL)
  215. FAIL("NULL argument");
  216. /* First pass: determine size, legality. */
  217. regparse = exp;
  218. regnpar = 1;
  219. regsize = 0L;
  220. regcode = &regdummy;
  221. regc(MAGIC);
  222. if (reg(0, &flags) == NULL)
  223. return(NULL);
  224. /* Small enough for pointer-storage convention? */
  225. if (regsize >= 32767L) /* Probably could be 65535L. */
  226. FAIL("regexp too big");
  227. /* Allocate space. */
  228. r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
  229. if (r == NULL)
  230. FAIL("out of space");
  231. /* Second pass: emit code. */
  232. regparse = exp;
  233. regnpar = 1;
  234. regcode = r->program;
  235. regc(MAGIC);
  236. if (reg(0, &flags) == NULL)
  237. return(NULL);
  238. /* Dig out information for optimizations. */
  239. r->regstart = '\0'; /* Worst-case defaults. */
  240. r->reganch = 0;
  241. r->regmust = NULL;
  242. r->regmlen = 0;
  243. scan = r->program+1; /* First BRANCH. */
  244. if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
  245. scan = OPERAND(scan);
  246. /* Starting-point info. */
  247. if (OP(scan) == EXACTLY)
  248. r->regstart = *OPERAND(scan);
  249. else if (OP(scan) == BOL)
  250. r->reganch++;
  251. /*
  252. * If there's something expensive in the r.e., find the
  253. * longest literal string that must appear and make it the
  254. * regmust. Resolve ties in favor of later strings, since
  255. * the regstart check works with the beginning of the r.e.
  256. * and avoiding duplication strengthens checking. Not a
  257. * strong reason, but sufficient in the absence of others.
  258. */
  259. if (flags&SPSTART) {
  260. longest = NULL;
  261. len = 0;
  262. for (; scan != NULL; scan = regnext(scan))
  263. if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= (size_t)len) {
  264. longest = OPERAND(scan);
  265. len = strlen(OPERAND(scan));
  266. }
  267. r->regmust = longest;
  268. r->regmlen = len;
  269. }
  270. }
  271. return(r);
  272. }
  273. /*
  274. - reg - regular expression, i.e. main body or parenthesized thing
  275. *
  276. * Caller must absorb opening parenthesis.
  277. *
  278. * Combining parenthesis handling with the base level of regular expression
  279. * is a trifle forced, but the need to tie the tails of the branches to what
  280. * follows makes it hard to avoid.
  281. */
  282. static char *
  283. reg(paren, flagp)
  284. int paren; /* Parenthesized? */
  285. int *flagp;
  286. {
  287. register char *ret;
  288. register char *br;
  289. register char *ender;
  290. register int parno;
  291. int flags;
  292. *flagp = HASWIDTH; /* Tentatively. */
  293. /* Make an OPEN node, if parenthesized. */
  294. if (paren) {
  295. if (regnpar >= NSUBEXP)
  296. FAIL("too many ()");
  297. parno = regnpar;
  298. regnpar++;
  299. ret = regnode(OPEN+parno);
  300. } else
  301. ret = NULL;
  302. /* Pick up the branches, linking them together. */
  303. br = regbranch(&flags);
  304. if (br == NULL)
  305. return(NULL);
  306. if (ret != NULL)
  307. regtail(ret, br); /* OPEN -> first. */
  308. else
  309. ret = br;
  310. if (!(flags&HASWIDTH))
  311. *flagp &= ~HASWIDTH;
  312. *flagp |= flags&SPSTART;
  313. while (*regparse == '|') {
  314. regparse++;
  315. br = regbranch(&flags);
  316. if (br == NULL)
  317. return(NULL);
  318. regtail(ret, br); /* BRANCH -> BRANCH. */
  319. if (!(flags&HASWIDTH))
  320. *flagp &= ~HASWIDTH;
  321. *flagp |= flags&SPSTART;
  322. }
  323. /* Make a closing node, and hook it on the end. */
  324. ender = regnode((paren) ? CLOSE+parno : END);
  325. regtail(ret, ender);
  326. /* Hook the tails of the branches to the closing node. */
  327. for (br = ret; br != NULL; br = regnext(br))
  328. regoptail(br, ender);
  329. /* Check for proper termination. */
  330. if (paren && *regparse++ != ')') {
  331. FAIL("unmatched ()");
  332. } else if (!paren && *regparse != '\0') {
  333. if (*regparse == ')') {
  334. FAIL("unmatched ()");
  335. } else
  336. FAIL("junk on end"); /* "Can't happen". */
  337. /* NOTREACHED */
  338. }
  339. return(ret);
  340. }
  341. /*
  342. - regbranch - one alternative of an | operator
  343. *
  344. * Implements the concatenation operator.
  345. */
  346. static char *
  347. regbranch(flagp)
  348. int *flagp;
  349. {
  350. register char *ret;
  351. register char *chain;
  352. register char *latest;
  353. int flags;
  354. *flagp = WORST; /* Tentatively. */
  355. ret = regnode(BRANCH);
  356. chain = NULL;
  357. while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
  358. latest = regpiece(&flags);
  359. if (latest == NULL)
  360. return(NULL);
  361. *flagp |= flags&HASWIDTH;
  362. if (chain == NULL) /* First piece. */
  363. *flagp |= flags&SPSTART;
  364. else
  365. regtail(chain, latest);
  366. chain = latest;
  367. }
  368. if (chain == NULL) /* Loop ran zero times. */
  369. (void) regnode(NOTHING);
  370. return(ret);
  371. }
  372. /*
  373. - regpiece - something followed by possible [*+?]
  374. *
  375. * Note that the branching code sequences used for ? and the general cases
  376. * of * and + are somewhat optimized: they use the same NOTHING node as
  377. * both the endmarker for their branch list and the body of the last branch.
  378. * It might seem that this node could be dispensed with entirely, but the
  379. * endmarker role is not redundant.
  380. */
  381. static char *
  382. regpiece(flagp)
  383. int *flagp;
  384. {
  385. register char *ret;
  386. register char op;
  387. register char *next;
  388. int flags;
  389. ret = regatom(&flags);
  390. if (ret == NULL)
  391. return(NULL);
  392. op = *regparse;
  393. if (!ISMULT(op)) {
  394. *flagp = flags;
  395. return(ret);
  396. }
  397. if (!(flags&HASWIDTH) && op != '?')
  398. FAIL("*+ operand could be empty");
  399. *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
  400. if (op == '*' && (flags&SIMPLE))
  401. reginsert(STAR, ret);
  402. else if (op == '*') {
  403. /* Emit x* as (x&|), where & means "self". */
  404. reginsert(BRANCH, ret); /* Either x */
  405. regoptail(ret, regnode(BACK)); /* and loop */
  406. regoptail(ret, ret); /* back */
  407. regtail(ret, regnode(BRANCH)); /* or */
  408. regtail(ret, regnode(NOTHING)); /* null. */
  409. } else if (op == '+' && (flags&SIMPLE))
  410. reginsert(PLUS, ret);
  411. else if (op == '+') {
  412. /* Emit x+ as x(&|), where & means "self". */
  413. next = regnode(BRANCH); /* Either */
  414. regtail(ret, next);
  415. regtail(regnode(BACK), ret); /* loop back */
  416. regtail(next, regnode(BRANCH)); /* or */
  417. regtail(ret, regnode(NOTHING)); /* null. */
  418. } else if (op == '?') {
  419. /* Emit x? as (x|) */
  420. reginsert(BRANCH, ret); /* Either x */
  421. regtail(ret, regnode(BRANCH)); /* or */
  422. next = regnode(NOTHING); /* null. */
  423. regtail(ret, next);
  424. regoptail(ret, next);
  425. }
  426. regparse++;
  427. if (ISMULT(*regparse))
  428. FAIL("nested *?+");
  429. return(ret);
  430. }
  431. /*
  432. - regatom - the lowest level
  433. *
  434. * Optimization: gobbles an entire sequence of ordinary characters so that
  435. * it can turn them into a single node, which is smaller to store and
  436. * faster to run. Backslashed characters are exceptions, each becoming a
  437. * separate node; the code is simpler that way and it's not worth fixing.
  438. */
  439. static char *
  440. regatom(flagp)
  441. int *flagp;
  442. {
  443. register char *ret;
  444. int flags;
  445. *flagp = WORST; /* Tentatively. */
  446. switch (*regparse++) {
  447. case '^':
  448. ret = regnode(BOL);
  449. break;
  450. case '$':
  451. ret = regnode(EOL);
  452. break;
  453. case '.':
  454. ret = regnode(ANY);
  455. *flagp |= HASWIDTH|SIMPLE;
  456. break;
  457. case '[': {
  458. register int class;
  459. register int classend;
  460. if (*regparse == '^') { /* Complement of range. */
  461. ret = regnode(ANYBUT);
  462. regparse++;
  463. } else
  464. ret = regnode(ANYOF);
  465. if (*regparse == ']' || *regparse == '-')
  466. regc(*regparse++);
  467. while (*regparse != '\0' && *regparse != ']') {
  468. if (*regparse == '-') {
  469. regparse++;
  470. if (*regparse == ']' || *regparse == '\0')
  471. regc('-');
  472. else {
  473. class = UCHARAT(regparse-2)+1;
  474. classend = UCHARAT(regparse);
  475. if (class > classend+1)
  476. FAIL("invalid [] range");
  477. for (; class <= classend; class++)
  478. regc(class);
  479. regparse++;
  480. }
  481. } else
  482. regc(*regparse++);
  483. }
  484. regc('\0');
  485. if (*regparse != ']')
  486. FAIL("unmatched []");
  487. regparse++;
  488. *flagp |= HASWIDTH|SIMPLE;
  489. }
  490. break;
  491. case '(':
  492. ret = reg(1, &flags);
  493. if (ret == NULL)
  494. return(NULL);
  495. *flagp |= flags&(HASWIDTH|SPSTART);
  496. break;
  497. case '\0':
  498. case '|':
  499. case ')':
  500. FAIL("internal urp"); /* Supposed to be caught earlier. */
  501. break;
  502. case '?':
  503. case '+':
  504. case '*':
  505. FAIL("?+* follows nothing");
  506. break;
  507. case '\\':
  508. if (*regparse == '\0')
  509. FAIL("trailing \\");
  510. ret = regnode(EXACTLY);
  511. regc(*regparse++);
  512. regc('\0');
  513. *flagp |= HASWIDTH|SIMPLE;
  514. break;
  515. default: {
  516. register int len;
  517. register char ender;
  518. regparse--;
  519. len = strcspn(regparse, META);
  520. if (len <= 0)
  521. FAIL("internal disaster");
  522. ender = *(regparse+len);
  523. if (len > 1 && ISMULT(ender))
  524. len--; /* Back off clear of ?+* operand. */
  525. *flagp |= HASWIDTH;
  526. if (len == 1)
  527. *flagp |= SIMPLE;
  528. ret = regnode(EXACTLY);
  529. while (len > 0) {
  530. regc(*regparse++);
  531. len--;
  532. }
  533. regc('\0');
  534. }
  535. break;
  536. }
  537. return(ret);
  538. }
  539. /*
  540. - regnode - emit a node
  541. */
  542. static char * /* Location. */
  543. regnode(op)
  544. char op;
  545. {
  546. register char *ret;
  547. register char *ptr;
  548. ret = regcode;
  549. if (ret == &regdummy) {
  550. regsize += 3;
  551. return(ret);
  552. }
  553. ptr = ret;
  554. *ptr++ = op;
  555. *ptr++ = '\0'; /* Null "next" pointer. */
  556. *ptr++ = '\0';
  557. regcode = ptr;
  558. return(ret);
  559. }
  560. /*
  561. - regc - emit (if appropriate) a byte of code
  562. */
  563. static void
  564. regc(b)
  565. char b;
  566. {
  567. if (regcode != &regdummy)
  568. *regcode++ = b;
  569. else
  570. regsize++;
  571. }
  572. /*
  573. - reginsert - insert an operator in front of already-emitted operand
  574. *
  575. * Means relocating the operand.
  576. */
  577. static void
  578. reginsert(op, opnd)
  579. char op;
  580. char *opnd;
  581. {
  582. register char *src;
  583. register char *dst;
  584. register char *place;
  585. if (regcode == &regdummy) {
  586. regsize += 3;
  587. return;
  588. }
  589. src = regcode;
  590. regcode += 3;
  591. dst = regcode;
  592. while (src > opnd)
  593. *--dst = *--src;
  594. place = opnd; /* Op node, where operand used to be. */
  595. *place++ = op;
  596. *place++ = '\0';
  597. *place++ = '\0';
  598. }
  599. /*
  600. - regtail - set the next-pointer at the end of a node chain
  601. */
  602. static void
  603. regtail(p, val)
  604. char *p;
  605. char *val;
  606. {
  607. register char *scan;
  608. register char *temp;
  609. register int offset;
  610. if (p == &regdummy)
  611. return;
  612. /* Find last node. */
  613. scan = p;
  614. for (;;) {
  615. temp = regnext(scan);
  616. if (temp == NULL)
  617. break;
  618. scan = temp;
  619. }
  620. if (OP(scan) == BACK)
  621. offset = (int)(scan - val);
  622. else
  623. offset = (int)(val - scan);
  624. *(scan+1) = (char)((offset>>8)&0377);
  625. *(scan+2) = (char)(offset&0377);
  626. }
  627. /*
  628. - regoptail - regtail on operand of first argument; nop if operandless
  629. */
  630. static void
  631. regoptail(p, val)
  632. char *p;
  633. char *val;
  634. {
  635. /* "Operandless" and "op != BRANCH" are synonymous in practice. */
  636. if (p == NULL || p == &regdummy || OP(p) != BRANCH)
  637. return;
  638. regtail(OPERAND(p), val);
  639. }
  640. /*
  641. * regexec and friends
  642. */
  643. /*
  644. * Global work variables for regexec().
  645. */
  646. static char *reginput; /* String-input pointer. */
  647. static char *regbol; /* Beginning of input, for ^ check. */
  648. static char **regstartp; /* Pointer to startp array. */
  649. static char **regendp; /* Ditto for endp. */
  650. /*
  651. * Forwards.
  652. */
  653. STATIC int regtry();
  654. STATIC int regmatch();
  655. STATIC int regrepeat();
  656. #ifdef DEBUG
  657. int regnarrate = 0;
  658. void regdump();
  659. STATIC char *regprop();
  660. #endif
  661. /*
  662. - regexec - match a regexp against a string
  663. */
  664. int
  665. regexec(regexp *prog, char *string, int at_bol)
  666. {
  667. register char *s;
  668. /* Be paranoid... */
  669. if (prog == NULL || string == NULL) {
  670. regerror("NULL parameter");
  671. return(0);
  672. }
  673. /* Check validity of program. */
  674. if (UCHARAT(prog->program) != MAGIC) {
  675. regerror("corrupted program");
  676. return(0);
  677. }
  678. /* If there is a "must appear" string, look for it. */
  679. if (prog->regmust != NULL) {
  680. s = string;
  681. while ((s = cstrchr(s, prog->regmust[0])) != NULL) {
  682. if (cstrncmp(s, prog->regmust, prog->regmlen) == 0)
  683. break; /* Found it. */
  684. s++;
  685. }
  686. if (s == NULL) /* Not present. */
  687. return(0);
  688. }
  689. /* Mark beginning of line for ^ . */
  690. if (at_bol)
  691. regbol = string; /* is possible to match bol */
  692. else
  693. regbol = NULL; /* we aren't there, so don't match it */
  694. /* Simplest case: anchored match need be tried only once. */
  695. if (prog->reganch)
  696. return(regtry(prog, string));
  697. /* Messy cases: unanchored match. */
  698. s = string;
  699. if (prog->regstart != '\0')
  700. /* We know what char it must start with. */
  701. while ((s = cstrchr(s, prog->regstart)) != NULL) {
  702. if (regtry(prog, s))
  703. return(1);
  704. s++;
  705. }
  706. else
  707. /* We don't -- general case. */
  708. do {
  709. if (regtry(prog, s))
  710. return(1);
  711. } while (*s++ != '\0');
  712. /* Failure. */
  713. return(0);
  714. }
  715. /*
  716. - regtry - try match at specific point
  717. */
  718. static int /* 0 failure, 1 success */
  719. regtry(prog, string)
  720. regexp *prog;
  721. char *string;
  722. {
  723. register int i;
  724. register char **sp;
  725. register char **ep;
  726. reginput = string;
  727. regstartp = prog->startp;
  728. regendp = prog->endp;
  729. sp = prog->startp;
  730. ep = prog->endp;
  731. for (i = NSUBEXP; i > 0; i--) {
  732. *sp++ = NULL;
  733. *ep++ = NULL;
  734. }
  735. if (regmatch(prog->program + 1)) {
  736. prog->startp[0] = string;
  737. prog->endp[0] = reginput;
  738. return(1);
  739. } else
  740. return(0);
  741. }
  742. /*
  743. - regmatch - main matching routine
  744. *
  745. * Conceptually the strategy is simple: check to see whether the current
  746. * node matches, call self recursively to see whether the rest matches,
  747. * and then act accordingly. In practice we make some effort to avoid
  748. * recursion, in particular by going through "ordinary" nodes (that don't
  749. * need to know whether the rest of the match failed) by a loop instead of
  750. * by recursion.
  751. */
  752. static int /* 0 failure, 1 success */
  753. regmatch(prog)
  754. char *prog;
  755. {
  756. register char *scan; /* Current node. */
  757. char *next; /* Next node. */
  758. scan = prog;
  759. #ifdef DEBUG
  760. if (scan != NULL && regnarrate)
  761. fprintf(stderr, "%s(\n", regprop(scan));
  762. #endif
  763. while (scan != NULL) {
  764. #ifdef DEBUG
  765. if (regnarrate)
  766. fprintf(stderr, "%s...\n", regprop(scan));
  767. #endif
  768. next = regnext(scan);
  769. switch (OP(scan)) {
  770. case BOL:
  771. if (reginput != regbol)
  772. return(0);
  773. break;
  774. case EOL:
  775. if (*reginput != '\0')
  776. return(0);
  777. break;
  778. case ANY:
  779. if (*reginput == '\0')
  780. return(0);
  781. reginput++;
  782. break;
  783. case EXACTLY: {
  784. register int len;
  785. register char *opnd;
  786. opnd = OPERAND(scan);
  787. /* Inline the first character, for speed. */
  788. if (mkup(*opnd) != mkup(*reginput))
  789. return(0);
  790. len = strlen(opnd);
  791. if (len > 1 && cstrncmp(opnd,reginput,len) != 0)
  792. return(0);
  793. reginput += len;
  794. }
  795. break;
  796. case ANYOF:
  797. if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
  798. return(0);
  799. reginput++;
  800. break;
  801. case ANYBUT:
  802. if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
  803. return(0);
  804. reginput++;
  805. break;
  806. case NOTHING:
  807. break;
  808. case BACK:
  809. break;
  810. case OPEN+1:
  811. case OPEN+2:
  812. case OPEN+3:
  813. case OPEN+4:
  814. case OPEN+5:
  815. case OPEN+6:
  816. case OPEN+7:
  817. case OPEN+8:
  818. case OPEN+9: {
  819. register int no;
  820. register char *save;
  821. no = OP(scan) - OPEN;
  822. save = reginput;
  823. if (regmatch(next)) {
  824. /*
  825. * Don't set startp if some later
  826. * invocation of the same parentheses
  827. * already has.
  828. */
  829. if (regstartp[no] == NULL)
  830. regstartp[no] = save;
  831. return(1);
  832. } else
  833. return(0);
  834. }
  835. break;
  836. case CLOSE+1:
  837. case CLOSE+2:
  838. case CLOSE+3:
  839. case CLOSE+4:
  840. case CLOSE+5:
  841. case CLOSE+6:
  842. case CLOSE+7:
  843. case CLOSE+8:
  844. case CLOSE+9: {
  845. register int no;
  846. register char *save;
  847. no = OP(scan) - CLOSE;
  848. save = reginput;
  849. if (regmatch(next)) {
  850. /*
  851. * Don't set endp if some later
  852. * invocation of the same parentheses
  853. * already has.
  854. */
  855. if (regendp[no] == NULL)
  856. regendp[no] = save;
  857. return(1);
  858. } else
  859. return(0);
  860. }
  861. break;
  862. case BRANCH: {
  863. register char *save;
  864. if (OP(next) != BRANCH) /* No choice. */
  865. next = OPERAND(scan); /* Avoid recursion. */
  866. else {
  867. do {
  868. save = reginput;
  869. if (regmatch(OPERAND(scan)))
  870. return(1);
  871. reginput = save;
  872. scan = regnext(scan);
  873. } while (scan != NULL && OP(scan) == BRANCH);
  874. return(0);
  875. /* NOTREACHED */
  876. }
  877. }
  878. break;
  879. case STAR:
  880. case PLUS: {
  881. register char nextch;
  882. register int no;
  883. register char *save;
  884. register int min;
  885. /*
  886. * Lookahead to avoid useless match attempts
  887. * when we know what character comes next.
  888. */
  889. nextch = '\0';
  890. if (OP(next) == EXACTLY)
  891. nextch = *OPERAND(next);
  892. min = (OP(scan) == STAR) ? 0 : 1;
  893. save = reginput;
  894. no = regrepeat(OPERAND(scan));
  895. while (no >= min) {
  896. /* If it could work, try it. */
  897. if (nextch == '\0' || *reginput == nextch)
  898. if (regmatch(next))
  899. return(1);
  900. /* Couldn't or didn't -- back up. */
  901. no--;
  902. reginput = save + no;
  903. }
  904. return(0);
  905. }
  906. break;
  907. case END:
  908. return(1); /* Success! */
  909. break;
  910. default:
  911. regerror("memory corruption");
  912. return(0);
  913. break;
  914. }
  915. scan = next;
  916. }
  917. /*
  918. * We get here only if there's trouble -- normally "case END" is
  919. * the terminating point.
  920. */
  921. regerror("corrupted pointers");
  922. return(0);
  923. }
  924. /*
  925. - regrepeat - repeatedly match something simple, report how many
  926. */
  927. static int
  928. regrepeat(p)
  929. char *p;
  930. {
  931. register int count = 0;
  932. register char *scan;
  933. register char *opnd;
  934. scan = reginput;
  935. opnd = OPERAND(p);
  936. switch (OP(p)) {
  937. case ANY:
  938. count = strlen(scan);
  939. scan += count;
  940. break;
  941. case EXACTLY:
  942. while (mkup(*opnd) == mkup(*scan)) {
  943. count++;
  944. scan++;
  945. }
  946. break;
  947. case ANYOF:
  948. while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
  949. count++;
  950. scan++;
  951. }
  952. break;
  953. case ANYBUT:
  954. while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
  955. count++;
  956. scan++;
  957. }
  958. break;
  959. default: /* Oh dear. Called inappropriately. */
  960. regerror("internal foulup");
  961. count = 0; /* Best compromise. */
  962. break;
  963. }
  964. reginput = scan;
  965. return(count);
  966. }
  967. /*
  968. - regnext - dig the "next" pointer out of a node
  969. */
  970. static char *
  971. regnext(p)
  972. register char *p;
  973. {
  974. register int offset;
  975. if (p == &regdummy)
  976. return(NULL);
  977. offset = NEXT(p);
  978. if (offset == 0)
  979. return(NULL);
  980. if (OP(p) == BACK)
  981. return(p-offset);
  982. else
  983. return(p+offset);
  984. }
  985. #ifdef DEBUG
  986. STATIC char *regprop();
  987. /*
  988. - regdump - dump a regexp onto stdout in vaguely comprehensible form
  989. */
  990. void
  991. regdump(r)
  992. regexp *r;
  993. {
  994. register char *s;
  995. register char op = EXACTLY; /* Arbitrary non-END op. */
  996. register char *next;
  997. s = r->program + 1;
  998. while (op != END) { /* While that wasn't END last time... */
  999. op = OP(s);
  1000. printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
  1001. next = regnext(s);
  1002. if (next == NULL) /* Next ptr. */
  1003. printf("(0)");
  1004. else
  1005. printf("(%d)", (s-r->program)+(next-s));
  1006. s += 3;
  1007. if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
  1008. /* Literal string, where present. */
  1009. while (*s != '\0') {
  1010. putchar(*s);
  1011. s++;
  1012. }
  1013. s++;
  1014. }
  1015. putchar('\n');
  1016. }
  1017. /* Header fields of interest. */
  1018. if (r->regstart != '\0')
  1019. printf("start `%c' ", r->regstart);
  1020. if (r->reganch)
  1021. printf("anchored ");
  1022. if (r->regmust != NULL)
  1023. printf("must have \"%s\"", r->regmust);
  1024. printf("\n");
  1025. }
  1026. /*
  1027. - regprop - printable representation of opcode
  1028. */
  1029. static char *
  1030. regprop(op)
  1031. char *op;
  1032. {
  1033. register char *p;
  1034. static char buf[50];
  1035. (void) strcpy(buf, ":");
  1036. switch (OP(op)) {
  1037. case BOL:
  1038. p = "BOL";
  1039. break;
  1040. case EOL:
  1041. p = "EOL";
  1042. break;
  1043. case ANY:
  1044. p = "ANY";
  1045. break;
  1046. case ANYOF:
  1047. p = "ANYOF";
  1048. break;
  1049. case ANYBUT:
  1050. p = "ANYBUT";
  1051. break;
  1052. case BRANCH:
  1053. p = "BRANCH";
  1054. break;
  1055. case EXACTLY:
  1056. p = "EXACTLY";
  1057. break;
  1058. case NOTHING:
  1059. p = "NOTHING";
  1060. break;
  1061. case BACK:
  1062. p = "BACK";
  1063. break;
  1064. case END:
  1065. p = "END";
  1066. break;
  1067. case OPEN+1:
  1068. case OPEN+2:
  1069. case OPEN+3:
  1070. case OPEN+4:
  1071. case OPEN+5:
  1072. case OPEN+6:
  1073. case OPEN+7:
  1074. case OPEN+8:
  1075. case OPEN+9:
  1076. sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
  1077. p = NULL;
  1078. break;
  1079. case CLOSE+1:
  1080. case CLOSE+2:
  1081. case CLOSE+3:
  1082. case CLOSE+4:
  1083. case CLOSE+5:
  1084. case CLOSE+6:
  1085. case CLOSE+7:
  1086. case CLOSE+8:
  1087. case CLOSE+9:
  1088. sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
  1089. p = NULL;
  1090. break;
  1091. case STAR:
  1092. p = "STAR";
  1093. break;
  1094. case PLUS:
  1095. p = "PLUS";
  1096. break;
  1097. default:
  1098. regerror("corrupted opcode");
  1099. break;
  1100. }
  1101. if (p != NULL)
  1102. (void) strcat(buf, p);
  1103. return(buf);
  1104. }
  1105. #endif
  1106. /*
  1107. * The following is provided for those people who do not have strcspn() in
  1108. * their C libraries. They should get off their butts and do something
  1109. * about it; at least one public-domain implementation of those (highly
  1110. * useful) string routines has been published on Usenet.
  1111. */
  1112. #ifdef STRCSPN
  1113. /*
  1114. * strcspn - find length of initial segment of s1 consisting entirely
  1115. * of characters not from s2
  1116. */
  1117. static int
  1118. strcspn(s1, s2)
  1119. char *s1;
  1120. char *s2;
  1121. {
  1122. register char *scan1;
  1123. register char *scan2;
  1124. register int count;
  1125. count = 0;
  1126. for (scan1 = s1; *scan1 != '\0'; scan1++) {
  1127. for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
  1128. if (*scan1 == *scan2++)
  1129. return(count);
  1130. count++;
  1131. }
  1132. return(count);
  1133. }
  1134. #endif
  1135. int
  1136. cstrncmp(s1, s2, n)
  1137. char *s1, *s2;
  1138. int n;
  1139. {
  1140. char *p, *S1, *S2, *strsave();
  1141. int rval;
  1142. if (!reg_ic)
  1143. return (strncmp(s1, s2, n));
  1144. S1 = strsave(s1);
  1145. S2 = strsave(s2);
  1146. for (p = S1; *p ;p++)
  1147. if (islower(*p))
  1148. *p = (char)toupper(*p);
  1149. for (p = S2; *p ;p++)
  1150. if (islower(*p))
  1151. *p = (char)toupper(*p);
  1152. rval = strncmp(S1, S2, n);
  1153. free(S1);
  1154. free(S2);
  1155. return rval;
  1156. }
  1157. char *
  1158. cstrchr(
  1159. char *s,
  1160. char c)
  1161. {
  1162. char *p;
  1163. for (p = s; *p ;p++) {
  1164. if (mkup(*p) == mkup(c))
  1165. return p;
  1166. }
  1167. return NULL;
  1168. }