You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1237 lines
32 KiB
1237 lines
32 KiB
/* reparse.c - parse a regular expression
|
|
*
|
|
* cl /c /Zep /AM /NT RE /Gs /G2 /Oa /D LINT_ARGS /Fc reparse.c
|
|
*
|
|
* Modifications:
|
|
*
|
|
* 22-Jul-1986 mz Hookable allocator (allow Z to create enough free space)
|
|
* 19-Nov-1986 mz Add RETranslateLength for Z to determine overflows
|
|
* 18-Aug-1987 mz Add field width and justification in translations
|
|
* 01-Mar-1988 mz Add in UNIX-like syntax
|
|
* 14-Jun-1988 mz Fix file parts allowing backslashes
|
|
* 04-Dec-1989 bp Let :p accept uppercase drive names
|
|
* 20-Dec-1989 ln capture trailing periods in :p
|
|
* 23-Jan-1990 ln Handle escaped characters & invalid trailing \ in
|
|
* RETranslate.
|
|
*
|
|
* 28-Jul-1990 davegi Changed Fill to memset (OS/2 2.0)
|
|
* Changed Move to memmove (OS/2 2.0)
|
|
* 19-Oct-1990 w-barry changed cArg to unsigned int from int.
|
|
*/
|
|
#include <ctype.h>
|
|
|
|
#include <stdio.h>
|
|
#include <malloc.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <windows.h>
|
|
#include <tools.h>
|
|
#include <remi.h>
|
|
|
|
#include "re.h"
|
|
|
|
#if DEBUG
|
|
#define DEBOUT(x) printf x; fflush (stdout)
|
|
#else
|
|
#define DEBOUT(x)
|
|
#endif
|
|
|
|
|
|
/* regular expression compiler. A regular expression is compiled into pseudo-
|
|
* machine code. The principle is portable to other machines and is outlined
|
|
* below. We parse by recursive descent.
|
|
*
|
|
* The pseudo-code is fairly close to normal assembler and can be easily
|
|
* converted to be real machine code and has been done for the 80*86
|
|
* processor family.
|
|
*
|
|
* The basic regular expressions handled are:
|
|
*
|
|
* letter matches a single letter
|
|
* [class] matches a single character in the class
|
|
* [~class] matches a single character not in the class
|
|
* ^ matches the beginning of the line
|
|
* $ matches the end of the line
|
|
* ? matches any character (except previous two)
|
|
* \x literal x
|
|
* \n matches the previously tagged/matched expression (n digit)
|
|
*
|
|
* Regular expressions are now build from the above via:
|
|
*
|
|
* x* matches 0 or more x, matching minimal number
|
|
* x+ matches 1 or more x, matching minimal number
|
|
* x@ matches 0 or more x, matching maximal number
|
|
* x# matches 1 or more x, matching maximal number
|
|
* (x1!x2!...) matches x1 or x2 or ...
|
|
* ~x matches 0 characters but prevents x from occuring
|
|
* {x} identifies an argument
|
|
*
|
|
* The final expression that is matched by the compiler is:
|
|
*
|
|
* xy matches x then y
|
|
*
|
|
*
|
|
* The actual grammar used is: Parsing action:
|
|
*
|
|
* TOP -> re PROLOG .re. EPILOG
|
|
*
|
|
*
|
|
* re -> { re } re | LEFTARG .re. RIGHTARG
|
|
* e re |
|
|
* empty
|
|
*
|
|
* e -> se * | SMSTAR .se. SMSTAR1
|
|
* se + |
|
|
* se @ | STAR .se. STAR1
|
|
* se # |
|
|
* se
|
|
*
|
|
* se -> ( alt ) |
|
|
* [ ccl ] |
|
|
* ? | ANY
|
|
* ^ | BOL
|
|
* $ | EOL
|
|
* ~ se | NOTSIGN .se. NOTSIGN1
|
|
* :x |
|
|
* \n | PREV
|
|
* letter LETTER x
|
|
*
|
|
* alt -> re ! alt | LEFTOR .re. ORSIGN
|
|
* re LEFTOR .re. ORSIGN RIGHTOR
|
|
*
|
|
* ccl -> ~ cset | CCLBEG NOTSIGN .cset. CCLEND
|
|
* cset CCLBEG NULL .cset. CCLEND
|
|
*
|
|
* cset -> item cset |
|
|
* item
|
|
*
|
|
* item -> letter - letter | RANGE x y
|
|
* letter RANGE x x
|
|
*
|
|
* Abbreviations are introduced by :.
|
|
*
|
|
* :a [a-zA-Z0-9] alphanumeric
|
|
* :b ([<space><tab>]#) whitespace
|
|
* :c [a-zA-Z] alphabetic
|
|
* :d [0-9] digit
|
|
* :f ([~/\\ "\[\]\:<|>+=;,.]#) file part
|
|
* :h ([0-9a-fA-F]#) hex number
|
|
* :i ([a-zA-Z_$][a-zA-Z0-9_$]@) identifier
|
|
* :n ([0-9]#.[0-9]@![0-9]@.[0-9]#![0-9]#) number
|
|
* :p (([A-Za-z]\:!)(\\!)(:f(.:f!)(\\!/))@:f(.:f!.!)) path
|
|
* :q ("[~"]@"!'[~']@') quoted string
|
|
* :w ([a-zA-Z]#) word
|
|
* :z ([0-9]#) integer
|
|
*
|
|
*/
|
|
|
|
extern char XLTab[256]; /* lower-casing table */
|
|
|
|
/* There are several classes of characters:
|
|
*
|
|
* Closure characters are suffixes that indicate repetition of the previous
|
|
* RE.
|
|
*
|
|
* Simple RE chars are characters that indicate a particular type of match
|
|
*
|
|
*/
|
|
|
|
/* Closure character equates
|
|
*/
|
|
#define CCH_SMPLUS 0 /* plus closure */
|
|
#define CCH_SMCLOSURE 1 /* star closure */
|
|
#define CCH_POWER 2 /* n repetitions of previous pattern */
|
|
#define CCH_CLOSURE 3 /* greedy closure */
|
|
#define CCH_PLUS 4 /* greedy plus */
|
|
#define CCH_NONE 5
|
|
#define CCH_ERROR -1
|
|
|
|
/* Simple RE character equates */
|
|
#define SR_BOL 0
|
|
#define SR_EOL 1
|
|
#define SR_ANY 2
|
|
#define SR_CCLBEG 3
|
|
#define SR_LEFTOR 4
|
|
#define SR_CCLEND 5
|
|
#define SR_ABBREV 6
|
|
#define SR_RIGHTOR 7
|
|
#define SR_ORSIGN 8
|
|
#define SR_NOTSIGN 9
|
|
#define SR_LEFTARG 10
|
|
#define SR_RIGHTARG 11
|
|
#define SR_LETTER 12
|
|
#define SR_PREV 13
|
|
|
|
int EndAltRE[] = { SR_ORSIGN, SR_RIGHTOR, -1};
|
|
int EndArg[] = { SR_RIGHTARG, -1};
|
|
|
|
char *pAbbrev[] = {
|
|
"a[a-zA-Z0-9]",
|
|
"b([ \t]#)",
|
|
"c[a-zA-Z]",
|
|
"d[0-9]",
|
|
"f([~/\\\\ \\\"\\[\\]\\:<|>+=;,.]#!..!.)",
|
|
"h([0-9a-fA-F]#)",
|
|
"i([a-zA-Z_$][a-zA-Z0-9_$]@)",
|
|
"n([0-9]#.[0-9]@![0-9]@.[0-9]#![0-9]#)",
|
|
"p(([A-Za-z]\\:!)(\\\\!/!)(:f(.:f!)(\\\\!/))@:f(.:f!.!))",
|
|
"q(\"[~\"]@\"!'[~']@')",
|
|
"w([a-zA-Z]#)",
|
|
"z([0-9]#)",
|
|
NULL
|
|
};
|
|
|
|
static char *digits = "0123456789";
|
|
|
|
static flagType fZSyntax = TRUE; /* TRUE => use Z syntax for things */
|
|
|
|
static unsigned int cArg;
|
|
|
|
/* RECharType - classify a character type
|
|
*
|
|
* p character pointer
|
|
*
|
|
* returns type of character (SR_xx)
|
|
*/
|
|
int
|
|
RECharType (
|
|
char *p
|
|
)
|
|
{
|
|
if (fZSyntax)
|
|
/* Zibo syntax
|
|
*/
|
|
switch (*p) {
|
|
case '^':
|
|
return SR_BOL;
|
|
case '$':
|
|
if (isdigit (p[1]))
|
|
return SR_PREV;
|
|
else
|
|
return SR_EOL;
|
|
case '?':
|
|
return SR_ANY;
|
|
case '[':
|
|
return SR_CCLBEG;
|
|
case '(':
|
|
return SR_LEFTOR;
|
|
case ']':
|
|
return SR_CCLEND;
|
|
case ':':
|
|
return SR_ABBREV;
|
|
case ')':
|
|
return SR_RIGHTOR;
|
|
case '!':
|
|
return SR_ORSIGN;
|
|
case '~':
|
|
return SR_NOTSIGN;
|
|
case '{':
|
|
return SR_LEFTARG;
|
|
case '}':
|
|
return SR_RIGHTARG;
|
|
default:
|
|
return SR_LETTER;
|
|
} else
|
|
/* Crappy UNIX syntax
|
|
*/
|
|
switch (*p) {
|
|
case '^':
|
|
return SR_BOL;
|
|
case '$':
|
|
return SR_EOL;
|
|
case '.':
|
|
return SR_ANY;
|
|
case '[':
|
|
return SR_CCLBEG;
|
|
case ']':
|
|
return SR_CCLEND;
|
|
case '\\':
|
|
switch (p[1]) {
|
|
case ':': /* \:C */
|
|
return SR_ABBREV;
|
|
case '(': /* \( */
|
|
return SR_LEFTARG;
|
|
case ')': /* \) */
|
|
return SR_RIGHTARG;
|
|
case '~': /* \~ */
|
|
return SR_NOTSIGN;
|
|
case '{': /* \{ */
|
|
return SR_LEFTOR;
|
|
case '}': /* \} */
|
|
return SR_RIGHTOR;
|
|
case '!': /* \! */
|
|
return SR_ORSIGN;
|
|
}
|
|
if (isdigit (p[1])) /* \N */
|
|
return SR_PREV;
|
|
default:
|
|
return SR_LETTER;
|
|
}
|
|
}
|
|
|
|
/* RECharLen - length of character type
|
|
*
|
|
* p character pointer to type
|
|
*
|
|
* returns length in chars of type
|
|
*/
|
|
int
|
|
RECharLen (
|
|
char *p
|
|
)
|
|
{
|
|
if (fZSyntax)
|
|
if (RECharType (p) == SR_PREV) /* $N */
|
|
return 2;
|
|
else
|
|
if (RECharType (p) == SR_ABBREV) /* :N */
|
|
return 2;
|
|
else
|
|
return 1;
|
|
else {
|
|
if (*p == '\\')
|
|
switch (p[1]) {
|
|
case '{':
|
|
case '}':
|
|
case '~':
|
|
case '(':
|
|
case ')':
|
|
case '!':
|
|
return 2; /* \C */
|
|
case ':': /* \:C */
|
|
return 3;
|
|
default:
|
|
if (isdigit (p[1]))
|
|
return 2; /* \N */
|
|
else
|
|
return 1;
|
|
}
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
/* REClosureLen - length of character type
|
|
*
|
|
* p character pointer to type
|
|
*
|
|
* returns length in chars of type
|
|
*/
|
|
int
|
|
REClosureLen (
|
|
char *p
|
|
)
|
|
{
|
|
p;
|
|
|
|
return 1;
|
|
}
|
|
|
|
/* REParseRE - parse a general RE up to but not including the pEnd set
|
|
* of chars. Apply a particular action to each node in the parse tree.
|
|
*
|
|
* pAction Parse action routine to call at particluar points in the
|
|
* parse tree. This routine returns an unsigned quantity that
|
|
* is expected to be passed on to other action calls within the
|
|
* same node.
|
|
* p character pointer to string being parsed
|
|
* pEnd pointer to set of char types that end the current RE.
|
|
* External callers will typically use NULL for this value.
|
|
* Internally, however, we need to break on the ALT-terminating
|
|
* types or on arg-terminating types.
|
|
*
|
|
* Returns: pointer to delimited character if successful parse
|
|
* NULL if unsuccessful parse (syntax error).
|
|
*
|
|
*/
|
|
char *
|
|
REParseRE (
|
|
PACT pAction,
|
|
register char *p,
|
|
int *pEnd
|
|
)
|
|
{
|
|
int *pe;
|
|
UINT_PTR u;
|
|
|
|
DEBOUT (("REParseRE (%04x, %s)\n", pAction, p));
|
|
|
|
while (TRUE) {
|
|
/* If we're at end of input
|
|
*/
|
|
if (*p == '\0')
|
|
/* If we're not in the midst of an open expression
|
|
*/
|
|
if (pEnd == NULL)
|
|
/* return the current parse position
|
|
*/
|
|
return p;
|
|
else {
|
|
/* End of input, but expecting more, ERROR
|
|
*/
|
|
DEBOUT (("REParse expecting more, ERROR\n"));
|
|
return NULL;
|
|
}
|
|
|
|
/* If there is an open expression
|
|
*/
|
|
if (pEnd != NULL)
|
|
/* Find a matching character
|
|
*/
|
|
for (pe = pEnd; *pe != -1; pe++)
|
|
if (RECharType (p) == *pe)
|
|
return p;
|
|
|
|
/* If we are looking at a left argument
|
|
*/
|
|
if (RECharType (p) == SR_LEFTARG) {
|
|
/* Parse LEFTARG .re. RIGHTARG
|
|
*/
|
|
u = (*pAction) (LEFTARG, 0, '\0', '\0');
|
|
if ((p = REParseRE (pAction, p + RECharLen (p), EndArg)) == NULL)
|
|
return NULL;
|
|
(*pAction) (RIGHTARG, u, '\0', '\0');
|
|
cArg++;
|
|
p += RECharLen (p);
|
|
} else
|
|
/* Parse .e.
|
|
*/
|
|
if ((p = REParseE (pAction, p)) == NULL)
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
/* REParseE - parse a simple regular expression with potential closures.
|
|
*
|
|
* pAction Action to apply at special parse nodes
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParseE (
|
|
PACT pAction,
|
|
register char *p
|
|
)
|
|
{
|
|
DEBOUT (("REParseE (%04x, %s)\n", pAction, p));
|
|
|
|
switch (REClosureChar (p)) {
|
|
case CCH_SMPLUS:
|
|
if (REParseSE (pAction, p) == NULL)
|
|
return NULL;
|
|
case CCH_SMCLOSURE:
|
|
return REParseClosure (pAction, p);
|
|
|
|
case CCH_PLUS:
|
|
if (REParseSE (pAction, p) == NULL)
|
|
return NULL;
|
|
case CCH_CLOSURE:
|
|
return REParseGreedy (pAction, p);
|
|
|
|
case CCH_POWER:
|
|
return REParsePower (pAction, p);
|
|
|
|
case CCH_NONE:
|
|
return REParseSE (pAction, p);
|
|
|
|
default:
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
/* REParseSE - parse a simple regular expression
|
|
*
|
|
* pAction Action to apply at special parse nodes
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParseSE (
|
|
register PACT pAction,
|
|
register char *p
|
|
)
|
|
{
|
|
DEBOUT (("REParseSE (%04x, %s)\n", pAction, p));
|
|
|
|
switch (RECharType (p)) {
|
|
case SR_CCLBEG:
|
|
return REParseClass (pAction, p);
|
|
case SR_ANY:
|
|
return REParseAny (pAction, p);
|
|
case SR_BOL:
|
|
return REParseBOL (pAction, p);
|
|
case SR_EOL:
|
|
return REParseEOL (pAction, p);
|
|
case SR_PREV:
|
|
return REParsePrev (pAction, p);
|
|
case SR_LEFTOR:
|
|
return REParseAlt (pAction, p);
|
|
case SR_NOTSIGN:
|
|
return REParseNot (pAction, p);
|
|
case SR_ABBREV:
|
|
return REParseAbbrev (pAction, p);
|
|
default:
|
|
return REParseChar (pAction, p);
|
|
}
|
|
}
|
|
|
|
/* REParseClass - parse a class membership match
|
|
*
|
|
* pAction Action to apply at beginning of parse and at each range
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParseClass (
|
|
PACT pAction,
|
|
register char *p
|
|
)
|
|
{
|
|
char c;
|
|
UINT_PTR u;
|
|
|
|
DEBOUT (("REParseClass (%04x, %s)\n", pAction, p));
|
|
|
|
p += RECharLen (p);
|
|
if ((fZSyntax && *p == '~') || (!fZSyntax && *p == '^')) {
|
|
u = (*pAction) (CCLNOT, 0, '\0', '\0');
|
|
p += RECharLen (p);
|
|
} else
|
|
u = (*pAction) (CCLBEG, 0, '\0', '\0');
|
|
|
|
while (RECharType (p) != SR_CCLEND) {
|
|
if (*p == '\\')
|
|
p++;
|
|
if (*p == '\0') {
|
|
DEBOUT (("REParseClass expecting more, ERROR\n"));
|
|
return NULL;
|
|
}
|
|
c = *p++;
|
|
if (*p == '-') {
|
|
p++;
|
|
if (*p == '\\')
|
|
p++;
|
|
if (*p == '\0') {
|
|
DEBOUT (("REParseClass expecting more, ERROR\n"));
|
|
return NULL;
|
|
}
|
|
(*pAction) (RANGE, u, c, *p);
|
|
p++;
|
|
} else
|
|
(*pAction) (RANGE, u, c, c);
|
|
}
|
|
return p + RECharLen (p);
|
|
}
|
|
|
|
/* REParseAny - parse a match-any-character expression
|
|
*
|
|
* pAction Action to apply
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParseAny (
|
|
PACT pAction,
|
|
char *p
|
|
)
|
|
{
|
|
DEBOUT (("REParseAny (%04x, %s)\n", pAction, p));
|
|
|
|
(*pAction) (ANY, 0, '\0', '\0');
|
|
return p + RECharLen (p);
|
|
}
|
|
|
|
/* REParseBOL - parse a beginning-of-line match
|
|
*
|
|
* pAction Action to apply
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParseBOL (
|
|
PACT pAction,
|
|
char *p
|
|
)
|
|
{
|
|
DEBOUT (("REParseBOL (%04x, %s)\n", pAction, p));
|
|
|
|
(*pAction) (BOL, 0, '\0', '\0');
|
|
return p + RECharLen (p);
|
|
}
|
|
|
|
/* REParsePrev - parse a previous-match item
|
|
*
|
|
* pAction Action to apply
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParsePrev (
|
|
PACT pAction,
|
|
char *p
|
|
)
|
|
{
|
|
unsigned int i = *(p + 1) - '0';
|
|
|
|
DEBOUT (("REParsePrev (%04x, %s)\n", pAction, p));
|
|
|
|
if (i < 1 || i > cArg) {
|
|
DEBOUT (("REParsePrev invalid previous number, ERROR\n"));
|
|
return NULL;
|
|
}
|
|
|
|
(*pAction) (PREV, i, '\0', '\0');
|
|
return p + RECharLen (p);
|
|
}
|
|
|
|
/* REParseEOL - parse an end-of-line match
|
|
*
|
|
* pAction Action to apply
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParseEOL (
|
|
PACT pAction,
|
|
char *p
|
|
)
|
|
{
|
|
DEBOUT (("REParseEOL (%04x, %s)\n", pAction, p));
|
|
|
|
(*pAction) (EOL, 0, '\0', '\0');
|
|
return p + RECharLen (p);
|
|
}
|
|
|
|
/* REParseAlt - parse a series of alternatives
|
|
*
|
|
* pAction Action to apply before and after each alternative
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParseAlt (
|
|
PACT pAction,
|
|
register char *p
|
|
)
|
|
{
|
|
UINT_PTR u = 0;
|
|
|
|
DEBOUT (("REParseAlt (%04x, %s)\n", pAction, p));
|
|
|
|
while (RECharType (p) != SR_RIGHTOR) {
|
|
p += RECharLen (p);
|
|
u = (*pAction) (LEFTOR, u, '\0', '\0');
|
|
if ((p = REParseRE (pAction, p, EndAltRE)) == NULL)
|
|
return NULL;
|
|
u = (*pAction) (ORSIGN, u, '\0', '\0');
|
|
}
|
|
(*pAction) (RIGHTOR, u, '\0', '\0');
|
|
return p + RECharLen (p);
|
|
}
|
|
|
|
/* REParseNot - parse a guard-against match
|
|
*
|
|
* pAction Action to apply
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParseNot (
|
|
PACT pAction,
|
|
register char *p
|
|
)
|
|
{
|
|
UINT_PTR u;
|
|
|
|
DEBOUT (("REParseNot (%04x, %s)\n", pAction, p));
|
|
|
|
p += RECharLen (p);
|
|
if (*p == '\0') {
|
|
DEBOUT (("REParseNot expecting more, ERROR\n"));
|
|
return NULL;
|
|
}
|
|
u = (*pAction) (NOTSIGN, 0, '\0', '\0');
|
|
p = REParseSE (pAction, p);
|
|
(*pAction) (NOTSIGN1, u, '\0', '\0');
|
|
return p;
|
|
}
|
|
|
|
/* REParseAbbrev - parse and expand an abbreviation
|
|
*
|
|
* Note that since the abbreviations are in Z syntax, we must change syntax
|
|
* temporarily to Z. We are careful to do this so that we do not mess up
|
|
* advancign the pointers.
|
|
*
|
|
* pAction Action to apply
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParseAbbrev (
|
|
PACT pAction,
|
|
register char *p
|
|
)
|
|
{
|
|
int i;
|
|
flagType fZSTmp;
|
|
|
|
DEBOUT (("REParseAbbrev (%04x, %s)\n", pAction, p));
|
|
|
|
p += RECharLen (p);
|
|
|
|
fZSTmp = fZSyntax;
|
|
fZSyntax = TRUE;
|
|
if (p[-1] == '\0') {
|
|
DEBOUT (("REParseAbbrev expecting abbrev char, ERROR\n"));
|
|
fZSyntax = fZSTmp;
|
|
return NULL;
|
|
}
|
|
|
|
for (i = 0; pAbbrev[i]; i++)
|
|
if (p[-1] == *pAbbrev[i])
|
|
if (REParseSE (pAction, pAbbrev[i] + 1) == NULL) {
|
|
fZSyntax = fZSTmp;
|
|
return NULL;
|
|
} else {
|
|
fZSyntax = fZSTmp;
|
|
return p;
|
|
}
|
|
DEBOUT (("REParseAbbrev found invalid abbrev char %s, ERROR\n", p - 1));
|
|
fZSyntax = fZSTmp;
|
|
return NULL;
|
|
}
|
|
|
|
/* REParseChar - parse a single character match
|
|
*
|
|
* pAction Action to apply
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParseChar (
|
|
PACT pAction,
|
|
register char *p
|
|
)
|
|
{
|
|
DEBOUT (("REParseChar (%04x, %s)\n", pAction, p));
|
|
|
|
if (*p == '\\')
|
|
p++;
|
|
if (*p == '\0') {
|
|
DEBOUT (("REParseChar expected more, ERROR\n"));
|
|
return NULL;
|
|
}
|
|
(*pAction) (LETTER, 0, *p, '\0');
|
|
return p+1;
|
|
}
|
|
|
|
/* REParseClosure - parse a minimal match closure. The match occurs by
|
|
* matching none, then one, ...
|
|
*
|
|
* pAction Action to apply
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParseClosure (
|
|
PACT pAction,
|
|
register char *p
|
|
)
|
|
{
|
|
UINT_PTR u;
|
|
|
|
DEBOUT (("REParseaClosure (%04x, %s)\n", pAction, p));
|
|
|
|
u = (*pAction) (SMSTAR, 0, '\0', '\0');
|
|
if ((p = REParseSE (pAction, p)) == NULL)
|
|
return NULL;
|
|
(*pAction) (SMSTAR1, u, '\0', '\0');
|
|
return p + REClosureLen (p);
|
|
}
|
|
|
|
/* REParseGreedy - parse a maximal-match closure. The match occurs by
|
|
* matching the maximal number and then backing off as failures occur.
|
|
*
|
|
* pAction Action to apply
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParseGreedy (
|
|
PACT pAction,
|
|
register char *p
|
|
)
|
|
{
|
|
UINT_PTR u;
|
|
|
|
DEBOUT (("REParseGreedy (%04x, %s)\n", pAction, p));
|
|
|
|
u = (*pAction) (STAR, 0, '\0', '\0');
|
|
if ((p = REParseSE (pAction, p)) == NULL)
|
|
return NULL;
|
|
(*pAction) (STAR1, u, '\0', '\0');
|
|
return p + REClosureLen (p);
|
|
}
|
|
|
|
/* REParsePower - parse a power-closure. This is merely the simple pattern
|
|
* repeated the number of times specified by the exponent.
|
|
*
|
|
* pAction Action to apply
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns pointer past parsed text if successful
|
|
* NULL otherwise (syntax error)
|
|
*/
|
|
char *
|
|
REParsePower (
|
|
PACT pAction,
|
|
char *p
|
|
)
|
|
{
|
|
register char *p1;
|
|
int exp;
|
|
|
|
DEBOUT (("REParsePower (%04x, %s)\n", pAction, p));
|
|
|
|
/* We have .se. POWER something. Skip over the .se. and POWER
|
|
* to make sure that what follows is a valid number
|
|
*/
|
|
p1 = REParseSE (NullAction, p);
|
|
|
|
if (p1 == NULL)
|
|
/* Parse of .se. failed
|
|
*/
|
|
return NULL;
|
|
|
|
/* skip POWER
|
|
*/
|
|
p1 += REClosureLen (p1);
|
|
|
|
if (*p1 == '\0') {
|
|
DEBOUT (("REParsePower expecting more, ERROR\n"));
|
|
return NULL;
|
|
}
|
|
|
|
/* try to parse off number */
|
|
if (sscanf (p1, "%d", &exp) != 1) {
|
|
DEBOUT (("REParsePower expecting number, ERROR\n"));
|
|
return NULL;
|
|
}
|
|
|
|
p1 = strbskip (p1, digits);
|
|
|
|
/* iterate the pattern the exponent number of times */
|
|
while (exp--)
|
|
if (REParseSE (pAction, p) == NULL)
|
|
return NULL;
|
|
return p1;
|
|
}
|
|
|
|
/* NullAction - a do-nothing action. Used for stubbing out the action
|
|
* during a parse.
|
|
*/
|
|
UINT_PTR
|
|
NullAction(
|
|
unsigned int type,
|
|
UINT_PTR u,
|
|
unsigned char x,
|
|
unsigned char y
|
|
)
|
|
{
|
|
type; u; x; y;
|
|
return 0;
|
|
}
|
|
|
|
/* REClosureChar - return the character that corresponds to the next
|
|
* closure to be parsed. We call REParseSE with a null action to merely
|
|
* advance the character pointer to point just beyond the current simple
|
|
* regular expression.
|
|
*
|
|
* p character pointer to spot where parsing occurs
|
|
*
|
|
* Returns closure character if appropriate
|
|
* CCH_NONE if no closure character found.
|
|
*/
|
|
char
|
|
REClosureChar (
|
|
char *p
|
|
)
|
|
{
|
|
p = REParseSE (NullAction, p);
|
|
if (p == NULL)
|
|
return CCH_ERROR;
|
|
|
|
if (fZSyntax)
|
|
/* Zibo syntax
|
|
*/
|
|
switch (*p) {
|
|
case '^':
|
|
return CCH_POWER;
|
|
case '+':
|
|
return CCH_SMPLUS;
|
|
case '#':
|
|
return CCH_PLUS;
|
|
case '*':
|
|
return CCH_SMCLOSURE;
|
|
case '@':
|
|
return CCH_CLOSURE;
|
|
default:
|
|
return CCH_NONE;
|
|
} else
|
|
/* Crappy UNIX syntax
|
|
*/
|
|
switch (*p) {
|
|
case '+':
|
|
return CCH_PLUS;
|
|
case '*':
|
|
return CCH_CLOSURE;
|
|
default:
|
|
return CCH_NONE;
|
|
}
|
|
}
|
|
|
|
/* RECompile - compile a pattern into the machine. Return a
|
|
* pointer to the match machine.
|
|
*
|
|
* p character pointer to pattern being compiled
|
|
*
|
|
* Returns: pointer to the machine if compilation was successful
|
|
* NULL if syntax error or not enough memory for malloc
|
|
*/
|
|
struct patType *
|
|
RECompile(
|
|
char *p,
|
|
flagType fCase,
|
|
flagType fZS
|
|
)
|
|
{
|
|
fZSyntax = fZS;
|
|
|
|
REEstimate (p);
|
|
|
|
DEBOUT (("Length is %04x\n", RESize));
|
|
|
|
if (RESize == -1)
|
|
return NULL;
|
|
|
|
if ((REPat = (struct patType *) (*tools_alloc) (RESize)) == NULL)
|
|
return NULL;
|
|
|
|
memset ((char far *) REPat, -1, RESize);
|
|
memset ((char far *) REPat->pArgBeg, 0, sizeof (REPat->pArgBeg));
|
|
memset ((char far *) REPat->pArgEnd, 0, sizeof (REPat->pArgEnd));
|
|
|
|
REip = REPat->code;
|
|
REArg = 1;
|
|
REPat->fCase = fCase;
|
|
REPat->fUnix = (flagType) !fZS;
|
|
|
|
cArg = 0;
|
|
|
|
CompileAction (PROLOG, 0, '\0', '\0');
|
|
|
|
if (REParseRE (CompileAction, p, NULL) == NULL)
|
|
return NULL;
|
|
|
|
CompileAction (EPILOG, 0, '\0', '\0');
|
|
|
|
#if DEBUG
|
|
REDump (REPat);
|
|
#endif
|
|
return REPat;
|
|
}
|
|
|
|
/* Escaped - translate an escaped character ala UNIX C conventions.
|
|
*
|
|
* \t => tab \e => ESC char \h => backspace \g => bell
|
|
* \n => lf \r => cr \\ => \
|
|
*
|
|
* c character to be translated
|
|
*
|
|
* Returns: character as per above
|
|
*/
|
|
char
|
|
Escaped(
|
|
char c
|
|
)
|
|
{
|
|
switch (c) {
|
|
case 't':
|
|
return '\t';
|
|
case 'e':
|
|
return 0x1B;
|
|
case 'h':
|
|
return 0x08;
|
|
case 'g':
|
|
return 0x07;
|
|
case 'n':
|
|
return '\n';
|
|
case 'r':
|
|
return '\r';
|
|
case '\\':
|
|
return '\\';
|
|
default:
|
|
return c;
|
|
}
|
|
}
|
|
|
|
/* REGetArg - copy argument string out from match.
|
|
*
|
|
* pat matched pattern
|
|
* i index of argument to fetch, 0 is entire pattern
|
|
* p destination of argument
|
|
*
|
|
* Returns: TRUE if successful, FALSE if i is out of range.
|
|
*/
|
|
flagType
|
|
REGetArg (
|
|
struct patType *pat,
|
|
int i,
|
|
char *p
|
|
)
|
|
{
|
|
int l = 0;
|
|
|
|
if (i > MAXPATARG)
|
|
return FALSE;
|
|
else
|
|
if (pat->pArgBeg[i] != (char *)-1)
|
|
memmove ((char far *)p, (char far *)pat->pArgBeg[i], l = RELength (pat, i));
|
|
p[l] = '\0';
|
|
return TRUE;
|
|
}
|
|
|
|
/* RETranslate - translate a pattern string and match structure into an
|
|
* output string. During pattern search-and-replace, RETranslate is used
|
|
* to generate an output string based on an input match pattern and a template
|
|
* that directs the output.
|
|
*
|
|
* The input match is any patType returned from RECompile that has been passed
|
|
* to fREMatch and that causes fREMatch to return TRUE. The template string
|
|
* is any set of ascii chars. The $ character leads in arguments:
|
|
*
|
|
* $$ is replaced with $
|
|
* $0 is replaced with the entire match string
|
|
* $1-$9 is replaced with the corresponding tagged (by {}) item from
|
|
* the match.
|
|
*
|
|
* An alternative method is to specify the argument as:
|
|
*
|
|
* $([w,]a) where a is the argument number (0-9) and w is an optional field
|
|
* width that will be used in a printf %ws format.
|
|
*
|
|
* buf pattern matched
|
|
* src template for the match
|
|
* dst destination of the translation
|
|
*
|
|
* Returns: TRUE if translation was successful, FALSE otherwise
|
|
*/
|
|
flagType
|
|
RETranslate (
|
|
struct patType *buf,
|
|
register char *src,
|
|
register char *dst
|
|
)
|
|
{
|
|
int i, w;
|
|
char *work;
|
|
char chArg = (char) (buf->fUnix ? '\\' : '$');
|
|
|
|
work = (*tools_alloc) (MAXLINELEN);
|
|
if (work == NULL)
|
|
return FALSE;
|
|
|
|
*dst = '\0';
|
|
|
|
while (*src != '\0') {
|
|
/* Process tagged substitutions first
|
|
*/
|
|
if (*src == chArg && (isdigit (src[1]) || src[1] == '(')) {
|
|
/* presume 0-width field */
|
|
w = 0;
|
|
|
|
/* skip $ and char */
|
|
src += 2;
|
|
|
|
/* if we saw $n */
|
|
if (isdigit (src[-1]))
|
|
i = src[-1] - '0';
|
|
/* else we saw $( */
|
|
else {
|
|
/* get tagged expr number */
|
|
i = atoi (src);
|
|
|
|
/* skip over number */
|
|
if (*src == '-')
|
|
src++;
|
|
src = strbskip (src, digits);
|
|
|
|
/* was there a comma? */
|
|
if (*src == ',') {
|
|
/* We saw field width, parse off expr number */
|
|
w = i;
|
|
i = atoi (++src);
|
|
src = strbskip (src, digits);
|
|
}
|
|
|
|
/* We MUST end with a close paren */
|
|
if (*src++ != ')') {
|
|
free (work);
|
|
return FALSE;
|
|
}
|
|
}
|
|
/* w is field width
|
|
* i is selected argument
|
|
*/
|
|
if (!REGetArg (buf, i, work)) {
|
|
free (work);
|
|
return FALSE;
|
|
}
|
|
sprintf (dst, "%*s", w, work);
|
|
dst += strlen (dst);
|
|
} else
|
|
/* process escaped characters */
|
|
if (*src == '\\') {
|
|
src++;
|
|
if (!*src) {
|
|
free (work);
|
|
return FALSE;
|
|
}
|
|
*dst++ = Escaped (*src++);
|
|
} else
|
|
/* chArg quotes itself */
|
|
if (*src == chArg && src[1] == chArg) {
|
|
*dst++ = chArg;
|
|
src += 2;
|
|
} else
|
|
*dst++ = *src++;
|
|
}
|
|
*dst = '\0';
|
|
free (work);
|
|
return TRUE;
|
|
}
|
|
|
|
/* RETranslateLength - given a matched pattern and a replacement string
|
|
* return the length of the final replacement
|
|
*
|
|
* The inputs have the same syntax/semantics as in RETranslate.
|
|
*
|
|
* buf pattern matched
|
|
* src template for the match
|
|
*
|
|
* Returns: number of bytes in total replacement, -1 if error
|
|
*/
|
|
int
|
|
RETranslateLength (
|
|
struct patType *buf,
|
|
register char *src
|
|
)
|
|
{
|
|
int i, w;
|
|
int length = 0;
|
|
char chArg = (char) (buf->fUnix ? '\\' : '$');
|
|
|
|
while (*src != '\0') {
|
|
/* Process tagged substitutions first
|
|
*/
|
|
if (*src == chArg && (isdigit (src[1]) || src[1] == '(')) {
|
|
w = 0;
|
|
src += 2;
|
|
if (isdigit (src[-1]))
|
|
i = src[-1] - '0';
|
|
else {
|
|
i = atoi (src);
|
|
if (*src == '-')
|
|
src++;
|
|
src = strbskip (src, digits);
|
|
if (*src == ',') {
|
|
w = i;
|
|
i = atoi (++src);
|
|
src = strbskip (src, digits);
|
|
}
|
|
if (*src++ != ')')
|
|
return -1;
|
|
}
|
|
/* w is field width
|
|
* i is selected argument
|
|
*/
|
|
i = RELength (buf, i);
|
|
length += max (i, abs(w));
|
|
} else
|
|
/* process escaped characters */
|
|
if (*src == '\\') {
|
|
src += 2;
|
|
length++;
|
|
} else
|
|
/* chArg quotes itself */
|
|
if (*src == chArg && src[1] == chArg) {
|
|
src += 2;
|
|
length++;
|
|
} else {
|
|
length++;
|
|
src++;
|
|
}
|
|
}
|
|
return length;
|
|
}
|
|
|
|
/* RELength - return length of argument in match.
|
|
*
|
|
* pat matched pattern
|
|
* i index of argument to examine, 0 is entire pattern
|
|
*
|
|
* Returns: length of ith argument, -1 if i is out-of-range.
|
|
*/
|
|
int
|
|
RELength (
|
|
struct patType *pat,
|
|
int i
|
|
)
|
|
{
|
|
if (i > MAXPATARG)
|
|
return -1;
|
|
else
|
|
if (pat->pArgBeg[i] == (char *)-1)
|
|
return 0;
|
|
else
|
|
return (int)(pat->pArgEnd[i] - pat->pArgBeg[i]);
|
|
}
|
|
|
|
/* REStart - return pointer to beginning of match.
|
|
*
|
|
* ppat matched pattern
|
|
*
|
|
* Returns: character pointer to beginning of match
|
|
*/
|
|
char *
|
|
REStart (
|
|
struct patType *pat
|
|
)
|
|
{
|
|
return pat->pArgBeg[0] == (char *)-1 ? NULL : pat->pArgBeg[0];
|
|
}
|