diff options
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 820 |
1 files changed, 820 insertions, 0 deletions
diff --git a/regexec.c b/regexec.c new file mode 100644 index 0000000000..2246454b0a --- /dev/null +++ b/regexec.c @@ -0,0 +1,820 @@ +/* NOTE: this is derived from Henry Spencer's regexp code, and should not + * confused with the original package (see point 3 below). Thanks, Henry! + */ + +/* Additional note: this code is very heavily munged from Henry's version + * in places. In some spots I've traded clarity for efficiency, so don't + * blame Henry for some of the lack of readability. + */ + +/* $Header: regexec.c,v 3.0 89/10/18 15:22:53 lwall Locked $ + * + * $Log: regexec.c,v $ + * Revision 3.0 89/10/18 15:22:53 lwall + * 3.0 baseline + * + */ + +/* + * regcomp and regexec -- regsub and regerror are not used in perl + * + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + * + **** Alterations to Henry's code are... + **** + **** Copyright (c) 1989, Larry Wall + **** + **** You may distribute under the terms of the GNU General Public License + **** as specified in the README file that comes with the perl 3.0 kit. + * + * Beware that some of this code is subtly aware of the way operator + * precedence is structured in regular expressions. Serious changes in + * regular-expression syntax might require a total rethink. + */ +#include "EXTERN.h" +#include "perl.h" +#include "regcomp.h" + +#ifndef STATIC +#define STATIC static +#endif + +#ifdef DEBUGGING +int regnarrate = 0; +#endif + +/* + * regexec and friends + */ + +/* + * Global work variables for regexec(). + */ +static char *regprecomp; +static char *reginput; /* String-input pointer. */ +static char *regbol; /* Beginning of input, for ^ check. */ +static char *regeol; /* End of input, for $ check. */ +static char **regstartp; /* Pointer to startp array. */ +static char **regendp; /* Ditto for endp. */ +static char *reglastparen; /* Similarly for lastparen. */ +static char *regtill; + +static char *regmystartp[10]; /* For remembering backreferences. */ +static char *regmyendp[10]; + +/* + * Forwards. + */ +STATIC int regtry(); +STATIC int regmatch(); +STATIC int regrepeat(); + +extern int multiline; + +/* + - regexec - match a regexp against a string + */ +int +regexec(prog, stringarg, strend, strbeg, minend, screamer, safebase) +register regexp *prog; +char *stringarg; +register char *strend; /* pointer to null at end of string */ +char *strbeg; /* real beginning of string */ +int minend; /* end of match must be at least minend after stringarg */ +STR *screamer; +int safebase; /* no need to remember string in subbase */ +{ + register char *s; + register int i; + register char *c; + register char *string = stringarg; + register int tmp; + int minlen = 0; /* must match at least this many chars */ + int dontbother = 0; /* how many characters not to try at end */ + int beginning = (string == strbeg); /* is ^ valid at stringarg? */ + + /* Be paranoid... */ + if (prog == NULL || string == NULL) { + fatal("NULL regexp parameter"); + return(0); + } + + regprecomp = prog->precomp; + /* Check validity of program. */ + if (UCHARAT(prog->program) != MAGIC) { + FAIL("corrupted regexp program"); + } + + if (prog->do_folding) { + safebase = FALSE; + i = strend - string; + New(1101,c,i+1,char); + (void)bcopy(string, c, i+1); + string = c; + strend = string + i; + for (s = string; s < strend; s++) + if (isupper(*s)) + *s = tolower(*s); + } + + /* If there is a "must appear" string, look for it. */ + s = string; + if (prog->regmust != Nullstr) { + if (beginning && screamer) { + if (screamfirst[prog->regmust->str_rare] >= 0) + s = screaminstr(screamer,prog->regmust); + else + s = Nullch; + } +#ifndef lint + else + s = fbminstr((unsigned char*)s, (unsigned char*)strend, + prog->regmust); +#endif + if (!s) { + ++prog->regmust->str_u.str_useful; /* hooray */ + goto phooey; /* not present */ + } + else if (prog->regback >= 0) { + s -= prog->regback; + if (s < string) + s = string; + minlen = prog->regback + prog->regmust->str_cur; + } + else if (--prog->regmust->str_u.str_useful < 0) { /* boo */ + str_free(prog->regmust); + prog->regmust = Nullstr; /* disable regmust */ + s = string; + } + else { + s = string; + minlen = prog->regmust->str_cur; + } + } + + /* Mark beginning of line for ^ . */ + if (beginning) + regbol = string; + else + regbol = NULL; + + /* Mark end of line for $ (and such) */ + regeol = strend; + + /* see how far we have to get to not match where we matched before */ + regtill = string+minend; + + /* Simplest case: anchored match need be tried only once. */ + /* [unless multiline is set] */ + if (prog->reganch) { + if (regtry(prog, string)) + goto got_it; + else if (multiline) { + if (minlen) + dontbother = minlen - 1; + strend -= dontbother; + /* for multiline we only have to try after newlines */ + if (s > string) + s--; + for (; s < strend; s++) { + if (*s == '\n') { + if (++s < strend && regtry(prog, s)) + goto got_it; + } + } + } + goto phooey; + } + + /* Messy cases: unanchored match. */ + if (prog->regstart) { + /* We know what string it must start with. */ + if (prog->regstart->str_pok == 3) { +#ifndef lint + while ((s = fbminstr((unsigned char*)s, + (unsigned char*)strend, prog->regstart)) != NULL) +#else + while (s = Nullch) +#endif + { + if (regtry(prog, s)) + goto got_it; + s++; + } + } + else { + c = prog->regstart->str_ptr; + while ((s = ninstr(s, strend, + c, c + prog->regstart->str_cur )) != NULL) { + if (regtry(prog, s)) + goto got_it; + s++; + } + } + goto phooey; + } + if (c = prog->regstclass) { + if (minlen) + dontbother = minlen - 1; + strend -= dontbother; /* don't bother with what can't match */ + /* We know what class it must start with. */ + switch (OP(c)) { + case ANYOF: case ANYBUT: + c = OPERAND(c); + while (s < strend) { + i = *s; + if (!(c[i >> 3] & (1 << (i&7)))) + if (regtry(prog, s)) + goto got_it; + s++; + } + break; + case BOUND: + if (minlen) + dontbother++,strend--; + if (s != string) { + i = s[-1]; + tmp = (isalpha(i) || isdigit(i) || i == '_'); + } + else + tmp = 0; /* assume not alphanumeric */ + while (s < strend) { + i = *s; + if (tmp != (isalpha(i) || isdigit(i) || i == '_')) { + tmp = !tmp; + if (regtry(prog, s)) + goto got_it; + } + s++; + } + if (tmp && regtry(prog,s)) + goto got_it; + break; + case NBOUND: + if (minlen) + dontbother++,strend--; + if (s != string) { + i = s[-1]; + tmp = (isalpha(i) || isdigit(i) || i == '_'); + } + else + tmp = 0; /* assume not alphanumeric */ + while (s < strend) { + i = *s; + if (tmp != (isalpha(i) || isdigit(i) || i == '_')) + tmp = !tmp; + else if (regtry(prog, s)) + goto got_it; + s++; + } + if (!tmp && regtry(prog,s)) + goto got_it; + break; + case ALNUM: + while (s < strend) { + i = *s; + if (isalpha(i) || isdigit(i) || i == '_') + if (regtry(prog, s)) + goto got_it; + s++; + } + break; + case NALNUM: + while (s < strend) { + i = *s; + if (!isalpha(i) && !isdigit(i) && i != '_') + if (regtry(prog, s)) + goto got_it; + s++; + } + break; + case SPACE: + while (s < strend) { + if (isspace(*s)) + if (regtry(prog, s)) + goto got_it; + s++; + } + break; + case NSPACE: + while (s < strend) { + if (!isspace(*s)) + if (regtry(prog, s)) + goto got_it; + s++; + } + break; + case DIGIT: + while (s < strend) { + if (isdigit(*s)) + if (regtry(prog, s)) + goto got_it; + s++; + } + break; + case NDIGIT: + while (s < strend) { + if (!isdigit(*s)) + if (regtry(prog, s)) + goto got_it; + s++; + } + break; + } + } + else { + dontbother = minend; + strend -= dontbother; + /* We don't know much -- general case. */ + do { + if (regtry(prog, s)) + goto got_it; + } while (s++ < strend); + } + + /* Failure. */ + goto phooey; + + got_it: + if ((!safebase && (prog->nparens || sawampersand)) || prog->do_folding){ + strend += dontbother; /* uncheat */ + if (safebase) /* no need for $digit later */ + s = strbeg; + else if (strbeg != prog->subbase) { + i = strend - string + (stringarg - strbeg); + s = nsavestr(strbeg,i); /* so $digit will work later */ + if (prog->subbase) + Safefree(prog->subbase); + prog->subbase = s; + } + else + s = prog->subbase; + s += (stringarg - strbeg); + for (i = 0; i <= prog->nparens; i++) { + if (prog->endp[i]) { + prog->startp[i] = s + (prog->startp[i] - string); + prog->endp[i] = s + (prog->endp[i] - string); + } + } + if (prog->do_folding) + Safefree(string); + } + return(1); + + phooey: + if (prog->do_folding) + Safefree(string); + return(0); +} + +/* + - regtry - try match at specific point + */ +static int /* 0 failure, 1 success */ +regtry(prog, string) +regexp *prog; +char *string; +{ + register int i; + register char **sp; + register char **ep; + + reginput = string; + regstartp = prog->startp; + regendp = prog->endp; + reglastparen = &prog->lastparen; + prog->lastparen = 0; + + sp = prog->startp; + ep = prog->endp; + if (prog->nparens) { + for (i = NSUBEXP; i > 0; i--) { + *sp++ = NULL; + *ep++ = NULL; + } + } + if (regmatch(prog->program + 1) && reginput >= regtill) { + prog->startp[0] = string; + prog->endp[0] = reginput; + return(1); + } else + return(0); +} + +/* + - regmatch - main matching routine + * + * Conceptually the strategy is simple: check to see whether the current + * node matches, call self recursively to see whether the rest matches, + * and then act accordingly. In practice we make some effort to avoid + * recursion, in particular by going through "ordinary" nodes (that don't + * need to know whether the rest of the match failed) by a loop instead of + * by recursion. + */ +/* [lwall] I've hoisted the register declarations to the outer block in order to + * maybe save a little bit of pushing and popping on the stack. It also takes + * advantage of machines that use a register save mask on subroutine entry. + */ +static int /* 0 failure, 1 success */ +regmatch(prog) +char *prog; +{ + register char *scan; /* Current node. */ + char *next; /* Next node. */ + register int nextchar; + register int n; /* no or next */ + register int ln; /* len or last */ + register char *s; /* operand or save */ + register char *locinput = reginput; + + nextchar = *locinput; + scan = prog; +#ifdef DEBUGGING + if (scan != NULL && regnarrate) + fprintf(stderr, "%s(\n", regprop(scan)); +#endif + while (scan != NULL) { +#ifdef DEBUGGING + if (regnarrate) + fprintf(stderr, "%s...\n", regprop(scan)); +#endif + +#ifdef REGALIGN + next = scan + NEXT(scan); + if (next == scan) + next = NULL; +#else + next = regnext(scan); +#endif + + switch (OP(scan)) { + case BOL: + if (locinput == regbol || + ((nextchar || locinput < regeol) && + locinput[-1] == '\n') ) + { + regtill--; + break; + } + return(0); + case EOL: + if ((nextchar || locinput < regeol) && nextchar != '\n') + return(0); + regtill--; + break; + case ANY: + if ((nextchar == '\0' && locinput >= regeol) || + nextchar == '\n') + return(0); + nextchar = *++locinput; + break; + case EXACTLY: + s = OPERAND(scan); + ln = *s++; + /* Inline the first character, for speed. */ + if (*s != nextchar) + return(0); + if (locinput + ln > regeol) + return 0; + if (ln > 1 && bcmp(s, locinput, ln) != 0) + return(0); + locinput += ln; + nextchar = *locinput; + break; + case ANYOF: + case ANYBUT: + s = OPERAND(scan); + if (nextchar < 0) + nextchar = UCHARAT(locinput); + if (s[nextchar >> 3] & (1 << (nextchar&7))) + return(0); + nextchar = *++locinput; + if (!nextchar && locinput > regeol) + return 0; + break; + case ALNUM: + if (!nextchar) + return(0); + if (!isalpha(nextchar) && !isdigit(nextchar) && + nextchar != '_') + return(0); + nextchar = *++locinput; + break; + case NALNUM: + if (!nextchar && locinput >= regeol) + return(0); + if (isalpha(nextchar) || isdigit(nextchar) || + nextchar == '_') + return(0); + nextchar = *++locinput; + break; + case NBOUND: + case BOUND: + if (locinput == regbol) /* was last char in word? */ + ln = 0; + else + ln = (isalpha(locinput[-1]) || + isdigit(locinput[-1]) || + locinput[-1] == '_' ); + n = (isalpha(nextchar) || isdigit(nextchar) || + nextchar == '_' ); /* is next char in word? */ + if ((ln == n) == (OP(scan) == BOUND)) + return(0); + break; + case SPACE: + if (!nextchar && locinput >= regeol) + return(0); + if (!isspace(nextchar)) + return(0); + nextchar = *++locinput; + break; + case NSPACE: + if (!nextchar) + return(0); + if (isspace(nextchar)) + return(0); + nextchar = *++locinput; + break; + case DIGIT: + if (!isdigit(nextchar)) + return(0); + nextchar = *++locinput; + break; + case NDIGIT: + if (!nextchar && locinput >= regeol) + return(0); + if (isdigit(nextchar)) + return(0); + nextchar = *++locinput; + break; + case REF: + case REF+1: + case REF+2: + case REF+3: + case REF+4: + case REF+5: + case REF+6: + case REF+7: + case REF+8: + case REF+9: + n = OP(scan) - REF; + s = regmystartp[n]; + if (!s) + return(0); + if (!regmyendp[n]) + return(0); + if (s == regmyendp[n]) + break; + /* Inline the first character, for speed. */ + if (*s != nextchar) + return(0); + ln = regmyendp[n] - s; + if (locinput + ln > regeol) + return 0; + if (ln > 1 && bcmp(s, locinput, ln) != 0) + return(0); + locinput += ln; + nextchar = *locinput; + break; + + case NOTHING: + break; + case BACK: + break; + case OPEN+1: + case OPEN+2: + case OPEN+3: + case OPEN+4: + case OPEN+5: + case OPEN+6: + case OPEN+7: + case OPEN+8: + case OPEN+9: + n = OP(scan) - OPEN; + reginput = locinput; + + regmystartp[n] = locinput; /* for REF */ + if (regmatch(next)) { + /* + * Don't set startp if some later + * invocation of the same parentheses + * already has. + */ + if (regstartp[n] == NULL) + regstartp[n] = locinput; + return(1); + } else + return(0); + /* NOTREACHED */ + case CLOSE+1: + case CLOSE+2: + case CLOSE+3: + case CLOSE+4: + case CLOSE+5: + case CLOSE+6: + case CLOSE+7: + case CLOSE+8: + case CLOSE+9: { + n = OP(scan) - CLOSE; + reginput = locinput; + + regmyendp[n] = locinput; /* for REF */ + if (regmatch(next)) { + /* + * Don't set endp if some later + * invocation of the same parentheses + * already has. + */ + if (regendp[n] == NULL) { + regendp[n] = locinput; + if (n > *reglastparen) + *reglastparen = n; + } + return(1); + } else + return(0); + } + /*NOTREACHED*/ + case BRANCH: { + if (OP(next) != BRANCH) /* No choice. */ + next = NEXTOPER(scan); /* Avoid recursion. */ + else { + do { + reginput = locinput; + if (regmatch(NEXTOPER(scan))) + return(1); +#ifdef REGALIGN + if (n = NEXT(scan)) + scan += n; + else + scan = NULL; +#else + scan = regnext(scan); +#endif + } while (scan != NULL && OP(scan) == BRANCH); + return(0); + /* NOTREACHED */ + } + } + break; + case STAR: + case PLUS: + /* + * Lookahead to avoid useless match attempts + * when we know what character comes next. + */ + if (OP(next) == EXACTLY) + nextchar = *(OPERAND(next)+1); + else + nextchar = -1000; + ln = (OP(scan) == STAR) ? 0 : 1; + reginput = locinput; + n = regrepeat(NEXTOPER(scan)); + while (n >= ln) { + /* If it could work, try it. */ + if (nextchar == -1000 || *reginput == nextchar) + if (regmatch(next)) + return(1); + /* Couldn't or didn't -- back up. */ + n--; + reginput = locinput + n; + } + return(0); + case END: + reginput = locinput; /* put where regtry can find it */ + return(1); /* Success! */ + default: + printf("%x %d\n",scan,scan[1]); + FAIL("regexp memory corruption"); + } + + scan = next; + } + + /* + * We get here only if there's trouble -- normally "case END" is + * the terminating point. + */ + FAIL("corrupted regexp pointers"); + /*NOTREACHED*/ +#ifdef lint + return 0; +#endif +} + +/* + - regrepeat - repeatedly match something simple, report how many + */ +/* + * [This routine now assumes that it will only match on things of length 1. + * That was true before, but now we assume scan - reginput is the count, + * rather than incrementing count on every character.] + */ +static int +regrepeat(p) +char *p; +{ + register char *scan; + register char *opnd; + register int c; + register char *loceol = regeol; + + scan = reginput; + opnd = OPERAND(p); + switch (OP(p)) { + case ANY: + while (scan < loceol && *scan != '\n') + scan++; + break; + case EXACTLY: /* length of string is 1 */ + opnd++; + while (scan < loceol && *opnd == *scan) + scan++; + break; + case ANYOF: + case ANYBUT: + c = UCHARAT(scan); + while (scan < loceol && !(opnd[c >> 3] & (1 << (c & 7)))) { + scan++; + c = UCHARAT(scan); + } + break; + case ALNUM: + while (isalpha(*scan) || isdigit(*scan) || *scan == '_') + scan++; + break; + case NALNUM: + while (scan < loceol && (!isalpha(*scan) && !isdigit(*scan) && + *scan != '_')) + scan++; + break; + case SPACE: + while (scan < loceol && isspace(*scan)) + scan++; + break; + case NSPACE: + while (scan < loceol && !isspace(*scan)) + scan++; + break; + case DIGIT: + while (isdigit(*scan)) + scan++; + break; + case NDIGIT: + while (scan < loceol && !isdigit(*scan)) + scan++; + break; + default: /* Oh dear. Called inappropriately. */ + FAIL("internal regexp foulup"); + /* NOTREACHED */ + } + + c = scan - reginput; + reginput = scan; + + return(c); +} + +/* + - regnext - dig the "next" pointer out of a node + * + * [Note, when REGALIGN is defined there are two places in regmatch() + * that bypass this code for speed.] + */ +char * +regnext(p) +register char *p; +{ + register int offset; + + if (p == ®dummy) + return(NULL); + + offset = NEXT(p); + if (offset == 0) + return(NULL); + +#ifdef REGALIGN + return(p+offset); +#else + if (OP(p) == BACK) + return(p-offset); + else + return(p+offset); +#endif +} |