summaryrefslogtreecommitdiff
path: root/search.c
diff options
context:
space:
mode:
Diffstat (limited to 'search.c')
-rw-r--r--search.c751
1 files changed, 751 insertions, 0 deletions
diff --git a/search.c b/search.c
new file mode 100644
index 0000000000..79712a1359
--- /dev/null
+++ b/search.c
@@ -0,0 +1,751 @@
+/* $Header: search.c,v 1.0 87/12/18 13:05:59 root Exp $
+ *
+ * $Log: search.c,v $
+ * Revision 1.0 87/12/18 13:05:59 root
+ * Initial revision
+ *
+ */
+
+/* string search routines */
+
+#include <stdio.h>
+#include <ctype.h>
+
+#include "EXTERN.h"
+#include "handy.h"
+#include "util.h"
+#include "INTERN.h"
+#include "search.h"
+
+#define VERBOSE
+#define FLUSH
+#define MEM_SIZE int
+
+#ifndef BITSPERBYTE
+#define BITSPERBYTE 8
+#endif
+
+#define BMAPSIZ (127 / BITSPERBYTE + 1)
+
+#define CHAR 0 /* a normal character */
+#define ANY 1 /* . matches anything except newline */
+#define CCL 2 /* [..] character class */
+#define NCCL 3 /* [^..]negated character class */
+#define BEG 4 /* ^ beginning of a line */
+#define END 5 /* $ end of a line */
+#define LPAR 6 /* ( begin sub-match */
+#define RPAR 7 /* ) end sub-match */
+#define REF 8 /* \N backreference to the Nth submatch */
+#define WORD 9 /* \w matches alphanumeric character */
+#define NWORD 10 /* \W matches non-alphanumeric character */
+#define WBOUND 11 /* \b matches word boundary */
+#define NWBOUND 12 /* \B matches non-boundary */
+#define FINIS 13 /* the end of the pattern */
+
+#define CODEMASK 15
+
+/* Quantifiers: */
+
+#define MINZERO 16 /* minimum is 0, not 1 */
+#define MAXINF 32 /* maximum is infinity, not 1 */
+
+#define ASCSIZ 0200
+typedef char TRANSTABLE[ASCSIZ];
+
+static TRANSTABLE trans = {
+0000,0001,0002,0003,0004,0005,0006,0007,
+0010,0011,0012,0013,0014,0015,0016,0017,
+0020,0021,0022,0023,0024,0025,0026,0027,
+0030,0031,0032,0033,0034,0035,0036,0037,
+0040,0041,0042,0043,0044,0045,0046,0047,
+0050,0051,0052,0053,0054,0055,0056,0057,
+0060,0061,0062,0063,0064,0065,0066,0067,
+0070,0071,0072,0073,0074,0075,0076,0077,
+0100,0101,0102,0103,0104,0105,0106,0107,
+0110,0111,0112,0113,0114,0115,0116,0117,
+0120,0121,0122,0123,0124,0125,0126,0127,
+0130,0131,0132,0133,0134,0135,0136,0137,
+0140,0141,0142,0143,0144,0145,0146,0147,
+0150,0151,0152,0153,0154,0155,0156,0157,
+0160,0161,0162,0163,0164,0165,0166,0167,
+0170,0171,0172,0173,0174,0175,0176,0177,
+};
+static bool folding = FALSE;
+
+static int err;
+#define NOERR 0
+#define BEGFAIL 1
+#define FATAL 2
+
+static char *FirstCharacter;
+static char *matchend;
+static char *matchtill;
+
+void
+search_init()
+{
+#ifdef UNDEF
+ register int i;
+
+ for (i = 0; i < ASCSIZ; i++)
+ trans[i] = i;
+#else
+ ;
+#endif
+}
+
+void
+init_compex(compex)
+register COMPEX *compex;
+{
+ /* the following must start off zeroed */
+
+ compex->precomp = Nullch;
+ compex->complen = 0;
+ compex->subbase = Nullch;
+}
+
+#ifdef NOTUSED
+void
+free_compex(compex)
+register COMPEX *compex;
+{
+ if (compex->complen) {
+ safefree(compex->compbuf);
+ compex->complen = 0;
+ }
+ if (compex->subbase) {
+ safefree(compex->subbase);
+ compex->subbase = Nullch;
+ }
+}
+#endif
+
+static char *gbr_str = Nullch;
+static int gbr_siz = 0;
+
+char *
+getparen(compex,n)
+register COMPEX *compex;
+int n;
+{
+ int length = compex->subend[n] - compex->subbeg[n];
+
+ if (!n &&
+ (!compex->numsubs || n > compex->numsubs || !compex->subend[n] || length<0))
+ return "";
+ growstr(&gbr_str, &gbr_siz, length+1);
+ safecpy(gbr_str, compex->subbeg[n], length+1);
+ return gbr_str;
+}
+
+void
+case_fold(which)
+int which;
+{
+ register int i;
+
+ if (which != folding) {
+ if (which) {
+ for (i = 'A'; i <= 'Z'; i++)
+ trans[i] = tolower(i);
+ }
+ else {
+ for (i = 'A'; i <= 'Z'; i++)
+ trans[i] = i;
+ }
+ folding = which;
+ }
+}
+
+/* Compile the regular expression into internal form */
+
+char *
+compile(compex, sp, regex, fold)
+register COMPEX *compex;
+register char *sp;
+int regex;
+int fold;
+{
+ register int c;
+ register char *cp;
+ char *lastcp;
+ char paren[MAXSUB],
+ *parenp;
+ char **alt = compex->alternatives;
+ char *retmes = "Badly formed search string";
+
+ case_fold(compex->do_folding = fold);
+ if (compex->precomp)
+ safefree(compex->precomp);
+ compex->precomp = savestr(sp);
+ if (!compex->complen) {
+ compex->compbuf = safemalloc(84);
+ compex->complen = 80;
+ }
+ cp = compex->compbuf; /* point at compiled buffer */
+ *alt++ = cp; /* first alternative starts here */
+ parenp = paren; /* first paren goes here */
+ if (*sp == 0) { /* nothing to compile? */
+#ifdef NOTDEF
+ if (*cp == 0) /* nothing there yet? */
+ return "Null search string";
+#endif
+ if (*cp)
+ return Nullch; /* just keep old expression */
+ }
+ compex->numsubs = 0; /* no parens yet */
+ lastcp = 0;
+ for (;;) {
+ if (cp - compex->compbuf >= compex->complen) {
+ char *ocompbuf = compex->compbuf;
+
+ grow_comp(compex);
+ if (ocompbuf != compex->compbuf) { /* adjust pointers? */
+ char **tmpalt;
+
+ cp = compex->compbuf + (cp - ocompbuf);
+ if (lastcp)
+ lastcp = compex->compbuf + (lastcp - ocompbuf);
+ for (tmpalt = compex->alternatives; tmpalt < alt; tmpalt++)
+ if (*tmpalt)
+ *tmpalt = compex->compbuf + (*tmpalt - ocompbuf);
+ }
+ }
+ c = *sp++; /* get next char of pattern */
+ if (c == 0) { /* end of pattern? */
+ if (parenp != paren) { /* balanced parentheses? */
+#ifdef VERBOSE
+ retmes = "Missing right parenthesis";
+#endif
+ goto badcomp;
+ }
+ *cp++ = FINIS; /* append a stopper */
+ *alt++ = 0; /* terminate alternative list */
+ /*
+ compex->complen = cp - compex->compbuf + 1;
+ compex->compbuf = saferealloc(compex->compbuf,compex->complen+4); */
+ return Nullch; /* return success */
+ }
+ if (c != '*' && c != '?' && c != '+')
+ lastcp = cp;
+ if (!regex) { /* just a normal search string? */
+ *cp++ = CHAR; /* everything is a normal char */
+ *cp++ = trans[c];
+ }
+ else /* it is a regular expression */
+ switch (c) {
+
+ default:
+ normal_char:
+ *cp++ = CHAR;
+ *cp++ = trans[c];
+ continue;
+
+ case '.':
+ *cp++ = ANY;
+ continue;
+
+ case '[': { /* character class */
+ register int i;
+
+ if (cp - compex->compbuf >= compex->complen - BMAPSIZ) {
+ char *ocompbuf = compex->compbuf;
+
+ grow_comp(compex); /* reserve bitmap */
+ if (ocompbuf != compex->compbuf) {/* adjust pointers? */
+ char **tmpalt;
+
+ cp = compex->compbuf + (cp - ocompbuf);
+ if (lastcp)
+ lastcp = compex->compbuf + (lastcp - ocompbuf);
+ for (tmpalt = compex->alternatives; tmpalt < alt;
+ tmpalt++)
+ if (*tmpalt)
+ *tmpalt =
+ compex->compbuf + (*tmpalt - ocompbuf);
+ }
+ }
+ for (i = BMAPSIZ; i; --i)
+ cp[i] = 0;
+
+ if ((c = *sp++) == '^') {
+ c = *sp++;
+ *cp++ = NCCL; /* negated */
+ }
+ else
+ *cp++ = CCL; /* normal */
+
+ i = 0; /* remember oldchar */
+ do {
+ if (c == '\0') {
+#ifdef VERBOSE
+ retmes = "Missing ]";
+#endif
+ goto badcomp;
+ }
+ if (c == '\\' && *sp) {
+ switch (*sp) {
+ default:
+ c = *sp++;
+ break;
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ c = *sp++ - '0';
+ if (index("01234567",*sp)) {
+ c <<= 3;
+ c += *sp++ - '0';
+ }
+ if (index("01234567",*sp)) {
+ c <<= 3;
+ c += *sp++ - '0';
+ }
+ break;
+ case 'b':
+ c = '\b';
+ sp++;
+ break;
+ case 'n':
+ c = '\n';
+ sp++;
+ break;
+ case 'r':
+ c = '\r';
+ sp++;
+ break;
+ case 'f':
+ c = '\f';
+ sp++;
+ break;
+ case 't':
+ c = '\t';
+ sp++;
+ break;
+ }
+ }
+ if (*sp == '-' && *(++sp))
+ i = *sp++;
+ else
+ i = c;
+ while (c <= i) {
+ cp[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
+ if (fold && isalpha(c))
+ cp[(c ^ 32) / BITSPERBYTE] |=
+ 1 << ((c ^ 32) % BITSPERBYTE);
+ /* set the other bit too */
+ c++;
+ }
+ } while ((c = *sp++) != ']');
+ if (cp[-1] == NCCL)
+ cp[0] |= 1;
+ cp += BMAPSIZ;
+ continue;
+ }
+
+ case '^':
+ if (cp != compex->compbuf && cp[-1] != FINIS)
+ goto normal_char;
+ *cp++ = BEG;
+ continue;
+
+ case '$':
+ if (isdigit(*sp)) {
+ *cp++ = REF;
+ *cp++ = *sp - '0';
+ break;
+ }
+ if (*sp && *sp != '|')
+ goto normal_char;
+ *cp++ = END;
+ continue;
+
+ case '*': case '?': case '+':
+ if (lastcp == 0 ||
+ (*lastcp & (MINZERO|MAXINF)) ||
+ *lastcp == LPAR ||
+ *lastcp == RPAR ||
+ *lastcp == BEG ||
+ *lastcp == END ||
+ *lastcp == WBOUND ||
+ *lastcp == NWBOUND )
+ goto normal_char;
+ if (c != '+')
+ *lastcp |= MINZERO;
+ if (c != '?')
+ *lastcp |= MAXINF;
+ continue;
+
+ case '(':
+ if (compex->numsubs >= MAXSUB) {
+#ifdef VERBOSE
+ retmes = "Too many parens";
+#endif
+ goto badcomp;
+ }
+ *parenp++ = ++compex->numsubs;
+ *cp++ = LPAR;
+ *cp++ = compex->numsubs;
+ break;
+ case ')':
+ if (parenp <= paren) {
+#ifdef VERBOSE
+ retmes = "Unmatched right paren";
+#endif
+ goto badcomp;
+ }
+ *cp++ = RPAR;
+ *cp++ = *--parenp;
+ break;
+ case '|':
+ if (parenp>paren) {
+#ifdef VERBOSE
+ retmes = "No | in subpattern"; /* Sigh! */
+#endif
+ goto badcomp;
+ }
+ *cp++ = FINIS;
+ if (alt - compex->alternatives >= MAXALT) {
+#ifdef VERBOSE
+ retmes = "Too many alternatives";
+#endif
+ goto badcomp;
+ }
+ *alt++ = cp;
+ break;
+ case '\\': /* backslashed thingie */
+ switch (c = *sp++) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ *cp++ = REF;
+ *cp++ = c - '0';
+ break;
+ case 'w':
+ *cp++ = WORD;
+ break;
+ case 'W':
+ *cp++ = NWORD;
+ break;
+ case 'b':
+ *cp++ = WBOUND;
+ break;
+ case 'B':
+ *cp++ = NWBOUND;
+ break;
+ default:
+ *cp++ = CHAR;
+ if (c == '\0')
+ goto badcomp;
+ switch (c) {
+ case 'n':
+ c = '\n';
+ break;
+ case 'r':
+ c = '\r';
+ break;
+ case 'f':
+ c = '\f';
+ break;
+ case 't':
+ c = '\t';
+ break;
+ }
+ *cp++ = c;
+ break;
+ }
+ break;
+ }
+ }
+badcomp:
+ compex->compbuf[0] = 0;
+ compex->numsubs = 0;
+ return retmes;
+}
+
+void
+grow_comp(compex)
+register COMPEX *compex;
+{
+ compex->complen += 80;
+ compex->compbuf = saferealloc(compex->compbuf, (MEM_SIZE)compex->complen + 4);
+}
+
+char *
+execute(compex, addr, beginning, minend)
+register COMPEX *compex;
+char *addr;
+bool beginning;
+int minend;
+{
+ register char *p1 = addr;
+ register char *trt = trans;
+ register int c;
+ register int scr;
+ register int c2;
+
+ if (addr == Nullch)
+ return Nullch;
+ if (compex->numsubs) { /* any submatches? */
+ for (c = 0; c <= compex->numsubs; c++)
+ compex->subbeg[c] = compex->subend[c] = Nullch;
+ }
+ case_fold(compex->do_folding); /* make sure table is correct */
+ if (beginning)
+ FirstCharacter = p1; /* for ^ tests */
+ else {
+ if (multiline || compex->alternatives[1] || compex->compbuf[0] != BEG)
+ FirstCharacter = Nullch;
+ else
+ return Nullch; /* can't match */
+ }
+ matchend = Nullch;
+ matchtill = addr + minend;
+ err = 0;
+ if (compex->compbuf[0] == CHAR && !compex->alternatives[1]) {
+ if (compex->do_folding) {
+ c = compex->compbuf[1]; /* fast check for first character */
+ do {
+ if (trt[*p1] == c && try(compex, p1, compex->compbuf))
+ goto got_it;
+ } while (*p1++ && !err);
+ }
+ else {
+ c = compex->compbuf[1]; /* faster check for first character */
+ if (compex->compbuf[2] == CHAR)
+ c2 = compex->compbuf[3];
+ else
+ c2 = 0;
+ do {
+ false_alarm:
+ while (scr = *p1++, scr && scr != c) ;
+ if (!scr)
+ break;
+ if (c2 && *p1 != c2) /* and maybe even second character */
+ goto false_alarm;
+ if (try(compex, p1, compex->compbuf+2)) {
+ p1--;
+ goto got_it;
+ }
+ } while (!err);
+ }
+ return Nullch;
+ }
+ else { /* normal algorithm */
+ do {
+ register char **alt = compex->alternatives;
+ while (*alt) {
+ if (try(compex, p1, *alt++))
+ goto got_it;
+ }
+ } while (*p1++ && err < FATAL);
+ return Nullch;
+ }
+
+got_it:
+ if (compex->numsubs) { /* any parens? */
+ trt = savestr(addr); /* in case addr is not static */
+ if (compex->subbase)
+ safefree(compex->subbase); /* (may be freeing addr!) */
+ compex->subbase = trt;
+ scr = compex->subbase - addr;
+ p1 += scr;
+ matchend += scr;
+ for (c = 0; c <= compex->numsubs; c++) {
+ if (compex->subend[c]) {
+ compex->subbeg[c] += scr;
+ compex->subend[c] += scr;
+ }
+ }
+ }
+ compex->subend[0] = matchend;
+ compex->subbeg[0] = p1;
+ return p1;
+}
+
+bool
+try(compex, sp, cp)
+COMPEX *compex;
+register char *cp;
+register char *sp;
+{
+ register char *basesp;
+ register char *trt = trans;
+ register int i;
+ register int backlen;
+ register int code;
+
+ while (*sp || (*cp & MAXINF) || *cp == BEG || *cp == RPAR ||
+ *cp == WBOUND || *cp == NWBOUND) {
+ switch ((code = *cp++) & CODEMASK) {
+
+ case CHAR:
+ basesp = sp;
+ i = *cp++;
+ if (code & MAXINF)
+ while (*sp && trt[*sp] == i) sp++;
+ else
+ if (*sp && trt[*sp] == i) sp++;
+ backlen = 1;
+ goto backoff;
+
+ backoff:
+ while (sp > basesp) {
+ if (try(compex, sp, cp))
+ goto right;
+ sp -= backlen;
+ }
+ if (code & MINZERO)
+ continue;
+ goto wrong;
+
+ case ANY:
+ basesp = sp;
+ if (code & MAXINF)
+ while (*sp && *sp != '\n') sp++;
+ else
+ if (*sp && *sp != '\n') sp++;
+ backlen = 1;
+ goto backoff;
+
+ case CCL:
+ basesp = sp;
+ if (code & MAXINF)
+ while (*sp && cclass(cp, *sp, 1)) sp++;
+ else
+ if (*sp && cclass(cp, *sp, 1)) sp++;
+ cp += BMAPSIZ;
+ backlen = 1;
+ goto backoff;
+
+ case NCCL:
+ basesp = sp;
+ if (code & MAXINF)
+ while (*sp && cclass(cp, *sp, 0)) sp++;
+ else
+ if (*sp && cclass(cp, *sp, 0)) sp++;
+ cp += BMAPSIZ;
+ backlen = 1;
+ goto backoff;
+
+ case END:
+ if (!*sp || *sp == '\n') {
+ matchtill--;
+ continue;
+ }
+ goto wrong;
+
+ case BEG:
+ if (sp == FirstCharacter || (
+ *sp && sp[-1] == '\n') ) {
+ matchtill--;
+ continue;
+ }
+ if (!multiline) /* no point in advancing more */
+ err = BEGFAIL;
+ goto wrong;
+
+ case WORD:
+ basesp = sp;
+ if (code & MAXINF)
+ while (*sp && isalnum(*sp)) sp++;
+ else
+ if (*sp && isalnum(*sp)) sp++;
+ backlen = 1;
+ goto backoff;
+
+ case NWORD:
+ basesp = sp;
+ if (code & MAXINF)
+ while (*sp && !isalnum(*sp)) sp++;
+ else
+ if (*sp && !isalnum(*sp)) sp++;
+ backlen = 1;
+ goto backoff;
+
+ case WBOUND:
+ if ((sp == FirstCharacter || !isalnum(sp[-1])) !=
+ (!*sp || !isalnum(*sp)) )
+ continue;
+ goto wrong;
+
+ case NWBOUND:
+ if ((sp == FirstCharacter || !isalnum(sp[-1])) ==
+ (!*sp || !isalnum(*sp)))
+ continue;
+ goto wrong;
+
+ case FINIS:
+ goto right;
+
+ case LPAR:
+ compex->subbeg[*cp++] = sp;
+ continue;
+
+ case RPAR:
+ i = *cp++;
+ compex->subend[i] = sp;
+ compex->lastparen = i;
+ continue;
+
+ case REF:
+ if (compex->subend[i = *cp++] == 0) {
+ fputs("Bad subpattern reference\n",stdout) FLUSH;
+ err = FATAL;
+ goto wrong;
+ }
+ basesp = sp;
+ backlen = compex->subend[i] - compex->subbeg[i];
+ if (code & MAXINF)
+ while (*sp && subpat(compex, i, sp)) sp += backlen;
+ else
+ if (*sp && subpat(compex, i, sp)) sp += backlen;
+ goto backoff;
+
+ default:
+ fputs("Botched pattern compilation\n",stdout) FLUSH;
+ err = FATAL;
+ return -1;
+ }
+ }
+ if (*cp == FINIS || *cp == END) {
+right:
+ if (matchend == Nullch || sp > matchend)
+ matchend = sp;
+ return matchend >= matchtill;
+ }
+wrong:
+ matchend = Nullch;
+ return FALSE;
+}
+
+bool
+subpat(compex, i, sp)
+register COMPEX *compex;
+register int i;
+register char *sp;
+{
+ register char *bp;
+
+ bp = compex->subbeg[i];
+ while (*sp && *bp == *sp) {
+ bp++;
+ sp++;
+ if (bp >= compex->subend[i])
+ return TRUE;
+ }
+ return FALSE;
+}
+
+bool
+cclass(set, c, af)
+register char *set;
+register int c;
+{
+ c &= 0177;
+#if BITSPERBYTE == 8
+ if (set[c >> 3] & 1 << (c & 7))
+#else
+ if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
+#endif
+ return af;
+ return !af;
+}