summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorPerl 5 Porters <perl5-porters@africa.nicoh.com>1996-11-26 20:48:00 +1200
committerChip Salzenberg <chip@atlantic.net>1996-11-26 20:48:00 +1200
commitbbce6d69784bf43b0e69e8d312042d65f258af23 (patch)
treeeb5810e67656c19b6fb34dd0160c9131f24f65d1 /regexec.c
parent6d82b38436d2a39ffb7413e68ad91495cd645fff (diff)
downloadperl-bbce6d69784bf43b0e69e8d312042d65f258af23.tar.gz
[inseparable changes from patch from perl5.003_08 to perl5.003_09]
CORE LANGUAGE CHANGES Subject: Lexical locales From: Chip Salzenberg <chip@atlantic.net> Files: too many to list make effectiveness of locales depend on C<use locale> Subject: Lexical scoping cleanup From: Chip Salzenberg <chip@atlantic.net> Files: many... but mostly perly.y and toke.c tighten scoping of lexical variables, somewhat on the new constructs and somewhat on the old Subject: memory corruption / security bug in sysread,syswrite + patch Date: Mon, 25 Nov 1996 21:46:31 +0200 (EET) From: Jarkko Hietaniemi <jhi@cc.hut.fi> Files: MANIFEST pod/perldiag.pod pod/perlfunc.pod pp_sys.c t/op/sysio.t Msg-ID: <199611251946.VAA30459@alpha.hut.fi> (applied based on p5p patch as commit d7090df90a9cb89c83787d916e40d92a616b146d) DOCUMENTATION Subject: perldiag documentation patch. Date: Wed, 20 Nov 96 16:07:28 GMT From: Paul Marquess <pmarquess@bfsec.bt.co.uk> Files: pod/perldiag.pod private-msgid: <9611201607.AA12729@claudius.bfsec.bt.co.uk> Subject: a missing perldiag entry Date: Thu, 21 Nov 1996 15:24:02 -0500 From: Gurusamy Sarathy <gsar@engin.umich.edu> Files: pod/perldiag.pod private-msgid: <199611212024.PAA15758@aatma.engin.umich.edu> Subject: perlfunc patch Date: Wed, 20 Nov 96 14:04:08 GMT From: Paul Marquess <pmarquess@bfsec.bt.co.uk> Files: pod/perlfunc.pod Following on from the patch to make uc, lc etc default to $_ (as per Camel II), here is a followup patch to perlfunc that documents the change. I think I have documented all the other cases where $_ defaulting works as well. p5p-msgid: <9611201404.AA12477@claudius.bfsec.bt.co.uk> OTHER CORE CHANGES Subject: Properly prototype safe{malloc,calloc,realloc,free}. From: Chip Salzenberg <chip@atlantic.net> Files: proto.h Subject: UnixWare 2.1 fix for perl5.003_08 - cope with fp->_cnt < -1, allow debugging Date: Wed, 20 Nov 1996 14:27:06 +0100 From: John Hughes <john@AtlanTech.COM> Files: sv.c UnixWare 2.1 has no fp->_base so most of the debugging stuff in sv_gets just core dumps. Also, for some unknown reason fp->_cnt is sometimes < -1, screwing up the initial SvGROW in svgets. Appart from that its io is std. p5p-msgid: <01BBD6EE.E915C860@malvinas.AtlanTech.COM> Subject: die -> croak Date: Thu, 21 Nov 1996 16:11:21 -0500 From: Gurusamy Sarathy <gsar@engin.umich.edu> Files: pp_ctl.c private-msgid: <199611212111.QAA17070@aatma.engin.umich.edu> Subject: Cleanup of {,un}pack('w'). From: Chip Salzenberg <chip@atlantic.net> Files: pp.c Subject: Cleanups from Ilya. From: Chip Salzenberg <chip@atlantic.net> Files: gv.c malloc.c pod/perlguts.pod pp_ctl.c Subject: Fix for unpack('w') on 64-bit systems. From: Chip Salzenberg <chip@atlantic.net> Files: pp.c Subject: Re: LC_NUMERIC support is ready + performance Date: Mon, 25 Nov 1996 22:08:27 -0500 (EST) From: Ilya Zakharevich <ilya@math.ohio-state.edu> Files: sv.c Chip Salzenberg writes: > > Having thought about the use of our own gcvt() and atof(), I've run > away in horror. It's just too hairy. > > So I've implemented the only viable alternative I know of: Toggling > LC_NUMERIC to/from "C" as needed. > > Patch follows. > > I think _09 is *very* close. Since _09 is going to be alpha anyway, I reiterate my question: Is there any reason to not include my hash/array performance patches in _09? Btw, here is the next performance patch. It makes PADTMP values stealable too. I do not do by setting TEMP flags on them, since it would be a very distributed patch, and it would break some places which check for TEMP for some other reasons (yes, I checked ;-). This patch decreases *twice* the memory usage of perl -e '$a = "a" x 1e6; 1' Enjoy, p5p-msgid: <199611260308.WAA02677@monk.mps.ohio-state.edu> Subject: Hash key sharing improvements from Ilya. From: Chip Salzenberg <chip@atlantic.net> Files: hv.c hv.h proto.h Subject: Mortal stack pre-allocation from Ilya. From: Chip Salzenberg <chip@atlantic.net> Files: pp.c pp.h pp_ctl.c pp_hot.c pp_sys.c PORTABILITY Subject: VMS patches post-5.003_08 Date: Fri, 22 Nov 1996 18:16:31 -0500 (EST) From: Charles Bailey <bailey@hmivax.humgen.upenn.edu> Files: lib/ExtUtils/MM_Unix.pm lib/ExtUtils/MM_VMS.pm lib/ExtUtils/MakeMaker.pm lib/File/Path.pm mg.c pp_ctl.c utils/h2xs.PL vms/config.vms vms/descrip.mms vms/gen_shrfls.pl vms/genconfig.pl vms/perlvms.pod vms/vms.c vms/vmsish.h Here're diffs to bring a base 5.003_08 up to the current VMS working sources. Nearly all of the changes are VMS-specific, and comprise miscellaneous bugfixes accumulated since 5.003_07, rather than any particular problem with 5.003_08. I'm posting them here since some of the patches change core files, and I'd like to insure that I haven't accidentally created problems for anyone else. With these and a couple of of the small patches already send to p5p, 5.003_08 builds clean and passes all tests under VMS. Thanks, Chip, for all the work. p5p-msgid: <1996Nov22.181631.1603238@hmivax.humgen.upenn.edu>
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c332
1 files changed, 252 insertions, 80 deletions
diff --git a/regexec.c b/regexec.c
index 6e79d6b0f0..d9a893e66a 100644
--- a/regexec.c
+++ b/regexec.c
@@ -147,6 +147,9 @@ regcppop()
static I32 regmatch _((char *prog));
static I32 regrepeat _((char *p, I32 max));
static I32 regtry _((regexp *prog, char *startpos));
+static bool reginclass _((char *p, I32 c));
+
+static bool regtainted; /* tainted information used? */
/*
- pregexec - match a regexp against a string
@@ -162,7 +165,6 @@ SV *screamer;
I32 safebase; /* no need to remember string in subbase */
{
register char *s;
- register I32 i;
register char *c;
register char *startpos = stringarg;
register I32 tmp;
@@ -192,23 +194,15 @@ I32 safebase; /* no need to remember string in subbase */
if (!multiline && regprev == '\n')
regprev = '\0'; /* force ^ to NOT match */
}
+
regprecomp = prog->precomp;
- regnpar = prog->nparens;
/* Check validity of program. */
if (UCHARAT(prog->program) != MAGIC) {
FAIL("corrupted regexp program");
}
- if (prog->do_folding) {
- i = strend - startpos;
- New(1101,c,i+1,char);
- Copy(startpos, c, i+1, char);
- startpos = c;
- strend = startpos + i;
- for (s = startpos; s < strend; s++)
- if (isUPPER(*s))
- *s = toLOWER(*s);
- }
+ regnpar = prog->nparens;
+ regtainted = FALSE;
/* If there is a "must appear" string, look for it. */
s = startpos;
@@ -281,13 +275,13 @@ I32 safebase; /* no need to remember string in subbase */
if (prog->regstart) {
if (prog->reganch & ROPT_SKIP) { /* we have /x+whatever/ */
/* it must be a one character string */
- i = SvPVX(prog->regstart)[0];
+ char ch = SvPVX(prog->regstart)[0];
while (s < strend) {
- if (*s == i) {
+ if (*s == ch) {
if (regtry(prog, s))
goto got_it;
s++;
- while (s < strend && *s == i)
+ while (s < strend && *s == ch)
s++;
}
s++;
@@ -327,8 +321,7 @@ I32 safebase; /* no need to remember string in subbase */
case ANYOF:
c = OPERAND(c);
while (s < strend) {
- i = UCHARAT(s);
- if (!(c[i >> 3] & (1 << (i&7)))) {
+ if (reginclass(c, *s)) {
if (tmp && regtry(prog, s))
goto got_it;
else
@@ -339,18 +332,16 @@ I32 safebase; /* no need to remember string in subbase */
s++;
}
break;
+ case BOUNDL:
+ regtainted = TRUE;
+ /* FALL THROUGH */
case BOUND:
if (minlen)
dontbother++,strend--;
- if (s != startpos) {
- i = s[-1];
- tmp = isALNUM(i);
- }
- else
- tmp = isALNUM(regprev); /* assume not alphanumeric */
+ tmp = (s != startpos) ? UCHARAT(s - 1) : regprev;
+ tmp = (OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp));
while (s < strend) {
- i = *s;
- if (tmp != isALNUM(i)) {
+ if (tmp != (OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
tmp = !tmp;
if (regtry(prog, s))
goto got_it;
@@ -360,18 +351,16 @@ I32 safebase; /* no need to remember string in subbase */
if ((minlen || tmp) && regtry(prog,s))
goto got_it;
break;
+ case NBOUNDL:
+ regtainted = TRUE;
+ /* FALL THROUGH */
case NBOUND:
if (minlen)
dontbother++,strend--;
- if (s != startpos) {
- i = s[-1];
- tmp = isALNUM(i);
- }
- else
- tmp = isALNUM(regprev); /* assume not alphanumeric */
+ tmp = (s != startpos) ? UCHARAT(s - 1) : regprev;
+ tmp = (OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp));
while (s < strend) {
- i = *s;
- if (tmp != isALNUM(i))
+ if (tmp != (OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
tmp = !tmp;
else if (regtry(prog, s))
goto got_it;
@@ -382,8 +371,21 @@ I32 safebase; /* no need to remember string in subbase */
break;
case ALNUM:
while (s < strend) {
- i = *s;
- if (isALNUM(i)) {
+ if (isALNUM(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case ALNUML:
+ regtainted = TRUE;
+ while (s < strend) {
+ if (isALNUM_LC(*s)) {
if (tmp && regtry(prog, s))
goto got_it;
else
@@ -396,8 +398,21 @@ I32 safebase; /* no need to remember string in subbase */
break;
case NALNUM:
while (s < strend) {
- i = *s;
- if (!isALNUM(i)) {
+ if (!isALNUM(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NALNUML:
+ regtainted = TRUE;
+ while (s < strend) {
+ if (!isALNUM_LC(*s)) {
if (tmp && regtry(prog, s))
goto got_it;
else
@@ -421,6 +436,20 @@ I32 safebase; /* no need to remember string in subbase */
s++;
}
break;
+ case SPACEL:
+ regtainted = TRUE;
+ while (s < strend) {
+ if (isSPACE_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
case NSPACE:
while (s < strend) {
if (!isSPACE(*s)) {
@@ -434,6 +463,20 @@ I32 safebase; /* no need to remember string in subbase */
s++;
}
break;
+ case NSPACEL:
+ regtainted = TRUE;
+ while (s < strend) {
+ if (!isSPACE_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
case DIGIT:
while (s < strend) {
if (isDIGIT(*s)) {
@@ -480,8 +523,9 @@ got_it:
strend += dontbother; /* uncheat */
prog->subbeg = strbeg;
prog->subend = strend;
- if ((!safebase && (prog->nparens || sawampersand)) || prog->do_folding) {
- i = strend - startpos + (stringarg - strbeg);
+ prog->exec_tainted = regtainted;
+ if (!safebase && (prog->nparens || sawampersand)) {
+ I32 i = strend - startpos + (stringarg - strbeg);
if (safebase) { /* no need for $digit later */
s = strbeg;
prog->subend = s+i;
@@ -504,14 +548,10 @@ got_it:
prog->endp[i] = s + (prog->endp[i] - startpos);
}
}
- if (prog->do_folding)
- Safefree(startpos);
}
return 1;
phooey:
- if (prog->do_folding)
- Safefree(startpos);
return 0;
}
@@ -576,13 +616,14 @@ char *prog;
register I32 ln; /* len or last */
register char *s; /* operand or save */
register char *locinput = reginput;
+ register I32 c1, c2; /* case fold search */
int minmod = 0;
#ifdef DEBUGGING
static int regindent = 0;
regindent++;
#endif
- nextchar = *locinput;
+ nextchar = UCHARAT(locinput);
scan = prog;
while (scan != NULL) {
#ifdef DEBUGGING
@@ -653,14 +694,14 @@ char *prog;
case SANY:
if (!nextchar && locinput >= regeol)
sayNO;
- nextchar = *++locinput;
+ nextchar = UCHARAT(++locinput);
break;
case ANY:
if (!nextchar && locinput >= regeol || nextchar == '\n')
sayNO;
- nextchar = *++locinput;
+ nextchar = UCHARAT(++locinput);
break;
- case EXACTLY:
+ case EXACT:
s = OPERAND(scan);
ln = *s++;
/* Inline the first character, for speed. */
@@ -671,67 +712,111 @@ char *prog;
if (ln > 1 && memcmp(s, locinput, ln) != 0)
sayNO;
locinput += ln;
- nextchar = *locinput;
+ nextchar = UCHARAT(locinput);
+ break;
+ case EXACTFL:
+ regtainted = TRUE;
+ /* FALL THROUGH */
+ case EXACTF:
+ s = OPERAND(scan);
+ ln = *s++;
+ /* Inline the first character, for speed. */
+ if (UCHARAT(s) != nextchar &&
+ UCHARAT(s) != ((OP(scan) == EXACTF)
+ ? fold : fold_locale)[nextchar])
+ sayNO;
+ if (regeol - locinput < ln)
+ sayNO;
+ if (ln > 1 && ((OP(scan) == EXACTF)
+ ? ibcmp : ibcmp_locale)(s, locinput, ln) != 0)
+ sayNO;
+ locinput += ln;
+ nextchar = UCHARAT(locinput);
break;
case ANYOF:
s = OPERAND(scan);
if (nextchar < 0)
nextchar = UCHARAT(locinput);
- if (s[nextchar >> 3] & (1 << (nextchar&7)))
+ if (!reginclass(s, nextchar))
sayNO;
if (!nextchar && locinput >= regeol)
sayNO;
- nextchar = *++locinput;
+ nextchar = UCHARAT(++locinput);
break;
+ case ALNUML:
+ regtainted = TRUE;
+ /* FALL THROUGH */
case ALNUM:
if (!nextchar)
sayNO;
- if (!isALNUM(nextchar))
+ if (!(OP(scan) == ALNUM
+ ? isALNUM(nextchar) : isALNUM_LC(nextchar)))
sayNO;
- nextchar = *++locinput;
+ nextchar = UCHARAT(++locinput);
break;
+ case NALNUML:
+ regtainted = TRUE;
+ /* FALL THROUGH */
case NALNUM:
if (!nextchar && locinput >= regeol)
sayNO;
- if (isALNUM(nextchar))
+ if (OP(scan) == NALNUM
+ ? isALNUM(nextchar) : isALNUM_LC(nextchar))
sayNO;
- nextchar = *++locinput;
+ nextchar = UCHARAT(++locinput);
break;
- case NBOUND:
+ case BOUNDL:
+ case NBOUNDL:
+ regtainted = TRUE;
+ /* FALL THROUGH */
case BOUND:
- if (locinput == regbol) /* was last char in word? */
- ln = isALNUM(regprev);
- else
- ln = isALNUM(locinput[-1]);
- n = isALNUM(nextchar); /* is next char in word? */
- if ((ln == n) == (OP(scan) == BOUND))
+ case NBOUND:
+ /* was last char in word? */
+ ln = (locinput != regbol) ? UCHARAT(locinput - 1) : regprev;
+ if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+ ln = isALNUM(ln);
+ n = isALNUM(nextchar);
+ }
+ else {
+ ln = isALNUM_LC(ln);
+ n = isALNUM_LC(nextchar);
+ }
+ if ((ln == n) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
sayNO;
break;
+ case SPACEL:
+ regtainted = TRUE;
+ /* FALL THROUGH */
case SPACE:
if (!nextchar && locinput >= regeol)
sayNO;
- if (!isSPACE(nextchar))
+ if (!(OP(scan) == SPACE
+ ? isSPACE(nextchar) : isSPACE_LC(nextchar)))
sayNO;
- nextchar = *++locinput;
+ nextchar = UCHARAT(++locinput);
break;
+ case NSPACEL:
+ regtainted = TRUE;
+ /* FALL THROUGH */
case NSPACE:
if (!nextchar)
sayNO;
- if (isSPACE(nextchar))
+ if (OP(scan) == SPACE
+ ? isSPACE(nextchar) : isSPACE_LC(nextchar))
sayNO;
- nextchar = *++locinput;
+ nextchar = UCHARAT(++locinput);
break;
case DIGIT:
if (!isDIGIT(nextchar))
sayNO;
- nextchar = *++locinput;
+ nextchar = UCHARAT(++locinput);
break;
case NDIGIT:
if (!nextchar && locinput >= regeol)
sayNO;
if (isDIGIT(nextchar))
sayNO;
- nextchar = *++locinput;
+ nextchar = UCHARAT(++locinput);
break;
case REF:
n = ARG1(scan); /* which paren pair */
@@ -751,7 +836,7 @@ char *prog;
if (ln > 1 && memcmp(s, locinput, ln) != 0)
sayNO;
locinput += ln;
- nextchar = *locinput;
+ nextchar = UCHARAT(locinput);
break;
case NOTHING:
@@ -929,10 +1014,17 @@ char *prog;
n = 32767;
scan = NEXTOPER(scan);
repeat:
- if (OP(next) == EXACTLY)
- nextchar = *(OPERAND(next)+1);
+ if (regkind[(U8)OP(next)] == EXACT) {
+ c1 = UCHARAT(OPERAND(next) + 1);
+ if (OP(next) == EXACTF)
+ c2 = fold[c1];
+ else if (OP(next) == EXACTFL)
+ c2 = fold_locale[c1];
+ else
+ c2 = c1;
+ }
else
- nextchar = -1000;
+ c1 = c2 = -1000;
reginput = locinput;
if (minmod) {
minmod = 0;
@@ -940,9 +1032,13 @@ char *prog;
sayNO;
while (n >= ln || (n == 32767 && ln > 0)) { /* ln overflow ? */
/* If it could work, try it. */
- if (nextchar == -1000 || *reginput == nextchar)
+ if (c1 == -1000 ||
+ UCHARAT(reginput) == c1 ||
+ UCHARAT(reginput) == c2)
+ {
if (regmatch(next))
sayYES;
+ }
/* Couldn't or didn't -- back up. */
reginput = locinput + ln;
if (regrepeat(scan, 1)) {
@@ -960,9 +1056,13 @@ char *prog;
ln = n; /* why back off? */
while (n >= ln) {
/* If it could work, try it. */
- if (nextchar == -1000 || *reginput == nextchar)
+ if (c1 == -1000 ||
+ UCHARAT(reginput) == c1 ||
+ UCHARAT(reginput) == c2)
+ {
if (regmatch(next))
sayYES;
+ }
/* Couldn't or didn't -- back up. */
n--;
reginput = locinput + n;
@@ -1043,34 +1143,64 @@ I32 max;
case SANY:
scan = loceol;
break;
- case EXACTLY: /* length of string is 1 */
- opnd++;
- while (scan < loceol && *opnd == *scan)
+ case EXACT: /* length of string is 1 */
+ c = UCHARAT(++opnd);
+ while (scan < loceol && UCHARAT(scan) == c)
+ scan++;
+ break;
+ case EXACTF: /* length of string is 1 */
+ c = UCHARAT(++opnd);
+ while (scan < loceol &&
+ (UCHARAT(scan) == c || UCHARAT(scan) == fold[c]))
+ scan++;
+ break;
+ case EXACTFL: /* length of string is 1 */
+ regtainted = TRUE;
+ c = UCHARAT(++opnd);
+ while (scan < loceol &&
+ (UCHARAT(scan) == c || UCHARAT(scan) == fold_locale[c]))
scan++;
break;
case ANYOF:
- c = UCHARAT(scan);
- while (scan < loceol && !(opnd[c >> 3] & (1 << (c & 7)))) {
+ while (scan < loceol && reginclass(opnd, *scan))
scan++;
- c = UCHARAT(scan);
- }
break;
case ALNUM:
while (scan < loceol && isALNUM(*scan))
scan++;
break;
+ case ALNUML:
+ regtainted = TRUE;
+ while (scan < loceol && isALNUM_LC(*scan))
+ scan++;
+ break;
case NALNUM:
while (scan < loceol && !isALNUM(*scan))
scan++;
break;
+ case NALNUML:
+ regtainted = TRUE;
+ while (scan < loceol && !isALNUM_LC(*scan))
+ scan++;
+ break;
case SPACE:
while (scan < loceol && isSPACE(*scan))
scan++;
break;
+ case SPACEL:
+ regtainted = TRUE;
+ while (scan < loceol && isSPACE_LC(*scan))
+ scan++;
+ break;
case NSPACE:
while (scan < loceol && !isSPACE(*scan))
scan++;
break;
+ case NSPACEL:
+ regtainted = TRUE;
+ while (scan < loceol && !isSPACE_LC(*scan))
+ scan++;
+ break;
case DIGIT:
while (scan < loceol && isDIGIT(*scan))
scan++;
@@ -1090,6 +1220,48 @@ I32 max;
}
/*
+ - regclass - determine if a character falls into a character class
+ */
+
+static bool
+reginclass(p, c)
+register char *p;
+register I32 c;
+{
+ char flags = *p;
+ bool match = FALSE;
+
+ c &= 0xFF;
+ if (p[1 + (c >> 3)] & (1 << (c & 7)))
+ match = TRUE;
+ else if (flags & ANYOF_FOLD) {
+ I32 cf;
+ if (flags & ANYOF_LOCALE) {
+ regtainted = TRUE;
+ cf = fold_locale[c];
+ }
+ else
+ cf = fold[c];
+ if (p[1 + (cf >> 3)] & (1 << (cf & 7)))
+ match = TRUE;
+ }
+
+ if (!match && (flags & ANYOF_ISA)) {
+ regtainted = TRUE;
+
+ if (((flags & ANYOF_ALNUML) && isALNUM_LC(c)) ||
+ ((flags & ANYOF_NALNUML) && !isALNUM_LC(c)) ||
+ ((flags & ANYOF_SPACEL) && isSPACE_LC(c)) ||
+ ((flags & ANYOF_NSPACEL) && !isSPACE_LC(c)))
+ {
+ match = TRUE;
+ }
+ }
+
+ return match ^ ((flags & ANYOF_INVERT) != 0);
+}
+
+/*
- regnext - dig the "next" pointer out of a node
*
* [Note, when REGALIGN is defined there are two places in regmatch()