diff options
author | Perl 5 Porters <perl5-porters@africa.nicoh.com> | 1996-11-26 20:48:00 +1200 |
---|---|---|
committer | Chip Salzenberg <chip@atlantic.net> | 1996-11-26 20:48:00 +1200 |
commit | bbce6d69784bf43b0e69e8d312042d65f258af23 (patch) | |
tree | eb5810e67656c19b6fb34dd0160c9131f24f65d1 /regexec.c | |
parent | 6d82b38436d2a39ffb7413e68ad91495cd645fff (diff) | |
download | perl-bbce6d69784bf43b0e69e8d312042d65f258af23.tar.gz |
[inseparable changes from patch from perl5.003_08 to perl5.003_09]
CORE LANGUAGE CHANGES
Subject: Lexical locales
From: Chip Salzenberg <chip@atlantic.net>
Files: too many to list
make effectiveness of locales depend on C<use locale>
Subject: Lexical scoping cleanup
From: Chip Salzenberg <chip@atlantic.net>
Files: many... but mostly perly.y and toke.c
tighten scoping of lexical variables, somewhat on the
new constructs and somewhat on the old
Subject: memory corruption / security bug in sysread,syswrite + patch
Date: Mon, 25 Nov 1996 21:46:31 +0200 (EET)
From: Jarkko Hietaniemi <jhi@cc.hut.fi>
Files: MANIFEST pod/perldiag.pod pod/perlfunc.pod pp_sys.c t/op/sysio.t
Msg-ID: <199611251946.VAA30459@alpha.hut.fi>
(applied based on p5p patch as commit d7090df90a9cb89c83787d916e40d92a616b146d)
DOCUMENTATION
Subject: perldiag documentation patch.
Date: Wed, 20 Nov 96 16:07:28 GMT
From: Paul Marquess <pmarquess@bfsec.bt.co.uk>
Files: pod/perldiag.pod
private-msgid: <9611201607.AA12729@claudius.bfsec.bt.co.uk>
Subject: a missing perldiag entry
Date: Thu, 21 Nov 1996 15:24:02 -0500
From: Gurusamy Sarathy <gsar@engin.umich.edu>
Files: pod/perldiag.pod
private-msgid: <199611212024.PAA15758@aatma.engin.umich.edu>
Subject: perlfunc patch
Date: Wed, 20 Nov 96 14:04:08 GMT
From: Paul Marquess <pmarquess@bfsec.bt.co.uk>
Files: pod/perlfunc.pod
Following on from the patch to make uc, lc etc default to $_ (as per
Camel II), here is a followup patch to perlfunc that documents the
change. I think I have documented all the other cases where $_
defaulting works as well.
p5p-msgid: <9611201404.AA12477@claudius.bfsec.bt.co.uk>
OTHER CORE CHANGES
Subject: Properly prototype safe{malloc,calloc,realloc,free}.
From: Chip Salzenberg <chip@atlantic.net>
Files: proto.h
Subject: UnixWare 2.1 fix for perl5.003_08 - cope with fp->_cnt < -1, allow debugging
Date: Wed, 20 Nov 1996 14:27:06 +0100
From: John Hughes <john@AtlanTech.COM>
Files: sv.c
UnixWare 2.1 has no fp->_base so most of the debugging stuff in sv_gets just
core dumps.
Also, for some unknown reason fp->_cnt is sometimes < -1, screwing up the
initial SvGROW in svgets.
Appart from that its io is std.
p5p-msgid: <01BBD6EE.E915C860@malvinas.AtlanTech.COM>
Subject: die -> croak
Date: Thu, 21 Nov 1996 16:11:21 -0500
From: Gurusamy Sarathy <gsar@engin.umich.edu>
Files: pp_ctl.c
private-msgid: <199611212111.QAA17070@aatma.engin.umich.edu>
Subject: Cleanup of {,un}pack('w').
From: Chip Salzenberg <chip@atlantic.net>
Files: pp.c
Subject: Cleanups from Ilya.
From: Chip Salzenberg <chip@atlantic.net>
Files: gv.c malloc.c pod/perlguts.pod pp_ctl.c
Subject: Fix for unpack('w') on 64-bit systems.
From: Chip Salzenberg <chip@atlantic.net>
Files: pp.c
Subject: Re: LC_NUMERIC support is ready + performance
Date: Mon, 25 Nov 1996 22:08:27 -0500 (EST)
From: Ilya Zakharevich <ilya@math.ohio-state.edu>
Files: sv.c
Chip Salzenberg writes:
>
> Having thought about the use of our own gcvt() and atof(), I've run
> away in horror. It's just too hairy.
>
> So I've implemented the only viable alternative I know of: Toggling
> LC_NUMERIC to/from "C" as needed.
>
> Patch follows.
>
> I think _09 is *very* close.
Since _09 is going to be alpha anyway, I reiterate my question:
Is there any reason to not include my hash/array performance
patches in _09?
Btw, here is the next performance patch. It makes PADTMP values
stealable too. I do not do by setting TEMP flags on them, since it
would be a very distributed patch, and it would break some places
which check for TEMP for some other reasons (yes, I checked ;-).
This patch decreases *twice* the memory usage of
perl -e '$a = "a" x 1e6; 1'
Enjoy,
p5p-msgid: <199611260308.WAA02677@monk.mps.ohio-state.edu>
Subject: Hash key sharing improvements from Ilya.
From: Chip Salzenberg <chip@atlantic.net>
Files: hv.c hv.h proto.h
Subject: Mortal stack pre-allocation from Ilya.
From: Chip Salzenberg <chip@atlantic.net>
Files: pp.c pp.h pp_ctl.c pp_hot.c pp_sys.c
PORTABILITY
Subject: VMS patches post-5.003_08
Date: Fri, 22 Nov 1996 18:16:31 -0500 (EST)
From: Charles Bailey <bailey@hmivax.humgen.upenn.edu>
Files: lib/ExtUtils/MM_Unix.pm lib/ExtUtils/MM_VMS.pm lib/ExtUtils/MakeMaker.pm lib/File/Path.pm mg.c pp_ctl.c utils/h2xs.PL vms/config.vms vms/descrip.mms vms/gen_shrfls.pl vms/genconfig.pl vms/perlvms.pod vms/vms.c vms/vmsish.h
Here're diffs to bring a base 5.003_08 up to the current VMS working
sources. Nearly all of the changes are VMS-specific, and comprise
miscellaneous bugfixes accumulated since 5.003_07, rather than any
particular problem with 5.003_08. I'm posting them here since some
of the patches change core files, and I'd like to insure that I
haven't accidentally created problems for anyone else.
With these and a couple of of the small patches already send to p5p,
5.003_08 builds clean and passes all tests under VMS.
Thanks, Chip, for all the work.
p5p-msgid: <1996Nov22.181631.1603238@hmivax.humgen.upenn.edu>
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 332 |
1 files changed, 252 insertions, 80 deletions
@@ -147,6 +147,9 @@ regcppop() static I32 regmatch _((char *prog)); static I32 regrepeat _((char *p, I32 max)); static I32 regtry _((regexp *prog, char *startpos)); +static bool reginclass _((char *p, I32 c)); + +static bool regtainted; /* tainted information used? */ /* - pregexec - match a regexp against a string @@ -162,7 +165,6 @@ SV *screamer; I32 safebase; /* no need to remember string in subbase */ { register char *s; - register I32 i; register char *c; register char *startpos = stringarg; register I32 tmp; @@ -192,23 +194,15 @@ I32 safebase; /* no need to remember string in subbase */ if (!multiline && regprev == '\n') regprev = '\0'; /* force ^ to NOT match */ } + regprecomp = prog->precomp; - regnpar = prog->nparens; /* Check validity of program. */ if (UCHARAT(prog->program) != MAGIC) { FAIL("corrupted regexp program"); } - if (prog->do_folding) { - i = strend - startpos; - New(1101,c,i+1,char); - Copy(startpos, c, i+1, char); - startpos = c; - strend = startpos + i; - for (s = startpos; s < strend; s++) - if (isUPPER(*s)) - *s = toLOWER(*s); - } + regnpar = prog->nparens; + regtainted = FALSE; /* If there is a "must appear" string, look for it. */ s = startpos; @@ -281,13 +275,13 @@ I32 safebase; /* no need to remember string in subbase */ if (prog->regstart) { if (prog->reganch & ROPT_SKIP) { /* we have /x+whatever/ */ /* it must be a one character string */ - i = SvPVX(prog->regstart)[0]; + char ch = SvPVX(prog->regstart)[0]; while (s < strend) { - if (*s == i) { + if (*s == ch) { if (regtry(prog, s)) goto got_it; s++; - while (s < strend && *s == i) + while (s < strend && *s == ch) s++; } s++; @@ -327,8 +321,7 @@ I32 safebase; /* no need to remember string in subbase */ case ANYOF: c = OPERAND(c); while (s < strend) { - i = UCHARAT(s); - if (!(c[i >> 3] & (1 << (i&7)))) { + if (reginclass(c, *s)) { if (tmp && regtry(prog, s)) goto got_it; else @@ -339,18 +332,16 @@ I32 safebase; /* no need to remember string in subbase */ s++; } break; + case BOUNDL: + regtainted = TRUE; + /* FALL THROUGH */ case BOUND: if (minlen) dontbother++,strend--; - if (s != startpos) { - i = s[-1]; - tmp = isALNUM(i); - } - else - tmp = isALNUM(regprev); /* assume not alphanumeric */ + tmp = (s != startpos) ? UCHARAT(s - 1) : regprev; + tmp = (OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)); while (s < strend) { - i = *s; - if (tmp != isALNUM(i)) { + if (tmp != (OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { tmp = !tmp; if (regtry(prog, s)) goto got_it; @@ -360,18 +351,16 @@ I32 safebase; /* no need to remember string in subbase */ if ((minlen || tmp) && regtry(prog,s)) goto got_it; break; + case NBOUNDL: + regtainted = TRUE; + /* FALL THROUGH */ case NBOUND: if (minlen) dontbother++,strend--; - if (s != startpos) { - i = s[-1]; - tmp = isALNUM(i); - } - else - tmp = isALNUM(regprev); /* assume not alphanumeric */ + tmp = (s != startpos) ? UCHARAT(s - 1) : regprev; + tmp = (OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)); while (s < strend) { - i = *s; - if (tmp != isALNUM(i)) + if (tmp != (OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) tmp = !tmp; else if (regtry(prog, s)) goto got_it; @@ -382,8 +371,21 @@ I32 safebase; /* no need to remember string in subbase */ break; case ALNUM: while (s < strend) { - i = *s; - if (isALNUM(i)) { + if (isALNUM(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case ALNUML: + regtainted = TRUE; + while (s < strend) { + if (isALNUM_LC(*s)) { if (tmp && regtry(prog, s)) goto got_it; else @@ -396,8 +398,21 @@ I32 safebase; /* no need to remember string in subbase */ break; case NALNUM: while (s < strend) { - i = *s; - if (!isALNUM(i)) { + if (!isALNUM(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NALNUML: + regtainted = TRUE; + while (s < strend) { + if (!isALNUM_LC(*s)) { if (tmp && regtry(prog, s)) goto got_it; else @@ -421,6 +436,20 @@ I32 safebase; /* no need to remember string in subbase */ s++; } break; + case SPACEL: + regtainted = TRUE; + while (s < strend) { + if (isSPACE_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; case NSPACE: while (s < strend) { if (!isSPACE(*s)) { @@ -434,6 +463,20 @@ I32 safebase; /* no need to remember string in subbase */ s++; } break; + case NSPACEL: + regtainted = TRUE; + while (s < strend) { + if (!isSPACE_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; case DIGIT: while (s < strend) { if (isDIGIT(*s)) { @@ -480,8 +523,9 @@ got_it: strend += dontbother; /* uncheat */ prog->subbeg = strbeg; prog->subend = strend; - if ((!safebase && (prog->nparens || sawampersand)) || prog->do_folding) { - i = strend - startpos + (stringarg - strbeg); + prog->exec_tainted = regtainted; + if (!safebase && (prog->nparens || sawampersand)) { + I32 i = strend - startpos + (stringarg - strbeg); if (safebase) { /* no need for $digit later */ s = strbeg; prog->subend = s+i; @@ -504,14 +548,10 @@ got_it: prog->endp[i] = s + (prog->endp[i] - startpos); } } - if (prog->do_folding) - Safefree(startpos); } return 1; phooey: - if (prog->do_folding) - Safefree(startpos); return 0; } @@ -576,13 +616,14 @@ char *prog; register I32 ln; /* len or last */ register char *s; /* operand or save */ register char *locinput = reginput; + register I32 c1, c2; /* case fold search */ int minmod = 0; #ifdef DEBUGGING static int regindent = 0; regindent++; #endif - nextchar = *locinput; + nextchar = UCHARAT(locinput); scan = prog; while (scan != NULL) { #ifdef DEBUGGING @@ -653,14 +694,14 @@ char *prog; case SANY: if (!nextchar && locinput >= regeol) sayNO; - nextchar = *++locinput; + nextchar = UCHARAT(++locinput); break; case ANY: if (!nextchar && locinput >= regeol || nextchar == '\n') sayNO; - nextchar = *++locinput; + nextchar = UCHARAT(++locinput); break; - case EXACTLY: + case EXACT: s = OPERAND(scan); ln = *s++; /* Inline the first character, for speed. */ @@ -671,67 +712,111 @@ char *prog; if (ln > 1 && memcmp(s, locinput, ln) != 0) sayNO; locinput += ln; - nextchar = *locinput; + nextchar = UCHARAT(locinput); + break; + case EXACTFL: + regtainted = TRUE; + /* FALL THROUGH */ + case EXACTF: + s = OPERAND(scan); + ln = *s++; + /* Inline the first character, for speed. */ + if (UCHARAT(s) != nextchar && + UCHARAT(s) != ((OP(scan) == EXACTF) + ? fold : fold_locale)[nextchar]) + sayNO; + if (regeol - locinput < ln) + sayNO; + if (ln > 1 && ((OP(scan) == EXACTF) + ? ibcmp : ibcmp_locale)(s, locinput, ln) != 0) + sayNO; + locinput += ln; + nextchar = UCHARAT(locinput); break; case ANYOF: s = OPERAND(scan); if (nextchar < 0) nextchar = UCHARAT(locinput); - if (s[nextchar >> 3] & (1 << (nextchar&7))) + if (!reginclass(s, nextchar)) sayNO; if (!nextchar && locinput >= regeol) sayNO; - nextchar = *++locinput; + nextchar = UCHARAT(++locinput); break; + case ALNUML: + regtainted = TRUE; + /* FALL THROUGH */ case ALNUM: if (!nextchar) sayNO; - if (!isALNUM(nextchar)) + if (!(OP(scan) == ALNUM + ? isALNUM(nextchar) : isALNUM_LC(nextchar))) sayNO; - nextchar = *++locinput; + nextchar = UCHARAT(++locinput); break; + case NALNUML: + regtainted = TRUE; + /* FALL THROUGH */ case NALNUM: if (!nextchar && locinput >= regeol) sayNO; - if (isALNUM(nextchar)) + if (OP(scan) == NALNUM + ? isALNUM(nextchar) : isALNUM_LC(nextchar)) sayNO; - nextchar = *++locinput; + nextchar = UCHARAT(++locinput); break; - case NBOUND: + case BOUNDL: + case NBOUNDL: + regtainted = TRUE; + /* FALL THROUGH */ case BOUND: - if (locinput == regbol) /* was last char in word? */ - ln = isALNUM(regprev); - else - ln = isALNUM(locinput[-1]); - n = isALNUM(nextchar); /* is next char in word? */ - if ((ln == n) == (OP(scan) == BOUND)) + case NBOUND: + /* was last char in word? */ + ln = (locinput != regbol) ? UCHARAT(locinput - 1) : regprev; + if (OP(scan) == BOUND || OP(scan) == NBOUND) { + ln = isALNUM(ln); + n = isALNUM(nextchar); + } + else { + ln = isALNUM_LC(ln); + n = isALNUM_LC(nextchar); + } + if ((ln == n) == (OP(scan) == BOUND || OP(scan) == BOUNDL)) sayNO; break; + case SPACEL: + regtainted = TRUE; + /* FALL THROUGH */ case SPACE: if (!nextchar && locinput >= regeol) sayNO; - if (!isSPACE(nextchar)) + if (!(OP(scan) == SPACE + ? isSPACE(nextchar) : isSPACE_LC(nextchar))) sayNO; - nextchar = *++locinput; + nextchar = UCHARAT(++locinput); break; + case NSPACEL: + regtainted = TRUE; + /* FALL THROUGH */ case NSPACE: if (!nextchar) sayNO; - if (isSPACE(nextchar)) + if (OP(scan) == SPACE + ? isSPACE(nextchar) : isSPACE_LC(nextchar)) sayNO; - nextchar = *++locinput; + nextchar = UCHARAT(++locinput); break; case DIGIT: if (!isDIGIT(nextchar)) sayNO; - nextchar = *++locinput; + nextchar = UCHARAT(++locinput); break; case NDIGIT: if (!nextchar && locinput >= regeol) sayNO; if (isDIGIT(nextchar)) sayNO; - nextchar = *++locinput; + nextchar = UCHARAT(++locinput); break; case REF: n = ARG1(scan); /* which paren pair */ @@ -751,7 +836,7 @@ char *prog; if (ln > 1 && memcmp(s, locinput, ln) != 0) sayNO; locinput += ln; - nextchar = *locinput; + nextchar = UCHARAT(locinput); break; case NOTHING: @@ -929,10 +1014,17 @@ char *prog; n = 32767; scan = NEXTOPER(scan); repeat: - if (OP(next) == EXACTLY) - nextchar = *(OPERAND(next)+1); + if (regkind[(U8)OP(next)] == EXACT) { + c1 = UCHARAT(OPERAND(next) + 1); + if (OP(next) == EXACTF) + c2 = fold[c1]; + else if (OP(next) == EXACTFL) + c2 = fold_locale[c1]; + else + c2 = c1; + } else - nextchar = -1000; + c1 = c2 = -1000; reginput = locinput; if (minmod) { minmod = 0; @@ -940,9 +1032,13 @@ char *prog; sayNO; while (n >= ln || (n == 32767 && ln > 0)) { /* ln overflow ? */ /* If it could work, try it. */ - if (nextchar == -1000 || *reginput == nextchar) + if (c1 == -1000 || + UCHARAT(reginput) == c1 || + UCHARAT(reginput) == c2) + { if (regmatch(next)) sayYES; + } /* Couldn't or didn't -- back up. */ reginput = locinput + ln; if (regrepeat(scan, 1)) { @@ -960,9 +1056,13 @@ char *prog; ln = n; /* why back off? */ while (n >= ln) { /* If it could work, try it. */ - if (nextchar == -1000 || *reginput == nextchar) + if (c1 == -1000 || + UCHARAT(reginput) == c1 || + UCHARAT(reginput) == c2) + { if (regmatch(next)) sayYES; + } /* Couldn't or didn't -- back up. */ n--; reginput = locinput + n; @@ -1043,34 +1143,64 @@ I32 max; case SANY: scan = loceol; break; - case EXACTLY: /* length of string is 1 */ - opnd++; - while (scan < loceol && *opnd == *scan) + case EXACT: /* length of string is 1 */ + c = UCHARAT(++opnd); + while (scan < loceol && UCHARAT(scan) == c) + scan++; + break; + case EXACTF: /* length of string is 1 */ + c = UCHARAT(++opnd); + while (scan < loceol && + (UCHARAT(scan) == c || UCHARAT(scan) == fold[c])) + scan++; + break; + case EXACTFL: /* length of string is 1 */ + regtainted = TRUE; + c = UCHARAT(++opnd); + while (scan < loceol && + (UCHARAT(scan) == c || UCHARAT(scan) == fold_locale[c])) scan++; break; case ANYOF: - c = UCHARAT(scan); - while (scan < loceol && !(opnd[c >> 3] & (1 << (c & 7)))) { + while (scan < loceol && reginclass(opnd, *scan)) scan++; - c = UCHARAT(scan); - } break; case ALNUM: while (scan < loceol && isALNUM(*scan)) scan++; break; + case ALNUML: + regtainted = TRUE; + while (scan < loceol && isALNUM_LC(*scan)) + scan++; + break; case NALNUM: while (scan < loceol && !isALNUM(*scan)) scan++; break; + case NALNUML: + regtainted = TRUE; + while (scan < loceol && !isALNUM_LC(*scan)) + scan++; + break; case SPACE: while (scan < loceol && isSPACE(*scan)) scan++; break; + case SPACEL: + regtainted = TRUE; + while (scan < loceol && isSPACE_LC(*scan)) + scan++; + break; case NSPACE: while (scan < loceol && !isSPACE(*scan)) scan++; break; + case NSPACEL: + regtainted = TRUE; + while (scan < loceol && !isSPACE_LC(*scan)) + scan++; + break; case DIGIT: while (scan < loceol && isDIGIT(*scan)) scan++; @@ -1090,6 +1220,48 @@ I32 max; } /* + - regclass - determine if a character falls into a character class + */ + +static bool +reginclass(p, c) +register char *p; +register I32 c; +{ + char flags = *p; + bool match = FALSE; + + c &= 0xFF; + if (p[1 + (c >> 3)] & (1 << (c & 7))) + match = TRUE; + else if (flags & ANYOF_FOLD) { + I32 cf; + if (flags & ANYOF_LOCALE) { + regtainted = TRUE; + cf = fold_locale[c]; + } + else + cf = fold[c]; + if (p[1 + (cf >> 3)] & (1 << (cf & 7))) + match = TRUE; + } + + if (!match && (flags & ANYOF_ISA)) { + regtainted = TRUE; + + if (((flags & ANYOF_ALNUML) && isALNUM_LC(c)) || + ((flags & ANYOF_NALNUML) && !isALNUM_LC(c)) || + ((flags & ANYOF_SPACEL) && isSPACE_LC(c)) || + ((flags & ANYOF_NSPACEL) && !isSPACE_LC(c))) + { + match = TRUE; + } + } + + return match ^ ((flags & ANYOF_INVERT) != 0); +} + +/* - regnext - dig the "next" pointer out of a node * * [Note, when REGALIGN is defined there are two places in regmatch() |