summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2006-11-15 13:29:39 +0100
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>2006-11-15 12:41:24 +0000
commitc74340f9cdee6010339b6bfd0e8b0dc8bc875344 (patch)
tree461d7dee65931c649dec8616b2a6547652ba3777
parentf81333e0586497e8dadbe01b840e0be9ee8313ee (diff)
downloadperl-c74340f9cdee6010339b6bfd0e8b0dc8bc875344.tar.gz
Re: [PATCH] Fix RT#19049 and add relative backreferences
Message-ID: <9b18b3110611150329l206e4552w887ae5f0a3f7ca80@mail.gmail.com> p4raw-id: //depot/perl@29279
-rw-r--r--ext/re/re.xs2
-rw-r--r--pod/perl595delta.pod8
-rw-r--r--pod/perldiag.pod10
-rw-r--r--pod/perlre.pod17
-rw-r--r--pod/perlreguts.pod11
-rw-r--r--regcomp.c50
-rw-r--r--regexec.c39
-rw-r--r--regexp.h7
-rwxr-xr-xt/op/pat.t24
-rw-r--r--t/op/re_tests5
10 files changed, 153 insertions, 20 deletions
diff --git a/ext/re/re.xs b/ext/re/re.xs
index b82062a8b0..13dcdc268d 100644
--- a/ext/re/re.xs
+++ b/ext/re/re.xs
@@ -19,7 +19,7 @@ extern char* my_re_intuit_start (pTHX_ regexp *prog, SV *sv, char *strpos,
char *strend, U32 flags,
struct re_scream_pos_data_s *data);
extern SV* my_re_intuit_string (pTHX_ regexp *prog);
-extern char* my_reg_stringify (pTHX_ MAGIC *mg, U32 *flags, STRLEN *lp, I32 *haseval);
+extern char* my_reg_stringify (pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval);
#if defined(USE_ITHREADS)
extern regexp* my_regdupe (pTHX_ const regexp *r, CLONE_PARAMS *param);
diff --git a/pod/perl595delta.pod b/pod/perl595delta.pod
index af76cf68ee..717540cb22 100644
--- a/pod/perl595delta.pod
+++ b/pod/perl595delta.pod
@@ -113,7 +113,13 @@ quantifiers. (Yves Orton)
The regex engine now supports a number of special purpose backtrack
control verbs: (*THEN), (*PRUNE), (*MARK), (*SKIP), (*COMMIT), (*FAIL)
-and (*ACCEPT). See L<perlre> for their descriptions.
+and (*ACCEPT). See L<perlre> for their descriptions. (Yves Orton)
+
+=item Relative backreferences
+
+A new syntax C<\R1> ("1" being any positive decimal integer) allows
+relative backreferencing. This should make it easier to embed patterns
+that contain backreferences. (Yves Orton)
=back
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index e9d23267bd..e6a8b0f6dd 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -3495,6 +3495,16 @@ prepend a zero to make the number at least two digits: C<\07>
The <-- HERE shows in the regular expression about where the problem was
discovered.
+=item Reference to nonexistent or unclosed group in regex; marked by <-- HERE in m/%s/
+
+(F) You used something like C<\R7> in your regular expression, but there are
+not at least seven sets of closed capturing parentheses in the expression before
+where the C<\R7> was located. It's also possible you forgot to escape the
+backslash.
+
+The <-- HERE shows in the regular expression about where the problem was
+discovered.
+
=item Reference to nonexistent named group in regex; marked by <-- HERE in m/%s/
(F) You used something like C<\k'NAME'> or C<< \k<NAME> >> in your regular
diff --git a/pod/perlre.pod b/pod/perlre.pod
index c2b968062b..7df564738e 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -246,7 +246,9 @@ X<word> X<whitespace>
so you may end up with malformed pieces of UTF-8.
Unsupported in lookbehind.
\1 Backreference to a specific group.
- '1' may actually be any positive integer.
+ '1' may actually be any positive integer.
+ \R1 Relative backreference to a preceding closed group.
+ '1' may actually be any positive integer.
\k<name> Named backreference
\N{name} Named unicode character, or unicode escape
\x12 Hexadecimal escape sequence
@@ -469,7 +471,15 @@ ambiguity by interpreting \10 as a backreference only if at least 10
left parentheses have opened before it. Likewise \11 is a
backreference only if at least 11 left parentheses have opened
before it. And so on. \1 through \9 are always interpreted as
-backreferences.
+backreferences.
+
+X<relative backreference>
+In Perl 5.10 it is possible to relatively address a capture buffer by
+using the C<\RNNN> notation, where C<NNN> is negative offset to a
+preceding completed capture buffer. Thus C<\R1> refers to the last
+buffer closed, C<\R2> refers to the buffer before that, and so on. Note
+especially that C</(foo)(\R1)/> refers to the capture buffer containing
+C<foo>, not to the buffer containing C<\R1>.
Additionally, as of Perl 5.10 you may use named capture buffers and named
backreferences. The notation is C<< (?<name>...) >> and C<< \k<name> >>
@@ -884,6 +894,9 @@ C<(?R)>. If PARNO is preceded by a plus or minus sign then it is assumed
to be relative, with negative numbers indicating preceding capture buffers
and positive ones following. Thus C<(?-1)> refers to the most recently
declared buffer, and C<(?+1)> indicates the next buffer to be declared.
+Note that the counting for relative recursion differs from that of
+relative backreferences, in that with recursion unclosed buffers B<are>
+included.
The following pattern matches a function foo() which may contain
balanced parentheses as the argument.
diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod
index 937565745c..aa54bfcb8f 100644
--- a/pod/perlreguts.pod
+++ b/pod/perlreguts.pod
@@ -747,6 +747,7 @@ F<regexp.h> contains the base structure definition:
typedef struct regexp {
I32 *startp;
I32 *endp;
+ regexp_paren_ofs *swap;
regnode *regstclass;
struct reg_substr_data *substrs;
char *precomp; /* pre-compilation regular expression */
@@ -802,11 +803,19 @@ These fields are used to keep track of how many paren groups could be matched
in the pattern, which was the last open paren to be entered, and which was
the last close paren to be entered.
-=item C<startp>, C<endp>
+=item C<startp>, C<endp>, C<swap>
These fields store arrays that are used to hold the offsets of the begining
and end of each capture group that has matched. -1 is used to indicate no match.
+C<swap> is an extra set of startp/endp stored in a C<regexp_paren_ofs>
+struct. This is used when the last successful match was from same pattern
+as the current pattern, so that a partial match doesn't overwrite the
+previous match's results. When this field is data filled the matching
+engine will swap buffers before every match attempt. If the match fails,
+then it swaps them back. If it's successful it leaves them. This field
+is populated on demand and is by default null.
+
These are the source for @- and @+.
=item C<subbeg> C<sublen> C<saved_copy>
diff --git a/regcomp.c b/regcomp.c
index 9099194b07..6d916f1ff1 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -113,7 +113,8 @@ typedef struct RExC_state_t {
I32 sawback; /* Did we see \1, ...? */
U32 seen;
I32 size; /* Code size. */
- I32 npar; /* () count. */
+ I32 npar; /* Capture buffer count, (OPEN). */
+ I32 cpar; /* Capture buffer count, (CLOSE). */
I32 nestroot; /* root parens we are in - used by accept */
I32 extralen;
I32 seen_zerolen;
@@ -153,6 +154,7 @@ typedef struct RExC_state_t {
#define RExC_seen (pRExC_state->seen)
#define RExC_size (pRExC_state->size)
#define RExC_npar (pRExC_state->npar)
+#define RExC_cpar (pRExC_state->cpar)
#define RExC_nestroot (pRExC_state->nestroot)
#define RExC_extralen (pRExC_state->extralen)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
@@ -3943,6 +3945,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
RExC_end = xend;
RExC_naughty = 0;
RExC_npar = 1;
+ RExC_cpar = 1;
RExC_nestroot = 0;
RExC_size = 0L;
RExC_emit = &PL_regdummy;
@@ -4013,6 +4016,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
r->substrs = 0; /* Useful during FAIL. */
r->startp = 0; /* Useful during FAIL. */
r->endp = 0;
+ r->swap = NULL;
r->paren_names = 0;
if (RExC_seen & REG_SEEN_RECURSE) {
@@ -4040,6 +4044,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
RExC_end = xend;
RExC_naughty = 0;
RExC_npar = 1;
+ RExC_cpar = 1;
RExC_emit_start = r->program;
RExC_emit = r->program;
#ifdef DEBUGGING
@@ -4482,7 +4487,8 @@ reStudy:
}
Newxz(r->startp, RExC_npar, I32);
Newxz(r->endp, RExC_npar, I32);
-
+ /* assume we don't need to swap parens around before we match */
+
DEBUG_DUMP_r({
PerlIO_printf(Perl_debug_log,"Final program:\n");
regdump(r);
@@ -5326,6 +5332,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
ender = reg_node(pRExC_state, TAIL);
break;
case 1:
+ RExC_cpar++;
ender = reganode(pRExC_state, CLOSE, parno);
if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
@@ -6270,11 +6277,20 @@ tryagain:
case 'c':
case '0':
goto defchar;
+ case 'R':
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
{
- const I32 num = atoi(RExC_parse);
-
+ I32 num;
+ bool isrel=(*RExC_parse=='R');
+ if (isrel)
+ RExC_parse++;
+ num = atoi(RExC_parse);
+ if (isrel) {
+ num = RExC_cpar - num;
+ if (num < 1)
+ vFAIL("Reference to nonexistent or unclosed group");
+ }
if (num > 9 && num >= RExC_npar)
goto defchar;
else {
@@ -6282,8 +6298,16 @@ tryagain:
while (isDIGIT(*RExC_parse))
RExC_parse++;
- if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
- vFAIL("Reference to nonexistent group");
+ if (!SIZE_ONLY) {
+ if (num > (I32)RExC_rx->nparens)
+ vFAIL("Reference to nonexistent group");
+ /* People make this error all the time apparently.
+ So we cant fail on it, even though we should
+
+ else if (num >= RExC_cpar)
+ vFAIL("Reference to unclosed group will always match");
+ */
+ }
RExC_sawback = 1;
ret = reganode(pRExC_state,
(U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
@@ -6372,6 +6396,7 @@ tryagain:
case 'p':
case 'P':
case 'N':
+ case 'R':
--p;
goto loopdone;
case 'n':
@@ -8502,6 +8527,11 @@ Perl_pregfree(pTHX_ struct regexp *r)
}
Safefree(r->startp);
Safefree(r->endp);
+ if (r->swap) {
+ Safefree(r->swap->startp);
+ Safefree(r->swap->endp);
+ Safefree(r->swap);
+ }
Safefree(r);
}
@@ -8544,6 +8574,14 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
Copy(r->startp, ret->startp, npar, I32);
Newx(ret->endp, npar, I32);
Copy(r->startp, ret->startp, npar, I32);
+ if(r->swap) {
+ Newx(ret->swap, 1, regexp_paren_ofs);
+ /* no need to copy these */
+ Newx(ret->swap->startp, npar, I32);
+ Newx(ret->swap->endp, npar, I32);
+ } else {
+ ret->swap = NULL;
+ }
Newx(ret->substrs, 1, struct reg_substr_data);
for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
diff --git a/regexec.c b/regexec.c
index a0637a822f..d547ff71a2 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1726,7 +1726,28 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
else /* pos() not defined */
reginfo.ganch = strbeg;
}
-
+ if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
+ I32 *t;
+ if (!prog->swap) {
+ /* We have to be careful. If the previous successful match
+ was from this regex we don't want a subsequent paritally
+ successful match to clobber the old results.
+ So when we detect this possibility we add a swap buffer
+ to the re, and switch the buffer each match. If we fail
+ we switch it back, otherwise we leave it swapped.
+ */
+ Newxz(prog->swap, 1, regexp_paren_ofs);
+ /* no need to copy these */
+ Newxz(prog->swap->startp, prog->nparens + 1, I32);
+ Newxz(prog->swap->endp, prog->nparens + 1, I32);
+ }
+ t = prog->swap->startp;
+ prog->swap->startp = prog->startp;
+ prog->startp = t;
+ t = prog->swap->endp;
+ prog->swap->endp = prog->endp;
+ prog->endp = t;
+ }
if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
re_scream_pos_data d;
@@ -2074,6 +2095,16 @@ phooey:
PL_colors[4], PL_colors[5]));
if (PL_reg_eval_set)
restore_pos(aTHX_ prog);
+ if (prog->swap) {
+ /* we failed :-( roll it back */
+ I32 *t;
+ t = prog->swap->startp;
+ prog->swap->startp = prog->startp;
+ prog->startp = t;
+ t = prog->swap->endp;
+ prog->swap->endp = prog->endp;
+ prog->endp = t;
+ }
return 0;
}
@@ -2195,8 +2226,8 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
* on those tests seems to be returning null fields from matches.
* --jhi */
#if 1
- sp = prog->startp;
- ep = prog->endp;
+ sp = PL_regstartp;
+ ep = PL_regendp;
if (prog->nparens) {
register I32 i;
for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
@@ -2207,7 +2238,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
#endif
REGCP_SET(lastcp);
if (regmatch(reginfo, prog->program + 1)) {
- prog->endp[0] = PL_reginput - PL_bostr;
+ PL_regendp[0] = PL_reginput - PL_bostr;
return 1;
}
if (reginfo->cutpoint)
diff --git a/regexp.h b/regexp.h
index 8d08682290..d59fa832b2 100644
--- a/regexp.h
+++ b/regexp.h
@@ -31,10 +31,15 @@ struct reg_substr_data;
struct reg_data;
struct regexp_engine;
+typedef struct regexp_paren_ofs {
+ I32 *startp;
+ I32 *endp;
+} regexp_paren_ofs;
typedef struct regexp {
- I32 *startp;
+ I32 *startp;
I32 *endp;
+ regexp_paren_ofs *swap;
regnode *regstclass;
struct reg_substr_data *substrs;
char *precomp; /* pre-compilation regular expression */
diff --git a/t/op/pat.t b/t/op/pat.t
index 333165d185..358fbb08bc 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -3216,10 +3216,10 @@ $_ = "x"; s/x/func "in multiline subst"/em;
#$_ = "x"; /x(?{func "in regexp"})/;
#$_ = "x"; /x(?{func "in multiline regexp"})/m;
-# bug #19049
+# bug RT#19049
$_="abcdef\n";
@x = m/./g;
-ok("abcde" eq "$`", '# TODO #19049 - global match not setting $`');
+ok("abcde" eq "$`", 'RT#19049 - global match not setting $`');
ok("123\x{100}" =~ /^.*1.*23\x{100}$/, 'uft8 + multiple floating substr');
@@ -4011,6 +4011,24 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) {
#-------------------------------------------------------------------
# Keep the following tests last -- they may crash perl
+{
+ # RT#19049 / RT#38869
+ my @list = (
+ 'ab cdef', # matches regex
+ ( 'e' x 40000 ) .'ab c' # matches not, but 'ab c' matches part of it
+ );
+ my $y;
+ my $x;
+ foreach (@list) {
+ m/ab(.+)cd/i; # the ignore-case seems to be important
+ $y = $1; # use $1, which might not be from the last match!
+ $x = substr($list[0],$-[0],$+[0]-$-[0]);
+ }
+ iseq($y,' ',
+ 'pattern in a loop, failure should not affect previous success');
+ iseq($x,'ab cd',
+ 'pattern in a loop, failure should not affect previous success');
+}
ok(("a" x (2**15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker: #24274")
or print "# Unexpected outcome: should pass or crash perl\n";
@@ -4034,4 +4052,4 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/,
# Put new tests above the line, not here.
# Don't forget to update this!
-BEGIN { print "1..1345\n" };
+BEGIN { print "1..1347\n" };
diff --git a/t/op/re_tests b/t/op/re_tests
index 078caa94be..4279dd6843 100644
--- a/t/op/re_tests
+++ b/t/op/re_tests
@@ -1187,5 +1187,8 @@ a*(*F) aaaab n - -
(A(A|B(*ACCEPT)|C)D)(E) ACDE y $1$2$3 ACDCE
(a)(?:(?-1)|(?+1))(b) aab y $&-$1-$2 aab-a-b
-(a)(?:(?-1)|(?+1))(b) abb y $&-$1-$2 abb-a-b
+(a)(?:(?-1)|(?+1))(b) abb y $1-$2 a-b
(a)(?:(?-1)|(?+1))(b) acb n - -
+
+(foo)(\R1) foofoo y $1-$2 foo-foo
+(foo)(\R1)(foo)(\R1) foofoofoofoo y $1-$2-$3-$4 foo-foo-foo-foo