Re: [PATCH] Fix RT#19049 and add relative backreferences

Message-ID: <9b18b3110611150329l206e4552w887ae5f0a3f7ca80@mail.gmail.com> p4raw-id: //depot/perl@29279
author: Yves Orton <demerphq@gmail.com> 2006-11-15 13:29:39 +0100
committer: Rafael Garcia-Suarez <rgarciasuarez@gmail.com> 2006-11-15 12:41:24 +0000
commit: c74340f9cdee6010339b6bfd0e8b0dc8bc875344 (patch)
tree: 461d7dee65931c649dec8616b2a6547652ba3777
parent: f81333e0586497e8dadbe01b840e0be9ee8313ee (diff)
download: perl-c74340f9cdee6010339b6bfd0e8b0dc8bc875344.tar.gz
10 files changed, 153 insertions, 20 deletions
diff --git a/ext/re/re.xs b/ext/re/re.xs
index b82062a8b0..13dcdc268d 100644
--- a/ext/re/re.xs
+++ b/ext/re/re.xs
@@ -19,7 +19,7 @@ extern char*	my_re_intuit_start (pTHX_ regexp *prog, SV *sv, char *strpos,
 				    char *strend, U32 flags,
 				    struct re_scream_pos_data_s *data);
 extern SV*	my_re_intuit_string (pTHX_ regexp *prog);
-extern char*	my_reg_stringify (pTHX_ MAGIC *mg, U32 *flags, STRLEN *lp, I32 *haseval);
+extern char*	my_reg_stringify (pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags,  I32 *haseval);
 
 #if defined(USE_ITHREADS)
 extern regexp*	my_regdupe (pTHX_ const regexp *r, CLONE_PARAMS *param);
diff --git a/pod/perl595delta.pod b/pod/perl595delta.pod
index af76cf68ee..717540cb22 100644
--- a/pod/perl595delta.pod
+++ b/pod/perl595delta.pod
@@ -113,7 +113,13 @@ quantifiers. (Yves Orton)
 
 The regex engine now supports a number of special purpose backtrack
 control verbs: (*THEN), (*PRUNE), (*MARK), (*SKIP), (*COMMIT), (*FAIL)
-and (*ACCEPT). See L<perlre> for their descriptions.
+and (*ACCEPT). See L<perlre> for their descriptions. (Yves Orton)
+
+=item Relative backreferences
+
+A new syntax C<\R1> ("1" being any positive decimal integer) allows
+relative backreferencing. This should make it easier to embed patterns
+that contain backreferences. (Yves Orton)
 
 =back
 
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index e9d23267bd..e6a8b0f6dd 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -3495,6 +3495,16 @@ prepend a zero to make the number at least two digits: C<\07>
 The <-- HERE shows in the regular expression about where the problem was
 discovered.
 
+=item Reference to nonexistent or unclosed group in regex; marked by <-- HERE in m/%s/
+
+(F) You used something like C<\R7> in your regular expression, but there are
+not at least seven sets of closed capturing parentheses in the expression before
+where the C<\R7> was located. It's also possible you forgot to escape the
+backslash.
+
+The <-- HERE shows in the regular expression about where the problem was
+discovered.
+
 =item Reference to nonexistent named group in regex; marked by <-- HERE in m/%s/
 
 (F) You used something like C<\k'NAME'> or C<< \k<NAME> >> in your regular
diff --git a/pod/perlre.pod b/pod/perlre.pod
index c2b968062b..7df564738e 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -246,7 +246,9 @@ X<word> X<whitespace>
 	     so you may end up with malformed pieces of UTF-8.
 	     Unsupported in lookbehind.
     \1       Backreference to a specific group.
-	    '1' may actually be any positive integer.
+	     '1' may actually be any positive integer.
+    \R1      Relative backreference to a preceding closed group.
+	     '1' may actually be any positive integer.
     \k<name> Named backreference
     \N{name} Named unicode character, or unicode escape
     \x12     Hexadecimal escape sequence
@@ -469,7 +471,15 @@ ambiguity by interpreting \10 as a backreference only if at least 10
 left parentheses have opened before it.  Likewise \11 is a
 backreference only if at least 11 left parentheses have opened
 before it.  And so on.  \1 through \9 are always interpreted as
-backreferences.
+backreferences. 
+
+X<relative backreference>
+In Perl 5.10 it is possible to relatively address a capture buffer by
+using the C<\RNNN> notation, where C<NNN> is negative offset to a
+preceding completed capture buffer. Thus C<\R1> refers to the last
+buffer closed, C<\R2> refers to the buffer before that, and so on. Note
+especially that C</(foo)(\R1)/> refers to the capture buffer containing
+C<foo>, not to the buffer containing C<\R1>.
 
 Additionally, as of Perl 5.10 you may use named capture buffers and named
 backreferences. The notation is C<< (?<name>...) >> and C<< \k<name> >>
@@ -884,6 +894,9 @@ C<(?R)>. If PARNO is preceded by a plus or minus sign then it is assumed
 to be relative, with negative numbers indicating preceding capture buffers
 and positive ones following. Thus C<(?-1)> refers to the most recently
 declared buffer, and C<(?+1)> indicates the next buffer to be declared.
+Note that the counting for relative recursion differs from that of
+relative backreferences, in that with recursion unclosed buffers B<are>
+included.
 
 The following pattern matches a function foo() which may contain
 balanced parentheses as the argument.
diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod
index 937565745c..aa54bfcb8f 100644
--- a/pod/perlreguts.pod
+++ b/pod/perlreguts.pod
@@ -747,6 +747,7 @@ F<regexp.h> contains the base structure definition:
     typedef struct regexp {
         I32 *startp;
         I32 *endp;
+        regexp_paren_ofs *swap;
         regnode *regstclass;
         struct reg_substr_data *substrs;
         char *precomp;          /* pre-compilation regular expression */
@@ -802,11 +803,19 @@ These fields are used to keep track of how many paren groups could be matched
 in the pattern, which was the last open paren to be entered, and which was
 the last close paren to be entered.
 
-=item C<startp>, C<endp>
+=item C<startp>, C<endp>, C<swap>
 
 These fields store arrays that are used to hold the offsets of the begining
 and end of each capture group that has matched. -1 is used to indicate no match.
 
+C<swap> is an extra set of startp/endp stored in a C<regexp_paren_ofs>
+struct. This is used when the last successful match was from same pattern
+as the current pattern, so that a partial match doesn't overwrite the
+previous match's results. When this field is data filled the matching
+engine will swap buffers before every match attempt. If the match fails,
+then it swaps them back. If it's successful it leaves them. This field
+is populated on demand and is by default null.
+
 These are the source for @- and @+.
 
 =item C<subbeg> C<sublen> C<saved_copy>
diff --git a/regcomp.c b/regcomp.c
index 9099194b07..6d916f1ff1 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -113,7 +113,8 @@ typedef struct RExC_state_t {
     I32		sawback;		/* Did we see \1, ...? */
     U32		seen;
     I32		size;			/* Code size. */
-    I32		npar;			/* () count. */
+    I32		npar;			/* Capture buffer count, (OPEN). */
+    I32		cpar;			/* Capture buffer count, (CLOSE). */
     I32		nestroot;		/* root parens we are in - used by accept */
     I32		extralen;
     I32		seen_zerolen;
@@ -153,6 +154,7 @@ typedef struct RExC_state_t {
 #define RExC_seen	(pRExC_state->seen)
 #define RExC_size	(pRExC_state->size)
 #define RExC_npar	(pRExC_state->npar)
+#define RExC_cpar	(pRExC_state->cpar)
 #define RExC_nestroot   (pRExC_state->nestroot)
 #define RExC_extralen	(pRExC_state->extralen)
 #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
@@ -3943,6 +3945,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     RExC_end = xend;
     RExC_naughty = 0;
     RExC_npar = 1;
+    RExC_cpar = 1;
     RExC_nestroot = 0;
     RExC_size = 0L;
     RExC_emit = &PL_regdummy;
@@ -4013,6 +4016,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     r->substrs = 0;			/* Useful during FAIL. */
     r->startp = 0;			/* Useful during FAIL. */
     r->endp = 0;			
+    r->swap = NULL; 
     r->paren_names = 0;
     
     if (RExC_seen & REG_SEEN_RECURSE) {
@@ -4040,6 +4044,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     RExC_end = xend;
     RExC_naughty = 0;
     RExC_npar = 1;
+    RExC_cpar = 1;
     RExC_emit_start = r->program;
     RExC_emit = r->program;
 #ifdef DEBUGGING
@@ -4482,7 +4487,8 @@ reStudy:
     }
     Newxz(r->startp, RExC_npar, I32);
     Newxz(r->endp, RExC_npar, I32);
-    
+    /* assume we don't need to swap parens around before we match */
+
     DEBUG_DUMP_r({
         PerlIO_printf(Perl_debug_log,"Final program:\n");
         regdump(r);
@@ -5326,6 +5332,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 	    ender = reg_node(pRExC_state, TAIL);
 	    break;
 	case 1:
+	    RExC_cpar++;
 	    ender = reganode(pRExC_state, CLOSE, parno);
 	    if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
 		DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
@@ -6270,11 +6277,20 @@ tryagain:
 	case 'c':
 	case '0':
 	    goto defchar;
+	case 'R': 
 	case '1': case '2': case '3': case '4':
 	case '5': case '6': case '7': case '8': case '9':
 	    {
-		const I32 num = atoi(RExC_parse);
-
+		I32 num;
+		bool isrel=(*RExC_parse=='R');
+		if (isrel)
+		    RExC_parse++;
+		num = atoi(RExC_parse);
+                if (isrel) {
+                    num = RExC_cpar - num;
+                    if (num < 1)
+                        vFAIL("Reference to nonexistent or unclosed group");
+                }
 		if (num > 9 && num >= RExC_npar)
 		    goto defchar;
 		else {
@@ -6282,8 +6298,16 @@ tryagain:
 		    while (isDIGIT(*RExC_parse))
 			RExC_parse++;
 
-		    if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
-			vFAIL("Reference to nonexistent group");
+		    if (!SIZE_ONLY) {
+		        if (num > (I32)RExC_rx->nparens)
+			    vFAIL("Reference to nonexistent group");
+			/* People make this error all the time apparently.
+			   So we cant fail on it, even though we should 
+			
+			else if (num >= RExC_cpar)
+			    vFAIL("Reference to unclosed group will always match");
+			*/
+		    }
 		    RExC_sawback = 1;
 		    ret = reganode(pRExC_state,
 				   (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
@@ -6372,6 +6396,7 @@ tryagain:
 		    case 'p':
 		    case 'P':
                     case 'N':
+                    case 'R':
 			--p;
 			goto loopdone;
 		    case 'n':
@@ -8502,6 +8527,11 @@ Perl_pregfree(pTHX_ struct regexp *r)
     }
     Safefree(r->startp);
     Safefree(r->endp);
+    if (r->swap) {
+        Safefree(r->swap->startp);
+        Safefree(r->swap->endp);
+        Safefree(r->swap);
+    }
     Safefree(r);
 }
 
@@ -8544,6 +8574,14 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
     Copy(r->startp, ret->startp, npar, I32);
     Newx(ret->endp, npar, I32);
     Copy(r->startp, ret->startp, npar, I32);
+    if(r->swap) {
+        Newx(ret->swap, 1, regexp_paren_ofs);
+        /* no need to copy these */
+        Newx(ret->swap->startp, npar, I32);
+        Newx(ret->swap->endp, npar, I32);
+    } else {
+        ret->swap = NULL;
+    }
 
     Newx(ret->substrs, 1, struct reg_substr_data);
     for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
diff --git a/regexec.c b/regexec.c
index a0637a822f..d547ff71a2 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1726,7 +1726,28 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
 	else				/* pos() not defined */
 	    reginfo.ganch = strbeg;
     }
-
+    if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
+        I32 *t;
+        if (!prog->swap) {
+        /* We have to be careful. If the previous successful match
+           was from this regex we don't want a subsequent paritally
+           successful match to clobber the old results. 
+           So when we detect this possibility we add a swap buffer
+           to the re, and switch the buffer each match. If we fail
+           we switch it back, otherwise we leave it swapped.
+        */
+            Newxz(prog->swap, 1, regexp_paren_ofs);
+            /* no need to copy these */
+            Newxz(prog->swap->startp, prog->nparens + 1, I32);
+            Newxz(prog->swap->endp, prog->nparens + 1, I32);
+        }
+        t = prog->swap->startp;
+        prog->swap->startp = prog->startp;
+        prog->startp = t;
+        t = prog->swap->endp;
+        prog->swap->endp = prog->endp;
+        prog->endp = t;
+    }
     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
 	re_scream_pos_data d;
 
@@ -2074,6 +2095,16 @@ phooey:
 			  PL_colors[4], PL_colors[5]));
     if (PL_reg_eval_set)
 	restore_pos(aTHX_ prog);
+    if (prog->swap) {
+        /* we failed :-( roll it back */
+        I32 *t;
+        t = prog->swap->startp;
+        prog->swap->startp = prog->startp;
+        prog->startp = t;
+        t = prog->swap->endp;
+        prog->swap->endp = prog->endp;
+        prog->endp = t;
+    }
     return 0;
 }
 
@@ -2195,8 +2226,8 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
      * on those tests seems to be returning null fields from matches.
      * --jhi */
 #if 1
-    sp = prog->startp;
-    ep = prog->endp;
+    sp = PL_regstartp;
+    ep = PL_regendp;
     if (prog->nparens) {
 	register I32 i;
 	for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
@@ -2207,7 +2238,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
 #endif
     REGCP_SET(lastcp);
     if (regmatch(reginfo, prog->program + 1)) {
-	prog->endp[0] = PL_reginput - PL_bostr;
+	PL_regendp[0] = PL_reginput - PL_bostr;
 	return 1;
     }
     if (reginfo->cutpoint)
diff --git a/regexp.h b/regexp.h
index 8d08682290..d59fa832b2 100644
--- a/regexp.h
+++ b/regexp.h
@@ -31,10 +31,15 @@ struct reg_substr_data;
 struct reg_data;
 
 struct regexp_engine;
+typedef struct regexp_paren_ofs {
+    I32 *startp;
+    I32 *endp;
+} regexp_paren_ofs;
 
 typedef struct regexp {
-	I32 *startp;
+        I32 *startp;
 	I32 *endp;
+        regexp_paren_ofs *swap;
 	regnode *regstclass;
         struct reg_substr_data *substrs;
 	char *precomp;		/* pre-compilation regular expression */
diff --git a/t/op/pat.t b/t/op/pat.t
index 333165d185..358fbb08bc 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -3216,10 +3216,10 @@ $_ = "x"; s/x/func "in multiline subst"/em;
 #$_ = "x"; /x(?{func "in regexp"})/;
 #$_ = "x"; /x(?{func "in multiline regexp"})/m;
 
-# bug #19049
+# bug RT#19049
 $_="abcdef\n";
 @x = m/./g;
-ok("abcde" eq "$`", '# TODO #19049 - global match not setting $`');
+ok("abcde" eq "$`", 'RT#19049 - global match not setting $`');
 
 ok("123\x{100}" =~ /^.*1.*23\x{100}$/, 'uft8 + multiple floating substr');
 
@@ -4011,6 +4011,24 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) {
 #-------------------------------------------------------------------
 
 # Keep the following tests last -- they may crash perl
+{   
+    # RT#19049 / RT#38869
+    my @list = (
+        'ab cdef', # matches regex
+        ( 'e' x 40000 ) .'ab c' # matches not, but 'ab c' matches part of it
+    );
+    my $y;
+    my $x;
+    foreach (@list) {
+        m/ab(.+)cd/i; # the ignore-case seems to be important
+        $y = $1; # use $1, which might not be from the last match!
+        $x = substr($list[0],$-[0],$+[0]-$-[0]);
+    }
+    iseq($y,' ',
+        'pattern in a loop, failure should not affect previous success');
+    iseq($x,'ab cd',
+        'pattern in a loop, failure should not affect previous success');
+}
 
 ok(("a" x (2**15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker: #24274")
     or print "# Unexpected outcome: should pass or crash perl\n";
@@ -4034,4 +4052,4 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/,
 # Put new tests above the line, not here.
 
 # Don't forget to update this!
-BEGIN { print "1..1345\n" };
+BEGIN { print "1..1347\n" };
diff --git a/t/op/re_tests b/t/op/re_tests
index 078caa94be..4279dd6843 100644
--- a/t/op/re_tests
+++ b/t/op/re_tests
@@ -1187,5 +1187,8 @@ a*(*F)	aaaab	n	-	-
 (A(A|B(*ACCEPT)|C)D)(E)	ACDE	y	$1$2$3	ACDCE
 
 (a)(?:(?-1)|(?+1))(b)	aab	y	$&-$1-$2	aab-a-b
-(a)(?:(?-1)|(?+1))(b)	abb	y	$&-$1-$2	abb-a-b
+(a)(?:(?-1)|(?+1))(b)	abb	y	$1-$2	a-b
 (a)(?:(?-1)|(?+1))(b)	acb	n	-	-
+
+(foo)(\R1)	foofoo	y	$1-$2	foo-foo
+(foo)(\R1)(foo)(\R1)	foofoofoofoo	y	$1-$2-$3-$4	foo-foo-foo-foo
author	Yves Orton <demerphq@gmail.com>	2006-11-15 13:29:39 +0100
committer	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>	2006-11-15 12:41:24 +0000
commit	c74340f9cdee6010339b6bfd0e8b0dc8bc875344 (patch)
tree	461d7dee65931c649dec8616b2a6547652ba3777
parent	f81333e0586497e8dadbe01b840e0be9ee8313ee (diff)
download	perl-c74340f9cdee6010339b6bfd0e8b0dc8bc875344.tar.gz