[MERGE] various regex capture fixups

Improve the code and macros in S_regmatch() to make opening and closing captures (groups) more consistent and simpler. Shouldn't make any changes to behaviour apart from improved debugging output.
author: David Mitchell <davem@iabyn.com> 2018-08-26 21:10:16 +0100
committer: David Mitchell <davem@iabyn.com> 2018-08-26 21:10:16 +0100
commit: 1cfb6d7befa06ab0aba4adfd61117af3bf8693cb (patch)
tree: a48a5a82813b42d1688de0c2956cffa59e99c277
parent: 42f10b3ffcb09f2c3fb2fb2600565b3fe588f922 (diff)
parent: 0b9dad94ed37e484db3e29d315fc26305c88f250 (diff)
download: perl-1cfb6d7befa06ab0aba4adfd61117af3bf8693cb.tar.gz
7 files changed, 133 insertions, 126 deletions
diff --git a/pod/perlreapi.pod b/pod/perlreapi.pod
index 2df337e21a..c32171a9a9 100644
--- a/pod/perlreapi.pod
+++ b/pod/perlreapi.pod
@@ -710,9 +710,10 @@ used in the future for all engines for optimisations.
 
 =head2 C<nparens>, C<lastparen>, and C<lastcloseparen>
 
-These fields are used to keep track of how many paren groups could be matched
-in the pattern, which was the last open paren to be entered, and which was
-the last close paren to be entered.
+These fields are used to keep track of: how many paren capture groups
+there are in the pattern; which was the highest paren to be closed (see
+L<perlvar/$+>); and which was the most recent paren to be closed (see
+L<perlvar/$^N>).
 
 =head2 C<intflags>
 
diff --git a/pod/perlvar.pod b/pod/perlvar.pod
index c7b77120ef..114a7e0d12 100644
--- a/pod/perlvar.pod
+++ b/pod/perlvar.pod
@@ -1046,7 +1046,10 @@ This variable is read-only and dynamically-scoped.
 =item $+
 X<$+> X<$LAST_PAREN_MATCH>
 
-The text matched by the last bracket of the last successful search pattern.
+The text matched by the highest used capture group of the last
+successful search pattern.  It is logically equivalent to the highest
+numbered capture variable (C<$1>, C<$2>, ...) which has a defined value.
+
 This is useful if you don't know which one of a set of alternative patterns
 matched.  For example:
 
@@ -1063,7 +1066,15 @@ X<$^N> X<$LAST_SUBMATCH_RESULT>
 
 The text matched by the used group most-recently closed (i.e. the group
 with the rightmost closing parenthesis) of the last successful search
-pattern.
+pattern. This is subtly different from C<$+>. For example in
+
+    "ab" =~ /^((.)(.))$/
+
+we have
+
+    $1,$^N   have the value "ab"
+    $2       has  the value "a"
+    $3,$+    have the value "b"
 
 This is primarily used inside C<(?{...})> blocks for examining text
 recently matched.  For example, to effectively capture text to a variable
diff --git a/regcomp.sym b/regcomp.sym
index 368039539f..6c20e28b8a 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -255,7 +255,7 @@ WHILEM          A_pre,A_min,A_max,B_min,B_max:FAIL
 BRANCH          next:FAIL
 CURLYM          A,B:FAIL
 IFMATCH         A:FAIL
-CURLY           B_min_known,B_min,B_max:FAIL
+CURLY           B_min,B_max:FAIL
 COMMIT          next:FAIL
 MARKPOINT       next:FAIL
 SKIP            next:FAIL
diff --git a/regexec.c b/regexec.c
index c927abc611..16a230997e 100644
--- a/regexec.c
+++ b/regexec.c
@@ -328,7 +328,34 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
     );                                                          \
     regcpblow(cp)
 
+/* set the start and end positions of capture ix */
+#define CLOSE_CAPTURE(ix, s, e)                                            \
+    rex->offs[ix].start = s;                                               \
+    rex->offs[ix].end = e;                                                 \
+    if (ix > rex->lastparen)                                               \
+        rex->lastparen = ix;                                               \
+    rex->lastcloseparen = ix;                                              \
+    DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
+        "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
+        depth,                                                             \
+        PTR2UV(rex),                                                       \
+        PTR2UV(rex->offs),                                                 \
+        (UV)ix,                                                            \
+        (IV)rex->offs[ix].start,                                           \
+        (IV)rex->offs[ix].end,                                             \
+        (UV)rex->lastparen                                                 \
+    ))
+
 #define UNWIND_PAREN(lp, lcp)               \
+    DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
+        "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
+        depth,                              \
+        PTR2UV(rex),                        \
+        PTR2UV(rex->offs),                  \
+        (UV)(lp),                           \
+        (UV)(rex->lastparen),               \
+        (UV)(lcp)                           \
+    ));                                     \
     for (n = rex->lastparen; n > lp; n--)   \
         rex->offs[n].end = -1;              \
     rex->lastparen = n;                     \
@@ -7584,26 +7611,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             script_run_begin = (U8 *) locinput;
             break;
 
-/* XXX really need to log other places start/end are set too */
-#define CLOSE_CAPTURE                                                      \
-    rex->offs[n].start = rex->offs[n].start_tmp;                           \
-    rex->offs[n].end = locinput - reginfo->strbeg;                         \
-    DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
-        "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
-        depth,                                                             \
-        PTR2UV(rex),                                                       \
-        PTR2UV(rex->offs),                                                 \
-        (UV)n,                                                             \
-        (IV)rex->offs[n].start,                                            \
-        (IV)rex->offs[n].end                                               \
-    ))
 
 	case CLOSE:  /*  )  */
 	    n = ARG(scan);  /* which paren pair */
-	    CLOSE_CAPTURE;
-	    if (n > rex->lastparen)
-		rex->lastparen = n;
-	    rex->lastcloseparen = n;
+	    CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
+                             locinput - reginfo->strbeg);
             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
 	        goto fake_end;
 
@@ -7631,10 +7643,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                     if ( OP(cursor)==CLOSE ){
                         n = ARG(cursor);
                         if ( n <= lastopen ) {
-			    CLOSE_CAPTURE;
-                            if (n > rex->lastparen)
-                                rex->lastparen = n;
-                            rex->lastcloseparen = n;
+			    CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
+                                             locinput - reginfo->strbeg);
                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
                                 break;
                         }
@@ -8260,14 +8270,11 @@ NULL
 
 	    if (ST.me->flags) {
 		/* emulate CLOSE: mark current A as captured */
-		I32 paren = ST.me->flags;
+		U32 paren = (U32)ST.me->flags;
 		if (ST.count) {
-		    rex->offs[paren].start
-			= HOPc(locinput, -ST.alen) - reginfo->strbeg;
-		    rex->offs[paren].end = locinput - reginfo->strbeg;
-		    if ((U32)paren > rex->lastparen)
-			rex->lastparen = paren;
-		    rex->lastcloseparen = paren;
+                    CLOSE_CAPTURE(paren,
+			HOPc(locinput, -ST.alen) - reginfo->strbeg,
+		        locinput - reginfo->strbeg);
 		}
 		else
 		    rex->offs[paren].end = -1;
@@ -8306,11 +8313,8 @@ NULL
 #define CURLY_SETPAREN(paren, success) \
     if (paren) { \
 	if (success) { \
-	    rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
-	    rex->offs[paren].end = locinput - reginfo->strbeg; \
-	    if (paren > rex->lastparen) \
-		rex->lastparen = paren; \
-	    rex->lastcloseparen = paren; \
+            CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
+	                         locinput - reginfo->strbeg); \
 	} \
 	else { \
 	    rex->offs[paren].end = -1; \
@@ -8341,12 +8345,18 @@ NULL
 		maxopenparen = ST.paren;
 	    ST.min = ARG1(scan);  /* min to match */
 	    ST.max = ARG2(scan);  /* max to match */
+            scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
+
+            /* handle the single-char capture called as a GOSUB etc */
             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
             {
-	        ST.min=1;
-	        ST.max=1;
+                char *li = locinput;
+                if (!regrepeat(rex, &li, scan, reginfo, 1))
+		    sayNO;
+                SET_locinput(li);
+                goto fake_end;
 	    }
-            scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
+
 	    goto repeat;
 
 	case CURLY:		/*  /A{m,n}B/ where A is width 1 char */
@@ -8462,24 +8472,41 @@ NULL
 	    }
 	    NOT_REACHED; /* NOTREACHED */
 
-	case CURLY_B_min_known_fail:
-	    /* failed to find B in a non-greedy match where c1,c2 valid */
+	case CURLY_B_min_fail:
+	    /* failed to find B in a non-greedy match.
+             * Handles both cases where c1,c2 valid or not */
 
 	    REGCP_UNWIND(ST.cp);
             if (ST.paren) {
                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
             }
-	    /* Couldn't or didn't -- move forward. */
-	    ST.oldloc = locinput;
-	    if (utf8_target)
-		locinput += UTF8SKIP(locinput);
-	    else
-		locinput++;
-	    ST.count++;
-	  curly_try_B_min_known:
-	     /* find the next place where 'B' could work, then call B */
-	    {
+
+            if (ST.c1 == CHRTEST_VOID) {
+                /* failed -- move forward one */
+                char *li = locinput;
+                if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
+                    sayNO;
+                }
+                locinput = li;
+                ST.count++;
+		if (!(   ST.count <= ST.max
+                        /* count overflow ? */
+                     || (ST.max == REG_INFTY && ST.count > 0))
+                )
+                    sayNO;
+            }
+            else {
 		int n;
+                /* Couldn't or didn't -- move forward. */
+                ST.oldloc = locinput;
+                if (utf8_target)
+                    locinput += UTF8SKIP(locinput);
+                else
+                    locinput++;
+                ST.count++;
+
+              curly_try_B_min_known:
+                /* find the next place where 'B' could work, then call B */
 		if (utf8_target) {
 		    n = (ST.oldloc == locinput) ? 0 : 1;
 		    if (ST.c1 == ST.c2) {
@@ -8558,47 +8585,16 @@ NULL
 			sayNO;
                     assert(n == REG_INFTY || locinput == li);
 		}
-		CURLY_SETPAREN(ST.paren, ST.count);
-                if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
-		    goto fake_end;
-		PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
 	    }
-	    NOT_REACHED; /* NOTREACHED */
-
-	case CURLY_B_min_fail:
-	    /* failed to find B in a non-greedy match where c1,c2 invalid */
 
-	    REGCP_UNWIND(ST.cp);
-            if (ST.paren) {
-                UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
-            }
-	    /* failed -- move forward one */
-            {
-                char *li = locinput;
-                if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
-                    sayNO;
-                }
-                locinput = li;
-            }
-            {
-		ST.count++;
-		if (ST.count <= ST.max || (ST.max == REG_INFTY &&
-			ST.count > 0)) /* count overflow ? */
-		{
-		  curly_try_B_min:
-		    CURLY_SETPAREN(ST.paren, ST.count);
-                    if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
-                        goto fake_end;
-		    PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
-		}
-	    }
-            sayNO;
+          curly_try_B_min:
+            CURLY_SETPAREN(ST.paren, ST.count);
+            PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
 	    NOT_REACHED; /* NOTREACHED */
 
+
           curly_try_B_max:
 	    /* a successful greedy match: now try to match B */
-            if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
-                goto fake_end;
 	    {
 		bool could_match = locinput < reginfo->strend;
 
diff --git a/regexp.h b/regexp.h
index 44409f0d9c..aa31846cbb 100644
--- a/regexp.h
+++ b/regexp.h
@@ -134,8 +134,8 @@ typedef struct regexp {
      * Data about the last/current match. These are modified during matching
      */
 
-    U32 lastparen;           /* last open paren matched */
-    U32 lastcloseparen;      /* last close paren matched */
+    U32 lastparen;           /* highest close paren matched ($+) */
+    U32 lastcloseparen;      /* last close paren matched ($^N) */
     regexp_paren_pair *offs; /* Array of offsets for (@-) and (@+) */
     char **recurse_locinput; /* used to detect infinite recursion, XXX: move to internal */
 
diff --git a/regnodes.h b/regnodes.h
index 69f3e38cdb..eeb5ce9f18 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -7,7 +7,7 @@
 /* Regops and State definitions */
 
 #define REGNODE_MAX           	97
-#define REGMATCH_STATE_MAX    	139
+#define REGMATCH_STATE_MAX    	137
 
 #define	END                   	0	/* 0000 End of program. */
 #define	SUCCEED               	1	/* 0x01 Return from a subroutine, basically. */
@@ -136,22 +136,20 @@
 #define	CURLYM_B_fail         	(REGNODE_MAX + 24)	/* state for CURLYM */
 #define	IFMATCH_A             	(REGNODE_MAX + 25)	/* state for IFMATCH */
 #define	IFMATCH_A_fail        	(REGNODE_MAX + 26)	/* state for IFMATCH */
-#define	CURLY_B_min_known     	(REGNODE_MAX + 27)	/* state for CURLY */
-#define	CURLY_B_min_known_fail	(REGNODE_MAX + 28)	/* state for CURLY */
-#define	CURLY_B_min           	(REGNODE_MAX + 29)	/* state for CURLY */
-#define	CURLY_B_min_fail      	(REGNODE_MAX + 30)	/* state for CURLY */
-#define	CURLY_B_max           	(REGNODE_MAX + 31)	/* state for CURLY */
-#define	CURLY_B_max_fail      	(REGNODE_MAX + 32)	/* state for CURLY */
-#define	COMMIT_next           	(REGNODE_MAX + 33)	/* state for COMMIT */
-#define	COMMIT_next_fail      	(REGNODE_MAX + 34)	/* state for COMMIT */
-#define	MARKPOINT_next        	(REGNODE_MAX + 35)	/* state for MARKPOINT */
-#define	MARKPOINT_next_fail   	(REGNODE_MAX + 36)	/* state for MARKPOINT */
-#define	SKIP_next             	(REGNODE_MAX + 37)	/* state for SKIP */
-#define	SKIP_next_fail        	(REGNODE_MAX + 38)	/* state for SKIP */
-#define	CUTGROUP_next         	(REGNODE_MAX + 39)	/* state for CUTGROUP */
-#define	CUTGROUP_next_fail    	(REGNODE_MAX + 40)	/* state for CUTGROUP */
-#define	KEEPS_next            	(REGNODE_MAX + 41)	/* state for KEEPS */
-#define	KEEPS_next_fail       	(REGNODE_MAX + 42)	/* state for KEEPS */
+#define	CURLY_B_min           	(REGNODE_MAX + 27)	/* state for CURLY */
+#define	CURLY_B_min_fail      	(REGNODE_MAX + 28)	/* state for CURLY */
+#define	CURLY_B_max           	(REGNODE_MAX + 29)	/* state for CURLY */
+#define	CURLY_B_max_fail      	(REGNODE_MAX + 30)	/* state for CURLY */
+#define	COMMIT_next           	(REGNODE_MAX + 31)	/* state for COMMIT */
+#define	COMMIT_next_fail      	(REGNODE_MAX + 32)	/* state for COMMIT */
+#define	MARKPOINT_next        	(REGNODE_MAX + 33)	/* state for MARKPOINT */
+#define	MARKPOINT_next_fail   	(REGNODE_MAX + 34)	/* state for MARKPOINT */
+#define	SKIP_next             	(REGNODE_MAX + 35)	/* state for SKIP */
+#define	SKIP_next_fail        	(REGNODE_MAX + 36)	/* state for SKIP */
+#define	CUTGROUP_next         	(REGNODE_MAX + 37)	/* state for CUTGROUP */
+#define	CUTGROUP_next_fail    	(REGNODE_MAX + 38)	/* state for CUTGROUP */
+#define	KEEPS_next            	(REGNODE_MAX + 39)	/* state for KEEPS */
+#define	KEEPS_next_fail       	(REGNODE_MAX + 40)	/* state for KEEPS */
 
 /* PL_regkind[] What type of regop or state is this. */
 
@@ -284,8 +282,6 @@ EXTCONST U8 PL_regkind[] = {
 	CURLYM,   	/* CURLYM_B_fail          */
 	IFMATCH,  	/* IFMATCH_A              */
 	IFMATCH,  	/* IFMATCH_A_fail         */
-	CURLY,    	/* CURLY_B_min_known      */
-	CURLY,    	/* CURLY_B_min_known_fail */
 	CURLY,    	/* CURLY_B_min            */
 	CURLY,    	/* CURLY_B_min_fail       */
 	CURLY,    	/* CURLY_B_max            */
@@ -645,22 +641,20 @@ EXTCONST char * const PL_reg_name[] = {
 	"CURLYM_B_fail",         	/* REGNODE_MAX +0x18 */
 	"IFMATCH_A",             	/* REGNODE_MAX +0x19 */
 	"IFMATCH_A_fail",        	/* REGNODE_MAX +0x1a */
-	"CURLY_B_min_known",     	/* REGNODE_MAX +0x1b */
-	"CURLY_B_min_known_fail",	/* REGNODE_MAX +0x1c */
-	"CURLY_B_min",           	/* REGNODE_MAX +0x1d */
-	"CURLY_B_min_fail",      	/* REGNODE_MAX +0x1e */
-	"CURLY_B_max",           	/* REGNODE_MAX +0x1f */
-	"CURLY_B_max_fail",      	/* REGNODE_MAX +0x20 */
-	"COMMIT_next",           	/* REGNODE_MAX +0x21 */
-	"COMMIT_next_fail",      	/* REGNODE_MAX +0x22 */
-	"MARKPOINT_next",        	/* REGNODE_MAX +0x23 */
-	"MARKPOINT_next_fail",   	/* REGNODE_MAX +0x24 */
-	"SKIP_next",             	/* REGNODE_MAX +0x25 */
-	"SKIP_next_fail",        	/* REGNODE_MAX +0x26 */
-	"CUTGROUP_next",         	/* REGNODE_MAX +0x27 */
-	"CUTGROUP_next_fail",    	/* REGNODE_MAX +0x28 */
-	"KEEPS_next",            	/* REGNODE_MAX +0x29 */
-	"KEEPS_next_fail",       	/* REGNODE_MAX +0x2a */
+	"CURLY_B_min",           	/* REGNODE_MAX +0x1b */
+	"CURLY_B_min_fail",      	/* REGNODE_MAX +0x1c */
+	"CURLY_B_max",           	/* REGNODE_MAX +0x1d */
+	"CURLY_B_max_fail",      	/* REGNODE_MAX +0x1e */
+	"COMMIT_next",           	/* REGNODE_MAX +0x1f */
+	"COMMIT_next_fail",      	/* REGNODE_MAX +0x20 */
+	"MARKPOINT_next",        	/* REGNODE_MAX +0x21 */
+	"MARKPOINT_next_fail",   	/* REGNODE_MAX +0x22 */
+	"SKIP_next",             	/* REGNODE_MAX +0x23 */
+	"SKIP_next_fail",        	/* REGNODE_MAX +0x24 */
+	"CUTGROUP_next",         	/* REGNODE_MAX +0x25 */
+	"CUTGROUP_next_fail",    	/* REGNODE_MAX +0x26 */
+	"KEEPS_next",            	/* REGNODE_MAX +0x27 */
+	"KEEPS_next_fail",       	/* REGNODE_MAX +0x28 */
 };
 #endif /* DOINIT */
 
diff --git a/t/re/re_tests b/t/re/re_tests
index 0e80a6c8fb..f4747f6315 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1143,6 +1143,11 @@ X(?<=foo.)[YZ]	..XfooXY..	y	pos	8
 /(a)+((?1))(fox)/	aafox	y	$1-$2-$3	a-a-fox
 /(a){1,100}((?1))(fox)/	aafox	y	$1-$2-$3	a-a-fox
 /(a){0,100}((?1))(fox)/	aafox	y	$1-$2-$3	a-a-fox
+/(a)??((?1))(fox)/	aafox	y	$1-$2-$3	a-a-fox
+/(a)*?((?1))(fox)/	aafox	y	$1-$2-$3	a-a-fox
+/(a)+?((?1))(fox)/	aafox	y	$1-$2-$3	a-a-fox
+/(a){1,100}?((?1))(fox)/	aafox	y	$1-$2-$3	a-a-fox
+/(a){0,100}?((?1))(fox)/	aafox	y	$1-$2-$3	a-a-fox
 /(ab)?((?1))(fox)/	ababfox	y	$1-$2-$3	ab-ab-fox
 /(ab)*((?1))(fox)/	ababfox	y	$1-$2-$3	ab-ab-fox
 /(ab)+((?1))(fox)/	ababfox	y	$1-$2-$3	ab-ab-fox
author	David Mitchell <davem@iabyn.com>	2018-08-26 21:10:16 +0100
committer	David Mitchell <davem@iabyn.com>	2018-08-26 21:10:16 +0100
commit	1cfb6d7befa06ab0aba4adfd61117af3bf8693cb (patch)
tree	a48a5a82813b42d1688de0c2956cffa59e99c277
parent	42f10b3ffcb09f2c3fb2fb2600565b3fe588f922 (diff)
parent	0b9dad94ed37e484db3e29d315fc26305c88f250 (diff)
download	perl-1cfb6d7befa06ab0aba4adfd61117af3bf8693cb.tar.gz