summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Mitchell <davem@iabyn.com>2018-08-26 21:10:16 +0100
committerDavid Mitchell <davem@iabyn.com>2018-08-26 21:10:16 +0100
commit1cfb6d7befa06ab0aba4adfd61117af3bf8693cb (patch)
treea48a5a82813b42d1688de0c2956cffa59e99c277
parent42f10b3ffcb09f2c3fb2fb2600565b3fe588f922 (diff)
parent0b9dad94ed37e484db3e29d315fc26305c88f250 (diff)
downloadperl-1cfb6d7befa06ab0aba4adfd61117af3bf8693cb.tar.gz
[MERGE] various regex capture fixups
Improve the code and macros in S_regmatch() to make opening and closing captures (groups) more consistent and simpler. Shouldn't make any changes to behaviour apart from improved debugging output.
-rw-r--r--pod/perlreapi.pod7
-rw-r--r--pod/perlvar.pod15
-rw-r--r--regcomp.sym2
-rw-r--r--regexec.c162
-rw-r--r--regexp.h4
-rw-r--r--regnodes.h64
-rw-r--r--t/re/re_tests5
7 files changed, 133 insertions, 126 deletions
diff --git a/pod/perlreapi.pod b/pod/perlreapi.pod
index 2df337e21a..c32171a9a9 100644
--- a/pod/perlreapi.pod
+++ b/pod/perlreapi.pod
@@ -710,9 +710,10 @@ used in the future for all engines for optimisations.
=head2 C<nparens>, C<lastparen>, and C<lastcloseparen>
-These fields are used to keep track of how many paren groups could be matched
-in the pattern, which was the last open paren to be entered, and which was
-the last close paren to be entered.
+These fields are used to keep track of: how many paren capture groups
+there are in the pattern; which was the highest paren to be closed (see
+L<perlvar/$+>); and which was the most recent paren to be closed (see
+L<perlvar/$^N>).
=head2 C<intflags>
diff --git a/pod/perlvar.pod b/pod/perlvar.pod
index c7b77120ef..114a7e0d12 100644
--- a/pod/perlvar.pod
+++ b/pod/perlvar.pod
@@ -1046,7 +1046,10 @@ This variable is read-only and dynamically-scoped.
=item $+
X<$+> X<$LAST_PAREN_MATCH>
-The text matched by the last bracket of the last successful search pattern.
+The text matched by the highest used capture group of the last
+successful search pattern. It is logically equivalent to the highest
+numbered capture variable (C<$1>, C<$2>, ...) which has a defined value.
+
This is useful if you don't know which one of a set of alternative patterns
matched. For example:
@@ -1063,7 +1066,15 @@ X<$^N> X<$LAST_SUBMATCH_RESULT>
The text matched by the used group most-recently closed (i.e. the group
with the rightmost closing parenthesis) of the last successful search
-pattern.
+pattern. This is subtly different from C<$+>. For example in
+
+ "ab" =~ /^((.)(.))$/
+
+we have
+
+ $1,$^N have the value "ab"
+ $2 has the value "a"
+ $3,$+ have the value "b"
This is primarily used inside C<(?{...})> blocks for examining text
recently matched. For example, to effectively capture text to a variable
diff --git a/regcomp.sym b/regcomp.sym
index 368039539f..6c20e28b8a 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -255,7 +255,7 @@ WHILEM A_pre,A_min,A_max,B_min,B_max:FAIL
BRANCH next:FAIL
CURLYM A,B:FAIL
IFMATCH A:FAIL
-CURLY B_min_known,B_min,B_max:FAIL
+CURLY B_min,B_max:FAIL
COMMIT next:FAIL
MARKPOINT next:FAIL
SKIP next:FAIL
diff --git a/regexec.c b/regexec.c
index c927abc611..16a230997e 100644
--- a/regexec.c
+++ b/regexec.c
@@ -328,7 +328,34 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
); \
regcpblow(cp)
+/* set the start and end positions of capture ix */
+#define CLOSE_CAPTURE(ix, s, e) \
+ rex->offs[ix].start = s; \
+ rex->offs[ix].end = e; \
+ if (ix > rex->lastparen) \
+ rex->lastparen = ix; \
+ rex->lastcloseparen = ix; \
+ DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
+ "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
+ depth, \
+ PTR2UV(rex), \
+ PTR2UV(rex->offs), \
+ (UV)ix, \
+ (IV)rex->offs[ix].start, \
+ (IV)rex->offs[ix].end, \
+ (UV)rex->lastparen \
+ ))
+
#define UNWIND_PAREN(lp, lcp) \
+ DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
+ "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
+ depth, \
+ PTR2UV(rex), \
+ PTR2UV(rex->offs), \
+ (UV)(lp), \
+ (UV)(rex->lastparen), \
+ (UV)(lcp) \
+ )); \
for (n = rex->lastparen; n > lp; n--) \
rex->offs[n].end = -1; \
rex->lastparen = n; \
@@ -7584,26 +7611,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
script_run_begin = (U8 *) locinput;
break;
-/* XXX really need to log other places start/end are set too */
-#define CLOSE_CAPTURE \
- rex->offs[n].start = rex->offs[n].start_tmp; \
- rex->offs[n].end = locinput - reginfo->strbeg; \
- DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
- "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
- depth, \
- PTR2UV(rex), \
- PTR2UV(rex->offs), \
- (UV)n, \
- (IV)rex->offs[n].start, \
- (IV)rex->offs[n].end \
- ))
case CLOSE: /* ) */
n = ARG(scan); /* which paren pair */
- CLOSE_CAPTURE;
- if (n > rex->lastparen)
- rex->lastparen = n;
- rex->lastcloseparen = n;
+ CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
+ locinput - reginfo->strbeg);
if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
goto fake_end;
@@ -7631,10 +7643,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
if ( OP(cursor)==CLOSE ){
n = ARG(cursor);
if ( n <= lastopen ) {
- CLOSE_CAPTURE;
- if (n > rex->lastparen)
- rex->lastparen = n;
- rex->lastcloseparen = n;
+ CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
+ locinput - reginfo->strbeg);
if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
break;
}
@@ -8260,14 +8270,11 @@ NULL
if (ST.me->flags) {
/* emulate CLOSE: mark current A as captured */
- I32 paren = ST.me->flags;
+ U32 paren = (U32)ST.me->flags;
if (ST.count) {
- rex->offs[paren].start
- = HOPc(locinput, -ST.alen) - reginfo->strbeg;
- rex->offs[paren].end = locinput - reginfo->strbeg;
- if ((U32)paren > rex->lastparen)
- rex->lastparen = paren;
- rex->lastcloseparen = paren;
+ CLOSE_CAPTURE(paren,
+ HOPc(locinput, -ST.alen) - reginfo->strbeg,
+ locinput - reginfo->strbeg);
}
else
rex->offs[paren].end = -1;
@@ -8306,11 +8313,8 @@ NULL
#define CURLY_SETPAREN(paren, success) \
if (paren) { \
if (success) { \
- rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
- rex->offs[paren].end = locinput - reginfo->strbeg; \
- if (paren > rex->lastparen) \
- rex->lastparen = paren; \
- rex->lastcloseparen = paren; \
+ CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
+ locinput - reginfo->strbeg); \
} \
else { \
rex->offs[paren].end = -1; \
@@ -8341,12 +8345,18 @@ NULL
maxopenparen = ST.paren;
ST.min = ARG1(scan); /* min to match */
ST.max = ARG2(scan); /* max to match */
+ scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
+
+ /* handle the single-char capture called as a GOSUB etc */
if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
{
- ST.min=1;
- ST.max=1;
+ char *li = locinput;
+ if (!regrepeat(rex, &li, scan, reginfo, 1))
+ sayNO;
+ SET_locinput(li);
+ goto fake_end;
}
- scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
+
goto repeat;
case CURLY: /* /A{m,n}B/ where A is width 1 char */
@@ -8462,24 +8472,41 @@ NULL
}
NOT_REACHED; /* NOTREACHED */
- case CURLY_B_min_known_fail:
- /* failed to find B in a non-greedy match where c1,c2 valid */
+ case CURLY_B_min_fail:
+ /* failed to find B in a non-greedy match.
+ * Handles both cases where c1,c2 valid or not */
REGCP_UNWIND(ST.cp);
if (ST.paren) {
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
}
- /* Couldn't or didn't -- move forward. */
- ST.oldloc = locinput;
- if (utf8_target)
- locinput += UTF8SKIP(locinput);
- else
- locinput++;
- ST.count++;
- curly_try_B_min_known:
- /* find the next place where 'B' could work, then call B */
- {
+
+ if (ST.c1 == CHRTEST_VOID) {
+ /* failed -- move forward one */
+ char *li = locinput;
+ if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
+ sayNO;
+ }
+ locinput = li;
+ ST.count++;
+ if (!( ST.count <= ST.max
+ /* count overflow ? */
+ || (ST.max == REG_INFTY && ST.count > 0))
+ )
+ sayNO;
+ }
+ else {
int n;
+ /* Couldn't or didn't -- move forward. */
+ ST.oldloc = locinput;
+ if (utf8_target)
+ locinput += UTF8SKIP(locinput);
+ else
+ locinput++;
+ ST.count++;
+
+ curly_try_B_min_known:
+ /* find the next place where 'B' could work, then call B */
if (utf8_target) {
n = (ST.oldloc == locinput) ? 0 : 1;
if (ST.c1 == ST.c2) {
@@ -8558,47 +8585,16 @@ NULL
sayNO;
assert(n == REG_INFTY || locinput == li);
}
- CURLY_SETPAREN(ST.paren, ST.count);
- if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
- goto fake_end;
- PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
}
- NOT_REACHED; /* NOTREACHED */
-
- case CURLY_B_min_fail:
- /* failed to find B in a non-greedy match where c1,c2 invalid */
- REGCP_UNWIND(ST.cp);
- if (ST.paren) {
- UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
- }
- /* failed -- move forward one */
- {
- char *li = locinput;
- if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
- sayNO;
- }
- locinput = li;
- }
- {
- ST.count++;
- if (ST.count <= ST.max || (ST.max == REG_INFTY &&
- ST.count > 0)) /* count overflow ? */
- {
- curly_try_B_min:
- CURLY_SETPAREN(ST.paren, ST.count);
- if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
- goto fake_end;
- PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
- }
- }
- sayNO;
+ curly_try_B_min:
+ CURLY_SETPAREN(ST.paren, ST.count);
+ PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
NOT_REACHED; /* NOTREACHED */
+
curly_try_B_max:
/* a successful greedy match: now try to match B */
- if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
- goto fake_end;
{
bool could_match = locinput < reginfo->strend;
diff --git a/regexp.h b/regexp.h
index 44409f0d9c..aa31846cbb 100644
--- a/regexp.h
+++ b/regexp.h
@@ -134,8 +134,8 @@ typedef struct regexp {
* Data about the last/current match. These are modified during matching
*/
- U32 lastparen; /* last open paren matched */
- U32 lastcloseparen; /* last close paren matched */
+ U32 lastparen; /* highest close paren matched ($+) */
+ U32 lastcloseparen; /* last close paren matched ($^N) */
regexp_paren_pair *offs; /* Array of offsets for (@-) and (@+) */
char **recurse_locinput; /* used to detect infinite recursion, XXX: move to internal */
diff --git a/regnodes.h b/regnodes.h
index 69f3e38cdb..eeb5ce9f18 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -7,7 +7,7 @@
/* Regops and State definitions */
#define REGNODE_MAX 97
-#define REGMATCH_STATE_MAX 139
+#define REGMATCH_STATE_MAX 137
#define END 0 /* 0000 End of program. */
#define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */
@@ -136,22 +136,20 @@
#define CURLYM_B_fail (REGNODE_MAX + 24) /* state for CURLYM */
#define IFMATCH_A (REGNODE_MAX + 25) /* state for IFMATCH */
#define IFMATCH_A_fail (REGNODE_MAX + 26) /* state for IFMATCH */
-#define CURLY_B_min_known (REGNODE_MAX + 27) /* state for CURLY */
-#define CURLY_B_min_known_fail (REGNODE_MAX + 28) /* state for CURLY */
-#define CURLY_B_min (REGNODE_MAX + 29) /* state for CURLY */
-#define CURLY_B_min_fail (REGNODE_MAX + 30) /* state for CURLY */
-#define CURLY_B_max (REGNODE_MAX + 31) /* state for CURLY */
-#define CURLY_B_max_fail (REGNODE_MAX + 32) /* state for CURLY */
-#define COMMIT_next (REGNODE_MAX + 33) /* state for COMMIT */
-#define COMMIT_next_fail (REGNODE_MAX + 34) /* state for COMMIT */
-#define MARKPOINT_next (REGNODE_MAX + 35) /* state for MARKPOINT */
-#define MARKPOINT_next_fail (REGNODE_MAX + 36) /* state for MARKPOINT */
-#define SKIP_next (REGNODE_MAX + 37) /* state for SKIP */
-#define SKIP_next_fail (REGNODE_MAX + 38) /* state for SKIP */
-#define CUTGROUP_next (REGNODE_MAX + 39) /* state for CUTGROUP */
-#define CUTGROUP_next_fail (REGNODE_MAX + 40) /* state for CUTGROUP */
-#define KEEPS_next (REGNODE_MAX + 41) /* state for KEEPS */
-#define KEEPS_next_fail (REGNODE_MAX + 42) /* state for KEEPS */
+#define CURLY_B_min (REGNODE_MAX + 27) /* state for CURLY */
+#define CURLY_B_min_fail (REGNODE_MAX + 28) /* state for CURLY */
+#define CURLY_B_max (REGNODE_MAX + 29) /* state for CURLY */
+#define CURLY_B_max_fail (REGNODE_MAX + 30) /* state for CURLY */
+#define COMMIT_next (REGNODE_MAX + 31) /* state for COMMIT */
+#define COMMIT_next_fail (REGNODE_MAX + 32) /* state for COMMIT */
+#define MARKPOINT_next (REGNODE_MAX + 33) /* state for MARKPOINT */
+#define MARKPOINT_next_fail (REGNODE_MAX + 34) /* state for MARKPOINT */
+#define SKIP_next (REGNODE_MAX + 35) /* state for SKIP */
+#define SKIP_next_fail (REGNODE_MAX + 36) /* state for SKIP */
+#define CUTGROUP_next (REGNODE_MAX + 37) /* state for CUTGROUP */
+#define CUTGROUP_next_fail (REGNODE_MAX + 38) /* state for CUTGROUP */
+#define KEEPS_next (REGNODE_MAX + 39) /* state for KEEPS */
+#define KEEPS_next_fail (REGNODE_MAX + 40) /* state for KEEPS */
/* PL_regkind[] What type of regop or state is this. */
@@ -284,8 +282,6 @@ EXTCONST U8 PL_regkind[] = {
CURLYM, /* CURLYM_B_fail */
IFMATCH, /* IFMATCH_A */
IFMATCH, /* IFMATCH_A_fail */
- CURLY, /* CURLY_B_min_known */
- CURLY, /* CURLY_B_min_known_fail */
CURLY, /* CURLY_B_min */
CURLY, /* CURLY_B_min_fail */
CURLY, /* CURLY_B_max */
@@ -645,22 +641,20 @@ EXTCONST char * const PL_reg_name[] = {
"CURLYM_B_fail", /* REGNODE_MAX +0x18 */
"IFMATCH_A", /* REGNODE_MAX +0x19 */
"IFMATCH_A_fail", /* REGNODE_MAX +0x1a */
- "CURLY_B_min_known", /* REGNODE_MAX +0x1b */
- "CURLY_B_min_known_fail", /* REGNODE_MAX +0x1c */
- "CURLY_B_min", /* REGNODE_MAX +0x1d */
- "CURLY_B_min_fail", /* REGNODE_MAX +0x1e */
- "CURLY_B_max", /* REGNODE_MAX +0x1f */
- "CURLY_B_max_fail", /* REGNODE_MAX +0x20 */
- "COMMIT_next", /* REGNODE_MAX +0x21 */
- "COMMIT_next_fail", /* REGNODE_MAX +0x22 */
- "MARKPOINT_next", /* REGNODE_MAX +0x23 */
- "MARKPOINT_next_fail", /* REGNODE_MAX +0x24 */
- "SKIP_next", /* REGNODE_MAX +0x25 */
- "SKIP_next_fail", /* REGNODE_MAX +0x26 */
- "CUTGROUP_next", /* REGNODE_MAX +0x27 */
- "CUTGROUP_next_fail", /* REGNODE_MAX +0x28 */
- "KEEPS_next", /* REGNODE_MAX +0x29 */
- "KEEPS_next_fail", /* REGNODE_MAX +0x2a */
+ "CURLY_B_min", /* REGNODE_MAX +0x1b */
+ "CURLY_B_min_fail", /* REGNODE_MAX +0x1c */
+ "CURLY_B_max", /* REGNODE_MAX +0x1d */
+ "CURLY_B_max_fail", /* REGNODE_MAX +0x1e */
+ "COMMIT_next", /* REGNODE_MAX +0x1f */
+ "COMMIT_next_fail", /* REGNODE_MAX +0x20 */
+ "MARKPOINT_next", /* REGNODE_MAX +0x21 */
+ "MARKPOINT_next_fail", /* REGNODE_MAX +0x22 */
+ "SKIP_next", /* REGNODE_MAX +0x23 */
+ "SKIP_next_fail", /* REGNODE_MAX +0x24 */
+ "CUTGROUP_next", /* REGNODE_MAX +0x25 */
+ "CUTGROUP_next_fail", /* REGNODE_MAX +0x26 */
+ "KEEPS_next", /* REGNODE_MAX +0x27 */
+ "KEEPS_next_fail", /* REGNODE_MAX +0x28 */
};
#endif /* DOINIT */
diff --git a/t/re/re_tests b/t/re/re_tests
index 0e80a6c8fb..f4747f6315 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1143,6 +1143,11 @@ X(?<=foo.)[YZ] ..XfooXY.. y pos 8
/(a)+((?1))(fox)/ aafox y $1-$2-$3 a-a-fox
/(a){1,100}((?1))(fox)/ aafox y $1-$2-$3 a-a-fox
/(a){0,100}((?1))(fox)/ aafox y $1-$2-$3 a-a-fox
+/(a)??((?1))(fox)/ aafox y $1-$2-$3 a-a-fox
+/(a)*?((?1))(fox)/ aafox y $1-$2-$3 a-a-fox
+/(a)+?((?1))(fox)/ aafox y $1-$2-$3 a-a-fox
+/(a){1,100}?((?1))(fox)/ aafox y $1-$2-$3 a-a-fox
+/(a){0,100}?((?1))(fox)/ aafox y $1-$2-$3 a-a-fox
/(ab)?((?1))(fox)/ ababfox y $1-$2-$3 ab-ab-fox
/(ab)*((?1))(fox)/ ababfox y $1-$2-$3 ab-ab-fox
/(ab)+((?1))(fox)/ ababfox y $1-$2-$3 ab-ab-fox