diff options
author | David Mitchell <davem@iabyn.com> | 2018-08-26 21:10:16 +0100 |
---|---|---|
committer | David Mitchell <davem@iabyn.com> | 2018-08-26 21:10:16 +0100 |
commit | 1cfb6d7befa06ab0aba4adfd61117af3bf8693cb (patch) | |
tree | a48a5a82813b42d1688de0c2956cffa59e99c277 | |
parent | 42f10b3ffcb09f2c3fb2fb2600565b3fe588f922 (diff) | |
parent | 0b9dad94ed37e484db3e29d315fc26305c88f250 (diff) | |
download | perl-1cfb6d7befa06ab0aba4adfd61117af3bf8693cb.tar.gz |
[MERGE] various regex capture fixups
Improve the code and macros in S_regmatch() to make opening and closing
captures (groups) more consistent and simpler.
Shouldn't make any changes to behaviour apart from improved debugging
output.
-rw-r--r-- | pod/perlreapi.pod | 7 | ||||
-rw-r--r-- | pod/perlvar.pod | 15 | ||||
-rw-r--r-- | regcomp.sym | 2 | ||||
-rw-r--r-- | regexec.c | 162 | ||||
-rw-r--r-- | regexp.h | 4 | ||||
-rw-r--r-- | regnodes.h | 64 | ||||
-rw-r--r-- | t/re/re_tests | 5 |
7 files changed, 133 insertions, 126 deletions
diff --git a/pod/perlreapi.pod b/pod/perlreapi.pod index 2df337e21a..c32171a9a9 100644 --- a/pod/perlreapi.pod +++ b/pod/perlreapi.pod @@ -710,9 +710,10 @@ used in the future for all engines for optimisations. =head2 C<nparens>, C<lastparen>, and C<lastcloseparen> -These fields are used to keep track of how many paren groups could be matched -in the pattern, which was the last open paren to be entered, and which was -the last close paren to be entered. +These fields are used to keep track of: how many paren capture groups +there are in the pattern; which was the highest paren to be closed (see +L<perlvar/$+>); and which was the most recent paren to be closed (see +L<perlvar/$^N>). =head2 C<intflags> diff --git a/pod/perlvar.pod b/pod/perlvar.pod index c7b77120ef..114a7e0d12 100644 --- a/pod/perlvar.pod +++ b/pod/perlvar.pod @@ -1046,7 +1046,10 @@ This variable is read-only and dynamically-scoped. =item $+ X<$+> X<$LAST_PAREN_MATCH> -The text matched by the last bracket of the last successful search pattern. +The text matched by the highest used capture group of the last +successful search pattern. It is logically equivalent to the highest +numbered capture variable (C<$1>, C<$2>, ...) which has a defined value. + This is useful if you don't know which one of a set of alternative patterns matched. For example: @@ -1063,7 +1066,15 @@ X<$^N> X<$LAST_SUBMATCH_RESULT> The text matched by the used group most-recently closed (i.e. the group with the rightmost closing parenthesis) of the last successful search -pattern. +pattern. This is subtly different from C<$+>. For example in + + "ab" =~ /^((.)(.))$/ + +we have + + $1,$^N have the value "ab" + $2 has the value "a" + $3,$+ have the value "b" This is primarily used inside C<(?{...})> blocks for examining text recently matched. For example, to effectively capture text to a variable diff --git a/regcomp.sym b/regcomp.sym index 368039539f..6c20e28b8a 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -255,7 +255,7 @@ WHILEM A_pre,A_min,A_max,B_min,B_max:FAIL BRANCH next:FAIL CURLYM A,B:FAIL IFMATCH A:FAIL -CURLY B_min_known,B_min,B_max:FAIL +CURLY B_min,B_max:FAIL COMMIT next:FAIL MARKPOINT next:FAIL SKIP next:FAIL @@ -328,7 +328,34 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH) ); \ regcpblow(cp) +/* set the start and end positions of capture ix */ +#define CLOSE_CAPTURE(ix, s, e) \ + rex->offs[ix].start = s; \ + rex->offs[ix].end = e; \ + if (ix > rex->lastparen) \ + rex->lastparen = ix; \ + rex->lastcloseparen = ix; \ + DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \ + "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \ + depth, \ + PTR2UV(rex), \ + PTR2UV(rex->offs), \ + (UV)ix, \ + (IV)rex->offs[ix].start, \ + (IV)rex->offs[ix].end, \ + (UV)rex->lastparen \ + )) + #define UNWIND_PAREN(lp, lcp) \ + DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \ + "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \ + depth, \ + PTR2UV(rex), \ + PTR2UV(rex->offs), \ + (UV)(lp), \ + (UV)(rex->lastparen), \ + (UV)(lcp) \ + )); \ for (n = rex->lastparen; n > lp; n--) \ rex->offs[n].end = -1; \ rex->lastparen = n; \ @@ -7584,26 +7611,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) script_run_begin = (U8 *) locinput; break; -/* XXX really need to log other places start/end are set too */ -#define CLOSE_CAPTURE \ - rex->offs[n].start = rex->offs[n].start_tmp; \ - rex->offs[n].end = locinput - reginfo->strbeg; \ - DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \ - "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \ - depth, \ - PTR2UV(rex), \ - PTR2UV(rex->offs), \ - (UV)n, \ - (IV)rex->offs[n].start, \ - (IV)rex->offs[n].end \ - )) case CLOSE: /* ) */ n = ARG(scan); /* which paren pair */ - CLOSE_CAPTURE; - if (n > rex->lastparen) - rex->lastparen = n; - rex->lastcloseparen = n; + CLOSE_CAPTURE(n, rex->offs[n].start_tmp, + locinput - reginfo->strbeg); if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) ) goto fake_end; @@ -7631,10 +7643,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if ( OP(cursor)==CLOSE ){ n = ARG(cursor); if ( n <= lastopen ) { - CLOSE_CAPTURE; - if (n > rex->lastparen) - rex->lastparen = n; - rex->lastcloseparen = n; + CLOSE_CAPTURE(n, rex->offs[n].start_tmp, + locinput - reginfo->strbeg); if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) ) break; } @@ -8260,14 +8270,11 @@ NULL if (ST.me->flags) { /* emulate CLOSE: mark current A as captured */ - I32 paren = ST.me->flags; + U32 paren = (U32)ST.me->flags; if (ST.count) { - rex->offs[paren].start - = HOPc(locinput, -ST.alen) - reginfo->strbeg; - rex->offs[paren].end = locinput - reginfo->strbeg; - if ((U32)paren > rex->lastparen) - rex->lastparen = paren; - rex->lastcloseparen = paren; + CLOSE_CAPTURE(paren, + HOPc(locinput, -ST.alen) - reginfo->strbeg, + locinput - reginfo->strbeg); } else rex->offs[paren].end = -1; @@ -8306,11 +8313,8 @@ NULL #define CURLY_SETPAREN(paren, success) \ if (paren) { \ if (success) { \ - rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \ - rex->offs[paren].end = locinput - reginfo->strbeg; \ - if (paren > rex->lastparen) \ - rex->lastparen = paren; \ - rex->lastcloseparen = paren; \ + CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \ + locinput - reginfo->strbeg); \ } \ else { \ rex->offs[paren].end = -1; \ @@ -8341,12 +8345,18 @@ NULL maxopenparen = ST.paren; ST.min = ARG1(scan); /* min to match */ ST.max = ARG2(scan); /* max to match */ + scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); + + /* handle the single-char capture called as a GOSUB etc */ if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren)) { - ST.min=1; - ST.max=1; + char *li = locinput; + if (!regrepeat(rex, &li, scan, reginfo, 1)) + sayNO; + SET_locinput(li); + goto fake_end; } - scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); + goto repeat; case CURLY: /* /A{m,n}B/ where A is width 1 char */ @@ -8462,24 +8472,41 @@ NULL } NOT_REACHED; /* NOTREACHED */ - case CURLY_B_min_known_fail: - /* failed to find B in a non-greedy match where c1,c2 valid */ + case CURLY_B_min_fail: + /* failed to find B in a non-greedy match. + * Handles both cases where c1,c2 valid or not */ REGCP_UNWIND(ST.cp); if (ST.paren) { UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); } - /* Couldn't or didn't -- move forward. */ - ST.oldloc = locinput; - if (utf8_target) - locinput += UTF8SKIP(locinput); - else - locinput++; - ST.count++; - curly_try_B_min_known: - /* find the next place where 'B' could work, then call B */ - { + + if (ST.c1 == CHRTEST_VOID) { + /* failed -- move forward one */ + char *li = locinput; + if (!regrepeat(rex, &li, ST.A, reginfo, 1)) { + sayNO; + } + locinput = li; + ST.count++; + if (!( ST.count <= ST.max + /* count overflow ? */ + || (ST.max == REG_INFTY && ST.count > 0)) + ) + sayNO; + } + else { int n; + /* Couldn't or didn't -- move forward. */ + ST.oldloc = locinput; + if (utf8_target) + locinput += UTF8SKIP(locinput); + else + locinput++; + ST.count++; + + curly_try_B_min_known: + /* find the next place where 'B' could work, then call B */ if (utf8_target) { n = (ST.oldloc == locinput) ? 0 : 1; if (ST.c1 == ST.c2) { @@ -8558,47 +8585,16 @@ NULL sayNO; assert(n == REG_INFTY || locinput == li); } - CURLY_SETPAREN(ST.paren, ST.count); - if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren)) - goto fake_end; - PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput); } - NOT_REACHED; /* NOTREACHED */ - - case CURLY_B_min_fail: - /* failed to find B in a non-greedy match where c1,c2 invalid */ - REGCP_UNWIND(ST.cp); - if (ST.paren) { - UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); - } - /* failed -- move forward one */ - { - char *li = locinput; - if (!regrepeat(rex, &li, ST.A, reginfo, 1)) { - sayNO; - } - locinput = li; - } - { - ST.count++; - if (ST.count <= ST.max || (ST.max == REG_INFTY && - ST.count > 0)) /* count overflow ? */ - { - curly_try_B_min: - CURLY_SETPAREN(ST.paren, ST.count); - if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren)) - goto fake_end; - PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput); - } - } - sayNO; + curly_try_B_min: + CURLY_SETPAREN(ST.paren, ST.count); + PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput); NOT_REACHED; /* NOTREACHED */ + curly_try_B_max: /* a successful greedy match: now try to match B */ - if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren)) - goto fake_end; { bool could_match = locinput < reginfo->strend; @@ -134,8 +134,8 @@ typedef struct regexp { * Data about the last/current match. These are modified during matching */ - U32 lastparen; /* last open paren matched */ - U32 lastcloseparen; /* last close paren matched */ + U32 lastparen; /* highest close paren matched ($+) */ + U32 lastcloseparen; /* last close paren matched ($^N) */ regexp_paren_pair *offs; /* Array of offsets for (@-) and (@+) */ char **recurse_locinput; /* used to detect infinite recursion, XXX: move to internal */ diff --git a/regnodes.h b/regnodes.h index 69f3e38cdb..eeb5ce9f18 100644 --- a/regnodes.h +++ b/regnodes.h @@ -7,7 +7,7 @@ /* Regops and State definitions */ #define REGNODE_MAX 97 -#define REGMATCH_STATE_MAX 139 +#define REGMATCH_STATE_MAX 137 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -136,22 +136,20 @@ #define CURLYM_B_fail (REGNODE_MAX + 24) /* state for CURLYM */ #define IFMATCH_A (REGNODE_MAX + 25) /* state for IFMATCH */ #define IFMATCH_A_fail (REGNODE_MAX + 26) /* state for IFMATCH */ -#define CURLY_B_min_known (REGNODE_MAX + 27) /* state for CURLY */ -#define CURLY_B_min_known_fail (REGNODE_MAX + 28) /* state for CURLY */ -#define CURLY_B_min (REGNODE_MAX + 29) /* state for CURLY */ -#define CURLY_B_min_fail (REGNODE_MAX + 30) /* state for CURLY */ -#define CURLY_B_max (REGNODE_MAX + 31) /* state for CURLY */ -#define CURLY_B_max_fail (REGNODE_MAX + 32) /* state for CURLY */ -#define COMMIT_next (REGNODE_MAX + 33) /* state for COMMIT */ -#define COMMIT_next_fail (REGNODE_MAX + 34) /* state for COMMIT */ -#define MARKPOINT_next (REGNODE_MAX + 35) /* state for MARKPOINT */ -#define MARKPOINT_next_fail (REGNODE_MAX + 36) /* state for MARKPOINT */ -#define SKIP_next (REGNODE_MAX + 37) /* state for SKIP */ -#define SKIP_next_fail (REGNODE_MAX + 38) /* state for SKIP */ -#define CUTGROUP_next (REGNODE_MAX + 39) /* state for CUTGROUP */ -#define CUTGROUP_next_fail (REGNODE_MAX + 40) /* state for CUTGROUP */ -#define KEEPS_next (REGNODE_MAX + 41) /* state for KEEPS */ -#define KEEPS_next_fail (REGNODE_MAX + 42) /* state for KEEPS */ +#define CURLY_B_min (REGNODE_MAX + 27) /* state for CURLY */ +#define CURLY_B_min_fail (REGNODE_MAX + 28) /* state for CURLY */ +#define CURLY_B_max (REGNODE_MAX + 29) /* state for CURLY */ +#define CURLY_B_max_fail (REGNODE_MAX + 30) /* state for CURLY */ +#define COMMIT_next (REGNODE_MAX + 31) /* state for COMMIT */ +#define COMMIT_next_fail (REGNODE_MAX + 32) /* state for COMMIT */ +#define MARKPOINT_next (REGNODE_MAX + 33) /* state for MARKPOINT */ +#define MARKPOINT_next_fail (REGNODE_MAX + 34) /* state for MARKPOINT */ +#define SKIP_next (REGNODE_MAX + 35) /* state for SKIP */ +#define SKIP_next_fail (REGNODE_MAX + 36) /* state for SKIP */ +#define CUTGROUP_next (REGNODE_MAX + 37) /* state for CUTGROUP */ +#define CUTGROUP_next_fail (REGNODE_MAX + 38) /* state for CUTGROUP */ +#define KEEPS_next (REGNODE_MAX + 39) /* state for KEEPS */ +#define KEEPS_next_fail (REGNODE_MAX + 40) /* state for KEEPS */ /* PL_regkind[] What type of regop or state is this. */ @@ -284,8 +282,6 @@ EXTCONST U8 PL_regkind[] = { CURLYM, /* CURLYM_B_fail */ IFMATCH, /* IFMATCH_A */ IFMATCH, /* IFMATCH_A_fail */ - CURLY, /* CURLY_B_min_known */ - CURLY, /* CURLY_B_min_known_fail */ CURLY, /* CURLY_B_min */ CURLY, /* CURLY_B_min_fail */ CURLY, /* CURLY_B_max */ @@ -645,22 +641,20 @@ EXTCONST char * const PL_reg_name[] = { "CURLYM_B_fail", /* REGNODE_MAX +0x18 */ "IFMATCH_A", /* REGNODE_MAX +0x19 */ "IFMATCH_A_fail", /* REGNODE_MAX +0x1a */ - "CURLY_B_min_known", /* REGNODE_MAX +0x1b */ - "CURLY_B_min_known_fail", /* REGNODE_MAX +0x1c */ - "CURLY_B_min", /* REGNODE_MAX +0x1d */ - "CURLY_B_min_fail", /* REGNODE_MAX +0x1e */ - "CURLY_B_max", /* REGNODE_MAX +0x1f */ - "CURLY_B_max_fail", /* REGNODE_MAX +0x20 */ - "COMMIT_next", /* REGNODE_MAX +0x21 */ - "COMMIT_next_fail", /* REGNODE_MAX +0x22 */ - "MARKPOINT_next", /* REGNODE_MAX +0x23 */ - "MARKPOINT_next_fail", /* REGNODE_MAX +0x24 */ - "SKIP_next", /* REGNODE_MAX +0x25 */ - "SKIP_next_fail", /* REGNODE_MAX +0x26 */ - "CUTGROUP_next", /* REGNODE_MAX +0x27 */ - "CUTGROUP_next_fail", /* REGNODE_MAX +0x28 */ - "KEEPS_next", /* REGNODE_MAX +0x29 */ - "KEEPS_next_fail", /* REGNODE_MAX +0x2a */ + "CURLY_B_min", /* REGNODE_MAX +0x1b */ + "CURLY_B_min_fail", /* REGNODE_MAX +0x1c */ + "CURLY_B_max", /* REGNODE_MAX +0x1d */ + "CURLY_B_max_fail", /* REGNODE_MAX +0x1e */ + "COMMIT_next", /* REGNODE_MAX +0x1f */ + "COMMIT_next_fail", /* REGNODE_MAX +0x20 */ + "MARKPOINT_next", /* REGNODE_MAX +0x21 */ + "MARKPOINT_next_fail", /* REGNODE_MAX +0x22 */ + "SKIP_next", /* REGNODE_MAX +0x23 */ + "SKIP_next_fail", /* REGNODE_MAX +0x24 */ + "CUTGROUP_next", /* REGNODE_MAX +0x25 */ + "CUTGROUP_next_fail", /* REGNODE_MAX +0x26 */ + "KEEPS_next", /* REGNODE_MAX +0x27 */ + "KEEPS_next_fail", /* REGNODE_MAX +0x28 */ }; #endif /* DOINIT */ diff --git a/t/re/re_tests b/t/re/re_tests index 0e80a6c8fb..f4747f6315 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1143,6 +1143,11 @@ X(?<=foo.)[YZ] ..XfooXY.. y pos 8 /(a)+((?1))(fox)/ aafox y $1-$2-$3 a-a-fox /(a){1,100}((?1))(fox)/ aafox y $1-$2-$3 a-a-fox /(a){0,100}((?1))(fox)/ aafox y $1-$2-$3 a-a-fox +/(a)??((?1))(fox)/ aafox y $1-$2-$3 a-a-fox +/(a)*?((?1))(fox)/ aafox y $1-$2-$3 a-a-fox +/(a)+?((?1))(fox)/ aafox y $1-$2-$3 a-a-fox +/(a){1,100}?((?1))(fox)/ aafox y $1-$2-$3 a-a-fox +/(a){0,100}?((?1))(fox)/ aafox y $1-$2-$3 a-a-fox /(ab)?((?1))(fox)/ ababfox y $1-$2-$3 ab-ab-fox /(ab)*((?1))(fox)/ ababfox y $1-$2-$3 ab-ab-fox /(ab)+((?1))(fox)/ ababfox y $1-$2-$3 ab-ab-fox |