diff options
-rw-r--r-- | pod/perldebguts.pod | 8 | ||||
-rw-r--r-- | pp_ctl.c | 4 | ||||
-rw-r--r-- | regcomp.c | 13 | ||||
-rw-r--r-- | regcomp.h | 24 | ||||
-rw-r--r-- | regcomp.sym | 8 | ||||
-rw-r--r-- | regcomp_debug.c | 2 | ||||
-rw-r--r-- | regexec.c | 55 | ||||
-rw-r--r-- | regexp.h | 11 | ||||
-rw-r--r-- | regnodes.h | 8 | ||||
-rw-r--r-- | t/re/pat.t | 32 | ||||
-rw-r--r-- | t/re/re_tests | 10 |
11 files changed, 132 insertions, 43 deletions
diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod index 19d2a7d562..42db53944f 100644 --- a/pod/perldebguts.pod +++ b/pod/perldebguts.pod @@ -752,13 +752,13 @@ will be lost. PLUS node Match this (simple) thing 1 or more times: /A{1,}B/ where A is width 1 char - CURLY sv 2 Match this (simple) thing {n,m} times: + CURLY sv 4 Match this (simple) thing {n,m} times: /A{m,n}B/ where A is width 1 char - CURLYN no 2 Capture next-after-this simple thing: + CURLYN no 4 Capture next-after-this simple thing: /(A){m,n}B/ where A is width 1 char - CURLYM no 2 Capture this medium-complex thing {n,m} + CURLYM no 4 Capture this medium-complex thing {n,m} times: /(A){m,n}B/ where A is fixed-length - CURLYX sv 2 Match/Capture this complex thing {n,m} + CURLYX sv 4 Match/Capture this complex thing {n,m} times. # This terminator creates a loop structure for CURLYX @@ -381,9 +381,9 @@ Perl_rxres_save(pTHX_ void **rsp, REGEXP *rx) if (!p || p[1] < RX_NPARENS(rx)) { #ifdef PERL_ANY_COW - i = 7 + (RX_NPARENS(rx)+1) * 2; + i = 7 + (RX_NPARENS(rx)+1) * 4; #else - i = 6 + (RX_NPARENS(rx)+1) * 2; + i = 6 + (RX_NPARENS(rx)+1) * 4; #endif if (!p) Newx(p, i, UV); @@ -4603,6 +4603,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) const char * const origparse = RExC_parse; I32 min; I32 max = REG_INFTY; + I32 npar_before = RExC_npar-1; /* Save the original in case we change the emitted regop to a FAIL. */ const regnode_offset orig_emit = RExC_emit; @@ -4618,6 +4619,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN); FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags); } + I32 npar_after = RExC_npar-1; op = *RExC_parse; switch (op) { @@ -4783,6 +4785,17 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ARG1_SET(REGNODE_p(ret), min); ARG2_SET(REGNODE_p(ret), max); + /* if we had a npar_after then we need to increment npar_before, + * we want to track the range of parens we need to reset each iteration + */ + if (npar_after!=npar_before) { + ARG3_SET(REGNODE_p(ret), (U16)npar_before+1); + ARG4_SET(REGNODE_p(ret), (U16)npar_after); + } else { + ARG3_SET(REGNODE_p(ret), 0); + ARG4_SET(REGNODE_p(ret), 0); + } + done_main_op: /* Process any greediness modifiers */ @@ -217,7 +217,10 @@ struct regnode_2L { I32 arg2; }; -/* 'Two field' -- Two 32 bit signed args */ +/* 'Two field' -- Two 32 bit signed args. + * First fields must match regnode. Currently unused except to + * facilitate regnode_4 behavior. Not simplifying that as this + * node type could still be useful for other regops. */ struct regnode_2 { U8 flags; U8 type; @@ -226,6 +229,19 @@ struct regnode_2 { I32 arg2; }; +/* 'Four field' -- Two 32 bit signed args, Two 16 bit unsigned args + * Used for CURLY and CURLYX node types to track min/max and + * first_paren/last_paren. First fields must match regnode_2 */ +struct regnode_4 { + U8 flags; + U8 type; + U16 next_off; + I32 arg1; + I32 arg2; + U16 arg3; + U16 arg4; +}; + #define REGNODE_BBM_BITMAP_LEN \ /* 6 info bits requires 64 bits; 5 => 32 */ \ ((1 << (UTF_CONTINUATION_BYTE_INFO_BITS)) / CHARBITS) @@ -347,11 +363,15 @@ struct regnode_ssc { #define ARGp(p) ARGp_VALUE_inline(p) #define ARG1(p) ARG_VALUE(ARG1_LOC(p)) #define ARG2(p) ARG_VALUE(ARG2_LOC(p)) +#define ARG3(p) ARG_VALUE(ARG3_LOC(p)) +#define ARG4(p) ARG_VALUE(ARG4_LOC(p)) #define ARG2L(p) ARG_VALUE(ARG2L_LOC(p)) #define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val)) #define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val)) #define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val)) +#define ARG3_SET(p, val) ARG__SET(ARG3_LOC(p), (val)) +#define ARG4_SET(p, val) ARG__SET(ARG4_LOC(p), (val)) #define ARG2L_SET(p, val) ARG__SET(ARG2L_LOC(p), (val)) #define ARGp_SET(p, val) ARGp_SET_inline((p),(val)) @@ -437,6 +457,8 @@ struct regnode_ssc { #define ARGp_BYTES_LOC(p) (((struct regnode_p *)p)->arg1_sv_ptr_bytes) #define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1) #define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2) +#define ARG3_LOC(p) (((struct regnode_4 *)p)->arg3) +#define ARG4_LOC(p) (((struct regnode_4 *)p)->arg4) #define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2) diff --git a/regcomp.sym b/regcomp.sym index ddc8397daf..c0735aada9 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -217,10 +217,10 @@ TAIL NOTHING, no ; Match empty string. Can jump here from outsi STAR STAR, node 0 V ; Match this (simple) thing 0 or more times: /A{0,}B/ where A is width 1 char PLUS PLUS, node 0 V ; Match this (simple) thing 1 or more times: /A{1,}B/ where A is width 1 char -CURLY CURLY, sv 2 V ; Match this (simple) thing {n,m} times: /A{m,n}B/ where A is width 1 char -CURLYN CURLY, no 2 V ; Capture next-after-this simple thing: /(A){m,n}B/ where A is width 1 char -CURLYM CURLY, no 2 V ; Capture this medium-complex thing {n,m} times: /(A){m,n}B/ where A is fixed-length -CURLYX CURLY, sv 2 V ; Match/Capture this complex thing {n,m} times. +CURLY CURLY, sv 4 V ; Match this (simple) thing {n,m} times: /A{m,n}B/ where A is width 1 char +CURLYN CURLY, no 4 V ; Capture next-after-this simple thing: /(A){m,n}B/ where A is width 1 char +CURLYM CURLY, no 4 V ; Capture this medium-complex thing {n,m} times: /(A){m,n}B/ where A is fixed-length +CURLYX CURLY, sv 4 V ; Match/Capture this complex thing {n,m} times. #*This terminator creates a loop structure for CURLYX WHILEM WHILEM, no 0 V ; Do curly processing and see if rest matches. diff --git a/regcomp_debug.c b/regcomp_debug.c index b90fe92532..6410f5e2da 100644 --- a/regcomp_debug.c +++ b/regcomp_debug.c @@ -464,6 +464,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } } else if (k == CURLY) { U32 lo = ARG1(o), hi = ARG2(o); + if (ARG3(o) || ARG4(o)) + Perl_sv_catpvf(aTHX_ sv, "<%d:%d>", ARG3(o),ARG4(o)); /* paren before, paren after */ if (op == CURLYM || op == CURLYN || op == CURLYX) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo); @@ -273,12 +273,12 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH) I32 p; for (p = parenfloor + 1; p <= (I32)maxopenparen; p++) { Perl_re_exec_indentf(aTHX_ - " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n", + " \\%" UVuf " std %" IVdf " .. %" IVdf " tmp %" IVdf " (regcppush)\n", depth, (UV)p, (IV)rex->offs[p].start, - (IV)rex->offs[p].start_tmp, - (IV)rex->offs[p].end + (IV)rex->offs[p].end, + (IV)rex->offs[p].start_tmp ); } }); @@ -315,7 +315,7 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH) /* set the start and end positions of capture ix */ #define CLOSE_ANY_CAPTURE(rex, ix, s, e) \ - (rex)->offs[(ix)].start = (s); \ + (rex)->offs[(ix)].start = (s); \ (rex)->offs[(ix)].end = (e) #define CLOSE_CAPTURE(rex, ix, s, e) \ @@ -344,8 +344,9 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH) (UV)(rex->lastparen), \ (UV)(lcp) \ )); \ - for (n = rex->lastparen; n > lp; n--) \ + for (n = rex->lastparen; n > lp; n--) { \ rex->offs[n].end = -1; \ + } \ rex->lastparen = n; \ rex->lastcloseparen = lcp; @@ -399,12 +400,12 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH) DEBUG_BUFFERS_r( for (; paren <= *maxopenparen_p; ++paren) { Perl_re_exec_indentf(aTHX_ - " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n", + " \\%" UVuf " std %" IVdf " .. %" IVdf " tmp %" IVdf "%s (regcppop)\n", depth, (UV)paren, (IV)rex->offs[paren].start, - (IV)rex->offs[paren].start_tmp, (IV)rex->offs[paren].end, + (IV)rex->offs[paren].start_tmp, (paren > rex->lastparen ? "(skipped)" : "")); } ); @@ -419,11 +420,12 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH) * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ * --jhi updated by dapm */ for (i = rex->lastparen + 1; i <= rex->nparens; i++) { - if (i > *maxopenparen_p) + if (i > *maxopenparen_p) { rex->offs[i].start = -1; + } rex->offs[i].end = -1; DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_ - " \\%" UVuf ": %s ..-1 undeffing\n", + " \\%" UVuf ": %s ..-1 undeffing (regcppop)\n", depth, (UV)i, (i > *maxopenparen_p) ? "-1" : " " @@ -3491,14 +3493,15 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ && !(PL_sawampersand & SAWAMPERSAND_RIGHT) ) { /* don't copy $' part of string */ + SSize_t offs_end; U32 n = 0; max = -1; /* calculate the right-most part of the string covered * by a capture. Due to lookahead, this may be to * the right of $&, so we have to scan all captures */ while (n <= prog->lastparen) { - if (prog->offs[n].end > max) - max = prog->offs[n].end; + if ((offs_end = RXp_OFFS_END(prog,n)) > max) + max = offs_end; n++; } if (max == -1) @@ -6523,7 +6526,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case KEEPS: /* \K */ /* update the startpoint */ - st->u.keeper.val = rex->offs[0].start; + st->u.keeper.val = RXp_OFFS_START(rex,0); rex->offs[0].start = locinput - reginfo->strbeg; PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol, script_run_begin); @@ -7928,21 +7931,29 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (rex->logical_to_parno) { n = rex->logical_to_parno[n]; do { - if (rex->lastparen < n || rex->offs[n].start == -1 || rex->offs[n].end == -1) { + if ( rex->lastparen < n || + RXp_OFFS_START(rex,n) == -1 || + RXp_OFFS_END(rex,n) == -1 + ) { n = rex->parno_to_logical_next[n]; } else { break; } } while(n); - if (!n) sayNO; + + if (!n) /* this means there is nothing that matched */ + sayNO; } do_nref_ref_common: + reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ + if (rex->lastparen < n) + sayNO; + ln = rex->offs[n].start; endref = rex->offs[n].end; - reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ - if (rex->lastparen < n || ln == -1 || endref == -1) + if (ln == -1 || endref == -1) sayNO; /* Do not match unless seen CLOSEn. */ if (ln == endref) break; @@ -8727,19 +8738,21 @@ NULL /* see the discussion above about CURLYX/WHILEM */ I32 n; int min, max; + // U16 first_paren, last_paren; regnode *A; assert(cur_curlyx); /* keep Coverity happy */ min = ARG1(cur_curlyx->u.curlyx.me); max = ARG2(cur_curlyx->u.curlyx.me); + // first_paren = ARG3(cur_curlyx->u.curlyx.me); + // last_paren = ARG4(cur_curlyx->u.curlyx.me); A = REGNODE_AFTER(cur_curlyx->u.curlyx.me); n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; ST.cache_offset = 0; ST.cache_mask = 0; - DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n", depth, (long)n, min, max) ); @@ -12145,7 +12158,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, const I32 paren) { struct regexp *const rx = ReANY(r); - I32 i; + I32 i,j; I32 s1, t1; I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens; @@ -12184,10 +12197,10 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ case RX_BUFF_IDX_POSTMATCH: /* $' */ - if ( (i = RXp_OFFS_END(rx,0)) != -1 ) { - i = rx->sublen - i; + if ( (j = RXp_OFFS_END(rx,0)) != -1 ) { + i = rx->sublen - j; if (i > 0) { - s1 = rx->offs[0].end; + s1 = j; t1 = rx->sublen; goto getlen; } @@ -67,6 +67,7 @@ struct reg_substr_data { typedef struct regexp_paren_pair { SSize_t start; SSize_t end; + /* 'start_tmp' records a new opening position before the matching end * has been found, so that the old start and end values are still * valid, e.g. @@ -184,14 +185,16 @@ typedef struct regexp { } regexp; -#define RXp_PAREN_NAMES(rx) ((rx)->paren_names) +#define RXp_PAREN_NAMES(rx) ((rx)->paren_names) -#define RXp_OFFS_START(rx,n) ((rx)->offs[(n)].start) +#define RXp_OFFS_START(rx,n) \ + (rx)->offs[(n)].start -#define RXp_OFFS_END(rx,n) ((rx)->offs[(n)].end) +#define RXp_OFFS_END(rx,n) \ + (rx)->offs[(n)].end #define RXp_OFFS_VALID(rx,n) \ - ( (rx)->offs[(n)].end != -1 && (rx)->offs[(n)].start != -1 ) + ((rx)->offs[(n)].end >= 0 && (rx)->offs[(n)].start >= 0 ) #define RX_OFFS_START(rx_sv,n) RXp_OFFS_START(ReANY(rx_sv),n) #define RX_OFFS_END(rx_sv,n) RXp_OFFS_END(ReANY(rx_sv),n) diff --git a/regnodes.h b/regnodes.h index cc225b3ec2..b45c1261f5 100644 --- a/regnodes.h +++ b/regnodes.h @@ -34,10 +34,10 @@ typedef struct regnode_1 tregnode_BRANCHJ; typedef struct regnode_1 tregnode_CLOSE; typedef struct regnode tregnode_CLUMP; typedef struct regnode_1 tregnode_COMMIT; -typedef struct regnode_2 tregnode_CURLY; -typedef struct regnode_2 tregnode_CURLYM; -typedef struct regnode_2 tregnode_CURLYN; -typedef struct regnode_2 tregnode_CURLYX; +typedef struct regnode_4 tregnode_CURLY; +typedef struct regnode_4 tregnode_CURLYM; +typedef struct regnode_4 tregnode_CURLYN; +typedef struct regnode_4 tregnode_CURLYX; typedef struct regnode_1 tregnode_CUTGROUP; typedef struct regnode_1 tregnode_DEFINEP; typedef struct regnode tregnode_END; diff --git a/t/re/pat.t b/t/re/pat.t index c494434675..b837157c42 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -27,7 +27,7 @@ skip_all_without_unicode_tables(); my $has_locales = locales_enabled('LC_CTYPE'); -plan tests => 1231; # Update this when adding/deleting tests. +plan tests => 1240; # Update this when adding/deleting tests. run_tests() unless caller; @@ -2426,6 +2426,36 @@ SKIP: print "ok"; }, 'ok', {}, 'gh20826: test regex save stack overflow'); } + { + local $::TODO = "Not Yet Implemented"; + my ($x, $y); + ok( "aaa" =~ /(?:(a)?\1)+/, + "GH Issue #18865 'aaa' - pattern matches"); + $x = "($-[0],$+[0])"; + ok( "aaa" =~ /(?:((?{})a)?\1)+/, + "GH Issue #18865 'aaa' - deoptimized pattern matches"); + $y = "($-[0],$+[0])"; + is( $y, $x, + "GH Issue #18865 'aaa' - test optimization"); + + ok( "ababab" =~ /(?:(?:(ab))?\1)+/, + "GH Issue #18865 'ababab' - pattern matches"); + $x = "($-[0],$+[0])"; + ok( "ababab" =~ /(?:(?:((?{})ab))?\1)+/, + "GH Issue #18865 'ababab' - deoptimized pattern matches"); + $y = "($-[0],$+[0])"; + is( $y, $x, + "GH Issue #18865 'ababab' - test optimization"); + + ok( "XaaXbbXb" =~ /(?:X([ab])?\1)+/, + "GH Issue #18865 'XaaXbbXb' - pattern matches"); + $x = "($-[0],$+[0])"; + ok( "XaaXbbXb" =~ /(?:X((?{})[ab])?\1)+/, + "GH Issue #18865 'XaaXbbXb' - deoptimized pattern matches"); + $y = "($-[0],$+[0])"; + is( $y, $x, + "GH Issue #18865 'XaaXbbXb' - test optimization"); + } } # End of sub run_tests 1; diff --git a/t/re/re_tests b/t/re/re_tests index 2afc639313..7379a39787 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -478,7 +478,7 @@ a(?:b|c|d)+(.) acdbcdbe y $1 e a(?:b|c|d){2}(.) acdbcdbe y $1 b a(?:b|c|d){4,5}(.) acdbcdbe y $1 b a(?:b|c|d){4,5}?(.) acdbcdbe y $1 d -((foo)|(bar))* foobar y $1-$2-$3 bar-foo-bar +((foo)|(bar))* foobar Ty $1-$2-$3 bar--bar # was bar-foo-bar prior to 5.37.7 :(?: - c - Sequence (? incomplete a(?:b|c|d){6,7}(.) acdbcdbe y $1 e a(?:b|c|d){6,7}?(.) acdbcdbe y $1 e @@ -501,7 +501,7 @@ a(?:b|(c|e){1,2}?|d)+?(.) ace y $1$2 ce ((a{4})+) aaaaaaaaa y $1 aaaaaaaa (((aa){2})+) aaaaaaaaaa y $1 aaaaaaaa (((a{2}){2})+) aaaaaaaaaa y $1 aaaaaaaa -(?:(f)(o)(o)|(b)(a)(r))* foobar y $1:$2:$3:$4:$5:$6 f:o:o:b:a:r +(?:(f)(o)(o)|(b)(a)(r))* foobar Ty $1:$2:$3:$4:$5:$6 :::b:a:r (?<=a)b ab y $& b (?<=af?)b ab y $& b (?<=a)b cb n - - @@ -2126,6 +2126,7 @@ AB\s+\x{100} AB \x{100}X y - - ((?|(?<a>a)(?-1)|(?<b>b)(?-1)|(?<c>c)(?-1))) aa y $1 aa # GH 20653 ((?|(?<a>a)(?-1)|(?<b>b)(?-1)|(?<c>c)(?-1))) bb y $1 bb # GH 20653 ((?|(?<a>a)(?-1)|(?<b>b)(?-1)|(?<c>c)(?-1))) cc y $1 cc # GH 20653 + (?|(a)|(b)) b y $+ b # GH 20912 (?|(a)(?{$::plus_got=$+})|(b)(?{$::plus_got=$+})) b y $::plus_got b # GH 20912 (?|(a)|(b)) b y $^N b # GH 20912 @@ -2134,6 +2135,11 @@ AB\s+\x{100} AB \x{100}X y - - (?|(a)(?{$::plus_got=$+})|(b)(?{$::plus_got=$+})) a y $::plus_got a # GH 20912 (?|(a)|(b)) a y $^N a # GH 20912 (?|(a)(?{$::caret_n_got=$^N})|(b)(?{$::caret_n_got=$^N})) a y $::caret_n_got a # GH 20912 + +/(([ab]+)|([cd]+)|([ef]+))+/ ace y $1-$2-$3-$4=$& e---e=ace +/(([ab]+)|([cd]+)|([ef]+))+/ aceb Ty $1-$2-$3-$4=$& b-b--=aceb +/(([ab]+)|([cd]+)|([ef]+))+/ acebd Ty $1-$2-$3-$4=$& d--d-=acebd +/(([ab]+)|([cd]+)|([ef]+))+/ acebdf Ty $1-$2-$3-$4=$& f---f=acebdf # Keep these lines at the end of the file # pat string y/n/etc expr expected-expr skip-reason comment # vim: softtabstop=0 noexpandtab |