diff options
-rw-r--r-- | pod/perldebguts.pod | 10 | ||||
-rw-r--r-- | pod/perldelta.pod | 12 | ||||
-rw-r--r-- | pod/perldiag.pod | 7 | ||||
-rw-r--r-- | pod/perlre.pod | 72 | ||||
-rw-r--r-- | regcomp.c | 49 | ||||
-rw-r--r-- | regcomp.sym | 4 | ||||
-rw-r--r-- | regexec.c | 90 | ||||
-rw-r--r-- | regexp.h | 3 | ||||
-rw-r--r-- | regnodes.h | 4 | ||||
-rw-r--r-- | t/re/alpha_assertions.t | 1 | ||||
-rw-r--r-- | t/re/pat.t | 4 | ||||
-rw-r--r-- | t/re/re_tests | 14 | ||||
-rw-r--r-- | t/re/reg_mesg.t | 4 | ||||
-rw-r--r-- | t/re/regexp.t | 1 |
14 files changed, 214 insertions, 61 deletions
diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod index ff2eaed89b..c0bcc1570b 100644 --- a/pod/perldebguts.pod +++ b/pod/perldebguts.pod @@ -746,11 +746,13 @@ will be lost. # Special Case Regops IFMATCH off 1 1 Succeeds if the following matches; non-zero - flags "f" means lookbehind assertion - starting "f" characters before current + flags "f", next_off "o" means lookbehind + assertion starting "f..(f-o)" characters + before current UNLESSM off 1 1 Fails if the following matches; non-zero - flags "f" means lookbehind assertion - starting "f" characters before current + flags "f", next_off "o" means lookbehind + assertion starting "f..(f-o)" characters + before current SUSPEND off 1 1 "Independent" sub-RE. IFTHEN off 1 1 Switch, should be preceded by switcher. GROUPP num 1 Whether the group matched. diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 68f4ba9fac..ece46e6757 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -75,6 +75,18 @@ L</Selected Bug Fixes> section. [ List each security issue as a =head2 entry ] +=head2 Variable length lookbehind in regular expression pattern matching +is now experimentally supported + +Using a lookbehind assertion (like C<(?<=foo)> or C<(?<!bar)> previously +would generate an error and refuse to compile. Now it compiles (if the +maximum lookbehind is at most 255 characters), but raises a warning in +the new C<experimental::vlb> warnings category. This is to caution you +that the precise behavior is subject to change based on feedback from +use in the field. + +See L<perlre/(?<=pattern)> and L<perlre/(?<!pattern)>. + =head1 Incompatible Changes XXX For a release on a stable branch, this section aspires to be: diff --git a/pod/perldiag.pod b/pod/perldiag.pod index ec1edb6e70..4b618cf7ac 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -7504,7 +7504,12 @@ front of your variable. =item Variable length lookbehind not implemented in regex m/%s/ -(F) Lookbehind is allowed only for subexpressions whose length is fixed and +(F) B<This message no longer should be raised as of Perl 5.30.> It is +retained in this document as a convenience for people using an earlier +Perl version. + +In Perl 5.30 and earlier, lookbehind is allowed +only for subexpressions whose length is fixed and known at compile time. For positive lookbehind, you can use the C<\K> regex construct as a way to get the equivalent functionality. See L<(?<=pattern) and \K in perlre|perlre/\K>. diff --git a/pod/perlre.pod b/pod/perlre.pod index af66136d49..f9ea161700 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -1629,17 +1629,36 @@ X<look-behind, positive> X<lookbehind, positive> X<\K> A zero-width positive lookbehind assertion. For example, C</(?<=\t)\w+/> matches a word that follows a tab, without including the tab in C<$&>. -Works only for fixed-width lookbehind of up to 255 characters. Note -that a compilation error will be generated if the assertion contains a -multi-character match under C</i>, as that could match a single -character, or it could match two or three, and that makes it variable -length, which is forbidden. -However, there is a special form of this construct, called C<\K> +Prior to Perl 5.30, it worked only for fixed-width lookbehind, but +starting in that release, it can handle variable lengths from 1 to 255 +characters as an experimental feature. The feature is enabled +automatically if you use a variable length lookbehind assertion, but +will raise a warning at pattern compilation time, unless turned off, in +the C<experimental::vlb> category. This is to warn you that the exact +behavior is subject to change should feedback from actual use in the +field indicate to do so; or even complete removal if the problems found +are not practically surmountable. You can achieve close to pre-5.30 +behavior by fatalizing warnings in this category. + +There is a special form of this construct, called C<\K> (available since Perl 5.10.0), which causes the regex engine to "keep" everything it had matched prior to the C<\K> and -not include it in C<$&>. This effectively provides variable-length -lookbehind. +not include it in C<$&>. This effectively provides non-experimental +variable-length lookbehind of any length. + +And, there is a technique that can be used to handle variable length +lookbehinds on earlier releases, and longer than 255 characters. It is +described in +L<http://www.drregex.com/2019/02/variable-length-lookbehinds-actually.html>. + +Note that under C</i>, a few single characters match two or three other +characters. This makes them variable length, and the 255 length applies +to the maximum number of characters in the match. For +example C<qr/\N{LATIN SMALL LETTER SHARP S}/i> matches the sequence +C<"ss">. Your lookbehind assertion could contain 127 Sharp S +characters under C</i>, but adding a 128th would generate a compilation +error, as that could match 256 C<"s"> characters in a row. The use of C<\K> inside of another lookaround assertion is allowed, but the behaviour is currently not well defined. @@ -1655,6 +1674,9 @@ can be rewritten as the much more efficient s/foo\Kbar//g; +Use of the non-greedy modifier C<"?"> may not give you the expected +results if it is within a capturing group within the construct. + The alphabetic forms (not including C<\K> are experimental; using them yields a warning in the C<experimental::alpha_assertions> category. @@ -1669,15 +1691,35 @@ X<(*negative_lookbehind> X<look-behind, negative> X<lookbehind, negative> A zero-width negative lookbehind assertion. For example C</(?<!bar)foo/> -matches any occurrence of "foo" that does not follow "bar". Works -only for fixed-width lookbehind of up to 255 characters. Note that a -compilation error will be generated if the assertion contains a -multi-character match under C</i>, as that could match a single -character, or it could match two or three, and that makes it variable -length, which is forbidden. However, there is a technique that can be -used to handle variable length lookbehinds. It is described in +matches any occurrence of "foo" that does not follow "bar". + +Prior to Perl 5.30, it worked only for fixed-width lookbehind, but +starting in that release, it can handle variable lengths from 1 to 255 +characters as an experimental feature. The feature is enabled +automatically if you use a variable length lookbehind assertion, but +will raise a warning at pattern compilation time, unless turned off, in +the C<experimental::vlb> category. This is to warn you that the exact +behavior is subject to change should feedback from actual use in the +field indicate to do so; or even complete removal if the problems found +are not practically surmountable. You can achieve close to pre-5.30 +behavior by fatalizing warnings in this category. + +There is a technique that can be used to handle variable length +lookbehinds on earlier releases, and longer than 255 characters. It is +described in L<http://www.drregex.com/2019/02/variable-length-lookbehinds-actually.html>. +Note that under C</i>, a few single characters match two or three other +characters. This makes them variable length, and the 255 length applies +to the maximum number of characters in the match. For +example C<qr/\N{LATIN SMALL LETTER SHARP S}/i> matches the sequence +C<"ss">. Your lookbehind assertion could contain 127 Sharp S +characters under C</i>, but adding a 128th would generate a compilation +error, as that could match 256 C<"s"> characters in a row. + +Use of the non-greedy modifier C<"?"> may not give you the expected +results if it is within a capturing group within the construct. + The alphabetic forms are experimental; using them yields a warning in the C<experimental::alpha_assertions> category. @@ -5986,14 +5986,27 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", last, &data_fake, stopparen, recursed_depth, NULL, f, depth+1); if (scan->flags) { - if (deltanext) { - FAIL("Variable length lookbehind not implemented"); - } - else if (minnext > (I32)U8_MAX) { + if ( deltanext < 0 + || deltanext > (I32) U8_MAX + || minnext > (I32)U8_MAX + || minnext + deltanext > (I32)U8_MAX) + { FAIL2("Lookbehind longer than %" UVuf " not implemented", (UV)U8_MAX); } - scan->flags = (U8)minnext; + + /* The 'next_off' field has been repurposed to count the + * additional starting positions to try beyond the initial + * one. (This leaves it at 0 for non-variable length + * matches to avoid breakage for those not using this + * extension) */ + if (deltanext) { + scan->next_off = deltanext; + ckWARNexperimental(RExC_parse, + WARN_EXPERIMENTAL__VLB, + "Variable length lookbehind is experimental"); + } + scan->flags = (U8)minnext + deltanext; } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -6078,14 +6091,21 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", stopparen, recursed_depth, NULL, f, depth+1); if (scan->flags) { - if (deltanext) { - FAIL("Variable length lookbehind not implemented"); - } - else if (*minnextp > (I32)U8_MAX) { + assert(0); /* This code has never been tested since this + is normally not compiled */ + if ( deltanext < 0 + || deltanext > (I32) U8_MAX + || *minnextp > (I32)U8_MAX + || *minnextp + deltanext > (I32)U8_MAX) + { FAIL2("Lookbehind longer than %" UVuf " not implemented", (UV)U8_MAX); } - scan->flags = (U8)*minnextp; + + if (deltanext) { + scan->next_off = deltanext; + } + scan->flags = (U8)*minnextp + deltanext; } *minnextp += min; @@ -20432,8 +20452,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ assert(FLAGS(o) < C_ARRAY_LENGTH(bounds)); sv_catpv(sv, bounds[FLAGS(o)]); } - else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) - Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); + else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) { + Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags)); + if (o->next_off) { + Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off); + } + Perl_sv_catpvf(aTHX_ sv, "]"); + } else if (OP(o) == SBOL) Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^"); diff --git a/regcomp.sym b/regcomp.sym index 4b9a42c338..11cf43f429 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -177,8 +177,8 @@ LONGJMP LONGJMP, off 1 . 1 ; Jump far away. BRANCHJ BRANCHJ, off 1 V 1 ; BRANCH with long offset. #*Special Case Regops -IFMATCH BRANCHJ, off 1 . 1 ; Succeeds if the following matches; non-zero flags "f" means lookbehind assertion starting "f" characters before current -UNLESSM BRANCHJ, off 1 . 1 ; Fails if the following matches; non-zero flags "f" means lookbehind assertion starting "f" characters before current +IFMATCH BRANCHJ, off 1 . 1 ; Succeeds if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current +UNLESSM BRANCHJ, off 1 . 1 ; Fails if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current SUSPEND BRANCHJ, off 1 V 1 ; "Independent" sub-RE. IFTHEN BRANCHJ, off 1 V 1 ; Switch, should be preceded by switcher. GROUPP GROUPP, num 1 ; Whether the group matched. @@ -5626,6 +5626,16 @@ allocated, and is never freed until interpreter destruction. When the slab is full, a new one is allocated and chained to the end. At exit from regmatch(), slabs allocated since entry are freed. +In order to work with variable length lookbehinds, an upper limit is placed on +lookbehinds which is set to where the match position is at the end of where the +lookbehind would get to. Nothing in the lookbehind should match above that, +except we should be able to look beyond if for things like \b, which need the +next character in the string to be able to determine if this is a boundary or +not. We also can't match the end of string/line unless we are also at the end +of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs +that match a width, we have to add a condition that they are within the legal +bounds of our window into the string. + */ /* returns -1 on failure, $+[0] on success */ @@ -8673,12 +8683,11 @@ NULL #undef ST #define ST st->u.ifmatch - { - char *newstart; - case SUSPEND: /* (?>A) */ ST.wanted = 1; - newstart = locinput; + ST.start = locinput; + ST.end = loceol; + ST.count = 1; goto do_ifmatch; case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */ @@ -8688,25 +8697,48 @@ NULL case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */ ST.wanted = 1; ifmatch_trivial_fail_test: - if (scan->flags) { - char * const s = HOPBACKc(locinput, scan->flags); - if (!s) { - /* trivial fail */ + ST.count = scan->next_off + 1; /* next_off repurposed to be + lookbehind count, requires + non-zero flags */ + if (! scan->flags) { /* 'flags' zero means lookahed */ + + /* Lookahead starts here and ends at the normal place */ + ST.start = locinput; + ST.end = loceol; + } + else { + PERL_UINT_FAST8_T back_count = scan->flags; + char * s; + + /* Lookbehind ends here */ + ST.end = locinput; + + /* ... and starts at the first place in the input that is in + * the range of the possible start positions */ + for (; ST.count > 0; ST.count--, back_count--) { + s = HOPBACKc(locinput, back_count); + if (s) { + ST.start = s; + goto do_ifmatch; + } + } + + /* If the lookbehind doesn't start in the actual string, is a + * trivial match failure */ if (logical) { logical = 0; sw = 1 - cBOOL(ST.wanted); } else if (ST.wanted) sayNO; + + /* Here, we didn't want it to match, so is actually success + * */ next = scan + ARG(scan); if (next == scan) next = NULL; break; - } - newstart = s; } - else - newstart = locinput; do_ifmatch: ST.me = scan; @@ -8714,20 +8746,35 @@ NULL logical = 0; /* XXX: reset state of logical once it has been saved into ST */ /* execute body of (?...A) */ - PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart, loceol); + PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start, ST.end); NOT_REACHED; /* NOTREACHED */ - } + + { + bool matched; case IFMATCH_A_fail: /* body of (?...A) failed */ - ST.wanted = !ST.wanted; - /* FALLTHROUGH */ + if (! ST.logical && ST.count > 1) { + + /* It isn't a real failure until we've tried all starting + * positions. Move to the next starting position and retry */ + ST.count--; + ST.start = HOPc(ST.start, 1); + scan = ST.me; + logical = ST.logical; + goto do_ifmatch; + } + + /* Here, all starting positions have been tried. */ + matched = FALSE; + goto ifmatch_done; case IFMATCH_A: /* body of (?...A) succeeded */ - if (ST.logical) { - sw = cBOOL(ST.wanted); - } - else if (!ST.wanted) - sayNO; + matched = TRUE; + ifmatch_done: + sw = matched == ST.wanted; + if (! ST.logical && !sw) { + sayNO; + } if (OP(ST.me) != SUSPEND) { /* restore old position except for (?>...) */ @@ -8738,6 +8785,7 @@ NULL if (scan == ST.me) scan = NULL; continue; /* execute B */ + } #undef ST @@ -807,6 +807,9 @@ typedef struct regmatch_state { struct regmatch_state *prev_yes_state; I32 wanted; I32 logical; /* saved copy of 'logical' var */ + U8 count; /* number of beginning positions */ + char *start; + char *end; regnode *me; /* the IFMATCH/SUSPEND/UNLESSM node */ } ifmatch; /* and SUSPEND/UNLESSM */ diff --git a/regnodes.h b/regnodes.h index 3b53c1715f..803938ac48 100644 --- a/regnodes.h +++ b/regnodes.h @@ -83,8 +83,8 @@ #define NREFFA 69 /* 0x45 Match already matched string, using /aai rules. */ #define LONGJMP 70 /* 0x46 Jump far away. */ #define BRANCHJ 71 /* 0x47 BRANCH with long offset. */ -#define IFMATCH 72 /* 0x48 Succeeds if the following matches; non-zero flags "f" means lookbehind assertion starting "f" characters before current */ -#define UNLESSM 73 /* 0x49 Fails if the following matches; non-zero flags "f" means lookbehind assertion starting "f" characters before current */ +#define IFMATCH 72 /* 0x48 Succeeds if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current */ +#define UNLESSM 73 /* 0x49 Fails if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current */ #define SUSPEND 74 /* 0x4a "Independent" sub-RE. */ #define IFTHEN 75 /* 0x4b Switch, should be preceded by switcher. */ #define GROUPP 76 /* 0x4c Whether the group matched. */ diff --git a/t/re/alpha_assertions.t b/t/re/alpha_assertions.t index 3d28bbcdd2..f37ad890b0 100644 --- a/t/re/alpha_assertions.t +++ b/t/re/alpha_assertions.t @@ -3,6 +3,7 @@ use strict; use warnings; no warnings 'once'; +no warnings 'experimental::vlb'; # This tests that the alphabetic assertions, like '(*atomic:...) work # It just sets a flag and calls regexp.t which will run through its test diff --git a/t/re/pat.t b/t/re/pat.t index 79ed9149d3..57ef00c418 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -6,6 +6,7 @@ use strict; use warnings; +no warnings 'experimental::vlb'; use 5.010; sub run_tests; @@ -1992,7 +1993,8 @@ EOP fresh_perl_is('m m0*0+\Rm', "",{},"Undefined behavior in address sanitizer"); } { # [perl #133642] - fresh_perl_is('m/((?<=(0?)))/', "Variable length lookbehind not implemented in regex m/((?<=(0?)))/ at - line 1.",{},"Was getting 'Double free'"); + fresh_perl_is('no warnings "experimental::vlb"; + m/((?<=(0?)))/', "",{},"Was getting 'Double free'"); } { # [perl #133782] # this would panic on DEBUGGING builds diff --git a/t/re/re_tests b/t/re/re_tests index 9b615ea2c2..bd7fc8f80d 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -487,12 +487,19 @@ a(?:b|(c|e){1,2}?|d)+?(.) ace y $1$2 ce (((a{2}){2})+) aaaaaaaaaa y $1 aaaaaaaa (?:(f)(o)(o)|(b)(a)(r))* foobar y $1:$2:$3:$4:$5:$6 f:o:o:b:a:r (?<=a)b ab y $& b +(?<=af?)b ab y $& b (?<=a)b cb n - - +(?<=a(?:fo)?)b cb n - - (?<=a)b b n - - +(?<=a(?:foo)?)b b n - - (?<!c)b ab y $& b +(?<!c(?:foob)?)b ab y $& b (?<!c)b cb n - - +(?<!c(?:fooba)?)b cb n - - (?<!c)b b y - - +(?<!c(?:foobar)?)b b y - - (?<!c)b b y $& b +(?<!c(?:foobarb)?)b b y $& b (?<%)b - c - Group name must start with a non-digit word character (?:..)*a aba y $& aba (?:..)*?a aba y $& a @@ -559,7 +566,10 @@ x(~~)*(?:(?:F)?)? x~~ y - - ^a(?#xxx){3}c aaac y $& aaac '^a (?#xxx) (?#yyy) {3}c'x aaac y $& aaac (?<![cd])b dbcb n - - +(?<![cd]e{0,254})b dbcb n - - (?<![cd])[ab] dbaacb y $& a +(?<![cd]{1,2})[ab] dbaacb y $& a +#Why does this fail. I think it's confusing (?<![cd]{1,3})[ab] dbaacb y $& a (?<!(c|d))b dbcb n - - (?<!(c|d))[ab] dbaacb y $& a (?<!cd)[ab] cdaccb y $& b @@ -668,7 +678,7 @@ $(?<=^(a)) a y $1 a ((?>a+)b) aaab y $1 aaab (?>(a+))b aaab y $1 aaa ((?>[^()]+)|\([^()]*\))+ ((abc(ade)ufh()()x y $& abc(ade)ufh()()x -(?<=x+)y - c - Variable length lookbehind not implemented +(?<=x+)y - c - Lookbehind longer than 255 not implemented ((def){37,17})?ABC ABC y $& ABC \Z a\nb\n y $-[0] 3 \z a\nb\n y $-[0] 4 @@ -1349,6 +1359,7 @@ a*(*F) aaaab n - - (?<=abcd(?<=(aaaabcd))) ..aaaabcd.. y $1 aaaabcd (?=xy(?<=(aaxy))) ..aaxy.. y $1 aaxy +(?=xy(?<=(aaxyz?))) ..aaxy.. y $1 aaxy X(\w+)(?=\s)|X(\w+) Xab y [$1-$2] [-ab] @@ -2002,6 +2013,7 @@ AB\s+\x{100} AB \x{100}X y - - (?:(?^:(?{1}))[^0-9]) : y $& : # [perl #133348] /[\xdf-/i - c - Invalid [] range # [perl #133620] likely only fails under valgrind /\1a(b)/ bab n - - # This compiles but fails to match as \1 is not set when parsed. +/(?iu)(?<=\xdf)hbase/ sshbase y $& hbase # Keep these lines at the end of the file # vim: softtabstop=0 noexpandtab diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index e4c008be1a..e7251a9571 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -128,7 +128,7 @@ my @death = ( '/[[=foo=]]/' => 'POSIX syntax [= =] is reserved for future extensions {#} m/[[=foo=]{#}]/', - '/(?<= .*)/' => 'Variable length lookbehind not implemented in regex m/(?<= .*)/', + '/(?<= .*)/' => 'Lookbehind longer than 255 not implemented in regex m/(?<= .*)/', '/(?<= x{1000})/' => 'Lookbehind longer than 255 not implemented in regex m/(?<= x{1000})/', @@ -415,7 +415,7 @@ my @death_only_under_strict = ( # These need the character 'ネ' as a marker for mark_as_utf8() my @death_utf8 = mark_as_utf8( - '/ネ(?<= .*)/' => 'Variable length lookbehind not implemented in regex m/ネ(?<= .*)/', + '/ネ(?<= .*)/' => 'Lookbehind longer than 255 not implemented in regex m/ネ(?<= .*)/', '/(?<= ネ{1000})/' => 'Lookbehind longer than 255 not implemented in regex m/(?<= ネ{1000})/', diff --git a/t/re/regexp.t b/t/re/regexp.t index 037d7b7a48..1ad028bcd2 100644 --- a/t/re/regexp.t +++ b/t/re/regexp.t @@ -105,6 +105,7 @@ sub convert_from_ascii { use strict; use warnings FATAL=>"all"; +no warnings 'experimental::vlb'; our ($bang, $ffff, $nulnul); # used by the tests our ($qr, $skip_amp, $qr_embed, $qr_embed_thr, $regex_sets, $alpha_assertions, $no_null); # set by our callers |