diff options
author | Karl Williamson <khw@cpan.org> | 2018-02-18 21:30:17 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2018-02-18 22:00:33 -0700 |
commit | e7206367046d4f58fd12d34816a357cc5535a02f (patch) | |
tree | eb6db72dfaf6a5d0551d4c6af00a37648650e0ab | |
parent | d97906123bcd8c325c65db4f67e8c96e2cdafaec (diff) | |
download | perl-e7206367046d4f58fd12d34816a357cc5535a02f.tar.gz |
Add alphabetic synonyms for regex assertions
This commit came out of the committee formed with PCRE members as a
result of http://nntp.perl.org/group/perl.perl5.porters/246762
-rw-r--r-- | MANIFEST | 1 | ||||
-rw-r--r-- | pod/perldelta.pod | 18 | ||||
-rw-r--r-- | pod/perldiag.pod | 30 | ||||
-rw-r--r-- | pod/perlexperiment.pod | 8 | ||||
-rw-r--r-- | pod/perlre.pod | 55 | ||||
-rw-r--r-- | pod/perlreref.pod | 5 | ||||
-rw-r--r-- | pod/perlretut.pod | 12 | ||||
-rw-r--r-- | regcomp.c | 121 | ||||
-rw-r--r-- | t/re/alpha_assertions.t | 18 | ||||
-rw-r--r-- | t/re/reg_mesg.t | 1 | ||||
-rw-r--r-- | t/re/regexp.t | 47 |
11 files changed, 287 insertions, 29 deletions
@@ -5882,6 +5882,7 @@ t/porting/regen.t Check that regen.pl doesn't need running t/porting/ss_dup.t Check that sv.c:ss_dup handles everything t/porting/test_bootstrap.t Test that the instructions for test bootstrapping aren't accidentally overlooked. t/porting/utils.t Check that utility scripts still compile +t/re/alpha_assertions.t See if things like '(*postive_lookahed:...) work properly t/re/anyof.t See if bracketed char classes [...] compile properly t/re/charset.t See if regex modifiers like /d, /u work properly t/re/fold_grind.t See if case folding works properly diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 6304fa888e..691135ef96 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -55,6 +55,24 @@ abbreviated form for it. The syntax is now either of: Previously a C<"+"> was used instead of the C<"*">. +=head2 Experimentally, there are now alphabetic synonyms for some +regular expression assertions + +If you find it difficult to remember how to write certain of the pattern +assertions, there are now alphabetic synonyms. + + CURRENT NEW SYNONYMS + ------ ------------ + (?=...) (*pla:...) or (*positive_lookahead:...) + (?!...) (*nla:...) or (*negative_lookahead:...) + (?<=...) (*plb:...) or (*positive_lookbehind:...) + (?<!...) (*nlb:...) or (*negative_lookbehind:...) + (?>...) (*atomic:...) + +These are considered experimental, so using any of these will raise +(unless turned off) a warning in the C<experimental::alpha_assertions> +category. + =head1 Security XXX Any security-related notices go here. In particular, any security diff --git a/pod/perldiag.pod b/pod/perldiag.pod index d070ba3d9a..3e69aaedf3 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -6100,6 +6100,12 @@ as a compiler directive. You may say only one of This is to prevent the problem of one module changing the array base out from under another module inadvertently. See L<perlvar/$[> and L<arybase>. +=item The alpha_assertions feature is experimental + +(S experimental::alpha_assertions) This feature is experimental +and its behavior may in any future release of perl. See +L<perlre/Extended Patterns>. + =item The crypt() function is unimplemented due to excessive paranoia. (F) Configure couldn't find the crypt() function on your machine, @@ -6696,15 +6702,21 @@ m/%s/ (F) The condition part of a (?(condition)if-clause|else-clause) construct is not known. The condition must be one of the following: - (1) (2) ... true if 1st, 2nd, etc., capture matched - (<NAME>) ('NAME') true if named capture matched - (?=...) (?<=...) true if subpattern matches - (?!...) (?<!...) true if subpattern fails to match - (?{ CODE }) true if code returns a true value - (R) true if evaluating inside recursion - (R1) (R2) ... true if directly inside capture group 1, 2, etc. - (R&NAME) true if directly inside named capture - (DEFINE) always false; for defining named subpatterns + (1) (2) ... true if 1st, 2nd, etc., capture matched + (<NAME>) ('NAME') true if named capture matched + (?=...) (?<=...) true if subpattern matches + (*pla:...) (*plb:...) true if subpattern matches; also + (*positive_lookahead:...) + (*positive_lookbehind:...) + (*nla:...) (*nlb:...) true if subpattern fails to match; also + (*negative_lookahead:...) + (*negative_lookbehind:...) + (?{ CODE }) true if code returns a true value + (R) true if evaluating inside recursion + (R1) (R2) ... true if directly inside capture group 1, 2, + etc. + (R&NAME) true if directly inside named capture + (DEFINE) always false; for defining named subpatterns The S<<-- HERE> shows whereabouts in the regular expression the problem was discovered. See L<perlre>. diff --git a/pod/perlexperiment.pod b/pod/perlexperiment.pod index 8c2c8f0ffb..7963c05ac7 100644 --- a/pod/perlexperiment.pod +++ b/pod/perlexperiment.pod @@ -127,6 +127,14 @@ C<experimental::script_run>. See also: L<perlre/Script Runs> +=item Alpabetic assertions + +Introduced in Perl 5.28.0 + +Using this feature triggers warnings in the category +C<experimental::alpha_assertions>. + +See also: L<perlre/Extended Patterns>. =back diff --git a/pod/perlre.pod b/pod/perlre.pod index e9a5e5f31f..b5d5517167 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -1570,13 +1570,30 @@ lookahead matches text following the current match position. =over 4 =item C<(?=pattern)> -X<(?=)> X<look-ahead, positive> X<lookahead, positive> + +=item C<(*pla:pattern)> + +=item C<(*positive_lookahead:pattern)> +X<(?=)> +X<(*pla> +X<(*positive_lookahead> +X<look-ahead, positive> X<lookahead, positive> A zero-width positive lookahead assertion. For example, C</\w+(?=\t)/> matches a word followed by a tab, without including the tab in C<$&>. +The alphabetic forms are experimental; using them yields a warning in the +C<experimental::alpha_assertions> category. + =item C<(?!pattern)> -X<(?!)> X<look-ahead, negative> X<lookahead, negative> + +=item C<(*nla:pattern)> + +=item C<(*negative_lookahead:pattern)> +X<(?!)> +X<(*nla> +X<(*negative_lookahead> +X<look-ahead, negative> X<lookahead, negative> A zero-width negative lookahead assertion. For example C</foo(?!bar)/> matches any occurrence of "foo" that isn't followed by "bar". Note @@ -1588,10 +1605,20 @@ will not do what you want. That's because the C<(?!foo)> is just saying that the next thing cannot be "foo"--and it's not, it's a "bar", so "foobar" will match. Use lookbehind instead (see below). +The alphabetic forms are experimental; using them yields a warning in the +C<experimental::alpha_assertions> category. + =item C<(?<=pattern)> =item C<\K> -X<(?<=)> X<look-behind, positive> X<lookbehind, positive> X<\K> + +=item C<(*plb:pattern)> + +=item C<(*positive_lookbehind:pattern)> +X<(?<=)> +X<(*plb> +X<(*positive_lookbehind> +X<look-behind, positive> X<lookbehind, positive> X<\K> A zero-width positive lookbehind assertion. For example, C</(?<=\t)\w+/> matches a word that follows a tab, without including the tab in C<$&>. @@ -1615,13 +1642,26 @@ can be rewritten as the much more efficient s/foo\Kbar//g; +The alphabetic forms (not including C<\K> are experimental; using them +yields a warning in the C<experimental::alpha_assertions> category. + =item C<(?<!pattern)> -X<(?<!)> X<look-behind, negative> X<lookbehind, negative> + +=item C<(*nlb:pattern)> + +=item C<(*negative_lookbehind:pattern)> +X<(?<!)> +X<(*nlb> +X<(*negative_lookbehind> +X<look-behind, negative> X<lookbehind, negative> A zero-width negative lookbehind assertion. For example C</(?<!bar)foo/> matches any occurrence of "foo" that does not follow "bar". Works only for fixed-width lookbehind. +The alphabetic forms are experimental; using them yields a warning in the +C<experimental::alpha_assertions> category. + =back =item C<< (?<NAME>pattern) >> @@ -2096,6 +2136,10 @@ compile the definitions with the C<qr//> operator, and later interpolate them in another pattern. =item C<< (?>pattern) >> + +=item C<< (*atomic:pattern) >> +X<(?E<gt>pattern)> +X<(*atomic> X<backtrack> X<backtracking> X<atomic> X<possessive> An "independent" subexpression, one which matches the substring @@ -2204,6 +2248,9 @@ to inside of one of these constructs. The following equivalences apply: PAT?+ (?>PAT?) PAT{min,max}+ (?>PAT{min,max}) +The alphabetic form (C<(*atomic:...)>) is experimental; using it +yields a warning in the C<experimental::alpha_assertions> category. + =item C<(?[ ])> See L<perlrecharclass/Extended Bracketed Character Classes>. diff --git a/pod/perlreref.pod b/pod/perlreref.pod index c9deafa1a6..aaac153607 100644 --- a/pod/perlreref.pod +++ b/pod/perlreref.pod @@ -234,10 +234,15 @@ There is no quantifier C<{,n}>. That's interpreted as a literal string. (?:...) Groups subexpressions without capturing (cluster) (?pimsx-imsx:...) Enable/disable option (as per m// modifiers) (?=...) Zero-width positive lookahead assertion + (?*pla:...) Same; avail experimentally starting in 5.28 (?!...) Zero-width negative lookahead assertion + (?*nla:...) Same; avail experimentally starting in 5.28 (?<=...) Zero-width positive lookbehind assertion + (?*plb:...) Same; avail experimentally starting in 5.28 (?<!...) Zero-width negative lookbehind assertion + (?*nlb:...) Same; avail experimentally starting in 5.28 (?>...) Grab what we can, prohibit backtracking + (?*atomic:...) Same; avail experimentally starting in 5.28 (?|...) Branch reset (?<name>...) Named capture (?'name'...) Named capture diff --git a/pod/perlretut.pod b/pod/perlretut.pod index 9c1671edfe..2f7670eca6 100644 --- a/pod/perlretut.pod +++ b/pod/perlretut.pod @@ -2355,6 +2355,18 @@ by looking ahead and behind: | (?<=-) (?=\S) # a '-' followed by any non-space /x, $str; # @toks = qw(one two - - - 6 - 8) +Starting in Perl 5.28, experimentally, alphabetic equivalents to these +assertions are added, so you can use whichever is most memorable for +your tastes. + + (?=...) (*pla:...) or (*positive_lookahead:...) + (?!...) (*nla:...) or (*negative_lookahead:...) + (?<=...) (*plb:...) or (*positive_lookbehind:...) + (?<!...) (*nlb:...) or (*negative_lookbehind:...) + (?>...) (*atomic:...) + +Using any of these will raise (unless turned off) a warning in the +C<experimental::alpha_assertions> category. =head2 Using independent subexpressions to prevent backtracking @@ -10812,6 +10812,40 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_seen |= REG_CUTGROUP_SEEN; } break; + case 'a': + if (memEQs(start_verb, verb_len, "atomic")) { + paren = 't'; /* AtOMIC */ + goto alpha_assertions; + } + break; + case 'p': + if ( memEQs(start_verb, verb_len, "plb") + || memEQs(start_verb, verb_len, "positive_lookbehind")) + { + paren = 'b'; + goto lookbehind_alpha_assertions; + } + else if ( memEQs(start_verb, verb_len, "pla") + || memEQs(start_verb, verb_len, "positive_lookahead")) + { + paren = 'a'; + goto alpha_assertions; + } + break; + case 'n': + if ( memEQs(start_verb, verb_len, "nlb") + || memEQs(start_verb, verb_len, "negative_lookbehind")) + { + paren = 'B'; + goto lookbehind_alpha_assertions; + } + else if ( memEQs(start_verb, verb_len, "nla") + || memEQs(start_verb, verb_len, "negative_lookahead")) + { + paren = 'A'; + goto alpha_assertions; + } + break; case 's': if ( memEQs(start_verb, verb_len, "sr") || memEQs(start_verb, verb_len, "script_run")) @@ -10851,6 +10885,36 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; + lookbehind_alpha_assertions: + RExC_seen |= REG_LOOKBEHIND_SEEN; + RExC_in_lookbehind++; + /*FALLTHROUGH*/ + + alpha_assertions: + + if (PASS2) { + Perl_ck_warner_d(aTHX_ + packWARN(WARN_EXPERIMENTAL__ALPHA_ASSERTIONS), + "The alpha_assertions feature is experimental" + REPORT_LOCATION, REPORT_LOCATION_ARGS(RExC_parse)); + } + + RExC_seen_zerolen++; + + if (! start_arg) { + goto no_colon; + } + + /* An empty negative lookahead assertion simply is failure */ + if (paren == 'A' && RExC_parse == start_arg) { + ret=reganode(pRExC_state, OPFAIL, 0); + nextchar(pRExC_state); + return ret; + } + + RExC_parse = start_arg; + goto parse_rest; + no_colon: vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'", @@ -11033,6 +11097,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) paren = 1; goto capturing_parens; } + RExC_seen |= REG_LOOKBEHIND_SEEN; RExC_in_lookbehind++; RExC_parse++; @@ -11263,12 +11328,37 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) { int is_define= 0; const int DEFINE_len = sizeof("DEFINE") - 1; - if (RExC_parse[0] == '?') { /* (?(?...)) */ - if ( RExC_parse < RExC_end - 1 - && ( RExC_parse[1] == '=' - || RExC_parse[1] == '!' - || RExC_parse[1] == '<' - || RExC_parse[1] == '{') + if ( RExC_parse < RExC_end - 1 + && ( ( RExC_parse[0] == '?' /* (?(?...)) */ + && ( RExC_parse[1] == '=' + || RExC_parse[1] == '!' + || RExC_parse[1] == '<' + || RExC_parse[1] == '{')) + || ( RExC_parse[0] == '*' /* (?(*...)) */ + && ( memBEGINs(RExC_parse +1, + (Size_t) (RExC_end - (RExC_parse + 1)), + "pla:") + || memBEGINs(RExC_parse +1, + (Size_t) (RExC_end - (RExC_parse + 1)), + "plb") + || memBEGINs(RExC_parse +1, + (Size_t) (RExC_end - (RExC_parse + 1)), + "nla") + || memBEGINs(RExC_parse +1, + (Size_t) (RExC_end - (RExC_parse + 1)), + "nlb") + || memBEGINs(RExC_parse +1, + (Size_t) (RExC_end - (RExC_parse + 1)), + "positive_lookahead") + || memBEGINs(RExC_parse +1, + (Size_t) (RExC_end - (RExC_parse + 1)), + "positive_lookbehind") + || memBEGINs(RExC_parse +1, + (Size_t) (RExC_end - (RExC_parse + 1)), + "negative_lookahead") + || memBEGINs(RExC_parse +1, + (Size_t) (RExC_end - (RExC_parse + 1)), + "negative_lookbehind")))) ) { /* Lookahead or eval. */ I32 flag; regnode *tail; @@ -11285,10 +11375,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) REGTAIL(pRExC_state, ret, tail); goto insert_if; } - /* Fall through to ‘Unknown switch condition’ at the - end of the if/else chain. */ - } - else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */ + else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */ || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ { char ch = RExC_parse[0] == '<' ? '>' : '\''; @@ -11601,11 +11688,16 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_in_script_run = 0; break; case '<': + case 'a': + case 'A': + case 'b': + case 'B': case ',': case '=': case '!': *flagp &= ~HASWIDTH; /* FALLTHROUGH */ + case 't': /* aTomic */ case '>': ender = reg_node(pRExC_state, SUCCEED); break; @@ -11691,14 +11783,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) { const char *p; - static const char parens[] = "=!<,>"; + /* Even/odd or x=don't care: 010101x10x */ + static const char parens[] = "=!aA<,>Bbt"; + /* flag below is set to 0 up through 'A'; 1 for larger */ if (paren && (p = strchr(parens, paren))) { U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; - int flag = (p - parens) > 1; + int flag = (p - parens) > 3; - if (paren == '>') + if (paren == '>' || paren == 't') { node = SUSPEND, flag = 0; + } reginsert(pRExC_state, node,ret, depth+1); Set_Node_Cur_Length(ret, parse_start); Set_Node_Offset(ret, parse_start + 1); diff --git a/t/re/alpha_assertions.t b/t/re/alpha_assertions.t new file mode 100644 index 0000000000..3d28bbcdd2 --- /dev/null +++ b/t/re/alpha_assertions.t @@ -0,0 +1,18 @@ +#!./perl + +use strict; +use warnings; +no warnings 'once'; + +# This tests that the alphabetic assertions, like '(*atomic:...) work +# It just sets a flag and calls regexp.t which will run through its test +# suite, modifiying the tests to use the alphabetic synonyms. + +BEGIN { $::alpha_assertions = 1; } +for my $file ('./re/regexp.t', './t/re/regexp.t', ':re:regexp.t') { + if (-r $file) { + do $file or die $@; + exit; + } +} +die "Cannot find ./re/regexp.t or ./t/re/regexp.t\n"; diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index aff5535ec3..1bab9dfe66 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -290,6 +290,7 @@ my @death = '/(*srfoo)/' => 'Unknown \'(*...)\' construct \'srfoo\' {#} m/(*srfoo){#}/', '/(*script_run)/' => '\'(*script_run\' requires a terminating \':\' {#} m/(*script_run{#})/', '/(*sr)/' => '\'(*sr\' requires a terminating \':\' {#} m/(*sr{#})/', + '/(*pla)/' => '\'(*pla\' requires a terminating \':\' {#} m/(*pla{#})/', '/(*script_run/' => 'Unterminated \'(*...\' construct {#} m/(*script_run{#}/', '/(*sr/' => 'Unterminated \'(*...\' construct {#} m/(*sr{#}/', '/(*script_run:foo/' => 'Unterminated \'(*...\' argument {#} m/(*script_run:foo{#}/', diff --git a/t/re/regexp.t b/t/re/regexp.t index cced1e0560..835cbdc668 100644 --- a/t/re/regexp.t +++ b/t/re/regexp.t @@ -99,7 +99,10 @@ sub convert_from_ascii { use strict; use warnings FATAL=>"all"; our ($bang, $ffff, $nulnul); # used by the tests -our ($qr, $skip_amp, $qr_embed, $qr_embed_thr, $regex_sets, $no_null); # set by our callers +our ($qr, $skip_amp, $qr_embed, $qr_embed_thr, $regex_sets, $alpha_assertions, $no_null); # set by our callers + +my $expanded_text = "expanded name from original test number"; +my $expanded_text_re = qr/$expanded_text/; if (!defined $file) { open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!"; @@ -115,7 +118,6 @@ $nulnul = "\0" x 2; my $OP = $qr ? 'qr' : 'm'; $| = 1; -printf "1..%d\n# $iters iterations\n", scalar @tests; my $test; TEST: @@ -131,6 +133,7 @@ foreach (@tests) { chomp; s/\\n/\n/g unless $regex_sets; my ($pat, $subject, $result, $repl, $expect, $reason, $comment) = split(/\t/,$_,7); + $comment = "" unless defined $comment; if (!defined $subject) { die "Bad test definition on line $test: $_\n"; } @@ -180,7 +183,42 @@ foreach (@tests) { $comment=~s/^\s*(?:#\s*)?//; $testname .= " - $comment" if $comment; } - if (! $skip && $regex_sets) { + if (! $skip && $alpha_assertions) { + my $assertions_re = qr/ (?: \Q(?\E (?: > | <? [=>] ) ) /x; + if ($pat !~ $assertions_re && $comment !~ $expanded_text_re) { + $skip++; + $reason = "Pattern doesn't contain assertions"; + } + elsif ($comment !~ $expanded_text_re) { + my $expanded_pat = $pat; + + $pat =~ s/\( \? > /(*atomic:/xg; + + if ($pat =~ s/\( \? = /(*pla:/xg) { + $expanded_pat =~ s//(*positive_lookahead:/g; + } + if ($pat =~ s/\( \? ! /(*nla:/xg) { + $expanded_pat =~ s//(*negative_lookahead:/g; + } + if ($pat =~ s/\( \? <= /(*plb:/xg) { + $expanded_pat =~ s//(*positive_lookbehind:/g; + } + if ($pat =~ s/\( \? <! /(*nlb:/xg) { + $expanded_pat =~ s//(*negative_lookbehind:/g; + } + if ($expanded_pat ne $pat) { + $comment .= " $expanded_text $test"; + push @tests, join "\t", $expanded_pat, + $subject // "", + $result // "", + $repl // "", + $expect // "", + $reason // "", + $comment; + } + } + } + elsif (! $skip && $regex_sets) { # If testing regex sets, change the [bracketed] classes into # (?[bracketed]). But note that '\[' and '\c[' don't introduce such a @@ -412,6 +450,7 @@ EOFCODE EOFCODE } $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets; + $code = "no warnings 'experimental::alpha_assertions';$code" if $alpha_assertions; #$code.=qq[\n\$expect="$expect";\n]; #use Devel::Peek; #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/; @@ -465,4 +504,6 @@ EOFCODE print "ok $testname$todo\n"; } +printf "1..%d\n# $iters iterations\n", scalar @tests; + 1; |