diff options
author | Karl Williamson <khw@cpan.org> | 2018-02-19 20:40:20 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2018-02-19 20:48:45 -0700 |
commit | 3337229ca19b517df4e7e43fd4312f0ee9e5b6c0 (patch) | |
tree | f05bea57e3b1796702dd25b8c8ce2429e352546d | |
parent | 8638266fb1a6a7f9b6ce2567347bfba94f8b2414 (diff) | |
download | perl-3337229ca19b517df4e7e43fd4312f0ee9e5b6c0.tar.gz |
Add atomic script runs
This is an extension to the new script_run feature that is syntactic
sugar for the idiom espected to be most commonly used:
(*sr:(?>...)) can be written as (*asr:...)
-rw-r--r-- | pod/perldelta.pod | 8 | ||||
-rw-r--r-- | pod/perlre.pod | 9 | ||||
-rw-r--r-- | regcomp.c | 73 | ||||
-rw-r--r-- | t/re/script_run.t | 5 |
4 files changed, 85 insertions, 10 deletions
diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 691135ef96..d1aa6bd3f4 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -55,6 +55,14 @@ abbreviated form for it. The syntax is now either of: Previously a C<"+"> was used instead of the C<"*">. +=head2 There is a new form for script runs which combines with +C<(?E<gt>...)> (or C<*atomic:...)>) + +C<(*asr:...> is an easier way to write C<(*sr:(?E<gt>...))>, +which is expected to be a commonly used idiom. +C<(*atomic_script_run:...> is also available. See +L<perlre/Script Runs>. + =head2 Experimentally, there are now alphabetic synonyms for some regular expression assertions diff --git a/pod/perlre.pod b/pod/perlre.pod index b5d5517167..29082a66d0 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -2469,6 +2469,7 @@ following match, see L</C<< (?>pattern) >>>. =head2 Script Runs X<(*script_run:...)> X<(sr:...)> +X<(*atomic_script_run:...)> X<(asr:...)> A script run is basically a sequence of characters, all from the same Unicode script (see L<perlunicode/Scripts>), such as Latin or Greek. In @@ -2499,10 +2500,16 @@ backtracking occurs until something all in the same script is found that matches, or all possibilities are exhausted. This can cause a lot of backtracking, but generally, only malicious input will result in this, though the slow down could cause a denial of service attack. If your -needs permit, it is best to make the pattern atomic. +needs permit, it is best to make the pattern atomic. This is so likely +to be what you want, that instead of writing this: (*script_run:(?>pattern)) +you can write either of these: + + (*atomic_script_run:pattern) + (*asr:pattern) + (See L</C<(?E<gt>pattern)>>.) In Taiwan, Japan, and Korea, it is common for text to have a mixture of @@ -10734,7 +10734,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) vFAIL("Unmatched ("); } - if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */ + if (paren == 'r') { /* Atomic script run */ + paren = '>'; + goto parse_rest; + } + else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */ char *start_verb = RExC_parse + 1; STRLEN verb_len; char *start_arg = NULL; @@ -10841,7 +10845,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } break; case 'a': - if (memEQs(start_verb, verb_len, "atomic")) { + if ( memEQs(start_verb, verb_len, "asr") + || memEQs(start_verb, verb_len, "atomic_script_run")) + { + paren = 'r'; /* Mnemonic: recursed run */ + goto script_run; + } + else if (memEQs(start_verb, verb_len, "atomic")) { paren = 't'; /* AtOMIC */ goto alpha_assertions; } @@ -10878,8 +10888,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if ( memEQs(start_verb, verb_len, "sr") || memEQs(start_verb, verb_len, "script_run")) { + regnode * atomic; + paren = 's'; + script_run: + /* This indicates Unicode rules. */ REQUIRE_UNI_RULES(flagp, NULL); @@ -10889,6 +10903,30 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse = start_arg; + if (RExC_in_script_run) { + + /* Nested script runs are treated as no-ops, because + * if the nested one fails, the outer one must as + * well. It could fail sooner, and avoid (??{} with + * side effects, but that is explicitly documented as + * undefined behavior. */ + + ret = NULL; + + if (paren == 's') { + paren = ':'; + goto parse_rest; + } + + /* But, the atomic part of a nested atomic script run + * isn't a no-op, but can be treated just like a '(?>' + * */ + paren = '>'; + goto parse_rest; + } + + /* By doing this here, we avoid extra warnings for nested + * script runs */ if (PASS2) { Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__SCRIPT_RUN), @@ -10897,17 +10935,35 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } - if (RExC_in_script_run) { - paren = ':'; - ret = NULL; + if (paren == 's') { + /* Here, we're starting a new regular script run */ + ret = reg_node(pRExC_state, SROPEN); + RExC_in_script_run = 1; + is_open = 1; goto parse_rest; } - RExC_in_script_run = 1; + + /* Here, we are starting an atomic script run. This is + * handled by recursing to deal with the atomic portion + * separately, enclosed in SROPEN ... SRCLOSE nodes */ ret = reg_node(pRExC_state, SROPEN); - is_open = 1; - goto parse_rest; + RExC_in_script_run = 1; + + atomic = reg(pRExC_state, 'r', &flags, depth); + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); + return NULL; + } + + REGTAIL(pRExC_state, ret, atomic); + + REGTAIL(pRExC_state, atomic, + reg_node(pRExC_state, SRCLOSE)); + + RExC_in_script_run = 0; + return ret; } break; @@ -11806,6 +11862,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (paren == '>' || paren == 't') { node = SUSPEND, flag = 0; } + reginsert(pRExC_state, node,ret, depth+1); Set_Node_Cur_Length(ret, parse_start); Set_Node_Offset(ret, parse_start + 1); diff --git a/t/re/script_run.t b/t/re/script_run.t index 5005d637a4..ca234d9d4e 100644 --- a/t/re/script_run.t +++ b/t/re/script_run.t @@ -19,7 +19,7 @@ no warnings "experimental::script_run"; # Since there's so few tests currently, we can afford to try each syntax on # all of them -foreach my $type ('script_run', 'sr') { +foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') { my $script_run; eval '$script_run = qr/ ^ (*$type: .* ) $ /x;'; @@ -89,4 +89,7 @@ foreach my $type ('script_run', 'sr') { # Until fixed, this was skipping the '[' unlike("abc]c", qr/^ (*sr:a(*sr:[bc]*)c) $/x, "Doesn't skip parts of exact matches"); + like("abc", qr/(*asr:a[bc]*c)/, "Outer asr works on a run"); + unlike("abc", qr/(*asr:a(*asr:[bc]*)c)/, "Nested asr works to exclude some things"); + done_testing(); |