summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2018-02-19 20:40:20 -0700
committerKarl Williamson <khw@cpan.org>2018-02-19 20:48:45 -0700
commit3337229ca19b517df4e7e43fd4312f0ee9e5b6c0 (patch)
treef05bea57e3b1796702dd25b8c8ce2429e352546d
parent8638266fb1a6a7f9b6ce2567347bfba94f8b2414 (diff)
downloadperl-3337229ca19b517df4e7e43fd4312f0ee9e5b6c0.tar.gz
Add atomic script runs
This is an extension to the new script_run feature that is syntactic sugar for the idiom espected to be most commonly used: (*sr:(?>...)) can be written as (*asr:...)
-rw-r--r--pod/perldelta.pod8
-rw-r--r--pod/perlre.pod9
-rw-r--r--regcomp.c73
-rw-r--r--t/re/script_run.t5
4 files changed, 85 insertions, 10 deletions
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 691135ef96..d1aa6bd3f4 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -55,6 +55,14 @@ abbreviated form for it. The syntax is now either of:
Previously a C<"+"> was used instead of the C<"*">.
+=head2 There is a new form for script runs which combines with
+C<(?E<gt>...)> (or C<*atomic:...)>)
+
+C<(*asr:...> is an easier way to write C<(*sr:(?E<gt>...))>,
+which is expected to be a commonly used idiom.
+C<(*atomic_script_run:...> is also available. See
+L<perlre/Script Runs>.
+
=head2 Experimentally, there are now alphabetic synonyms for some
regular expression assertions
diff --git a/pod/perlre.pod b/pod/perlre.pod
index b5d5517167..29082a66d0 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -2469,6 +2469,7 @@ following match, see L</C<< (?>pattern) >>>.
=head2 Script Runs
X<(*script_run:...)> X<(sr:...)>
+X<(*atomic_script_run:...)> X<(asr:...)>
A script run is basically a sequence of characters, all from the same
Unicode script (see L<perlunicode/Scripts>), such as Latin or Greek. In
@@ -2499,10 +2500,16 @@ backtracking occurs until something all in the same script is found that
matches, or all possibilities are exhausted. This can cause a lot of
backtracking, but generally, only malicious input will result in this,
though the slow down could cause a denial of service attack. If your
-needs permit, it is best to make the pattern atomic.
+needs permit, it is best to make the pattern atomic. This is so likely
+to be what you want, that instead of writing this:
(*script_run:(?>pattern))
+you can write either of these:
+
+ (*atomic_script_run:pattern)
+ (*asr:pattern)
+
(See L</C<(?E<gt>pattern)>>.)
In Taiwan, Japan, and Korea, it is common for text to have a mixture of
diff --git a/regcomp.c b/regcomp.c
index 13c4154192..3a10ba5831 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -10734,7 +10734,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
vFAIL("Unmatched (");
}
- if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
+ if (paren == 'r') { /* Atomic script run */
+ paren = '>';
+ goto parse_rest;
+ }
+ else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
char *start_verb = RExC_parse + 1;
STRLEN verb_len;
char *start_arg = NULL;
@@ -10841,7 +10845,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
}
break;
case 'a':
- if (memEQs(start_verb, verb_len, "atomic")) {
+ if ( memEQs(start_verb, verb_len, "asr")
+ || memEQs(start_verb, verb_len, "atomic_script_run"))
+ {
+ paren = 'r'; /* Mnemonic: recursed run */
+ goto script_run;
+ }
+ else if (memEQs(start_verb, verb_len, "atomic")) {
paren = 't'; /* AtOMIC */
goto alpha_assertions;
}
@@ -10878,8 +10888,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
if ( memEQs(start_verb, verb_len, "sr")
|| memEQs(start_verb, verb_len, "script_run"))
{
+ regnode * atomic;
+
paren = 's';
+ script_run:
+
/* This indicates Unicode rules. */
REQUIRE_UNI_RULES(flagp, NULL);
@@ -10889,6 +10903,30 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
RExC_parse = start_arg;
+ if (RExC_in_script_run) {
+
+ /* Nested script runs are treated as no-ops, because
+ * if the nested one fails, the outer one must as
+ * well. It could fail sooner, and avoid (??{} with
+ * side effects, but that is explicitly documented as
+ * undefined behavior. */
+
+ ret = NULL;
+
+ if (paren == 's') {
+ paren = ':';
+ goto parse_rest;
+ }
+
+ /* But, the atomic part of a nested atomic script run
+ * isn't a no-op, but can be treated just like a '(?>'
+ * */
+ paren = '>';
+ goto parse_rest;
+ }
+
+ /* By doing this here, we avoid extra warnings for nested
+ * script runs */
if (PASS2) {
Perl_ck_warner_d(aTHX_
packWARN(WARN_EXPERIMENTAL__SCRIPT_RUN),
@@ -10897,17 +10935,35 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
}
- if (RExC_in_script_run) {
- paren = ':';
- ret = NULL;
+ if (paren == 's') {
+ /* Here, we're starting a new regular script run */
+ ret = reg_node(pRExC_state, SROPEN);
+ RExC_in_script_run = 1;
+ is_open = 1;
goto parse_rest;
}
- RExC_in_script_run = 1;
+
+ /* Here, we are starting an atomic script run. This is
+ * handled by recursing to deal with the atomic portion
+ * separately, enclosed in SROPEN ... SRCLOSE nodes */
ret = reg_node(pRExC_state, SROPEN);
- is_open = 1;
- goto parse_rest;
+ RExC_in_script_run = 1;
+
+ atomic = reg(pRExC_state, 'r', &flags, depth);
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
+ return NULL;
+ }
+
+ REGTAIL(pRExC_state, ret, atomic);
+
+ REGTAIL(pRExC_state, atomic,
+ reg_node(pRExC_state, SRCLOSE));
+
+ RExC_in_script_run = 0;
+ return ret;
}
break;
@@ -11806,6 +11862,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
if (paren == '>' || paren == 't') {
node = SUSPEND, flag = 0;
}
+
reginsert(pRExC_state, node,ret, depth+1);
Set_Node_Cur_Length(ret, parse_start);
Set_Node_Offset(ret, parse_start + 1);
diff --git a/t/re/script_run.t b/t/re/script_run.t
index 5005d637a4..ca234d9d4e 100644
--- a/t/re/script_run.t
+++ b/t/re/script_run.t
@@ -19,7 +19,7 @@ no warnings "experimental::script_run";
# Since there's so few tests currently, we can afford to try each syntax on
# all of them
-foreach my $type ('script_run', 'sr') {
+foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') {
my $script_run;
eval '$script_run = qr/ ^ (*$type: .* ) $ /x;';
@@ -89,4 +89,7 @@ foreach my $type ('script_run', 'sr') {
# Until fixed, this was skipping the '['
unlike("abc]c", qr/^ (*sr:a(*sr:[bc]*)c) $/x, "Doesn't skip parts of exact matches");
+ like("abc", qr/(*asr:a[bc]*c)/, "Outer asr works on a run");
+ unlike("abc", qr/(*asr:a(*asr:[bc]*)c)/, "Nested asr works to exclude some things");
+
done_testing();