From 536520b4a24d012a8aa0690f044f651300258171 Mon Sep 17 00:00:00 2001 From: Tony Cook Date: Mon, 14 Sep 2020 16:00:28 +1000 Subject: don't croak when the \K follows the lookaround assertion this also simplifies the flagging for these assertions, since this error is now the only thing using in_lookhead and in_lookbehind they can be combined into a single in_lookaround. Rather than conditional increment/decrement as we recurse into S_reg I simply save the value of in_lookaround and restore it before returning. Some unsuccessful or restart paths don't do the restore, but they either result in a croak(), or a restart which reinitialises in_lookaround anyway. Also added tests to ensure that all the different zero-width assertions with content trigger the error. (cherry picked from commit 80f44cf4982e395989f886220e05dd2071bb205a) --- regcomp.c | 35 ++++++++++++----------------------- t/lib/croak/regcomp | 43 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 25 deletions(-) diff --git a/regcomp.c b/regcomp.c index 2109b6a403..0da659cf80 100644 --- a/regcomp.c +++ b/regcomp.c @@ -242,8 +242,7 @@ struct RExC_state_t { U8 *study_chunk_recursed; /* bitmap of which subs we have moved through */ U32 study_chunk_recursed_bytes; /* bytes in bitmap */ - I32 in_lookbehind; - I32 in_lookahead; + I32 in_lookaround; I32 contains_locale; I32 override_recoding; I32 recode_x_to_native; @@ -330,8 +329,7 @@ struct RExC_state_t { #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) #define RExC_study_chunk_recursed_bytes \ (pRExC_state->study_chunk_recursed_bytes) -#define RExC_in_lookbehind (pRExC_state->in_lookbehind) -#define RExC_in_lookahead (pRExC_state->in_lookahead) +#define RExC_in_lookaround (pRExC_state->in_lookaround) #define RExC_contains_locale (pRExC_state->contains_locale) #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) @@ -7791,8 +7789,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_seen = 0; RExC_maxlen = 0; - RExC_in_lookbehind = 0; - RExC_in_lookahead = 0; + RExC_in_lookaround = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_recode_x_to_native = 0; RExC_in_multi_char_class = 0; @@ -11180,6 +11177,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) I32 after_freeze = 0; I32 num; /* numeric backreferences */ SV * max_open; /* Max number of unclosed parens */ + I32 was_in_lookaround = RExC_in_lookaround; char * parse_start = RExC_parse; /* MJD */ char * const oregcomp_parse = RExC_parse; @@ -11201,13 +11199,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) *flagp = 0; /* Tentatively. */ - if (RExC_in_lookbehind) { - RExC_in_lookbehind++; - } - if (RExC_in_lookahead) { - RExC_in_lookahead++; - } - /* Having this true makes it feasible to have a lot fewer tests for the * parse pointer being in scope. For example, we can write * while(isFOO(*RExC_parse)) RExC_parse++; @@ -11461,11 +11452,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) lookbehind_alpha_assertions: RExC_seen |= REG_LOOKBEHIND_SEEN; - RExC_in_lookbehind++; /*FALLTHROUGH*/ alpha_assertions: + RExC_in_lookaround++; RExC_seen_zerolen++; if (! start_arg) { @@ -11668,7 +11659,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } RExC_seen |= REG_LOOKBEHIND_SEEN; - RExC_in_lookbehind++; + RExC_in_lookaround++; RExC_parse++; if (RExC_parse >= RExC_end) { vFAIL("Sequence (?... not terminated"); @@ -11677,7 +11668,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) break; case '=': /* (?=...) */ RExC_seen_zerolen++; - RExC_in_lookahead++; + RExC_in_lookaround++; break; case '!': /* (?!...) */ RExC_seen_zerolen++; @@ -11689,6 +11680,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) nextchar(pRExC_state); return ret; } + RExC_in_lookaround++; break; case '|': /* (?|...) */ /* branch reset, behave like a (?:...) except that @@ -12509,14 +12501,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) NOT_REACHED; /* NOTREACHED */ } - if (RExC_in_lookbehind) { - RExC_in_lookbehind--; - } - if (RExC_in_lookahead) { - RExC_in_lookahead--; - } if (after_freeze > RExC_npar) RExC_npar = after_freeze; + + RExC_in_lookaround = was_in_lookaround; + return(ret); } @@ -13627,7 +13616,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp |= SIMPLE; goto finish_meta_pat; case 'K': - if (!RExC_in_lookbehind && !RExC_in_lookahead) { + if (!RExC_in_lookaround) { RExC_seen_zerolen++; ret = reg_node(pRExC_state, KEEPS); *flagp |= SIMPLE; diff --git a/t/lib/croak/regcomp b/t/lib/croak/regcomp index 476b239fa7..74e70ed363 100644 --- a/t/lib/croak/regcomp +++ b/t/lib/croak/regcomp @@ -77,15 +77,54 @@ EXPECT Too many nested open parens in regex; marked by <-- HERE in m/(( <-- HERE a))/ at - line 3. ######## # NAME \K not permitted in lookahead -$x =~ /(?=a\Ka)a/; +qr/(?=a\Ka)a/; EXPECT \K not permitted in lookahead/lookbehind in regex; marked by <-- HERE in m/(?=a\K <-- HERE a)a/ at - line 1. ######## +# NAME \K not permitted in lookahead (alpha) +no warnings 'experimental::alpha_assertions'; +qr/(*positive_lookahead:a\Ka)a/; +EXPECT +\K not permitted in lookahead/lookbehind in regex; marked by <-- HERE in m/(*positive_lookahead:a\K <-- HERE a)a/ at - line 2. +######## +# NAME \K not permitted in negative lookahead +qr/(?!a\Ka)a/; +EXPECT +\K not permitted in lookahead/lookbehind in regex; marked by <-- HERE in m/(?!a\K <-- HERE a)a/ at - line 1. +######## +# NAME \K not permitted in negative lookahead (alpha) +no warnings 'experimental::alpha_assertions'; +qr/(*negative_lookahead:a\Ka)a/; +EXPECT +\K not permitted in lookahead/lookbehind in regex; marked by <-- HERE in m/(*negative_lookahead:a\K <-- HERE a)a/ at - line 2. +######## # NAME \K not permitted in lookbehind -$x =~ /(?<=a\Ka)a/; +qr/(?<=a\Ka)a/; EXPECT \K not permitted in lookahead/lookbehind in regex; marked by <-- HERE in m/(?<=a\K <-- HERE a)a/ at - line 1. ######## +# NAME \K not permitted in lookbehind (alpha) +no warnings 'experimental::alpha_assertions'; +qr/(*positive_lookbehind:a\Ka)a/; +EXPECT +\K not permitted in lookahead/lookbehind in regex; marked by <-- HERE in m/(*positive_lookbehind:a\K <-- HERE a)a/ at - line 2. +######## +# NAME \K not permitted in negative lookbehind +qr/(?