summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-10-23 07:54:53 -0600
committerSteve Hay <steve.m.hay@googlemail.com>2020-12-26 15:18:34 +0000
commit9e9893534814fa6362ae99273d3afbf1402502f9 (patch)
treecfb2b484f4b90967ff24c9c3c098c20af8ffb2f9
parente744a81b1cb8f9a3df9a4cd784cb2b74d256441a (diff)
downloadperl-9e9893534814fa6362ae99273d3afbf1402502f9.tar.gz
Fix GH #17278
This was an assertion failure in regexec.c under rare circumstances. A reduction of the fuzzed test case is now in pat_advanced.t The root cause of this was that the pattern being compiled was encoded in UTF-8 and 'use locale' was in effect, equivalent to the /l charset, and then the charset was reset inside the pattern, to /d. But /d in a UTF-8 patterns is illegal, hence the later assertion failure. The solution is to reset instead to /u when the pattern is UTF-8. (cherry picked from commit bb58640a409949759318542317e353e2241cc408)
-rw-r--r--regcomp.c15
-rw-r--r--t/re/pat_advanced.t14
2 files changed, 24 insertions, 5 deletions
diff --git a/regcomp.c b/regcomp.c
index 7ee299a47a..bea1348d67 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -421,6 +421,11 @@ struct RExC_state_t {
} \
} STMT_END
+/* /u is to be chosen if we are supposed to use Unicode rules, or if the
+ * pattern is in UTF-8. This latter condition is in case the outermost rules
+ * are locale. See GH #17278 */
+#define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
+
/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
* a flag that indicates we need to override /d with /u as a result of
* something in the pattern. It should only be used in regards to calling
@@ -7750,7 +7755,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
rx_flags = orig_rx_flags;
- if ( (UTF || RExC_uni_semantics)
+ if ( toUSE_UNI_CHARSET_NOT_DEPENDS
&& initial_charset == REGEX_DEPENDS_CHARSET)
{
@@ -10852,7 +10857,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
RExC_parse++;
has_use_defaults = TRUE;
STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
- cs = (RExC_uni_semantics)
+ cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
? REGEX_UNICODE_CHARSET
: REGEX_DEPENDS_CHARSET;
set_regex_charset(&RExC_flags, cs);
@@ -10860,7 +10865,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
else {
cs = get_regex_charset(RExC_flags);
if ( cs == REGEX_DEPENDS_CHARSET
- && RExC_uni_semantics)
+ && (toUSE_UNI_CHARSET_NOT_DEPENDS))
{
cs = REGEX_UNICODE_CHARSET;
}
@@ -10944,7 +10949,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
* pattern (or target, not known until runtime) are
* utf8, or something in the pattern indicates unicode
* semantics */
- cs = (RExC_uni_semantics)
+ cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
? REGEX_UNICODE_CHARSET
: REGEX_DEPENDS_CHARSET;
has_charset_modifier = DEPENDS_PAT_MOD;
@@ -12480,7 +12485,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
/* restore original flags, but keep (?p) and, if we've encountered
* something in the parse that changes /d rules into /u, keep the /u */
RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
- if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
+ if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
}
if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t
index b5c7f1cab4..509a51570f 100644
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -2562,6 +2562,20 @@ EOF
{}, "GH #17734");
}
+ { # GH $17278 assertion fails
+ fresh_perl_is('use locale;
+ my $A_grave = "\N{LATIN CAPITAL LETTER A WITH GRAVE}";
+ utf8::encode($A_grave);
+ my $a_grave = "\N{LATIN SMALL LETTER A WITH GRAVE}";
+ utf8::encode($a_grave);
+
+ my $z="q!$a_grave! =~ m!(?^i)[$A_grave]!";
+ utf8::decode($z);
+ print eval $z, "\n";',
+ 1,
+ {}, "GH #17278");
+ }
+
# !!! NOTE that tests that aren't at all likely to crash perl should go
# a ways above, above these last ones. There's a comment there that, like