diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-05-03 11:44:28 -0600 |
---|---|---|
committer | Jesse Vincent <jesse@bestpractical.com> | 2011-05-03 17:14:06 -0400 |
commit | 827f5bb80b513fa181ae206648e6d58d9d82eb29 (patch) | |
tree | af4604d629d45908c5522e6bd0ba93dde7734a1c | |
parent | 36bb2ab64fa2ef022d7870082c0dcc6db902c86e (diff) | |
download | perl-827f5bb80b513fa181ae206648e6d58d9d82eb29.tar.gz |
PATCH: [perl #89750]: Unicode regex negated case-insensitivity
This patch causes inverted [bracketed] character classes to not handle
multi-character folds. The reason is that these can lead to very
counter-intuitive results (see bug discussion).
In an inverted character class, only single-char folds are now
generated. However the fold for \xDF=>ss is hard-coded in,
and it was too much trouble sending flags to the sub-sub routine that
does this, so another check is done at the point of storing the list of
multi-char folds. Since \xDF doesn't have a single char fold, this
works.
-rw-r--r-- | regcomp.c | 22 | ||||
-rw-r--r-- | t/re/fold_grind.t | 2 | ||||
-rw-r--r-- | t/re/re_tests | 5 |
3 files changed, 28 insertions, 1 deletions
@@ -9552,6 +9552,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) IV namedclass; char *rangebegin = NULL; bool need_class = 0; + bool allow_full_fold = TRUE; /* Assume wants multi-char folding */ SV *listsv = NULL; STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ @@ -9608,6 +9609,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) RExC_parse++; if (!SIZE_ONLY) ANYOF_FLAGS(ret) |= ANYOF_INVERT; + + /* We have decided to not allow multi-char folds in inverted character + * classes, due to the confusion that can happen, even with classes + * that are designed for a non-Unicode world: You have the peculiar + * case that: + "s s" =~ /^[^\xDF]+$/i => Y + "ss" =~ /^[^\xDF]+$/i => N + * + * See [perl #89750] */ + allow_full_fold = FALSE; } if (SIZE_ONLY) { @@ -10136,7 +10147,8 @@ parseit: /* Get its fold */ U8 foldbuf[UTF8_MAXBYTES_CASE+1]; STRLEN foldlen; - const UV f = to_uni_fold(j, foldbuf, &foldlen); + const UV f = + _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold); if (foldlen > (STRLEN)UNISKIP(f)) { @@ -10437,10 +10449,18 @@ parseit: * used later (regexec.c:S_reginclass()). */ av_store(av, 0, listsv); av_store(av, 1, NULL); + + /* Store any computed multi-char folds only if we are allowing + * them */ + if (allow_full_fold) { av_store(av, 2, MUTABLE_SV(unicode_alternate)); if (unicode_alternate) { /* This node is variable length */ OP(ret) = ANYOFV; } + } + else { + av_store(av, 2, NULL); + } rv = newRV_noinc(MUTABLE_SV(av)); n = add_data(pRExC_state, 1, "s"); RExC_rxi->data->data[n] = (void*)rv; diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t index 82ca6ad249..460d296644 100644 --- a/t/re/fold_grind.t +++ b/t/re/fold_grind.t @@ -452,6 +452,8 @@ foreach my $test (sort { numerically } keys %tests) { foreach my $bracketed (0, 1) { # Put rhs in [...], or not foreach my $inverted (0,1) { next if $inverted && ! $bracketed; # inversion only valid in [^...] + next if $inverted && @target != 1; # [perl #89750] multi-char + # not valid in [^...] # In some cases, add an extra character that doesn't fold, and # looks ok in the output. diff --git a/t/re/re_tests b/t/re/re_tests index 9d5341b4c8..35a72203cd 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1517,4 +1517,9 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer /s/aia S y $& S /(?aia:s)/ \x{17F} n - - /(?aia:s)/ S y $& S + +# Normally 1E9E generates a multi-char fold, but not in inverted class; +# See [perl #89750]. This makes sure that the simple fold gets generated +# in that case, to DF. +/[^\x{1E9E}]/i \x{DF} n - - # vim: softtabstop=0 noexpandtab |