summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-05-03 11:44:28 -0600
committerJesse Vincent <jesse@bestpractical.com>2011-05-03 17:14:06 -0400
commit827f5bb80b513fa181ae206648e6d58d9d82eb29 (patch)
treeaf4604d629d45908c5522e6bd0ba93dde7734a1c
parent36bb2ab64fa2ef022d7870082c0dcc6db902c86e (diff)
downloadperl-827f5bb80b513fa181ae206648e6d58d9d82eb29.tar.gz
PATCH: [perl #89750]: Unicode regex negated case-insensitivity
This patch causes inverted [bracketed] character classes to not handle multi-character folds. The reason is that these can lead to very counter-intuitive results (see bug discussion). In an inverted character class, only single-char folds are now generated. However the fold for \xDF=>ss is hard-coded in, and it was too much trouble sending flags to the sub-sub routine that does this, so another check is done at the point of storing the list of multi-char folds. Since \xDF doesn't have a single char fold, this works.
-rw-r--r--regcomp.c22
-rw-r--r--t/re/fold_grind.t2
-rw-r--r--t/re/re_tests5
3 files changed, 28 insertions, 1 deletions
diff --git a/regcomp.c b/regcomp.c
index 085884174a..59397a2d99 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -9552,6 +9552,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
IV namedclass;
char *rangebegin = NULL;
bool need_class = 0;
+ bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
SV *listsv = NULL;
STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
than just initialized. */
@@ -9608,6 +9609,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
RExC_parse++;
if (!SIZE_ONLY)
ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+
+ /* We have decided to not allow multi-char folds in inverted character
+ * classes, due to the confusion that can happen, even with classes
+ * that are designed for a non-Unicode world: You have the peculiar
+ * case that:
+ "s s" =~ /^[^\xDF]+$/i => Y
+ "ss" =~ /^[^\xDF]+$/i => N
+ *
+ * See [perl #89750] */
+ allow_full_fold = FALSE;
}
if (SIZE_ONLY) {
@@ -10136,7 +10147,8 @@ parseit:
/* Get its fold */
U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN foldlen;
- const UV f = to_uni_fold(j, foldbuf, &foldlen);
+ const UV f =
+ _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
if (foldlen > (STRLEN)UNISKIP(f)) {
@@ -10437,10 +10449,18 @@ parseit:
* used later (regexec.c:S_reginclass()). */
av_store(av, 0, listsv);
av_store(av, 1, NULL);
+
+ /* Store any computed multi-char folds only if we are allowing
+ * them */
+ if (allow_full_fold) {
av_store(av, 2, MUTABLE_SV(unicode_alternate));
if (unicode_alternate) { /* This node is variable length */
OP(ret) = ANYOFV;
}
+ }
+ else {
+ av_store(av, 2, NULL);
+ }
rv = newRV_noinc(MUTABLE_SV(av));
n = add_data(pRExC_state, 1, "s");
RExC_rxi->data->data[n] = (void*)rv;
diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t
index 82ca6ad249..460d296644 100644
--- a/t/re/fold_grind.t
+++ b/t/re/fold_grind.t
@@ -452,6 +452,8 @@ foreach my $test (sort { numerically } keys %tests) {
foreach my $bracketed (0, 1) { # Put rhs in [...], or not
foreach my $inverted (0,1) {
next if $inverted && ! $bracketed; # inversion only valid in [^...]
+ next if $inverted && @target != 1; # [perl #89750] multi-char
+ # not valid in [^...]
# In some cases, add an extra character that doesn't fold, and
# looks ok in the output.
diff --git a/t/re/re_tests b/t/re/re_tests
index 9d5341b4c8..35a72203cd 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1517,4 +1517,9 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer
/s/aia S y $& S
/(?aia:s)/ \x{17F} n - -
/(?aia:s)/ S y $& S
+
+# Normally 1E9E generates a multi-char fold, but not in inverted class;
+# See [perl #89750]. This makes sure that the simple fold gets generated
+# in that case, to DF.
+/[^\x{1E9E}]/i \x{DF} n - -
# vim: softtabstop=0 noexpandtab