summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2013-05-18 08:25:16 -0600
committerKarl Williamson <public@khwilliamson.com>2013-05-20 11:01:52 -0600
commit1ca267a56acf698557ec1deec44af651acc88696 (patch)
tree007f9d40b63b92728a095d5defdad663e239289c
parent519101418837cf0edacb710b2b38b42dad6e47c1 (diff)
downloadperl-1ca267a56acf698557ec1deec44af651acc88696.tar.gz
Fix multi-char fold edge case
use locale; fc("\N{LATIN CAPITAL LETTER SHARP S}") eq 2 x fc("\N{LATIN SMALL LETTER LONG S}") should return true, as the SHARP S folds to two 's's in a row, and the LONG S is an antique variant of 's', and folds to s. Until this commit, the expression was false. Similarly, the following should match, but didn't until this commit: "\N{LATIN SMALL LETTER SHARP S}" =~ /\N{LATIN SMALL LETTER LONG S}{2}/iaa The reason these didn't work properly is that in both cases the actual fold to 's' is disallowed. In the first case because of locale; and in the second because of /aa. And the code wasn't smart enough to realize that these were legal. The fix is to special case these so that the fold of sharp s (both capital and small) is two LONG S's under /aa; as is the fold of the capital sharp s under locale. The latter is user-visible, and the documentation of fc() now points that out. I believe this is such an edge case that no mention of it need be done in perldelta.
-rw-r--r--lib/locale.t2
-rw-r--r--pod/perlfunc.pod12
-rw-r--r--regcharclass.h6
-rw-r--r--regcomp.c83
-rw-r--r--regen/regcharclass_multi_char_folds.pl23
-rw-r--r--t/uni/fold.t6
-rw-r--r--utf8.c61
7 files changed, 148 insertions, 45 deletions
diff --git a/lib/locale.t b/lib/locale.t
index a9a5a262e3..580613838d 100644
--- a/lib/locale.t
+++ b/lib/locale.t
@@ -1517,7 +1517,7 @@ setlocale(LC_ALL, "C");
$above_latin1_case_change_delta = -1;
}
else {
- @list = ("", "A", "\xC0", "\x{1E9E}", "\x{100}");
+ @list = ("", "A", "\xC0", "\x{17F}", "\x{100}");
$ascii_case_change_delta = +32;
$above_latin1_case_change_delta = +1;
}
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
index 676644f732..08b9df9e82 100644
--- a/pod/perlfunc.pod
+++ b/pod/perlfunc.pod
@@ -2174,8 +2174,16 @@ Case Charts available at L<http://www.unicode.org/charts/case/>.
If EXPR is omitted, uses C<$_>.
-This function behaves the same way under various pragma, such as in a locale,
-as L</lc> does.
+This function behaves the same way under various pragma, such as within
+S<C<"use feature 'unicode_strings">>, as L</lc> does, with the single
+exception of C<fc> of LATIN CAPITAL LETTER SHARP S (U+1E9E) within the
+scope of S<C<use locale>>. The foldcase of this character would
+normally be C<"ss">, but as explained in the L</lc> section, case
+changes that cross the 255/256 boundary are problematic under locales,
+and are hence prohibited. Therefore, this function under locale returns
+instead the string C<"\x{17F}\x{17F}">, which is the LATIN SMALL LETTER
+LONG S. Since that character itself folds to C<"s">, the string of two
+of them together should be equivalent to a single U+1E9E when foldcased.
While the Unicode Standard defines two additional forms of casefolding,
one for Turkic languages and one that never maps one character into multiple
diff --git a/regcharclass.h b/regcharclass.h
index e51fe64af4..a0bd93ddad 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -643,6 +643,8 @@
( ( ( 0xCC == ((U8*)s)[1] ) && ( 0x88 == ((U8*)s)[2] ) ) ? 3 : 0 ) \
: ( 0x77 == ((U8*)s)[0] || 0x79 == ((U8*)s)[0] ) ? \
( ( ( 0xCC == ((U8*)s)[1] ) && ( 0x8A == ((U8*)s)[2] ) ) ? 3 : 0 ) \
+ : ( 0xC5 == ((U8*)s)[0] ) ? \
+ ( ( ( ( 0xBF == ((U8*)s)[1] ) && ( 0xC5 == ((U8*)s)[2] ) ) && ( 0xBF == ((U8*)s)[3] ) ) ? 4 : 0 )\
: ( 0xCA == ((U8*)s)[0] ) ? \
( ( ( 0xBC == ((U8*)s)[1] ) && ( 0x6E == ((U8*)s)[2] ) ) ? 3 : 0 ) \
: ( 0xCE == ((U8*)s)[0] ) ? \
@@ -699,6 +701,8 @@
( ( ( 0xCC == ((U8*)s)[1] ) && ( 0x88 == ((U8*)s)[2] ) ) ? 3 : 0 ) \
: ( 0x77 == ((U8*)s)[0] || 0x79 == ((U8*)s)[0] ) ? \
( ( ( 0xCC == ((U8*)s)[1] ) && ( 0x8A == ((U8*)s)[2] ) ) ? 3 : 0 ) \
+ : ( 0xC5 == ((U8*)s)[0] ) ? \
+ ( ( ( ( 0xBF == ((U8*)s)[1] ) && ( 0xC5 == ((U8*)s)[2] ) ) && ( 0xBF == ((U8*)s)[3] ) ) ? 4 : 0 )\
: ( 0xCA == ((U8*)s)[0] ) ? \
( ( ( 0xBC == ((U8*)s)[1] ) && ( 0x6E == ((U8*)s)[2] ) ) ? 3 : 0 ) \
: ( 0xCE == ((U8*)s)[0] ) ? \
@@ -776,6 +780,8 @@
( ( ( 0xCC == ((U8*)s)[1] ) && ( 0x88 == ((U8*)s)[2] ) ) ? 3 : 0 ) \
: ( 0x77 == ((U8*)s)[0] || 0x79 == ((U8*)s)[0] ) ? \
( ( ( 0xCC == ((U8*)s)[1] ) && ( 0x8A == ((U8*)s)[2] ) ) ? 3 : 0 ) \
+ : ( 0xC5 == ((U8*)s)[0] ) ? \
+ ( ( ( ( 0xBF == ((U8*)s)[1] ) && ( 0xC5 == ((U8*)s)[2] ) ) && ( 0xBF == ((U8*)s)[3] ) ) ? 4 : 0 )\
: ( 0xCA == ((U8*)s)[0] ) ? \
( ( ( 0xBC == ((U8*)s)[1] ) && ( 0x6E == ((U8*)s)[2] ) ) ? 3 : 0 ) \
: ( 0xCE == ((U8*)s)[0] ) ? \
diff --git a/regcomp.c b/regcomp.c
index dfe3aeaaea..d48510c3bf 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2686,29 +2686,37 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode
* this file makes sure that in EXACTFU nodes, the sharp s gets folded to
* 'ss', even if the pattern isn't UTF-8. This avoids the issues
* described in the next item.
- * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
- * 'ss' or not is not knowable at compile time. It will match iff the
- * target string is in UTF-8, unlike the EXACTFU nodes, where it always
- * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
- * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
- * described in item 3). An assumption that the optimizer part of
- * regexec.c (probably unwittingly) makes is that a character in the
- * pattern corresponds to at most a single character in the target string.
- * (And I do mean character, and not byte here, unlike other parts of the
- * documentation that have never been updated to account for multibyte
- * Unicode.) This assumption is wrong only in this case, as all other
- * cases are either 1-1 folds when no UTF-8 is involved; or is true by
- * virtue of having this file pre-fold UTF-8 patterns. I'm
- * reluctant to try to change this assumption, so instead the code punts.
- * This routine examines EXACTF nodes for the sharp s, and returns a
- * boolean indicating whether or not the node is an EXACTF node that
- * contains a sharp s. When it is true, the caller sets a flag that later
- * causes the optimizer in this file to not set values for the floating
- * and fixed string lengths, and thus avoids the optimizer code in
- * regexec.c that makes the invalid assumption. Thus, there is no
- * optimization based on string lengths for EXACTF nodes that contain the
- * sharp s. This only happens for /id rules (which means the pattern
- * isn't in UTF-8).
+ * 4) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
+ * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
+ * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
+ * (probably unwittingly, in Perl_regexec_flags()) makes is that a
+ * character in the pattern corresponds to at most a single character in
+ * the target string. (And I do mean character, and not byte here, unlike
+ * other parts of the documentation that have never been updated to
+ * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
+ * two character string 'ss'; in EXACTFA nodes it can match
+ * "\x{17F}\x{17F}". These violate the assumption, and they are the only
+ * instances where it is violated. I'm reluctant to try to change the
+ * assumption, as the code involved is impenetrable to me (khw), so
+ * instead the code here punts. This routine examines (when the pattern
+ * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
+ * boolean indicating whether or not the node contains a sharp s. When it
+ * is true, the caller sets a flag that later causes the optimizer in this
+ * file to not set values for the floating and fixed string lengths, and
+ * thus avoids the optimizer code in regexec.c that makes the invalid
+ * assumption. Thus, there is no optimization based on string lengths for
+ * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
+ * (The reason the assumption is wrong only in these two cases is that all
+ * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
+ * other folds to their expanded versions. We can't prefold sharp s to
+ * 'ss' in EXACTF nodes because we don't know at compile time if it
+ * actually matches 'ss' or not. It will match iff the target string is
+ * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
+ * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
+ * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
+ * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
+ * require the pattern to be forced into UTF-8, the overhead of which we
+ * want to avoid.)
*/
#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
@@ -2899,13 +2907,30 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
next_iteration: ;
}
}
- else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
+ else if (OP(scan) == EXACTFA) {
- /* Here, the pattern is not UTF-8. Look for the multi-char folds
- * that are all ASCII. As in the above case, EXACTFL and EXACTFA
- * nodes can't have multi-char folds to this range (and there are
- * no existing ones in the upper latin1 range). In the EXACTF
- * case we look also for the sharp s, which can be in the final
+ /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
+ * fold to the ASCII range (and there are no existing ones in the
+ * upper latin1 range). But, as outlined in the comments preceding
+ * this function, we need to flag any occurrences of the sharp s */
+ while (s < s_end) {
+ if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+ *has_exactf_sharp_s = TRUE;
+ break;
+ }
+ s++;
+ continue;
+ }
+ }
+ else if (OP(scan) != EXACTFL) {
+
+ /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
+ * multi-char folds that are all Latin1. (This code knows that
+ * there are no current multi-char folds possible with EXACTFL,
+ * relying on fold_grind.t to catch any errors if the very unlikely
+ * event happens that some get added in future Unicode versions.)
+ * As explained in the comments preceding this function, we look
+ * also for the sharp s in EXACTF nodes; it can be in the final
* position. Otherwise we can stop looking 1 byte earlier because
* have to find at least two characters for a multi-fold */
const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
diff --git a/regen/regcharclass_multi_char_folds.pl b/regen/regcharclass_multi_char_folds.pl
index f0fd6b3a89..f04be85c58 100644
--- a/regen/regcharclass_multi_char_folds.pl
+++ b/regen/regcharclass_multi_char_folds.pl
@@ -104,6 +104,29 @@ sub multi_char_folds ($) {
}
}
+ # \x17F is the small LONG S, which folds to 's'. Both Capital and small
+ # LATIN SHARP S fold to 'ss'. Therefore, they should also match two 17F's
+ # in a row under regex /i matching. But under /iaa regex matching, all
+ # three folds to 's' are prohibited, but the sharp S's should still match
+ # two 17F's. This prohibition causes our regular regex algorithm that
+ # would ordinarily allow this match to fail. This is the only instance in
+ # all Unicode of this kind of issue. By adding a special case here, we
+ # can use the regular algorithm (with some other changes elsewhere as
+ # well).
+ #
+ # It would be possible to re-write the above code to automatically detect
+ # and handle this case, and any others that might eventually get added to
+ # the Unicode standard, but I (khw) don't think it's worth it. I believe
+ # that it's extremely unlikely that more folds to ASCII characters are
+ # going to be added, and if I'm wrong, fold_grind.t has the intelligence
+ # to detect them, and test that they work, at which point another special
+ # case could be added here if necessary.
+ #
+ # No combinations of this with 's' need be added, as any of these
+ # containing 's' are prohibted under /iaa.
+ push @folds, "\"\x{17F}\x{17F}\"";
+
+
return @folds;
}
diff --git a/t/uni/fold.t b/t/uni/fold.t
index 91356bbe18..6c06a2fdc2 100644
--- a/t/uni/fold.t
+++ b/t/uni/fold.t
@@ -444,6 +444,12 @@ foreach my $test_ref (@CF) {
}
}
+{
+ use feature qw( fc );
+ use locale;
+ is(fc("\x{1E9E}"), fc("\x{17F}\x{17F}"), 'fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")');
+}
+
my $num_tests = curr_test() - 1;
plan($num_tests);
diff --git a/utf8.c b/utf8.c
index 0e8274f877..c3394e5450 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1783,21 +1783,38 @@ UV
Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
{
/* Corresponds to to_lower_latin1(); <flags> bits meanings:
+ * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
* FOLD_FLAGS_FULL iff full folding is to be used;
+ *
+ * Not to be used for locale folds
*/
UV converted;
PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
+ assert (! (flags & FOLD_FLAGS_LOCALE));
+
if (c == MICRO_SIGN) {
converted = GREEK_SMALL_LETTER_MU;
}
else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
+
+ /* If can't cross 127/128 boundary, can't return "ss"; instead return
+ * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
+ * under those circumstances. */
+ if (flags & FOLD_FLAGS_NOMIX_ASCII) {
+ *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
+ Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+ p, *lenp, U8);
+ return LATIN_SMALL_LETTER_LONG_S;
+ }
+ else {
*(p)++ = 's';
*p = 's';
*lenp = 2;
return 's';
+ }
}
else { /* In this range the fold of all other characters is their lower
case */
@@ -1832,11 +1849,7 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags)
if (c < 256) {
UV result = _to_fold_latin1((U8) c, p, lenp,
- /* If ASCII-safe, don't allow full folding,
- * as that could include SHARP S => ss;
- * otherwise there is no crossing of
- * ascii/non-ascii in the latin1 range */
- (flags & FOLD_FLAGS_NOMIX_ASCII) ? 0 : flags & FOLD_FLAGS_FULL);
+ flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
/* It is illegal for the fold to cross the 255/256 boundary under
* locale; in this case return the original */
return (result > 256 && flags & FOLD_FLAGS_LOCALE)
@@ -2773,7 +2786,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
}
else {
return _to_fold_latin1(*p, ustrp, lenp,
- flags & FOLD_FLAGS_FULL);
+ flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
}
}
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
@@ -2783,18 +2796,22 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
else {
return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
ustrp, lenp,
- /* If ASCII safe, don't allow full
- * folding, as that could include SHARP
- * S => ss; otherwise there is no
- * crossing of ascii/non-ascii in the
- * latin1 range */
- (flags & FOLD_FLAGS_NOMIX_ASCII) ? 0 : flags & FOLD_FLAGS_FULL);
+ flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
}
}
else { /* utf8, ord above 255 */
result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
- if ((flags & FOLD_FLAGS_LOCALE)) {
+ if (flags & FOLD_FLAGS_LOCALE) {
+
+ /* Special case this character, as what normally gets returned
+ * under locale doesn't work */
+ if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
+ && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
+ sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1))
+ {
+ goto return_long_s;
+ }
return check_locale_boundary_crossing(p, result, ustrp, lenp);
}
else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
@@ -2815,6 +2832,12 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
if (isASCII(*s)) {
/* Crossed, have to return the original */
original = valid_utf8_to_uvchr(p, lenp);
+
+ /* But in this one instance, there is an alternative we can
+ * return that is valid */
+ if (original == LATIN_CAPITAL_LETTER_SHARP_S) {
+ goto return_long_s;
+ }
Copy(p, ustrp, *lenp, char);
return original;
}
@@ -2841,6 +2864,18 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
*tainted_ptr = TRUE;
}
return result;
+
+ return_long_s:
+ /* Certain folds to 'ss' are prohibited by the options, but they do allow
+ * folds to a string of two of these characters. By returning this
+ * instead, then, e.g.,
+ * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
+ * works. */
+
+ *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
+ Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+ ustrp, *lenp, U8);
+ return LATIN_SMALL_LETTER_LONG_S;
}
/* Note: