diff options
author | Karl Williamson <public@khwilliamson.com> | 2013-05-18 08:25:16 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2013-05-20 11:01:52 -0600 |
commit | 1ca267a56acf698557ec1deec44af651acc88696 (patch) | |
tree | 007f9d40b63b92728a095d5defdad663e239289c | |
parent | 519101418837cf0edacb710b2b38b42dad6e47c1 (diff) | |
download | perl-1ca267a56acf698557ec1deec44af651acc88696.tar.gz |
Fix multi-char fold edge case
use locale;
fc("\N{LATIN CAPITAL LETTER SHARP S}")
eq 2 x fc("\N{LATIN SMALL LETTER LONG S}")
should return true, as the SHARP S folds to two 's's in a row, and the
LONG S is an antique variant of 's', and folds to s. Until this commit,
the expression was false.
Similarly, the following should match, but didn't until this commit:
"\N{LATIN SMALL LETTER SHARP S}" =~ /\N{LATIN SMALL LETTER LONG S}{2}/iaa
The reason these didn't work properly is that in both cases the actual
fold to 's' is disallowed. In the first case because of locale; and in
the second because of /aa. And the code wasn't smart enough to realize
that these were legal.
The fix is to special case these so that the fold of sharp s (both
capital and small) is two LONG S's under /aa; as is the fold of the
capital sharp s under locale. The latter is user-visible, and the
documentation of fc() now points that out. I believe this is such an
edge case that no mention of it need be done in perldelta.
-rw-r--r-- | lib/locale.t | 2 | ||||
-rw-r--r-- | pod/perlfunc.pod | 12 | ||||
-rw-r--r-- | regcharclass.h | 6 | ||||
-rw-r--r-- | regcomp.c | 83 | ||||
-rw-r--r-- | regen/regcharclass_multi_char_folds.pl | 23 | ||||
-rw-r--r-- | t/uni/fold.t | 6 | ||||
-rw-r--r-- | utf8.c | 61 |
7 files changed, 148 insertions, 45 deletions
diff --git a/lib/locale.t b/lib/locale.t index a9a5a262e3..580613838d 100644 --- a/lib/locale.t +++ b/lib/locale.t @@ -1517,7 +1517,7 @@ setlocale(LC_ALL, "C"); $above_latin1_case_change_delta = -1; } else { - @list = ("", "A", "\xC0", "\x{1E9E}", "\x{100}"); + @list = ("", "A", "\xC0", "\x{17F}", "\x{100}"); $ascii_case_change_delta = +32; $above_latin1_case_change_delta = +1; } diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index 676644f732..08b9df9e82 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -2174,8 +2174,16 @@ Case Charts available at L<http://www.unicode.org/charts/case/>. If EXPR is omitted, uses C<$_>. -This function behaves the same way under various pragma, such as in a locale, -as L</lc> does. +This function behaves the same way under various pragma, such as within +S<C<"use feature 'unicode_strings">>, as L</lc> does, with the single +exception of C<fc> of LATIN CAPITAL LETTER SHARP S (U+1E9E) within the +scope of S<C<use locale>>. The foldcase of this character would +normally be C<"ss">, but as explained in the L</lc> section, case +changes that cross the 255/256 boundary are problematic under locales, +and are hence prohibited. Therefore, this function under locale returns +instead the string C<"\x{17F}\x{17F}">, which is the LATIN SMALL LETTER +LONG S. Since that character itself folds to C<"s">, the string of two +of them together should be equivalent to a single U+1E9E when foldcased. While the Unicode Standard defines two additional forms of casefolding, one for Turkic languages and one that never maps one character into multiple diff --git a/regcharclass.h b/regcharclass.h index e51fe64af4..a0bd93ddad 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -643,6 +643,8 @@ ( ( ( 0xCC == ((U8*)s)[1] ) && ( 0x88 == ((U8*)s)[2] ) ) ? 3 : 0 ) \ : ( 0x77 == ((U8*)s)[0] || 0x79 == ((U8*)s)[0] ) ? \ ( ( ( 0xCC == ((U8*)s)[1] ) && ( 0x8A == ((U8*)s)[2] ) ) ? 3 : 0 ) \ + : ( 0xC5 == ((U8*)s)[0] ) ? \ + ( ( ( ( 0xBF == ((U8*)s)[1] ) && ( 0xC5 == ((U8*)s)[2] ) ) && ( 0xBF == ((U8*)s)[3] ) ) ? 4 : 0 )\ : ( 0xCA == ((U8*)s)[0] ) ? \ ( ( ( 0xBC == ((U8*)s)[1] ) && ( 0x6E == ((U8*)s)[2] ) ) ? 3 : 0 ) \ : ( 0xCE == ((U8*)s)[0] ) ? \ @@ -699,6 +701,8 @@ ( ( ( 0xCC == ((U8*)s)[1] ) && ( 0x88 == ((U8*)s)[2] ) ) ? 3 : 0 ) \ : ( 0x77 == ((U8*)s)[0] || 0x79 == ((U8*)s)[0] ) ? \ ( ( ( 0xCC == ((U8*)s)[1] ) && ( 0x8A == ((U8*)s)[2] ) ) ? 3 : 0 ) \ + : ( 0xC5 == ((U8*)s)[0] ) ? \ + ( ( ( ( 0xBF == ((U8*)s)[1] ) && ( 0xC5 == ((U8*)s)[2] ) ) && ( 0xBF == ((U8*)s)[3] ) ) ? 4 : 0 )\ : ( 0xCA == ((U8*)s)[0] ) ? \ ( ( ( 0xBC == ((U8*)s)[1] ) && ( 0x6E == ((U8*)s)[2] ) ) ? 3 : 0 ) \ : ( 0xCE == ((U8*)s)[0] ) ? \ @@ -776,6 +780,8 @@ ( ( ( 0xCC == ((U8*)s)[1] ) && ( 0x88 == ((U8*)s)[2] ) ) ? 3 : 0 ) \ : ( 0x77 == ((U8*)s)[0] || 0x79 == ((U8*)s)[0] ) ? \ ( ( ( 0xCC == ((U8*)s)[1] ) && ( 0x8A == ((U8*)s)[2] ) ) ? 3 : 0 ) \ + : ( 0xC5 == ((U8*)s)[0] ) ? \ + ( ( ( ( 0xBF == ((U8*)s)[1] ) && ( 0xC5 == ((U8*)s)[2] ) ) && ( 0xBF == ((U8*)s)[3] ) ) ? 4 : 0 )\ : ( 0xCA == ((U8*)s)[0] ) ? \ ( ( ( 0xBC == ((U8*)s)[1] ) && ( 0x6E == ((U8*)s)[2] ) ) ? 3 : 0 ) \ : ( 0xCE == ((U8*)s)[0] ) ? \ @@ -2686,29 +2686,37 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * this file makes sure that in EXACTFU nodes, the sharp s gets folded to * 'ss', even if the pattern isn't UTF-8. This avoids the issues * described in the next item. - * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches - * 'ss' or not is not knowable at compile time. It will match iff the - * target string is in UTF-8, unlike the EXACTFU nodes, where it always - * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus - * it can't be folded to "ss" at compile time, unlike EXACTFU does (as - * described in item 3). An assumption that the optimizer part of - * regexec.c (probably unwittingly) makes is that a character in the - * pattern corresponds to at most a single character in the target string. - * (And I do mean character, and not byte here, unlike other parts of the - * documentation that have never been updated to account for multibyte - * Unicode.) This assumption is wrong only in this case, as all other - * cases are either 1-1 folds when no UTF-8 is involved; or is true by - * virtue of having this file pre-fold UTF-8 patterns. I'm - * reluctant to try to change this assumption, so instead the code punts. - * This routine examines EXACTF nodes for the sharp s, and returns a - * boolean indicating whether or not the node is an EXACTF node that - * contains a sharp s. When it is true, the caller sets a flag that later - * causes the optimizer in this file to not set values for the floating - * and fixed string lengths, and thus avoids the optimizer code in - * regexec.c that makes the invalid assumption. Thus, there is no - * optimization based on string lengths for EXACTF nodes that contain the - * sharp s. This only happens for /id rules (which means the pattern - * isn't in UTF-8). + * 4) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the + * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a + * UTF-8 pattern.) An assumption that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes is that a + * character in the pattern corresponds to at most a single character in + * the target string. (And I do mean character, and not byte here, unlike + * other parts of the documentation that have never been updated to + * account for multibyte Unicode.) sharp s in EXACTF nodes can match the + * two character string 'ss'; in EXACTFA nodes it can match + * "\x{17F}\x{17F}". These violate the assumption, and they are the only + * instances where it is violated. I'm reluctant to try to change the + * assumption, as the code involved is impenetrable to me (khw), so + * instead the code here punts. This routine examines (when the pattern + * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a + * boolean indicating whether or not the node contains a sharp s. When it + * is true, the caller sets a flag that later causes the optimizer in this + * file to not set values for the floating and fixed string lengths, and + * thus avoids the optimizer code in regexec.c that makes the invalid + * assumption. Thus, there is no optimization based on string lengths for + * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s. + * (The reason the assumption is wrong only in these two cases is that all + * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all + * other folds to their expanded versions. We can't prefold sharp s to + * 'ss' in EXACTF nodes because we don't know at compile time if it + * actually matches 'ss' or not. It will match iff the target string is + * in UTF-8, unlike the EXACTFU nodes, where it always matches; and + * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8 + * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem; + * but in a non-UTF8 pattern, folding it to that above-Latin1 string would + * require the pattern to be forced into UTF-8, the overhead of which we + * want to avoid.) */ #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ @@ -2899,13 +2907,30 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b next_iteration: ; } } - else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + else if (OP(scan) == EXACTFA) { - /* Here, the pattern is not UTF-8. Look for the multi-char folds - * that are all ASCII. As in the above case, EXACTFL and EXACTFA - * nodes can't have multi-char folds to this range (and there are - * no existing ones in the upper latin1 range). In the EXACTF - * case we look also for the sharp s, which can be in the final + /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char + * fold to the ASCII range (and there are no existing ones in the + * upper latin1 range). But, as outlined in the comments preceding + * this function, we need to flag any occurrences of the sharp s */ + while (s < s_end) { + if (*s == LATIN_SMALL_LETTER_SHARP_S) { + *has_exactf_sharp_s = TRUE; + break; + } + s++; + continue; + } + } + else if (OP(scan) != EXACTFL) { + + /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the + * multi-char folds that are all Latin1. (This code knows that + * there are no current multi-char folds possible with EXACTFL, + * relying on fold_grind.t to catch any errors if the very unlikely + * event happens that some get added in future Unicode versions.) + * As explained in the comments preceding this function, we look + * also for the sharp s in EXACTF nodes; it can be in the final * position. Otherwise we can stop looking 1 byte earlier because * have to find at least two characters for a multi-fold */ const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; diff --git a/regen/regcharclass_multi_char_folds.pl b/regen/regcharclass_multi_char_folds.pl index f0fd6b3a89..f04be85c58 100644 --- a/regen/regcharclass_multi_char_folds.pl +++ b/regen/regcharclass_multi_char_folds.pl @@ -104,6 +104,29 @@ sub multi_char_folds ($) { } } + # \x17F is the small LONG S, which folds to 's'. Both Capital and small + # LATIN SHARP S fold to 'ss'. Therefore, they should also match two 17F's + # in a row under regex /i matching. But under /iaa regex matching, all + # three folds to 's' are prohibited, but the sharp S's should still match + # two 17F's. This prohibition causes our regular regex algorithm that + # would ordinarily allow this match to fail. This is the only instance in + # all Unicode of this kind of issue. By adding a special case here, we + # can use the regular algorithm (with some other changes elsewhere as + # well). + # + # It would be possible to re-write the above code to automatically detect + # and handle this case, and any others that might eventually get added to + # the Unicode standard, but I (khw) don't think it's worth it. I believe + # that it's extremely unlikely that more folds to ASCII characters are + # going to be added, and if I'm wrong, fold_grind.t has the intelligence + # to detect them, and test that they work, at which point another special + # case could be added here if necessary. + # + # No combinations of this with 's' need be added, as any of these + # containing 's' are prohibted under /iaa. + push @folds, "\"\x{17F}\x{17F}\""; + + return @folds; } diff --git a/t/uni/fold.t b/t/uni/fold.t index 91356bbe18..6c06a2fdc2 100644 --- a/t/uni/fold.t +++ b/t/uni/fold.t @@ -444,6 +444,12 @@ foreach my $test_ref (@CF) { } } +{ + use feature qw( fc ); + use locale; + is(fc("\x{1E9E}"), fc("\x{17F}\x{17F}"), 'fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")'); +} + my $num_tests = curr_test() - 1; plan($num_tests); @@ -1783,21 +1783,38 @@ UV Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags) { /* Corresponds to to_lower_latin1(); <flags> bits meanings: + * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited * FOLD_FLAGS_FULL iff full folding is to be used; + * + * Not to be used for locale folds */ UV converted; PERL_ARGS_ASSERT__TO_FOLD_LATIN1; + assert (! (flags & FOLD_FLAGS_LOCALE)); + if (c == MICRO_SIGN) { converted = GREEK_SMALL_LETTER_MU; } else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) { + + /* If can't cross 127/128 boundary, can't return "ss"; instead return + * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}") + * under those circumstances. */ + if (flags & FOLD_FLAGS_NOMIX_ASCII) { + *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2; + Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, + p, *lenp, U8); + return LATIN_SMALL_LETTER_LONG_S; + } + else { *(p)++ = 's'; *p = 's'; *lenp = 2; return 's'; + } } else { /* In this range the fold of all other characters is their lower case */ @@ -1832,11 +1849,7 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags) if (c < 256) { UV result = _to_fold_latin1((U8) c, p, lenp, - /* If ASCII-safe, don't allow full folding, - * as that could include SHARP S => ss; - * otherwise there is no crossing of - * ascii/non-ascii in the latin1 range */ - (flags & FOLD_FLAGS_NOMIX_ASCII) ? 0 : flags & FOLD_FLAGS_FULL); + flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); /* It is illegal for the fold to cross the 255/256 boundary under * locale; in this case return the original */ return (result > 256 && flags & FOLD_FLAGS_LOCALE) @@ -2773,7 +2786,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b } else { return _to_fold_latin1(*p, ustrp, lenp, - flags & FOLD_FLAGS_FULL); + flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); } } else if UTF8_IS_DOWNGRADEABLE_START(*p) { @@ -2783,18 +2796,22 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b else { return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), ustrp, lenp, - /* If ASCII safe, don't allow full - * folding, as that could include SHARP - * S => ss; otherwise there is no - * crossing of ascii/non-ascii in the - * latin1 range */ - (flags & FOLD_FLAGS_NOMIX_ASCII) ? 0 : flags & FOLD_FLAGS_FULL); + flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); } } else { /* utf8, ord above 255 */ result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL); - if ((flags & FOLD_FLAGS_LOCALE)) { + if (flags & FOLD_FLAGS_LOCALE) { + + /* Special case this character, as what normally gets returned + * under locale doesn't work */ + if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1 + && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8, + sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1)) + { + goto return_long_s; + } return check_locale_boundary_crossing(p, result, ustrp, lenp); } else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) { @@ -2815,6 +2832,12 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b if (isASCII(*s)) { /* Crossed, have to return the original */ original = valid_utf8_to_uvchr(p, lenp); + + /* But in this one instance, there is an alternative we can + * return that is valid */ + if (original == LATIN_CAPITAL_LETTER_SHARP_S) { + goto return_long_s; + } Copy(p, ustrp, *lenp, char); return original; } @@ -2841,6 +2864,18 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b *tainted_ptr = TRUE; } return result; + + return_long_s: + /* Certain folds to 'ss' are prohibited by the options, but they do allow + * folds to a string of two of these characters. By returning this + * instead, then, e.g., + * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}") + * works. */ + + *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2; + Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, + ustrp, *lenp, U8); + return LATIN_SMALL_LETTER_LONG_S; } /* Note: |