diff options
author | Karl Williamson <public@khwilliamson.com> | 2013-05-18 08:25:16 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2013-05-20 11:01:52 -0600 |
commit | 1ca267a56acf698557ec1deec44af651acc88696 (patch) | |
tree | 007f9d40b63b92728a095d5defdad663e239289c /utf8.c | |
parent | 519101418837cf0edacb710b2b38b42dad6e47c1 (diff) | |
download | perl-1ca267a56acf698557ec1deec44af651acc88696.tar.gz |
Fix multi-char fold edge case
use locale;
fc("\N{LATIN CAPITAL LETTER SHARP S}")
eq 2 x fc("\N{LATIN SMALL LETTER LONG S}")
should return true, as the SHARP S folds to two 's's in a row, and the
LONG S is an antique variant of 's', and folds to s. Until this commit,
the expression was false.
Similarly, the following should match, but didn't until this commit:
"\N{LATIN SMALL LETTER SHARP S}" =~ /\N{LATIN SMALL LETTER LONG S}{2}/iaa
The reason these didn't work properly is that in both cases the actual
fold to 's' is disallowed. In the first case because of locale; and in
the second because of /aa. And the code wasn't smart enough to realize
that these were legal.
The fix is to special case these so that the fold of sharp s (both
capital and small) is two LONG S's under /aa; as is the fold of the
capital sharp s under locale. The latter is user-visible, and the
documentation of fc() now points that out. I believe this is such an
edge case that no mention of it need be done in perldelta.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 61 |
1 files changed, 48 insertions, 13 deletions
@@ -1783,21 +1783,38 @@ UV Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags) { /* Corresponds to to_lower_latin1(); <flags> bits meanings: + * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited * FOLD_FLAGS_FULL iff full folding is to be used; + * + * Not to be used for locale folds */ UV converted; PERL_ARGS_ASSERT__TO_FOLD_LATIN1; + assert (! (flags & FOLD_FLAGS_LOCALE)); + if (c == MICRO_SIGN) { converted = GREEK_SMALL_LETTER_MU; } else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) { + + /* If can't cross 127/128 boundary, can't return "ss"; instead return + * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}") + * under those circumstances. */ + if (flags & FOLD_FLAGS_NOMIX_ASCII) { + *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2; + Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, + p, *lenp, U8); + return LATIN_SMALL_LETTER_LONG_S; + } + else { *(p)++ = 's'; *p = 's'; *lenp = 2; return 's'; + } } else { /* In this range the fold of all other characters is their lower case */ @@ -1832,11 +1849,7 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags) if (c < 256) { UV result = _to_fold_latin1((U8) c, p, lenp, - /* If ASCII-safe, don't allow full folding, - * as that could include SHARP S => ss; - * otherwise there is no crossing of - * ascii/non-ascii in the latin1 range */ - (flags & FOLD_FLAGS_NOMIX_ASCII) ? 0 : flags & FOLD_FLAGS_FULL); + flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); /* It is illegal for the fold to cross the 255/256 boundary under * locale; in this case return the original */ return (result > 256 && flags & FOLD_FLAGS_LOCALE) @@ -2773,7 +2786,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b } else { return _to_fold_latin1(*p, ustrp, lenp, - flags & FOLD_FLAGS_FULL); + flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); } } else if UTF8_IS_DOWNGRADEABLE_START(*p) { @@ -2783,18 +2796,22 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b else { return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), ustrp, lenp, - /* If ASCII safe, don't allow full - * folding, as that could include SHARP - * S => ss; otherwise there is no - * crossing of ascii/non-ascii in the - * latin1 range */ - (flags & FOLD_FLAGS_NOMIX_ASCII) ? 0 : flags & FOLD_FLAGS_FULL); + flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); } } else { /* utf8, ord above 255 */ result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL); - if ((flags & FOLD_FLAGS_LOCALE)) { + if (flags & FOLD_FLAGS_LOCALE) { + + /* Special case this character, as what normally gets returned + * under locale doesn't work */ + if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1 + && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8, + sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1)) + { + goto return_long_s; + } return check_locale_boundary_crossing(p, result, ustrp, lenp); } else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) { @@ -2815,6 +2832,12 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b if (isASCII(*s)) { /* Crossed, have to return the original */ original = valid_utf8_to_uvchr(p, lenp); + + /* But in this one instance, there is an alternative we can + * return that is valid */ + if (original == LATIN_CAPITAL_LETTER_SHARP_S) { + goto return_long_s; + } Copy(p, ustrp, *lenp, char); return original; } @@ -2841,6 +2864,18 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b *tainted_ptr = TRUE; } return result; + + return_long_s: + /* Certain folds to 'ss' are prohibited by the options, but they do allow + * folds to a string of two of these characters. By returning this + * instead, then, e.g., + * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}") + * works. */ + + *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2; + Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, + ustrp, *lenp, U8); + return LATIN_SMALL_LETTER_LONG_S; } /* Note: |