summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2013-05-18 08:25:16 -0600
committerKarl Williamson <public@khwilliamson.com>2013-05-20 11:01:52 -0600
commit1ca267a56acf698557ec1deec44af651acc88696 (patch)
tree007f9d40b63b92728a095d5defdad663e239289c /utf8.c
parent519101418837cf0edacb710b2b38b42dad6e47c1 (diff)
downloadperl-1ca267a56acf698557ec1deec44af651acc88696.tar.gz
Fix multi-char fold edge case
use locale; fc("\N{LATIN CAPITAL LETTER SHARP S}") eq 2 x fc("\N{LATIN SMALL LETTER LONG S}") should return true, as the SHARP S folds to two 's's in a row, and the LONG S is an antique variant of 's', and folds to s. Until this commit, the expression was false. Similarly, the following should match, but didn't until this commit: "\N{LATIN SMALL LETTER SHARP S}" =~ /\N{LATIN SMALL LETTER LONG S}{2}/iaa The reason these didn't work properly is that in both cases the actual fold to 's' is disallowed. In the first case because of locale; and in the second because of /aa. And the code wasn't smart enough to realize that these were legal. The fix is to special case these so that the fold of sharp s (both capital and small) is two LONG S's under /aa; as is the fold of the capital sharp s under locale. The latter is user-visible, and the documentation of fc() now points that out. I believe this is such an edge case that no mention of it need be done in perldelta.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c61
1 files changed, 48 insertions, 13 deletions
diff --git a/utf8.c b/utf8.c
index 0e8274f877..c3394e5450 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1783,21 +1783,38 @@ UV
Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
{
/* Corresponds to to_lower_latin1(); <flags> bits meanings:
+ * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
* FOLD_FLAGS_FULL iff full folding is to be used;
+ *
+ * Not to be used for locale folds
*/
UV converted;
PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
+ assert (! (flags & FOLD_FLAGS_LOCALE));
+
if (c == MICRO_SIGN) {
converted = GREEK_SMALL_LETTER_MU;
}
else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
+
+ /* If can't cross 127/128 boundary, can't return "ss"; instead return
+ * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
+ * under those circumstances. */
+ if (flags & FOLD_FLAGS_NOMIX_ASCII) {
+ *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
+ Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+ p, *lenp, U8);
+ return LATIN_SMALL_LETTER_LONG_S;
+ }
+ else {
*(p)++ = 's';
*p = 's';
*lenp = 2;
return 's';
+ }
}
else { /* In this range the fold of all other characters is their lower
case */
@@ -1832,11 +1849,7 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags)
if (c < 256) {
UV result = _to_fold_latin1((U8) c, p, lenp,
- /* If ASCII-safe, don't allow full folding,
- * as that could include SHARP S => ss;
- * otherwise there is no crossing of
- * ascii/non-ascii in the latin1 range */
- (flags & FOLD_FLAGS_NOMIX_ASCII) ? 0 : flags & FOLD_FLAGS_FULL);
+ flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
/* It is illegal for the fold to cross the 255/256 boundary under
* locale; in this case return the original */
return (result > 256 && flags & FOLD_FLAGS_LOCALE)
@@ -2773,7 +2786,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
}
else {
return _to_fold_latin1(*p, ustrp, lenp,
- flags & FOLD_FLAGS_FULL);
+ flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
}
}
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
@@ -2783,18 +2796,22 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
else {
return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
ustrp, lenp,
- /* If ASCII safe, don't allow full
- * folding, as that could include SHARP
- * S => ss; otherwise there is no
- * crossing of ascii/non-ascii in the
- * latin1 range */
- (flags & FOLD_FLAGS_NOMIX_ASCII) ? 0 : flags & FOLD_FLAGS_FULL);
+ flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
}
}
else { /* utf8, ord above 255 */
result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
- if ((flags & FOLD_FLAGS_LOCALE)) {
+ if (flags & FOLD_FLAGS_LOCALE) {
+
+ /* Special case this character, as what normally gets returned
+ * under locale doesn't work */
+ if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
+ && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
+ sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1))
+ {
+ goto return_long_s;
+ }
return check_locale_boundary_crossing(p, result, ustrp, lenp);
}
else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
@@ -2815,6 +2832,12 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
if (isASCII(*s)) {
/* Crossed, have to return the original */
original = valid_utf8_to_uvchr(p, lenp);
+
+ /* But in this one instance, there is an alternative we can
+ * return that is valid */
+ if (original == LATIN_CAPITAL_LETTER_SHARP_S) {
+ goto return_long_s;
+ }
Copy(p, ustrp, *lenp, char);
return original;
}
@@ -2841,6 +2864,18 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
*tainted_ptr = TRUE;
}
return result;
+
+ return_long_s:
+ /* Certain folds to 'ss' are prohibited by the options, but they do allow
+ * folds to a string of two of these characters. By returning this
+ * instead, then, e.g.,
+ * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
+ * works. */
+
+ *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
+ Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+ ustrp, *lenp, U8);
+ return LATIN_SMALL_LETTER_LONG_S;
}
/* Note: