Fix multi-char fold edge case

use locale; fc("\N{LATIN CAPITAL LETTER SHARP S}") eq 2 x fc("\N{LATIN SMALL LETTER LONG S}") should return true, as the SHARP S folds to two 's's in a row, and the LONG S is an antique variant of 's', and folds to s. Until this commit, the expression was false. Similarly, the following should match, but didn't until this commit: "\N{LATIN SMALL LETTER SHARP S}" =~ /\N{LATIN SMALL LETTER LONG S}{2}/iaa The reason these didn't work properly is that in both cases the actual fold to 's' is disallowed. In the first case because of locale; and in the second because of /aa. And the code wasn't smart enough to realize that these were legal. The fix is to special case these so that the fold of sharp s (both capital and small) is two LONG S's under /aa; as is the fold of the capital sharp s under locale. The latter is user-visible, and the documentation of fc() now points that out. I believe this is such an edge case that no mention of it need be done in perldelta.
author: Karl Williamson <public@khwilliamson.com> 2013-05-18 08:25:16 -0600
committer: Karl Williamson <public@khwilliamson.com> 2013-05-20 11:01:52 -0600
commit: 1ca267a56acf698557ec1deec44af651acc88696 (patch)
tree: 007f9d40b63b92728a095d5defdad663e239289c /utf8.c
parent: 519101418837cf0edacb710b2b38b42dad6e47c1 (diff)
download: perl-1ca267a56acf698557ec1deec44af651acc88696.tar.gz
1 files changed, 48 insertions, 13 deletions
diff --git a/utf8.c b/utf8.c
index 0e8274f877..c3394e5450 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1783,21 +1783,38 @@ UV
 Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
 {
     /* Corresponds to to_lower_latin1(); <flags> bits meanings:
+     *	    FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
      *	    FOLD_FLAGS_FULL  iff full folding is to be used;
+     *
+     *	Not to be used for locale folds
      */
 
     UV converted;
 
     PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
 
+    assert (! (flags & FOLD_FLAGS_LOCALE));
+
     if (c == MICRO_SIGN) {
 	converted = GREEK_SMALL_LETTER_MU;
     }
     else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
+
+        /* If can't cross 127/128 boundary, can't return "ss"; instead return
+         * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
+         * under those circumstances. */
+        if (flags & FOLD_FLAGS_NOMIX_ASCII) {
+            *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
+            Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+                 p, *lenp, U8);
+            return LATIN_SMALL_LETTER_LONG_S;
+        }
+        else {
 	*(p)++ = 's';
 	*p = 's';
 	*lenp = 2;
 	return 's';
+        }
     }
     else { /* In this range the fold of all other characters is their lower
               case */
@@ -1832,11 +1849,7 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags)
 
     if (c < 256) {
 	UV result = _to_fold_latin1((U8) c, p, lenp,
-				   /* If ASCII-safe, don't allow full folding,
-				    * as that could include SHARP S => ss;
-				    * otherwise there is no crossing of
-				    * ascii/non-ascii in the latin1 range */
-			       (flags & FOLD_FLAGS_NOMIX_ASCII) ? 0 : flags & FOLD_FLAGS_FULL);
+			      flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
 	/* It is illegal for the fold to cross the 255/256 boundary under
 	 * locale; in this case return the original */
 	return (result > 256 && flags & FOLD_FLAGS_LOCALE)
@@ -2773,7 +2786,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
 	}
 	else {
 	    return _to_fold_latin1(*p, ustrp, lenp,
-                            flags & FOLD_FLAGS_FULL);
+                            flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
 	}
     }
     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
@@ -2783,18 +2796,22 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
 	else {
 	    return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
                             ustrp, lenp,
-				       /* If ASCII safe, don't allow full
-					* folding, as that could include SHARP
-					* S => ss; otherwise there is no
-					* crossing of ascii/non-ascii in the
-					* latin1 range */
-			       (flags & FOLD_FLAGS_NOMIX_ASCII) ? 0 : flags & FOLD_FLAGS_FULL);
+                            flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
 	}
     }
     else {  /* utf8, ord above 255 */
 	result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
 
-	if ((flags & FOLD_FLAGS_LOCALE)) {
+	if (flags & FOLD_FLAGS_LOCALE) {
+
+            /* Special case this character, as what normally gets returned
+             * under locale doesn't work */
+            if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
+                && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
+                          sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1))
+            {
+                goto return_long_s;
+            }
 	    return check_locale_boundary_crossing(p, result, ustrp, lenp);
 	}
 	else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
@@ -2815,6 +2832,12 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
 		if (isASCII(*s)) {
 		    /* Crossed, have to return the original */
 		    original = valid_utf8_to_uvchr(p, lenp);
+
+                    /* But in this one instance, there is an alternative we can
+                     * return that is valid */
+                    if (original == LATIN_CAPITAL_LETTER_SHARP_S) {
+                        goto return_long_s;
+                    }
 		    Copy(p, ustrp, *lenp, char);
 		    return original;
 		}
@@ -2841,6 +2864,18 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
 	*tainted_ptr = TRUE;
     }
     return result;
+
+  return_long_s:
+    /* Certain folds to 'ss' are prohibited by the options, but they do allow
+     * folds to a string of two of these characters.  By returning this
+     * instead, then, e.g.,
+     *      fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
+     * works. */
+
+    *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
+    Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+        ustrp, *lenp, U8);
+    return LATIN_SMALL_LETTER_LONG_S;
 }
 
 /* Note:
author	Karl Williamson <public@khwilliamson.com>	2013-05-18 08:25:16 -0600
committer	Karl Williamson <public@khwilliamson.com>	2013-05-20 11:01:52 -0600
commit	1ca267a56acf698557ec1deec44af651acc88696 (patch)
tree	007f9d40b63b92728a095d5defdad663e239289c /utf8.c
parent	519101418837cf0edacb710b2b38b42dad6e47c1 (diff)
download	perl-1ca267a56acf698557ec1deec44af651acc88696.tar.gz