From a1dde8dee0a7efc73bdb7c985e74f1461e153f12 Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Tue, 8 Nov 2011 22:16:39 -0700
Subject: utf8.c: Faster latin1 folding

This adds a function similar to the ones for the other three case
changing operations that works on latin1 characters only, and avoids
having to go out to swashes.  It changes to_uni_fold() and
to_utf8_fold() to call it on the appropriate input
---
 embed.fnc |  1 +
 embed.h   |  1 +
 proto.h   |  6 ++++++
 utf8.c    | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/embed.fnc b/embed.fnc
index e2911dd8a0..446faf5de9 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -603,6 +603,7 @@ Ap	|UV	|to_uni_title	|UV c|NN U8 *p|NN STRLEN *lenp
 #ifdef PERL_IN_UTF8_C
 sR	|U8	|to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp
 p	|UV	|_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s
+p	|UV	|_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const U8 flags
 #endif
 Ap	|UV	|to_uni_lower	|UV c|NN U8 *p|NN STRLEN *lenp
 Amp	|UV	|to_uni_fold	|UV c|NN U8 *p|NN STRLEN *lenp
diff --git a/embed.h b/embed.h
index 3d985b5f82..8540031730 100644
--- a/embed.h
+++ b/embed.h
@@ -1571,6 +1571,7 @@
 #define isa_lookup(a,b,c,d)	S_isa_lookup(aTHX_ a,b,c,d)
 #  endif
 #  if defined(PERL_IN_UTF8_C)
+#define _to_fold_latin1(a,b,c,d)	Perl__to_fold_latin1(aTHX_ a,b,c,d)
 #define _to_upper_title_latin1(a,b,c,d)	Perl__to_upper_title_latin1(aTHX_ a,b,c,d)
 #define is_utf8_char_slow	S_is_utf8_char_slow
 #define is_utf8_common(a,b,c)	S_is_utf8_common(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index 7f9621a47e..534fab8511 100644
--- a/proto.h
+++ b/proto.h
@@ -6983,6 +6983,12 @@ STATIC bool	S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U
 
 #endif
 #if defined(PERL_IN_UTF8_C)
+PERL_CALLCONV UV	Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const U8 flags)
+			__attribute__nonnull__(pTHX_2)
+			__attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1	\
+	assert(p); assert(lenp)
+
 PERL_CALLCONV UV	Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const char S_or_s)
 			__attribute__nonnull__(pTHX_2)
 			__attribute__nonnull__(pTHX_3);
diff --git a/utf8.c b/utf8.c
index 38f5c6c00c..9c55d10c25 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1458,13 +1458,51 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
     return CALL_LOWER_CASE(p, p, lenp);
 }
 
+UV
+Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const U8 flags)
+{
+    UV converted;
+
+    PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
+
+    if (c == MICRO_SIGN) {
+	converted = GREEK_SMALL_LETTER_MU;
+    }
+    else if (flags && c == LATIN_SMALL_LETTER_SHARP_S) {
+	*(p)++ = 's';
+	*p = 's';
+	*lenp = 2;
+	return 's';
+    }
+    else { /* In this range the fold of all other characters is their lower
+              case */
+	converted = toLOWER_LATIN1(c);
+    }
+
+    if (UNI_IS_INVARIANT(converted)) {
+	*p = (U8) converted;
+	*lenp = 1;
+    }
+    else {
+	*(p)++ = UTF8_TWO_BYTE_HI(converted);
+	*p = UTF8_TWO_BYTE_LO(converted);
+	*lenp = 2;
+    }
+
+    return converted;
+}
+
 UV
 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
 {
     PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
 
+    if (c < 256) {
+	return _to_fold_latin1((U8) c, p, lenp, flags);
+    }
+
     uvchr_to_utf8(p, c);
-    return _to_utf8_fold_flags(p, p, lenp, flags);
+    return CALL_FOLD_CASE(p, p, lenp, flags);
 }
 
 /* for now these all assume no locale info available for Unicode > 255 */
@@ -2180,6 +2218,14 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
 
     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
 
+    if (UTF8_IS_INVARIANT(*p)) {
+	return _to_fold_latin1(*p, ustrp, lenp, flags);
+    }
+    else if UTF8_IS_DOWNGRADEABLE_START(*p) {
+	return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
+		                                    ustrp, lenp, flags);
+    }
+
     return CALL_FOLD_CASE(p, ustrp, lenp, flags);
 }
 
-- 
cgit v1.2.1