From e1a8dbf543b3c24ee8aacaf571e19124bec0b7ae Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Mon, 22 Aug 2011 09:26:09 -0600
Subject: pp.c: Use built-in case tables for ords < 256

Previously, all case changing on utf8-encoded strings used the tables on
disk, under the off-chance that there was a user-defined case change
override in effect.  Now that that feature has been removed, this can't
happen, so we can use the existing built-in tables.

This code has been present and ifdef'd out since 5.10.1.  New compiler
warnings forced a few other changes besides removing the #if statements

Running some primitive benchmarks showed that this sped up upper-casing of
utf8 strings in the latin1 range by 2 orders of magnitude.
---
 pp.c | 31 ++++---------------------------
 1 file changed, 4 insertions(+), 27 deletions(-)

diff --git a/pp.c b/pp.c
index 5101eac3dc..54226dd078 100644
--- a/pp.c
+++ b/pp.c
@@ -3617,14 +3617,6 @@ PP(pp_ucfirst)
     else if (DO_UTF8(source)) {	/* Is the source utf8? */
 	doing_utf8 = TRUE;
 
-/* TODO: This is #ifdefd out because it has hard-coded the standard mappings,
- * and doesn't allow for the user to specify their own.  When code is added to
- * detect if there is a user-defined mapping in force here, and if so to use
- * that, then the code below can be compiled.  The detection would be a good
- * thing anyway, as currently the user-defined mappings only work on utf8
- * strings, and thus depend on the chosen internal storage method, which is a
- * bad thing */
-#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
 	if (UTF8_IS_INVARIANT(*s)) {
 
 	    /* An invariant source character is either ASCII or, in EBCDIC, an
@@ -3696,7 +3688,6 @@ PP(pp_ucfirst)
 	    }
 	}
 	else {
-#endif	/* end of dont want to break user-defined casing */
 
 	    /* Here, can't short-cut the general case */
 
@@ -3707,9 +3698,7 @@ PP(pp_ucfirst)
 	    /* we can't do in-place if the length changes.  */
 	    if (ulen != tculen) inplace = FALSE;
 	    need = slen + 1 - ulen + tculen;
-#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
 	}
-#endif
     }
     else { /* Non-zero length, non-UTF-8,  Need to consider locale and if
 	    * latin1 is treated as caseless.  Note that a locale takes
@@ -3966,10 +3955,6 @@ PP(pp_uc)
 		in_iota_subscript = FALSE;
 	    }
 
-
-/* See comments at the first instance in this file of this ifdef */
-#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
-
 	    /* If the UTF-8 character is invariant, then it is in the range
 	     * known by the standard macro; result is only one byte long */
 	    if (UTF8_IS_INVARIANT(*s)) {
@@ -3980,15 +3965,12 @@ PP(pp_uc)
 
 		/* Likewise, if it fits in a byte, its case change is in our
 		 * table */
-		U8 orig = TWO_BYTE_UTF8_TO_UNI(*s, *s++);
+		U8 orig = TWO_BYTE_UTF8_TO_UNI(*s, *(s+1));
 		U8 upper = toUPPER_LATIN1_MOD(orig);
 		CAT_TWO_BYTE_UNI_UPPER_MOD(d, orig, upper);
-		s++;
+		s += 2;
 	    }
 	    else {
-#else
-	    {
-#endif
 
 		/* Otherwise, need the general UTF-8 case.  Get the changed
 		 * case value and copy it to the output buffer */
@@ -4208,8 +4190,6 @@ PP(pp_lc)
 	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
 
 	while (s < send) {
-/* See comments at the first instance in this file of this ifdef */
-#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
 	    if (UTF8_IS_INVARIANT(*s)) {
 
 		/* Invariant characters use the standard mappings compiled in.
@@ -4220,12 +4200,11 @@ PP(pp_lc)
 	    else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
 
 		/* As do the ones in the Latin1 range */
-		U8 lower = toLOWER_LATIN1(TWO_BYTE_UTF8_TO_UNI(*s, *s++));
+		U8 lower = toLOWER_LATIN1(TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)));
 		CAT_UNI_TO_UTF8_TWO_BYTE(d, lower);
-		s++;
+		s += 2;
 	    }
 	    else {
-#endif
 		/* Here, is utf8 not in Latin-1 range, have to go out and get
 		 * the mappings from the tables. */
 
@@ -4326,9 +4305,7 @@ PP(pp_lc)
 		Copy(tmpbuf, d, ulen, U8);
 		d += ulen;
 		s += u;
-#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
 	    }
-#endif
 	}   /* End of looping through the source string */
 	SvUTF8_on(dest);
 	*d = '\0';
-- 
cgit v1.2.1