summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-08-22 09:26:09 -0600
committerKarl Williamson <public@khwilliamson.com>2011-08-27 08:25:24 -0600
commite1a8dbf543b3c24ee8aacaf571e19124bec0b7ae (patch)
tree70be12101f45982804fdf324e7c35be389978b8e
parent9d2ba0fab0a13d87f06a6836d5bc65337cf31ab8 (diff)
downloadperl-e1a8dbf543b3c24ee8aacaf571e19124bec0b7ae.tar.gz
pp.c: Use built-in case tables for ords < 256
Previously, all case changing on utf8-encoded strings used the tables on disk, under the off-chance that there was a user-defined case change override in effect. Now that that feature has been removed, this can't happen, so we can use the existing built-in tables. This code has been present and ifdef'd out since 5.10.1. New compiler warnings forced a few other changes besides removing the #if statements Running some primitive benchmarks showed that this sped up upper-casing of utf8 strings in the latin1 range by 2 orders of magnitude.
-rw-r--r--pp.c31
1 files changed, 4 insertions, 27 deletions
diff --git a/pp.c b/pp.c
index 5101eac3dc..54226dd078 100644
--- a/pp.c
+++ b/pp.c
@@ -3617,14 +3617,6 @@ PP(pp_ucfirst)
else if (DO_UTF8(source)) { /* Is the source utf8? */
doing_utf8 = TRUE;
-/* TODO: This is #ifdefd out because it has hard-coded the standard mappings,
- * and doesn't allow for the user to specify their own. When code is added to
- * detect if there is a user-defined mapping in force here, and if so to use
- * that, then the code below can be compiled. The detection would be a good
- * thing anyway, as currently the user-defined mappings only work on utf8
- * strings, and thus depend on the chosen internal storage method, which is a
- * bad thing */
-#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
if (UTF8_IS_INVARIANT(*s)) {
/* An invariant source character is either ASCII or, in EBCDIC, an
@@ -3696,7 +3688,6 @@ PP(pp_ucfirst)
}
}
else {
-#endif /* end of dont want to break user-defined casing */
/* Here, can't short-cut the general case */
@@ -3707,9 +3698,7 @@ PP(pp_ucfirst)
/* we can't do in-place if the length changes. */
if (ulen != tculen) inplace = FALSE;
need = slen + 1 - ulen + tculen;
-#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
}
-#endif
}
else { /* Non-zero length, non-UTF-8, Need to consider locale and if
* latin1 is treated as caseless. Note that a locale takes
@@ -3966,10 +3955,6 @@ PP(pp_uc)
in_iota_subscript = FALSE;
}
-
-/* See comments at the first instance in this file of this ifdef */
-#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
-
/* If the UTF-8 character is invariant, then it is in the range
* known by the standard macro; result is only one byte long */
if (UTF8_IS_INVARIANT(*s)) {
@@ -3980,15 +3965,12 @@ PP(pp_uc)
/* Likewise, if it fits in a byte, its case change is in our
* table */
- U8 orig = TWO_BYTE_UTF8_TO_UNI(*s, *s++);
+ U8 orig = TWO_BYTE_UTF8_TO_UNI(*s, *(s+1));
U8 upper = toUPPER_LATIN1_MOD(orig);
CAT_TWO_BYTE_UNI_UPPER_MOD(d, orig, upper);
- s++;
+ s += 2;
}
else {
-#else
- {
-#endif
/* Otherwise, need the general UTF-8 case. Get the changed
* case value and copy it to the output buffer */
@@ -4208,8 +4190,6 @@ PP(pp_lc)
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
while (s < send) {
-/* See comments at the first instance in this file of this ifdef */
-#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
if (UTF8_IS_INVARIANT(*s)) {
/* Invariant characters use the standard mappings compiled in.
@@ -4220,12 +4200,11 @@ PP(pp_lc)
else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
/* As do the ones in the Latin1 range */
- U8 lower = toLOWER_LATIN1(TWO_BYTE_UTF8_TO_UNI(*s, *s++));
+ U8 lower = toLOWER_LATIN1(TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)));
CAT_UNI_TO_UTF8_TWO_BYTE(d, lower);
- s++;
+ s += 2;
}
else {
-#endif
/* Here, is utf8 not in Latin-1 range, have to go out and get
* the mappings from the tables. */
@@ -4326,9 +4305,7 @@ PP(pp_lc)
Copy(tmpbuf, d, ulen, U8);
d += ulen;
s += u;
-#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
}
-#endif
} /* End of looping through the source string */
SvUTF8_on(dest);
*d = '\0';