summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-12-03 13:27:21 -0700
committerKarl Williamson <khw@cpan.org>2015-12-09 23:43:22 -0700
commit3bfc1e7044659f9ec4cc4f1bc9eea7a8b00061fb (patch)
treeec472afda925dc0c5b5f0f45b48aadb3e241061c
parent36eaa8111efe6b0ebe974f6b26ed667c1206dc9f (diff)
downloadperl-3bfc1e7044659f9ec4cc4f1bc9eea7a8b00061fb.tar.gz
Skip casing for high code points
As discussed in the previous commit, most code points in Unicode don't change if upper-, or lower-cased, etc. In fact as of Unicode v8.0, 93% of the available code points are above the highest one that does change. This commit skips trying to case these 93%. A regen/ script keeps track of the max changing one in the current Unicode release, and skips casing for the higher ones. Thus currently, casing emoji will be skipped. Together with the previous commits that dealt with casing, the potential for huge memory requirements for the swash hashes for casing are severely limited. If the following command is run on a perl compiled with -O2 and no DEBUGGING: blead Porting/bench.pl --raw --perlargs="-Ilib -X" --benchfile=plane1_case_perf /path_to_prior_perl=before_this_commit /path_to_new_perl=after and the file 'plane1_case_perf' contains [ 'string::casing::emoji' => { desc => 'yes swash vs no swash', setup => 'my $a = "\x{1F570}"', # MANTELPIECE CLOCK code => 'uc($a)' }, ]; the following results are obtained: The numbers represent raw counts per loop iteration. string::casing::emoji yes swash vs no swash before_this_commit after ------------------ -------- Ir 981.0 306.0 Dr 228.0 94.0 Dw 100.0 45.0 COND 137.0 49.0 IND 7.0 4.0 COND_m 5.5 0.0 IND_m 4.0 2.0 Ir_m1 0.1 -0.1 Dr_m1 0.0 0.0 Dw_m1 0.0 0.0 Ir_mm 0.0 0.0 Dr_mm 0.0 0.0 Dw_mm 0.0 0.0
-rw-r--r--regen/unicode_constants.pl16
-rw-r--r--unicode_constants.h3
-rw-r--r--utf8.c10
3 files changed, 29 insertions, 0 deletions
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl
index acd1f91e73..baf25f1258 100644
--- a/regen/unicode_constants.pl
+++ b/regen/unicode_constants.pl
@@ -158,6 +158,22 @@ printf $out_fh "\n/* The number of code points not matching \\pC */\n"
. "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C %d\n",
0x110000 - $count;
+# If this release has both the CWCM and CWCF properties, find the highest code
+# point which changes under any case change. We can use this to short-circuit
+# code
+my @cwcm = prop_invlist('CWCM');
+if (@cwcm) {
+ my @cwcf = prop_invlist('CWCF');
+ if (@cwcf) {
+ my $max = ($cwcm[-1] < $cwcf[-1])
+ ? $cwcf[-1]
+ : $cwcm[-1];
+ printf $out_fh "\n/* The highest code point that has any type of case change */\n"
+ . "#define HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C 0x%X\n",
+ $max - 1;
+ }
+}
+
print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
read_only_bottom_close_and_rename($out_fh);
diff --git a/unicode_constants.h b/unicode_constants.h
index 71755de7f6..1384873f19 100644
--- a/unicode_constants.h
+++ b/unicode_constants.h
@@ -182,6 +182,9 @@
/* The number of code points not matching \pC */
#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C 120522
+/* The highest code point that has any type of case change */
+#define HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C 0x118DF
+
#endif /* H_UNICODE_CONSTANTS */
/* ex: set ro: */
diff --git a/utf8.c b/utf8.c
index 4c43bdec2f..fa1439bdfc 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1997,6 +1997,16 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
}
goto cases_to_self;
}
+#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
+ if (UNLIKELY(uv1
+ > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
+ {
+
+ /* As of this writing, this means we avoid swash creation
+ * for anything beyond low Plane 1 */
+ goto cases_to_self;
+ }
+#endif
}
}