diff options
-rw-r--r-- | regen/unicode_constants.pl | 16 | ||||
-rw-r--r-- | unicode_constants.h | 3 | ||||
-rw-r--r-- | utf8.c | 10 |
3 files changed, 29 insertions, 0 deletions
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index acd1f91e73..baf25f1258 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -158,6 +158,22 @@ printf $out_fh "\n/* The number of code points not matching \\pC */\n" . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C %d\n", 0x110000 - $count; +# If this release has both the CWCM and CWCF properties, find the highest code +# point which changes under any case change. We can use this to short-circuit +# code +my @cwcm = prop_invlist('CWCM'); +if (@cwcm) { + my @cwcf = prop_invlist('CWCF'); + if (@cwcf) { + my $max = ($cwcm[-1] < $cwcf[-1]) + ? $cwcf[-1] + : $cwcm[-1]; + printf $out_fh "\n/* The highest code point that has any type of case change */\n" + . "#define HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C 0x%X\n", + $max - 1; + } +} + print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n"; read_only_bottom_close_and_rename($out_fh); diff --git a/unicode_constants.h b/unicode_constants.h index 71755de7f6..1384873f19 100644 --- a/unicode_constants.h +++ b/unicode_constants.h @@ -182,6 +182,9 @@ /* The number of code points not matching \pC */ #define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C 120522 +/* The highest code point that has any type of case change */ +#define HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C 0x118DF + #endif /* H_UNICODE_CONSTANTS */ /* ex: set ro: */ @@ -1997,6 +1997,16 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, } goto cases_to_self; } +#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C + if (UNLIKELY(uv1 + > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C)) + { + + /* As of this writing, this means we avoid swash creation + * for anything beyond low Plane 1 */ + goto cases_to_self; + } +#endif } } |