summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2021-05-09 15:46:01 -0600
committerKarl Williamson <khw@cpan.org>2021-08-23 08:02:35 -0600
commita2475cdc782ee260e97e2cd6266e12bd6b513613 (patch)
treecb30aa9bd51569a0e16e828aebc94ba8d7af79d9 /utf8.c
parent7b47c6158ac7e52929a1470911ccb63053ba2193 (diff)
downloadperl-a2475cdc782ee260e97e2cd6266e12bd6b513613.tar.gz
utf8.c: Rmv no longer needed speed-up code
The code this commit removes made sense when we were using swashes, and we had to go out to files on disk to find the answers. It used knowledge of the Unicode character database to skip swaths of scripts which are caseless. But now, all that information is stored in C arrays that will be paged in when accessed, which is done by a binary search. The information about those swaths is in those arrays. The conditionals removed here are better spent in executing iterations of the search in L1 cache.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c90
1 files changed, 12 insertions, 78 deletions
diff --git a/utf8.c b/utf8.c
index 3d0d355580..b372267506 100644
--- a/utf8.c
+++ b/utf8.c
@@ -3192,57 +3192,16 @@ S_to_case_cp_list(pTHX_ const UV original,
PERL_ARGS_ASSERT_TO_CASE_CP_LIST;
- /* Almost all results will be a single value */
- *remaining_count = 0;
-
- /* For code points that don't change case, we already know that the output
- * of this function is the unchanged input, so we can skip doing look-ups
- * for them. Unfortunately the case-changing code points are scattered
- * around. But there are some long consecutive ranges where there are no
- * case changing code points. By adding tests, we can eliminate the lookup
- * for all the ones in such ranges. This is currently done here only for
- * just a few cases where the scripts are in common use in modern commerce
- * (and scripts adjacent to those which can be included without additional
- * tests). */
-
- if (original >= 0x0590) {
- /* This keeps from needing further processing the code points most
- * likely to be used in the following non-cased scripts: Hebrew,
- * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
- * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
- * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
- if (original < 0x10A0) {
- return original;
- }
+ /* 'index' is guaranteed to be non-negative, as this is an inversion
+ * map that covers all possible inputs. See [perl #133365] */
+ index = _invlist_search(invlist, original);
+ base = invmap[index];
- /* The following largish code point ranges also don't have case
- * changes, but khw didn't think they warranted extra tests to speed
- * them up (which would slightly slow down everything else above them):
- * 1100..139F Hangul Jamo, Ethiopic
- * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
- * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
- * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
- * Combining Diacritical Marks Extended, Balinese,
- * Sundanese, Batak, Lepcha, Ol Chiki
- * 2000..206F General Punctuation
- */
-
- if (original >= 0x2D30) {
-
- /* This keeps the from needing further processing the code points
- * most likely to be used in the following non-cased major scripts:
- * CJK, Katakana, Hiragana, plus some less-likely scripts.
- *
- * (0x2D30 above might have to be changed to 2F00 in the unlikely
- * event that Unicode eventually allocates the unused block as of
- * v8.0 2FE0..2FEF to code points that are cased. khw has verified
- * that the test suite will start having failures to alert you
- * should that happen) */
- if (original < 0xA640) {
- return original;
- }
+ if (LIKELY(base == 0)) { /* 0 => original was unchanged by casing */
- if (original >= 0xAC00) {
+ /* At this bottom level routine is where we warn about illegal code
+ * points */
+ if (isUNICODE_POSSIBLY_PROBLEMATIC(original)) {
if (UNLIKELY(UNICODE_IS_SURROGATE(original))) {
if (ckWARN_d(WARN_SURROGATE)) {
const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
@@ -3250,16 +3209,8 @@ S_to_case_cp_list(pTHX_ const UV original,
"Operation \"%s\" returns its argument for"
" UTF-16 surrogate U+%04" UVXf, desc, original);
}
- return original;
- }
-
- /* AC00..FAFF Catches Hangul syllables and private use, plus
- * some others */
- if (original < 0xFB00) {
- return original;
}
-
- if (UNLIKELY(UNICODE_IS_SUPER(original))) {
+ else if (UNLIKELY(UNICODE_IS_SUPER(original))) {
if (UNLIKELY(original > MAX_LEGAL_CP)) {
Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, original));
}
@@ -3269,35 +3220,18 @@ S_to_case_cp_list(pTHX_ const UV original,
"Operation \"%s\" returns its argument for"
" non-Unicode code point 0x%04" UVXf, desc, original);
}
- return original;
- }
-
-#ifdef HIGHEST_CASE_CHANGING_CP
-
- if (UNLIKELY(original > HIGHEST_CASE_CHANGING_CP)) {
- return original;
}
-#endif
- }
- }
-
/* Note that non-characters are perfectly legal, so no warning should
* be given. */
- }
-
- /* 'index' is guaranteed to be non-negative, as this is an inversion
- * map that covers all possible inputs. See [perl #133365] */
- index = _invlist_search(invlist, original);
- base = invmap[index];
+ }
- /* The data structures are set up so that if 'base' is non-negative,
- * the case change is 1-to-1; and if 0, the change is to itself */
- if (LIKELY(base == 0)) {
+ *remaining_count = 0;
return original;
}
if (LIKELY(base > 0)) {
+ *remaining_count = 0;
return base + original - invlist_array(invlist)[index];
}