summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c143
1 files changed, 89 insertions, 54 deletions
diff --git a/utf8.c b/utf8.c
index 372b1f1200..3d0d355580 100644
--- a/utf8.c
+++ b/utf8.c
@@ -3166,26 +3166,34 @@ Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
}
STATIC UV
-S__to_utf8_case(pTHX_ const UV original, const U8 *p,
- U8* ustrp, STRLEN *lenp,
+S_to_case_cp_list(pTHX_ const UV original,
+ const U32 ** const remaining_list,
+ Size_t * remaining_count,
SV *invlist, const I32 * const invmap,
const U32 * const * const aux_tables,
const U8 * const aux_table_lengths,
const char * const normal)
{
- STRLEN len = 0;
-
- /* Change the case of code point 'original' whose UTF-8 representation (assumed
- * by this routine to be valid) begins at 'p'. 'normal' is a string to use
- * to name the new case in any generated messages, as a fallback if the
- * operation being used is not available. The new case is given by the
- * data structures in the remaining arguments.
+ SSize_t index;
+ I32 base;
+
+ /* Return the changed case of code point 'original'. The first code point of
+ * the changed case is returned; *remaining_count will be set to how many
+ * other code points are in the changed case. If it is non-zero,
+ * *remaining_list will point to a non-modifiable array containing them;
+ * if zero, *remaining_list is undefined.
*
- * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
- * entire changed case string, and the return value is the first code point
- * in that string */
+ * 'normal' is a string to use to name the new case in any generated
+ * messages, as a fallback if the operation being used is not available.
+ *
+ * The casing to use is given by the data structures in the remaining
+ * arguments.
+ */
- PERL_ARGS_ASSERT__TO_UTF8_CASE;
+ PERL_ARGS_ASSERT_TO_CASE_CP_LIST;
+
+ /* Almost all results will be a single value */
+ *remaining_count = 0;
/* For code points that don't change case, we already know that the output
* of this function is the unchanged input, so we can skip doing look-ups
@@ -3204,7 +3212,7 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
* Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
* Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
if (original < 0x10A0) {
- goto cases_to_self;
+ return original;
}
/* The following largish code point ranges also don't have case
@@ -3231,7 +3239,7 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
* that the test suite will start having failures to alert you
* should that happen) */
if (original < 0xA640) {
- goto cases_to_self;
+ return original;
}
if (original >= 0xAC00) {
@@ -3242,13 +3250,13 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
"Operation \"%s\" returns its argument for"
" UTF-16 surrogate U+%04" UVXf, desc, original);
}
- goto cases_to_self;
+ return original;
}
/* AC00..FAFF Catches Hangul syllables and private use, plus
* some others */
if (original < 0xFB00) {
- goto cases_to_self;
+ return original;
}
if (UNLIKELY(UNICODE_IS_SUPER(original))) {
@@ -3261,12 +3269,13 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
"Operation \"%s\" returns its argument for"
" non-Unicode code point 0x%04" UVXf, desc, original);
}
- goto cases_to_self;
+ return original;
}
+
#ifdef HIGHEST_CASE_CHANGING_CP
- if (UNLIKELY(original > HIGHEST_CASE_CHANGING_CP)) {
- goto cases_to_self;
+ if (UNLIKELY(original > HIGHEST_CASE_CHANGING_CP)) {
+ return original;
}
#endif
}
@@ -3276,64 +3285,90 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
* be given. */
}
- {
- unsigned int i;
- const U32 * cp_list;
- U8 * d;
/* 'index' is guaranteed to be non-negative, as this is an inversion
* map that covers all possible inputs. See [perl #133365] */
- SSize_t index = _invlist_search(invlist, original);
- I32 base = invmap[index];
+ index = _invlist_search(invlist, original);
+ base = invmap[index];
/* The data structures are set up so that if 'base' is non-negative,
* the case change is 1-to-1; and if 0, the change is to itself */
- if (base >= 0) {
- IV lc;
-
- if (base == 0) {
- goto cases_to_self;
+ if (LIKELY(base == 0)) {
+ return original;
}
- /* This computes, e.g. lc(H) as 'H - A + a', using the lc table */
- lc = base + original - invlist_array(invlist)[index];
- *lenp = uvchr_to_utf8(ustrp, lc) - ustrp;
- return lc;
+ if (LIKELY(base > 0)) {
+ return base + original - invlist_array(invlist)[index];
}
+
/* Here 'base' is negative. That means the mapping is 1-to-many, and
* requires an auxiliary table look up. abs(base) gives the index into
* a list of such tables which points to the proper aux table. And a
* parallel list gives the length of each corresponding aux table. */
- cp_list = aux_tables[-base];
+ base = -base;
+ *remaining_list = aux_tables[base] + 1;
+ *remaining_count = (Size_t) (aux_table_lengths[base] - 1);
- /* Create the string of UTF-8 from the mapped-to code points */
- d = ustrp;
- for (i = 0; i < aux_table_lengths[-base]; i++) {
- d = uvchr_to_utf8(d, cp_list[i]);
- }
- *d = '\0';
- *lenp = d - ustrp;
+ return (UV) aux_tables[base][0];
+}
- return cp_list[0];
- }
+STATIC UV
+S__to_utf8_case(pTHX_ const UV original, const U8 *p,
+ U8* ustrp, STRLEN *lenp,
+ SV *invlist, const I32 * const invmap,
+ const U32 * const * const aux_tables,
+ const U8 * const aux_table_lengths,
+ const char * const normal)
+{
+ /* Change the case of code point 'original'. If 'p' is non-NULL, it points to
+ * the beginning of the (assumed to be valid) UTF-8 representation of
+ * 'original'. 'normal' is a string to use to name the new case in any
+ * generated messages, as a fallback if the operation being used is not
+ * available. The new case is given by the data structures in the
+ * remaining arguments.
+ *
+ * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
+ * entire changed case string, and the return value is the first code point
+ * in that string
+ *
+ * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes
+ * since the changed version may be longer than the original character. */
+
+ const U32 * remaining_list;
+ Size_t remaining_count;
+ UV first = to_case_cp_list(original,
+ &remaining_list, &remaining_count,
+ invlist, invmap,
+ aux_tables, aux_table_lengths,
+ normal);
+
+ PERL_ARGS_ASSERT__TO_UTF8_CASE;
+
+ /* If the code point maps to itself and we already have its representation,
+ * copy it instead of recalculating */
+ if (original == first && p) {
+ *lenp = UTF8SKIP(p);
- /* Here, there was no mapping defined, which means that the code point maps
- * to itself. Return the inputs */
- cases_to_self:
- if (p) {
- len = UTF8SKIP(p);
if (p != ustrp) { /* Don't copy onto itself */
- Copy(p, ustrp, len, U8);
+ Copy(p, ustrp, *lenp, U8);
}
- *lenp = len;
}
else {
- *lenp = uvchr_to_utf8(ustrp, original) - ustrp;
+ U8 * d = ustrp;
+ Size_t i;
+
+ d = uvchr_to_utf8(d, first);
+
+ for (i = 0; i < remaining_count; i++) {
+ d = uvchr_to_utf8(d, remaining_list[i]);
}
- return original;
+ *d = '\0';
+ *lenp = d - ustrp;
+ }
+ return first;
}
Size_t