diff options
author | Karl Williamson <khw@cpan.org> | 2021-06-27 02:19:19 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2021-08-07 05:14:43 -0600 |
commit | 8b5f2733d203f1d4c77d2a7e2250f5b63c17c9e9 (patch) | |
tree | 4d780adf97c32e8aca95b22269706a4d19ab62a8 /utf8.c | |
parent | d49e4ce1792863bdc3e344a52aef19110508e1ac (diff) | |
download | perl-8b5f2733d203f1d4c77d2a7e2250f5b63c17c9e9.tar.gz |
utf8.c: Generalize static fcn
I've always been uncomfortable with the input constraints this function
had. Now that it has been refactored into using a switch(), new cases
for full generality can be added without affecting performance, and
some conditionals removed before calling it.
The function is renamed to reflect its more generality
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 65 |
1 files changed, 40 insertions, 25 deletions
@@ -608,7 +608,7 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, * but for these huge code points, speed shouldn't be a consideration, and * the compiler does have enough information, since it's static to this * file, to optimize to just the needed parts.) */ - is_overlong = is_utf8_overlong_given_start_byte_ok(s, len); + is_overlong = is_utf8_overlong(s, len); /* If it isn't overlong, more than 31 bits are required. */ if (is_overlong == 0) { @@ -692,7 +692,7 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, #endif PERL_STATIC_INLINE int -S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len) +S_is_utf8_overlong(const U8 * const s, const STRLEN len) { /* Returns an int indicating whether or not the UTF-8 sequence from 's' to * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if @@ -700,23 +700,15 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len) * return value can happen if the sequence is incomplete, missing some * trailing bytes that would form a complete character. If there are * enough bytes to make a definitive decision, this function does so. - * Usually 2 bytes sufficient. + * Usually 2 bytes are sufficient. * * Overlongs can occur whenever the number of continuation bytes changes. * That means whenever the number of leading 1 bits in a start byte * increases from the next lower start byte. That happens for start bytes - * C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following illegal - * start bytes have already been excluded, so don't need to be tested here; - * ASCII platforms: C0, C1 - * EBCDIC platforms C0, C1, C2, C3, C4, E0 + * C0, E0, F0, F8, FC, FE, and FF. */ - U8 s1; - - PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK; - assert(len > 1 && UTF8_IS_START(*s)); - - s1 = NATIVE_UTF8_TO_I8(s[1]); + PERL_ARGS_ASSERT_IS_UTF8_OVERLONG; /* Each platform has overlongs after the start bytes given above (expressed * in I8 for EBCDIC). The values below were found by manually inspecting @@ -724,18 +716,43 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len) switch (NATIVE_UTF8_TO_I8(s[0])) { default: + assert(UTF8_IS_START(s[0])); return 0; -#ifndef EBCDIC /* the E0 overlong has already been excluded on EBCDIC - platforms. */ - case 0xE0: return s1 < 0xA0; + case 0xC0: + case 0xC1: + return 1; + +#ifdef EBCDIC + + case 0xC2: + case 0xC3: + case 0xC4: + case 0xE0: + return 1; +#else + case 0xE0: + return (len < 2) ? -1 : s[1] < 0xA0; #endif - case 0xF0: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x10; - case 0xF8: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x08; - case 0xFC: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x04; - case 0xFE: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x02; - case 0xFF: return isFF_overlong(s, len); + case 0xF0: + return (len < 2) + ? -1 + : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10; + case 0xF8: + return (len < 2) + ? -1 + : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08; + case 0xFC: + return (len < 2) + ? -1 + : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04; + case 0xFE: + return (len < 2) + ? -1 + : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02; + case 0xFF: + return isFF_overlong(s, len); } } @@ -1055,7 +1072,7 @@ Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) /* Here is syntactically valid. Next, make sure this isn't the start of an * overlong. */ - if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) { + if (is_utf8_overlong(s, len) > 0) { return 0; } @@ -1733,9 +1750,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv))) || ( UNLIKELY(possible_problems) && ( UNLIKELY(! UTF8_IS_START(*s0)) - || ( curlen > 1 - && UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0, - s - s0)))))) + || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0)))))) { possible_problems |= UTF8_GOT_LONG; |