utf8.c: Generalize static fcn

I've always been uncomfortable with the input constraints this function had. Now that it has been refactored into using a switch(), new cases for full generality can be added without affecting performance, and some conditionals removed before calling it. The function is renamed to reflect its more generality
author: Karl Williamson <khw@cpan.org> 2021-06-27 02:19:19 -0600
committer: Karl Williamson <khw@cpan.org> 2021-08-07 05:14:43 -0600
commit: 8b5f2733d203f1d4c77d2a7e2250f5b63c17c9e9 (patch)
tree: 4d780adf97c32e8aca95b22269706a4d19ab62a8
parent: d49e4ce1792863bdc3e344a52aef19110508e1ac (diff)
download: perl-8b5f2733d203f1d4c77d2a7e2250f5b63c17c9e9.tar.gz
4 files changed, 45 insertions, 30 deletions
diff --git a/embed.fnc b/embed.fnc
index 4a53b25d58..7d851bae01 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2482,8 +2482,8 @@ EXp	|char *	|_byte_dump_string					\
 iTR	|int	|does_utf8_overflow|NN const U8 * const s		\
 				   |NN const U8 * e			\
 				   |const bool consider_overlongs
-iTR	|int	|is_utf8_overlong_given_start_byte_ok|NN const U8 * const s \
-						     |const STRLEN len
+iTR	|int	|is_utf8_overlong|NN const U8 * const s			\
+				|const STRLEN len
 iTR	|int	|isFF_overlong	|NN const U8 * const s|const STRLEN len
 SR	|char *	|unexpected_non_continuation_text			\
 		|NN const U8 * const s					\
diff --git a/embed.h b/embed.h
index ee85c75abd..e26c96149c 100644
--- a/embed.h
+++ b/embed.h
@@ -1991,7 +1991,7 @@
 #define does_utf8_overflow	S_does_utf8_overflow
 #define isFF_overlong		S_isFF_overlong
 #define is_utf8_common(a,b,c)	S_is_utf8_common(aTHX_ a,b,c)
-#define is_utf8_overlong_given_start_byte_ok	S_is_utf8_overlong_given_start_byte_ok
+#define is_utf8_overlong	S_is_utf8_overlong
 #define new_msg_hv(a,b,c)	S_new_msg_hv(aTHX_ a,b,c)
 #define to_lower_latin1		S_to_lower_latin1
 #define turkic_fc(a,b,c,d)	S_turkic_fc(aTHX_ a,b,c,d)
diff --git a/proto.h b/proto.h
index bb77d8307b..94fdfecbb3 100644
--- a/proto.h
+++ b/proto.h
@@ -6647,9 +6647,9 @@ PERL_STATIC_INLINE bool	S_is_utf8_common(pTHX_ const U8 *const p, const U8 *cons
 #endif
 
 #ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE int	S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
+PERL_STATIC_INLINE int	S_is_utf8_overlong(const U8 * const s, const STRLEN len)
 			__attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK	\
+#define PERL_ARGS_ASSERT_IS_UTF8_OVERLONG	\
 	assert(s)
 #endif
 
diff --git a/utf8.c b/utf8.c
index cd8e84833e..1be95015cb 100644
--- a/utf8.c
+++ b/utf8.c
@@ -608,7 +608,7 @@ S_is_utf8_cp_above_31_bits(const U8 * const s,
      * but for these huge code points, speed shouldn't be a consideration, and
      * the compiler does have enough information, since it's static to this
      * file, to optimize to just the needed parts.) */
-    is_overlong = is_utf8_overlong_given_start_byte_ok(s, len);
+    is_overlong = is_utf8_overlong(s, len);
 
     /* If it isn't overlong, more than 31 bits are required. */
     if (is_overlong == 0) {
@@ -692,7 +692,7 @@ S_is_utf8_cp_above_31_bits(const U8 * const s,
 #endif
 
 PERL_STATIC_INLINE int
-S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
+S_is_utf8_overlong(const U8 * const s, const STRLEN len)
 {
     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
      * 's' + 'len' - 1 is an overlong.  It returns 1 if it is an overlong; 0 if
@@ -700,23 +700,15 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
      * return value can happen if the sequence is incomplete, missing some
      * trailing bytes that would form a complete character.  If there are
      * enough bytes to make a definitive decision, this function does so.
-     * Usually 2 bytes sufficient.
+     * Usually 2 bytes are sufficient.
      *
      * Overlongs can occur whenever the number of continuation bytes changes.
      * That means whenever the number of leading 1 bits in a start byte
      * increases from the next lower start byte.  That happens for start bytes
-     * C0, E0, F0, F8, FC, FE, and FF.  On modern perls, the following illegal
-     * start bytes have already been excluded, so don't need to be tested here;
-     * ASCII platforms: C0, C1
-     * EBCDIC platforms C0, C1, C2, C3, C4, E0
+     * C0, E0, F0, F8, FC, FE, and FF.
      */
 
-    U8 s1;
-
-    PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
-    assert(len > 1 && UTF8_IS_START(*s));
-
-    s1 = NATIVE_UTF8_TO_I8(s[1]);
+    PERL_ARGS_ASSERT_IS_UTF8_OVERLONG;
 
     /* Each platform has overlongs after the start bytes given above (expressed
      * in I8 for EBCDIC).  The values below were found by manually inspecting
@@ -724,18 +716,43 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
 
     switch (NATIVE_UTF8_TO_I8(s[0])) {
       default:
+        assert(UTF8_IS_START(s[0]));
         return 0;
-#ifndef EBCDIC /* the E0 overlong has already been excluded on EBCDIC
-                  platforms. */
-      case 0xE0: return s1 < 0xA0;
 
+      case 0xC0:
+      case 0xC1:
+        return 1;
+
+#ifdef EBCDIC
+
+      case 0xC2:
+      case 0xC3:
+      case 0xC4:
+      case 0xE0:
+        return 1;
+#else
+      case 0xE0:
+        return (len < 2) ? -1 : s[1] < 0xA0;
 #endif
 
-      case 0xF0: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x10;
-      case 0xF8: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x08;
-      case 0xFC: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x04;
-      case 0xFE: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x02;
-      case 0xFF: return isFF_overlong(s, len);
+      case 0xF0:
+        return (len < 2)
+               ? -1
+               : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10;
+      case 0xF8:
+        return (len < 2)
+               ? -1
+               : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08;
+      case 0xFC:
+        return (len < 2)
+               ? -1
+               : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04;
+      case 0xFE:
+        return (len < 2)
+               ? -1
+               : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02;
+      case 0xFF:
+        return isFF_overlong(s, len);
     }
 }
 
@@ -1055,7 +1072,7 @@ Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
 
     /* Here is syntactically valid.  Next, make sure this isn't the start of an
      * overlong. */
-    if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
+    if (is_utf8_overlong(s, len) > 0) {
         return 0;
     }
 
@@ -1733,9 +1750,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
               && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
         || (       UNLIKELY(possible_problems)
             && (   UNLIKELY(! UTF8_IS_START(*s0))
-                || (   curlen > 1
-                    && UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
-                                                                s - s0))))))
+                || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))))
     {
         possible_problems |= UTF8_GOT_LONG;
author	Karl Williamson <khw@cpan.org>	2021-06-27 02:19:19 -0600
committer	Karl Williamson <khw@cpan.org>	2021-08-07 05:14:43 -0600
commit	8b5f2733d203f1d4c77d2a7e2250f5b63c17c9e9 (patch)
tree	4d780adf97c32e8aca95b22269706a4d19ab62a8
parent	d49e4ce1792863bdc3e344a52aef19110508e1ac (diff)
download	perl-8b5f2733d203f1d4c77d2a7e2250f5b63c17c9e9.tar.gz