utf8.c: Refactor is_utf8_char_helper()

Now that the DFA is used by the only callers to this to eliminate the need to check for e.g., wrong continuation bytes, this function can be refactored to use a switch statement, which makes it clearer, shorter, and faster. The name is changed to indicate its private nature
author: Karl Williamson <khw@cpan.org> 2021-07-01 18:48:10 -0600
committer: Karl Williamson <khw@cpan.org> 2021-08-14 06:47:43 -0600
commit: 1aa501c28abd51b6253fb6da3caeee66320bf274 (patch)
tree: 34755745c3dffee562186389cedada7b54c88bc0
parent: 22f363ffd253b5142b1138438c30f34da9494d4a (diff)
download: perl-1aa501c28abd51b6253fb6da3caeee66320bf274.tar.gz
5 files changed, 88 insertions, 117 deletions
diff --git a/embed.fnc b/embed.fnc
index 5cea8260ce..834faea428 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1115,7 +1115,7 @@ pR	|OP*	|cmpchain_finish|NN OP* ch
 ApR	|I32	|is_lvalue_sub
 : Used in cop.h
 XopR	|I32	|was_lvalue_sub
-CpRTP	|STRLEN	|is_utf8_char_helper|NN const U8 * const s|NN const U8 * e|const U32 flags
+CpRTP	|STRLEN	|is_utf8_char_helper_|NN const U8 * const s|NN const U8 * e|const U32 flags
 CpRTP	|Size_t	|is_utf8_FF_helper_|NN const U8 * const s0		    \
 				|NN const U8 * const e			    \
 				|const bool require_partial
diff --git a/embed.h b/embed.h
index f4e0043cf1..8e9b3779dc 100644
--- a/embed.h
+++ b/embed.h
@@ -277,7 +277,7 @@
 #ifndef NO_MATHOMS
 #define is_utf8_char		Perl_is_utf8_char
 #endif
-#define is_utf8_char_helper	Perl_is_utf8_char_helper
+#define is_utf8_char_helper_	Perl_is_utf8_char_helper_
 #define is_utf8_fixed_width_buf_loclen_flags	Perl_is_utf8_fixed_width_buf_loclen_flags
 #define is_utf8_invariant_string_loc	Perl_is_utf8_invariant_string_loc
 #define is_utf8_string_flags	Perl_is_utf8_string_flags
diff --git a/inline.h b/inline.h
index 9cfa445626..31c68bd485 100644
--- a/inline.h
+++ b/inline.h
@@ -2211,7 +2211,7 @@ Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
 
   check_success:
 
-    return is_utf8_char_helper(s0, e, flags);
+    return is_utf8_char_helper_(s0, e, flags);
 
 #ifdef HAS_EXTRA_LONG_UTF8
 
@@ -2303,7 +2303,7 @@ Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, c
         return TRUE;
     }
 
-    return cBOOL(is_utf8_char_helper(s0, e, flags));
+    return cBOOL(is_utf8_char_helper_(s0, e, flags));
 
 #ifdef HAS_EXTRA_LONG_UTF8
 
diff --git a/proto.h b/proto.h
index 16080febb6..effb8ea2f2 100644
--- a/proto.h
+++ b/proto.h
@@ -1733,10 +1733,10 @@ PERL_CALLCONV STRLEN	Perl_is_utf8_char_buf(const U8 *buf, const U8 *buf_end);
 #define PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF	\
 	assert(buf); assert(buf_end)
 #endif
-PERL_CALLCONV STRLEN	Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
+PERL_CALLCONV STRLEN	Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
 			__attribute__warn_unused_result__
 			__attribute__pure__;
-#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER	\
+#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_	\
 	assert(s); assert(e)
 
 /* PERL_CALLCONV bool	is_utf8_fixed_width_buf_flags(const U8 * const s, STRLEN len, const U32 flags); */
diff --git a/utf8.c b/utf8.c
index 7f26c8645c..d9363b3860 100644
--- a/utf8.c
+++ b/utf8.c
@@ -776,149 +776,120 @@ S_does_utf8_overflow(const U8 * const s,
 #undef FF_OVERLONG_PREFIX
 
 STRLEN
-Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
+Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
 {
-    STRLEN len;
-    const U8 *x;
+    SSize_t len, full_len;
 
-    /* A helper function that should not be called directly.
-     *
-     * This function returns non-zero if the string beginning at 's' and
-     * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
-     * code point; otherwise it returns 0.  The examination stops after the
-     * first code point in 's' is validated, not looking at the rest of the
-     * input.  If 'e' is such that there are not enough bytes to represent a
-     * complete code point, this function will return non-zero anyway, if the
-     * bytes it does have are well-formed UTF-8 as far as they go, and aren't
-     * excluded by 'flags'.
-     *
-     * A non-zero return gives the number of bytes required to represent the
-     * code point.  Be aware that if the input is for a partial character, the
-     * return will be larger than 'e - s'.
-     *
-     * This function assumes that the code point represented is UTF-8 variant.
-     * The caller should have excluded the possibility of it being invariant
-     * before calling this function.
+    /* An internal helper function.
      *
+     * On input:
+     *  's' is a string, which is known to be syntactically valid UTF-8 as far
+     *      as (e - 1); e > s must hold.
+     *  'e' This function is allowed to look at any byte from 's'...'e-1', but
+     *      nowhere else.  The function has to cope as best it can if that
+     *      sequence does not form a full character.
      * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
-     * accepted by L</utf8n_to_uvchr>.  If non-zero, this function will return
-     * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
-     * disallowed by the flags.  If the input is only for a partial character,
-     * the function will return non-zero if there is any sequence of
-     * well-formed UTF-8 that, when appended to the input sequence, could
-     * result in an allowed code point; otherwise it returns 0.  Non characters
-     * cannot be determined based on partial character input.  But many  of the
-     * other excluded types can be determined with just the first one or two
-     * bytes.
+     *      accepted by L</utf8n_to_uvchr>.  If non-zero, this function returns
+     *      0 if it determines the input will match something disallowed.
+     * On output:
+     *  The return is the number of bytes required to represent the code point
+     *  if it isn't disallowed by 'flags'; 0 otherwise.  Be aware that if the
+     *  input is for a partial character, a successful return will be larger
+     *  than 'e - s'.
+     *
+     *  If *s..*(e-1) is only for a partial character, the function will return
+     *  non-zero if there is any sequence of well-formed UTF-8 that, when
+     *  appended to the input sequence, could result in an allowed code point;
+     *  otherwise it returns 0.  Non characters cannot be determined based on
+     *  partial character input.  But many  of the other excluded types can be
+     *  determined with just the first one or two bytes.
      *
      */
 
-    PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER;
+    PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_;
 
+    assert(e > s);
     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
                           |UTF8_DISALLOW_PERL_EXTENDED)));
 
-    if (UTF8_IS_INVARIANT(*s)) {
-        return 1;
-    }
+    full_len = UTF8SKIP(s);
 
-    /* A variant char must begin with a start byte */
-    if (UNLIKELY(! UTF8_IS_START(*s))) {
-        return 0;
+    len = e - s;
+    if (len > full_len) {
+        e = s + full_len;
+        len = full_len;
     }
 
-    /* Examine a maximum of a single whole code point */
-    if (e - s > UTF8SKIP(s)) {
-        e = s + UTF8SKIP(s);
-    }
+    switch (full_len) {
+        bool is_super;
 
-    len = e - s;
+      default: /* Extended */
+        if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
+            return 0;
+        }
 
-    if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
-        const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
-
-        /* Here, we are disallowing some set of largish code points, and the
-         * first byte indicates the sequence is for a code point that could be
-         * in the excluded set.  We generally don't have to look beyond this or
-         * the second byte to see if the sequence is actually for one of the
-         * excluded classes.  The code below is derived from this table:
-         *
-         *              UTF-8            UTF-EBCDIC I8
-         *   U+D800: \xED\xA0\x80      \xF1\xB6\xA0\xA0      First surrogate
-         *   U+DFFF: \xED\xBF\xBF      \xF1\xB7\xBF\xBF      Final surrogate
-         * U+110000: \xF4\x90\x80\x80  \xF9\xA2\xA0\xA0\xA0  First above Unicode
-         *
-         * Keep in mind that legal continuation bytes range between \x80..\xBF
-         * for UTF-8, and \xA0..\xBF for I8.  Anything above those aren't
-         * continuation bytes.  Hence, we don't have to test the upper edge
-         * because if any of those is encountered, the sequence is malformed,
-         * and would fail elsewhere in this function.
-         *
-         * The code here likewise assumes that there aren't other
-         * malformations; again the function should fail elsewhere because of
-         * these.  For example, an overlong beginning with FC doesn't actually
-         * have to be a super; it could actually represent a small code point,
-         * even U+0000.  But, since overlongs (and other malformations) are
-         * illegal, the function should return FALSE in either case.
-         */
+        /* FALLTHROUGH */
 
-        if (  (flags & UTF8_DISALLOW_SUPER)
-            && UNLIKELY(s0 > UTF_START_BYTE_110000_))
-        {
-            return 0;           /* Above Unicode */
+      case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:   /* above Unicode */
+      case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:   /* above Unicode */
+
+        if (flags & UTF8_DISALLOW_SUPER) {
+            return 0;                       /* Above Unicode */
         }
 
-        if (   (flags & UTF8_DISALLOW_PERL_EXTENDED)
-            &&  UNLIKELY(UTF8_IS_PERL_EXTENDED(s)))
+        return full_len;
+
+      case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
+        is_super = (   UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_)
+                    || (   len > 1
+                        && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_
+                        && NATIVE_UTF8_TO_I8(s[1])
+                                                >= UTF_FIRST_CONT_BYTE_110000_));
+        if (is_super) {
+            if (flags & UTF8_DISALLOW_SUPER) {
+                return 0;
+            }
+        }
+        else if (   (flags & UTF8_DISALLOW_NONCHAR)
+                 && len == full_len
+                 && UNLIKELY(is_LARGER_NON_CHARS_utf8(s)))
         {
             return 0;
         }
 
-        if (len > 1) {
-            if (   (flags & UTF8_DISALLOW_SUPER)
-                && NATIVE_UTF8_TO_I8(s[0]) >= UTF_START_BYTE_110000_
-                && NATIVE_UTF8_TO_I8(s[1]) >= UTF_FIRST_CONT_BYTE_110000_)
-            {
-                return 0;       /* Above Unicode */
-            }
+        return full_len;
 
-            if (   (flags & UTF8_DISALLOW_SURROGATE)
-                &&  UNLIKELY(is_SURROGATE_utf8(s)))
-            {
-                return 0;       /* Surrogate */
-            }
+      case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
 
-            if (  (flags & UTF8_DISALLOW_NONCHAR)
-                && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
-            {
-                return 0;       /* Noncharacter code point */
-            }
+        if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) {
+            return full_len;
         }
-    }
 
-    /* Make sure that all that follows are continuation bytes */
-    for (x = s + 1; x < e; x++) {
-        if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
+        if (   (flags & UTF8_DISALLOW_SURROGATE)
+            &&  UNLIKELY(is_SURROGATE_utf8(s)))
+        {
+            return 0;       /* Surrogate */
+        }
+
+        if (  (flags & UTF8_DISALLOW_NONCHAR)
+            && len == full_len
+            && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s)))
+        {
             return 0;
         }
-    }
 
-    /* Here is syntactically valid.  Next, make sure this isn't the start of an
-     * overlong. */
-    if (is_utf8_overlong(s, len) > 0) {
-        return 0;
-    }
+        return full_len;
 
-    /* And finally, that the code point represented fits in a word on this
-     * platform */
-    if (0 < does_utf8_overflow(s, e,
-                               0 /* Don't consider overlongs */
-                              ))
-    {
-        return 0;
-    }
+      /* The lower code points don't have any disallowable characters */
+#ifdef EBCDIC
+      case 3:
+        return full_len;
+#endif
 
-    return UTF8SKIP(s);
+      case 2:
+      case 1:
+        return full_len;
+    }
 }
 
 Size_t
author	Karl Williamson <khw@cpan.org>	2021-07-01 18:48:10 -0600
committer	Karl Williamson <khw@cpan.org>	2021-08-14 06:47:43 -0600
commit	1aa501c28abd51b6253fb6da3caeee66320bf274 (patch)
tree	34755745c3dffee562186389cedada7b54c88bc0
parent	22f363ffd253b5142b1138438c30f34da9494d4a (diff)
download	perl-1aa501c28abd51b6253fb6da3caeee66320bf274.tar.gz