Add helper function for longest UTF8 sequence

This specialized functionality is used to check the validity of Perl's extended-length UTF-8, which has some ideosyncratic characteristics from the shorter sequences. This means this function doesn't have to consider those differences. It will be used in the next commit to avoid some work, and to eventually enable is_utf8_char_helper() to be simplified.
author: Karl Williamson <khw@cpan.org> 2021-06-26 19:57:03 -0600
committer: Karl Williamson <khw@cpan.org> 2021-08-07 05:14:44 -0600
commit: 58b66e89ba80b4f91ff12da18da1301aa182a687 (patch)
tree: db91351c2e183774b4b816c78fd842d73872bc98 /utf8.c
parent: 527347e0493f4ee24d63df996fd6616806422d17 (diff)
download: perl-58b66e89ba80b4f91ff12da18da1301aa182a687.tar.gz
1 files changed, 64 insertions, 0 deletions
diff --git a/utf8.c b/utf8.c
index 5877add9f2..6b157dfaaa 100644
--- a/utf8.c
+++ b/utf8.c
@@ -956,6 +956,70 @@ Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
     return UTF8SKIP(s);
 }
 
+Size_t
+Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
+                        const bool require_partial)
+{
+    /* This is called to determine if the UTF-8 sequence starting at s0 and
+     * continuing for up to one full character of bytes, but looking no further
+     * than 'e - 1', is legal.  *s0 must be 0xFF (or whatever the native
+     * equivalent of FF in I8 on EBCDIC platforms is).  This marks it as being
+     * for the largest code points recognized by Perl, the ones that require
+     * the most UTF-8 bytes per character to represent (somewhat less than
+     * twice the size of the next longest kind).  This sequence will only ever
+     * be Perl extended UTF-8.
+     *
+     * The routine returns 0 if the sequence is not fully valid, syntactically
+     * or semantically.  That means it checks that everything following the
+     * start byte is a continuation byte, and that it doesn't overflow, nor is
+     * an overlong representation.
+     *
+     * If 'require_partial' is FALSE, the routine returns non-zero only if the
+     * input (as far as 'e-1') is a full character.  The return is the count of
+     * the bytes in the character.
+     *
+     * If 'require_partial' is TRUE, the routine returns non-zero only if the
+     * input as far as 'e-1' is a partial, not full character, with no
+     * malformations found before position 'e'.  The return is either just
+     * FALSE, or TRUE.  */
+
+    const U8 *s = s0 + 1;
+    const U8 *send = e;
+
+    PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
+
+    assert(s0 < e);
+    assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
+
+    send = s + MIN(UTF8_MAXBYTES - 1, e - s);
+    while (s < send) {
+        if (! UTF8_IS_CONTINUATION(*s)) {
+            return 0;
+        }
+
+        s++;
+    }
+
+    if (0 < does_utf8_overflow(s0, e,
+                               FALSE /* Don't consider_overlongs */
+    )) {
+        return 0;
+    }
+
+    if (0 < isFF_overlong(s0, e - s0)) {
+        return 0;
+    }
+
+    /* Here, the character is valid as far as it got.  Check if got a partial
+     * character */
+    if (s - s0 < UTF8_MAXBYTES) {
+        return (require_partial) ? 1 : 0;
+    }
+
+    /* Here, got a full character */
+    return (require_partial) ? 0 : UTF8_MAXBYTES;
+}
+
 char *
 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
 {
author	Karl Williamson <khw@cpan.org>	2021-06-26 19:57:03 -0600
committer	Karl Williamson <khw@cpan.org>	2021-08-07 05:14:44 -0600
commit	58b66e89ba80b4f91ff12da18da1301aa182a687 (patch)
tree	db91351c2e183774b4b816c78fd842d73872bc98 /utf8.c
parent	527347e0493f4ee24d63df996fd6616806422d17 (diff)
download	perl-58b66e89ba80b4f91ff12da18da1301aa182a687.tar.gz