Add helper function for longest UTF8 sequence

This specialized functionality is used to check the validity of Perl's extended-length UTF-8, which has some ideosyncratic characteristics from the shorter sequences. This means this function doesn't have to consider those differences. It will be used in the next commit to avoid some work, and to eventually enable is_utf8_char_helper() to be simplified.
author: Karl Williamson <khw@cpan.org> 2021-06-26 19:57:03 -0600
committer: Karl Williamson <khw@cpan.org> 2021-08-07 05:14:44 -0600
commit: 58b66e89ba80b4f91ff12da18da1301aa182a687 (patch)
tree: db91351c2e183774b4b816c78fd842d73872bc98
parent: 527347e0493f4ee24d63df996fd6616806422d17 (diff)
download: perl-58b66e89ba80b4f91ff12da18da1301aa182a687.tar.gz
4 files changed, 74 insertions, 0 deletions
diff --git a/embed.fnc b/embed.fnc
index 8dba57ce12..ac8f93ab8a 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1116,6 +1116,9 @@ ApR	|I32	|is_lvalue_sub
 : Used in cop.h
 XopR	|I32	|was_lvalue_sub
 CpRTP	|STRLEN	|is_utf8_char_helper|NN const U8 * const s|NN const U8 * e|const U32 flags
+CpRTP	|Size_t	|is_utf8_FF_helper_|NN const U8 * const s0		    \
+				|NN const U8 * const e			    \
+				|const bool require_partial
 Cp	|UV	|to_uni_upper	|UV c|NN U8 *p|NN STRLEN *lenp
 Cp	|UV	|to_uni_title	|UV c|NN U8 *p|NN STRLEN *lenp
 p	|void	|init_uniprops
diff --git a/embed.h b/embed.h
index cece9a798c..4578848f7a 100644
--- a/embed.h
+++ b/embed.h
@@ -272,6 +272,7 @@
 #define is_lvalue_sub()		Perl_is_lvalue_sub(aTHX)
 #define is_safe_syscall(a,b,c,d)	Perl_is_safe_syscall(aTHX_ a,b,c,d)
 #define is_strict_utf8_string_loclen	Perl_is_strict_utf8_string_loclen
+#define is_utf8_FF_helper_	Perl_is_utf8_FF_helper_
 #ifndef NO_MATHOMS
 #define is_utf8_char		Perl_is_utf8_char
 #endif
diff --git a/proto.h b/proto.h
index 7083f13135..26884b042f 100644
--- a/proto.h
+++ b/proto.h
@@ -1708,6 +1708,12 @@ PERL_STATIC_INLINE bool	Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN le
 #define PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN	\
 	assert(s)
 #endif
+PERL_CALLCONV Size_t	Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e, const bool require_partial)
+			__attribute__warn_unused_result__
+			__attribute__pure__;
+#define PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_	\
+	assert(s0); assert(e)
+
 #ifndef NO_MATHOMS
 PERL_CALLCONV STRLEN	Perl_is_utf8_char(const U8 *s)
 			__attribute__deprecated__;
diff --git a/utf8.c b/utf8.c
index 5877add9f2..6b157dfaaa 100644
--- a/utf8.c
+++ b/utf8.c
@@ -956,6 +956,70 @@ Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
     return UTF8SKIP(s);
 }
 
+Size_t
+Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
+                        const bool require_partial)
+{
+    /* This is called to determine if the UTF-8 sequence starting at s0 and
+     * continuing for up to one full character of bytes, but looking no further
+     * than 'e - 1', is legal.  *s0 must be 0xFF (or whatever the native
+     * equivalent of FF in I8 on EBCDIC platforms is).  This marks it as being
+     * for the largest code points recognized by Perl, the ones that require
+     * the most UTF-8 bytes per character to represent (somewhat less than
+     * twice the size of the next longest kind).  This sequence will only ever
+     * be Perl extended UTF-8.
+     *
+     * The routine returns 0 if the sequence is not fully valid, syntactically
+     * or semantically.  That means it checks that everything following the
+     * start byte is a continuation byte, and that it doesn't overflow, nor is
+     * an overlong representation.
+     *
+     * If 'require_partial' is FALSE, the routine returns non-zero only if the
+     * input (as far as 'e-1') is a full character.  The return is the count of
+     * the bytes in the character.
+     *
+     * If 'require_partial' is TRUE, the routine returns non-zero only if the
+     * input as far as 'e-1' is a partial, not full character, with no
+     * malformations found before position 'e'.  The return is either just
+     * FALSE, or TRUE.  */
+
+    const U8 *s = s0 + 1;
+    const U8 *send = e;
+
+    PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
+
+    assert(s0 < e);
+    assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
+
+    send = s + MIN(UTF8_MAXBYTES - 1, e - s);
+    while (s < send) {
+        if (! UTF8_IS_CONTINUATION(*s)) {
+            return 0;
+        }
+
+        s++;
+    }
+
+    if (0 < does_utf8_overflow(s0, e,
+                               FALSE /* Don't consider_overlongs */
+    )) {
+        return 0;
+    }
+
+    if (0 < isFF_overlong(s0, e - s0)) {
+        return 0;
+    }
+
+    /* Here, the character is valid as far as it got.  Check if got a partial
+     * character */
+    if (s - s0 < UTF8_MAXBYTES) {
+        return (require_partial) ? 1 : 0;
+    }
+
+    /* Here, got a full character */
+    return (require_partial) ? 0 : UTF8_MAXBYTES;
+}
+
 char *
 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
 {
author	Karl Williamson <khw@cpan.org>	2021-06-26 19:57:03 -0600
committer	Karl Williamson <khw@cpan.org>	2021-08-07 05:14:44 -0600
commit	58b66e89ba80b4f91ff12da18da1301aa182a687 (patch)
tree	db91351c2e183774b4b816c78fd842d73872bc98
parent	527347e0493f4ee24d63df996fd6616806422d17 (diff)
download	perl-58b66e89ba80b4f91ff12da18da1301aa182a687.tar.gz