4 files changed, 74 insertions, 0 deletions
diff --git a/embed.fnc b/embed.fnc
index 8dba57ce12..ac8f93ab8a 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1116,6 +1116,9 @@ ApR	|I32	|is_lvalue_sub
 : Used in cop.h
 XopR	|I32	|was_lvalue_sub
 CpRTP	|STRLEN	|is_utf8_char_helper|NN const U8 * const s|NN const U8 * e|const U32 flags
+CpRTP	|Size_t	|is_utf8_FF_helper_|NN const U8 * const s0		    \
+				|NN const U8 * const e			    \
+				|const bool require_partial
 Cp	|UV	|to_uni_upper	|UV c|NN U8 *p|NN STRLEN *lenp
 Cp	|UV	|to_uni_title	|UV c|NN U8 *p|NN STRLEN *lenp
 p	|void	|init_uniprops
diff --git a/embed.h b/embed.h
index cece9a798c..4578848f7a 100644
--- a/embed.h
+++ b/embed.h
@@ -272,6 +272,7 @@
 #define is_lvalue_sub()		Perl_is_lvalue_sub(aTHX)
 #define is_safe_syscall(a,b,c,d)	Perl_is_safe_syscall(aTHX_ a,b,c,d)
 #define is_strict_utf8_string_loclen	Perl_is_strict_utf8_string_loclen
+#define is_utf8_FF_helper_	Perl_is_utf8_FF_helper_
 #ifndef NO_MATHOMS
 #define is_utf8_char		Perl_is_utf8_char
 #endif
diff --git a/proto.h b/proto.h
index 7083f13135..26884b042f 100644
--- a/proto.h
+++ b/proto.h
@@ -1708,6 +1708,12 @@ PERL_STATIC_INLINE bool	Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN le
 #define PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN	\
 	assert(s)
 #endif
+PERL_CALLCONV Size_t	Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e, const bool require_partial)
+			__attribute__warn_unused_result__
+			__attribute__pure__;
+#define PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_	\
+	assert(s0); assert(e)
+
 #ifndef NO_MATHOMS
 PERL_CALLCONV STRLEN	Perl_is_utf8_char(const U8 *s)
 			__attribute__deprecated__;
diff --git a/utf8.c b/utf8.c
index 5877add9f2..6b157dfaaa 100644
--- a/utf8.c
+++ b/utf8.c
@@ -956,6 +956,70 @@ Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
     return UTF8SKIP(s);
 }
 
+Size_t
+Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
+                        const bool require_partial)
+{
+    /* This is called to determine if the UTF-8 sequence starting at s0 and
+     * continuing for up to one full character of bytes, but looking no further
+     * than 'e - 1', is legal.  *s0 must be 0xFF (or whatever the native
+     * equivalent of FF in I8 on EBCDIC platforms is).  This marks it as being
+     * for the largest code points recognized by Perl, the ones that require
+     * the most UTF-8 bytes per character to represent (somewhat less than
+     * twice the size of the next longest kind).  This sequence will only ever
+     * be Perl extended UTF-8.
+     *
+     * The routine returns 0 if the sequence is not fully valid, syntactically
+     * or semantically.  That means it checks that everything following the
+     * start byte is a continuation byte, and that it doesn't overflow, nor is
+     * an overlong representation.
+     *
+     * If 'require_partial' is FALSE, the routine returns non-zero only if the
+     * input (as far as 'e-1') is a full character.  The return is the count of
+     * the bytes in the character.
+     *
+     * If 'require_partial' is TRUE, the routine returns non-zero only if the
+     * input as far as 'e-1' is a partial, not full character, with no
+     * malformations found before position 'e'.  The return is either just
+     * FALSE, or TRUE.  */
+
+    const U8 *s = s0 + 1;
+    const U8 *send = e;
+
+    PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
+
+    assert(s0 < e);
+    assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
+
+    send = s + MIN(UTF8_MAXBYTES - 1, e - s);
+    while (s < send) {
+        if (! UTF8_IS_CONTINUATION(*s)) {
+            return 0;
+        }
+
+        s++;
+    }
+
+    if (0 < does_utf8_overflow(s0, e,
+                               FALSE /* Don't consider_overlongs */
+    )) {
+        return 0;
+    }
+
+    if (0 < isFF_overlong(s0, e - s0)) {
+        return 0;
+    }
+
+    /* Here, the character is valid as far as it got.  Check if got a partial
+     * character */
+    if (s - s0 < UTF8_MAXBYTES) {
+        return (require_partial) ? 1 : 0;
+    }
+
+    /* Here, got a full character */
+    return (require_partial) ? 0 : UTF8_MAXBYTES;
+}
+
 char *
 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
 {