7 files changed, 136 insertions, 43 deletions
diff --git a/embed.fnc b/embed.fnc
index e2ac1272a0..08b248307b 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -678,7 +678,7 @@ ApR	|I32	|is_lvalue_sub
 : Used in cop.h
 XopR	|I32	|was_lvalue_sub
 #ifndef PERL_NO_INLINE_FUNCTIONS
-AiMRn	|STRLEN	|_is_utf8_char_slow|NN const U8 *s|NN const U8 *e
+ApMRnP	|STRLEN	|_is_utf8_char_slow|NN const U8 * const s|const STRLEN len
 #endif
 ADMpPR	|U32	|to_uni_upper_lc|U32 c
 ADMpPR	|U32	|to_uni_title_lc|U32 c
diff --git a/embed.h b/embed.h
index 9213a5df88..c9683ee32c 100644
--- a/embed.h
+++ b/embed.h
@@ -786,7 +786,7 @@
 #define my_popen(a,b)		Perl_my_popen(aTHX_ a,b)
 #endif
 #if !defined(PERL_NO_INLINE_FUNCTIONS)
-#define _is_utf8_char_slow	S__is_utf8_char_slow
+#define _is_utf8_char_slow	Perl__is_utf8_char_slow
 #define append_utf8_from_native_byte	S_append_utf8_from_native_byte
 #define av_top_index(a)		S_av_top_index(aTHX_ a)
 #define cx_popblock(a)		S_cx_popblock(aTHX_ a)
diff --git a/inline.h b/inline.h
index f709572ff6..0dcc733851 100644
--- a/inline.h
+++ b/inline.h
@@ -277,36 +277,6 @@ S_append_utf8_from_native_byte(const U8 byte, U8** dest)
 }
 
 /*
-
-A helper function for the macro isUTF8_CHAR(), which should be used instead of
-this function.  The macro will handle smaller code points directly saving time,
-using this function as a fall-back for higher code points.
-
-Tests if the first bytes of string C<s> form a valid UTF-8 character.  0 is
-returned if the bytes starting at C<s> up to but not including C<e> do not form a
-complete well-formed UTF-8 character; otherwise the number of bytes in the
-character is returned.
-
-Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a valid UTF-8
-character.
-
-=cut */
-PERL_STATIC_INLINE STRLEN
-S__is_utf8_char_slow(const U8 *s, const U8 *e)
-{
-    dTHX;   /* The function called below requires thread context */
-
-    STRLEN actual_len;
-
-    PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW;
-
-    assert(e >= s);
-    utf8n_to_uvchr(s, e - s, &actual_len, UTF8_CHECK_ONLY);
-
-    return (actual_len == (STRLEN) -1) ? 0 : actual_len;
-}
-
-/*
 =for apidoc valid_utf8_to_uvchr
 Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
 the next character in the input UTF-8 string C<s> is well-formed (I<e.g.>,
diff --git a/proto.h b/proto.h
index a0fd435e52..498589648e 100644
--- a/proto.h
+++ b/proto.h
@@ -3800,10 +3800,11 @@ STATIC SV *	S_incpush_if_exists(pTHX_ AV *const av, SV *dir, SV *const stem);
 #  endif
 #endif
 #if !defined(PERL_NO_INLINE_FUNCTIONS)
-PERL_STATIC_INLINE STRLEN	S__is_utf8_char_slow(const U8 *s, const U8 *e)
-			__attribute__warn_unused_result__;
+PERL_CALLCONV STRLEN	Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len)
+			__attribute__warn_unused_result__
+			__attribute__pure__;
 #define PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW	\
-	assert(s); assert(e)
+	assert(s)
 
 PERL_STATIC_INLINE void	S_append_utf8_from_native_byte(const U8 byte, U8** dest);
 #define PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE	\
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8
index 4263c04958..947dea467f 100644
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -756,7 +756,6 @@ Operation "uc" returns its argument for non-Unicode code point 0x7F+ at - line \
 Use of code point 0x80+ is deprecated; the permissible max is 0x7F+ at - line \d+.
 Operation "uc" returns its argument for non-Unicode code point 0x80+ at - line \d+.
 Code point 0x7F+ is not Unicode, may not be portable in print at - line \d+.
-Use of code point 0x80+ is deprecated; the permissible max is 0x7F+ in print at - line \d+.
 ########
 # NAME  [perl #127262]
 BEGIN{
diff --git a/utf8.c b/utf8.c
index bd9b0c3b1c..f24402de4e 100644
--- a/utf8.c
+++ b/utf8.c
@@ -337,6 +337,118 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 
 /*
 
+A helper function for the macro isUTF8_CHAR(), which should be used instead of
+this function.  The macro will handle smaller code points directly saving time,
+using this function as a fall-back for higher code points.  This function
+assumes that it is not called with an invariant character, and that
+'s + len - 1' is within bounds of the string 's'.
+
+Tests if the string C<s> of at least length 'len' is a valid variant UTF-8
+character.  0 is returned if not, otherwise, 'len' is returned.
+
+*/
+
+STRLEN
+Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len)
+{
+    const U8 *e;
+    const U8 *x, *y;
+
+    PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW;
+
+    if (UNLIKELY(! UTF8_IS_START(*s))) {
+        return 0;
+    }
+
+    e = s + len;
+
+    for (x = s + 1; x < e; x++) {
+        if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
+            return 0;
+        }
+    }
+
+#ifndef EBCDIC
+
+    /* Here is syntactically valid.  Make sure this isn't the start of an
+     * overlong.  These values were found by manually inspecting the UTF-8
+     * patterns.  See the tables in utf8.h and utfebcdic.h */
+
+    /* This is not needed on modern perls where C0 and C1 are not considered
+     * start bytes. */
+#if 0
+    if (UNLIKELY(*s < 0xC2)) {
+        return 0;
+    }
+#endif
+
+    if (len > 1) {
+        if (   (*s == 0xE0 && UNLIKELY(s[1] < 0xA0))
+            || (*s == 0xF0 && UNLIKELY(s[1] < 0x90))
+            || (*s == 0xF8 && UNLIKELY(s[1] < 0x88))
+            || (*s == 0xFC && UNLIKELY(s[1] < 0x84))
+            || (*s == 0xFE && UNLIKELY(s[1] < 0x82)))
+        {
+            return 0;
+        }
+        if ((len > 6 && UNLIKELY(*s == 0xFF) && UNLIKELY(s[6] < 0x81))) {
+            return 0;
+        }
+    }
+
+#else   /* For EBCDIC, we use I8, which is the same on all code pages */
+    {
+        const U8 s0 = NATIVE_UTF8_TO_I8(*s);
+
+        /* On modern perls C0-C4 aren't considered start bytes */
+        if ( /* s0 < 0xC5 || */ s0 == 0xE0) {
+            return 0;
+        }
+
+        if (len >= 1) {
+            const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+            if (   (s0 == 0xF0 && UNLIKELY(s1 < 0xB0))
+                || (s0 == 0xF8 && UNLIKELY(s1 < 0xA8))
+                || (s0 == 0xFC && UNLIKELY(s1 < 0xA4))
+                || (s0 == 0xFE && UNLIKELY(s1 < 0x82)))
+            {
+                return 0;
+            }
+            if ((len > 7 && UNLIKELY(s0 == 0xFF) && UNLIKELY(s[7] < 0xA1))) {
+                return 0;
+            }
+        }
+    }
+
+#endif
+
+    /* Now see if this would overflow a UV on this platform.  See if the UTF8
+     * for this code point is larger than that for the highest representable
+     * code point */
+    y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+
+    for (x = s; x < e; x++, y++) {
+
+        /* If the same at this byte, go on to the next */
+        if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
+            continue;
+        }
+
+        /* If this is larger, it overflows */
+        if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
+            return 0;
+        }
+
+        /* But if smaller, it won't */
+        break;
+    }
+
+    return len;
+}
+
+/*
+
 =for apidoc utf8n_to_uvchr
 
 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
diff --git a/utf8.h b/utf8.h
index 29b052cd64..b940eaf3ee 100644
--- a/utf8.h
+++ b/utf8.h
@@ -826,13 +826,24 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
 
 =for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e
 
-Returns the number of bytes beginning at C<s> which form a legal UTF-8 (or
-UTF-EBCDIC) encoded character, looking no further than S<C<e - s>> bytes into
-C<s>.  Returns 0 if the sequence starting at C<s> through S<C<e - 1>> is not
-well-formed UTF-8.
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+code point; otherwise it evaluates to 0.  If non-zero, the value gives how many
+many bytes starting at C<s> comprise the code point's representation.
 
-Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
-machines) is a valid UTF-8 character.
+The code point can be any that will fit in a UV on this machine, using Perl's
+extension to official UTF-8 to represent those higher than the Unicode maximum
+of 0x10FFFF.  That means that this macro is used to efficiently decide if the
+next few bytes in C<s> is legal UTF-8 for a single character.  Use
+L</is_utf8_string>(), L</is_utf8_string_loclen>(), and
+L</is_utf8_string_loc>() to check entire strings.
+
+Note that it is deprecated to use code points higher than what will fit in an
+IV.  This macro does not raise any warnings for such code points, treating them
+as valid.
+
+Note also that a UTF-8 INVARIANT character (i.e. ASCII on non-EBCDIC machines)
+is a valid UTF-8 character.
 
 =cut
 */
@@ -845,7 +856,7 @@ machines) is a valid UTF-8 character.
                                  ? 0                                        \
                                  : LIKELY(IS_UTF8_CHAR_FAST(UTF8SKIP(s)))   \
                                    ? is_UTF8_CHAR_utf8_no_length_checks(s)  \
-                                   : _is_utf8_char_slow(s, e))
+                                   : _is_utf8_char_slow(s, UTF8SKIP(s)))
 
 #define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)