7 files changed, 177 insertions, 127 deletions
diff --git a/embed.fnc b/embed.fnc
index 1e661e4f3f..20f2987bfe 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -744,7 +744,9 @@ ADMpR	|bool	|isALNUM_lazy	|NN const char* p
 #ifdef PERL_IN_UTF8_C
 snR	|U8	|to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp  \
 		|const char dummy
+#  ifndef UV_IS_QUAD
 inR	|bool	|is_utf8_cp_above_31_bits|NN const U8 * const s|NN const U8 * const e
+#  endif
 #endif
 #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
 EXp	|UV        |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const unsigned int flags
diff --git a/embed.h b/embed.h
index a74458d7fd..608d252e54 100644
--- a/embed.h
+++ b/embed.h
@@ -1472,6 +1472,11 @@
 #define mulexp10		S_mulexp10
 #    endif
 #  endif
+#  if !defined(UV_IS_QUAD)
+#    if defined(PERL_IN_UTF8_C)
+#define is_utf8_cp_above_31_bits	S_is_utf8_cp_above_31_bits
+#    endif
+#  endif
 #  if !defined(WIN32)
 #define do_exec3(a,b,c)		Perl_do_exec3(aTHX_ a,b,c)
 #  endif
@@ -1844,7 +1849,6 @@
 #define isFF_OVERLONG		S_isFF_OVERLONG
 #define is_utf8_common(a,b,c,d)	S_is_utf8_common(aTHX_ a,b,c,d)
 #define is_utf8_common_with_len(a,b,c,d,e)	S_is_utf8_common_with_len(aTHX_ a,b,c,d,e)
-#define is_utf8_cp_above_31_bits	S_is_utf8_cp_above_31_bits
 #define is_utf8_overlong_given_start_byte_ok	S_is_utf8_overlong_given_start_byte_ok
 #define swash_scan_list_line(a,b,c,d,e,f,g)	S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g)
 #define swatch_get(a,b,c)	S_swatch_get(aTHX_ a,b,c)
diff --git a/ext/XS-APItest/t/utf8_warn_base.pl b/ext/XS-APItest/t/utf8_warn_base.pl
index 94df88e813..6c88f5c308 100644
--- a/ext/XS-APItest/t/utf8_warn_base.pl
+++ b/ext/XS-APItest/t/utf8_warn_base.pl
@@ -28,6 +28,7 @@ local $SIG{__WARN__} = sub { my @copy = @_;
                              push @warnings_gotten, map { chomp; $_ } @copy;
                            };
 
+my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
 my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
 
 sub requires_extended_utf8($) {
@@ -36,8 +37,7 @@ sub requires_extended_utf8($) {
     # into 31 bits, subject to the convention that a negative code point
     # stands for one that overflows the word size, so won't fit in 31 bits.
 
-    my $cp = shift;
-    return $cp > 0x7FFFFFFF;
+    return shift > $highest_non_extended_utf8_cp;
 }
 
 my @tests;
@@ -286,7 +286,6 @@ my @tests;
             : I8_to_native(
                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
             0x80000000,
-            (isASCII) ? 1 : 8,
         ],
         [ "highest 32 bit code point",
             (isASCII)
@@ -294,7 +293,6 @@ my @tests;
             : I8_to_native(
                "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
             0xFFFFFFFF,
-            (isASCII) ? 1 : 8,
         ],
         [ "Lowest 33 bit code point",
             (isASCII)
@@ -340,7 +338,6 @@ my @tests;
                 [ "Lowest code point requiring 13 bytes to represent",
                     "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
                     0x1000000000,
-                    1,
                 ],
                 [ "overflow that old algorithm failed to detect",
                     "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
@@ -355,37 +352,31 @@ my @tests;
                     I8_to_native(
                     "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                     0x800000000,
-                    7,
                 ],
                 [ "requires at least 32 bits",
                     I8_to_native(
                     "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                     0x10000000000,
-                    6,
                 ],
                 [ "requires at least 32 bits",
                     I8_to_native(
                     "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                     0x200000000000,
-                    5,
                 ],
                 [ "requires at least 32 bits",
                     I8_to_native(
                     "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                     0x4000000000000,
-                    4,
                 ],
                 [ "requires at least 32 bits",
                     I8_to_native(
                     "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                     0x80000000000000,
-                    3,
                 ],
                 [ "requires at least 32 bits",
                     I8_to_native(
                     "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                     0x1000000000000000,
-                    2,
                 ];
         }
     }
@@ -587,6 +578,11 @@ foreach my $test (@tests) {
                                 # contain a code point.  (This is a result of
                                 # some sort of malformation that means we
                                 # can't get an exact code poin
+    my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
+                        \Q requires a Perl extension, and so is not\E
+                        \Q portable\E/x;
+    my $extended_non_cp_trailing_text
+                        = "is a Perl extension, and so is not portable";
 
     # Is this test malformed from the beginning?  If so, we know to generally
     # expect that the tests will show it isn't valid.
@@ -619,9 +615,9 @@ foreach my $test (@tests) {
             $initially_malformed = 1;
         }
         elsif (requires_extended_utf8($allowed_uv)) {
-            $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
-                                \Q and not portable\E/x;
-            $non_cp_trailing_text = "is for a non-Unicode code point, and is not portable";
+            $cp_message_qr = $extended_cp_message_qr;
+            $non_cp_trailing_text = $extended_non_cp_trailing_text;
+            $needed_to_discern_len = 1 unless defined $needed_to_discern_len;
         }
         else {
             $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
@@ -870,6 +866,9 @@ foreach my $test (@tests) {
           # maximum length, so skip if we're already at that length.
           next if $overlong && $length >= $::max_bytes;
 
+          my $this_cp_message_qr = $cp_message_qr;
+          my $this_non_cp_trailing_text = $non_cp_trailing_text;
+
           foreach my $malformed_allow_type (0..2) {
             # 0 don't allow this malformation; ignored if no malformation
             # 1 allow, with REPLACEMENT CHARACTER returned
@@ -899,6 +898,8 @@ foreach my $test (@tests) {
             # combinations of on/off are tested for.  It's either all are
             # allowed, or none are.
             my $allow_flags = 0;
+            my $overlong_is_in_perl_extended_utf8 = 0;
+            my $dont_use_overlong_cp = 0;
 
             if ($overlong) {
                 my $new_expected_len;
@@ -929,8 +930,20 @@ foreach my $test (@tests) {
                 else {  # Must use extended UTF-8.  On ASCII platforms, we
                         # could express some overlongs here starting with
                         # \xFE, but there's no real reason to do so.
+                    $overlong_is_in_perl_extended_utf8 = 1;
                     $start_byte = I8_to_native("\xFF");
                     $new_expected_len = $::max_bytes;
+                    $this_cp_message_qr = $extended_cp_message_qr;
+
+                    # The warning that gets raised doesn't include the code
+                    # point in the message if the code point can be expressed
+                    # without using extended UTF-8, but the particular
+                    # overlong sequence used is in extended UTF-8.  To do
+                    # otherwise would be confusing to the user, as it would
+                    # claim the code point requires extended, when it doesn't.
+                    $dont_use_overlong_cp = 1
+                                    unless requires_extended_utf8($allowed_uv);
+                    $this_non_cp_trailing_text = $extended_non_cp_trailing_text;
                 }
 
                 # Splice in the revise continuation byte, preceded by the
@@ -1152,12 +1165,20 @@ foreach my $test (@tests) {
                     # on all the other flags.  That makes sure that they all
                     # are independent of this flag, and so we don't need to
                     # test them individually.
-                    my $this_warning_flags  = ($use_warn_flag)
-                                              ? $this_utf8n_flag_to_warn
-                                              : $utf8n_flag_to_warn_complement;
-                    my $this_disallow_flags = ($do_disallow)
-                                          ? $this_utf8n_flag_to_disallow
-                                          : $utf8n_flag_to_disallow_complement;
+                    my $this_warning_flags
+                            = ($use_warn_flag)
+                              ? $this_utf8n_flag_to_warn
+                              : ($overlong_is_in_perl_extended_utf8
+                                ? ($utf8n_flag_to_warn_complement
+                                    & ~$::UTF8_WARN_PERL_EXTENDED)
+                                :  $utf8n_flag_to_warn_complement);
+                    my $this_disallow_flags
+                            = ($do_disallow)
+                              ? $this_utf8n_flag_to_disallow
+                              : ($overlong_is_in_perl_extended_utf8
+                                 ? ($utf8n_flag_to_disallow_complement
+                                    & ~$::UTF8_DISALLOW_PERL_EXTENDED)
+                                 :  $utf8n_flag_to_disallow_complement);
                     my $expected_uv = $allowed_uv;
                     my $this_uv_string = $uv_string;
 
@@ -1216,19 +1237,21 @@ foreach my $test (@tests) {
 
                         # So far the array contains warnings generated by
                         # malformations.  Add the expected regular one.
-                        unshift @expected_warnings, $cp_message_qr;
+                        unshift @expected_warnings, $this_cp_message_qr;
 
                         # But it may need to be modified, because either of
                         # these malformations means we can't determine the
                         # expected code point.
-                        if ($short || $unexpected_noncont) {
+                        if (   $short || $unexpected_noncont
+                            || $dont_use_overlong_cp)
+                        {
                             my $first_byte = substr($this_bytes, 0, 1);
                             $expected_warnings[0] = display_bytes(
                                     substr($this_bytes, 0, $this_expected_len));
                             $expected_warnings[0]
                                 = qr/[Aa]\Qny UTF-8 sequence that starts with\E
                                      \Q $expected_warnings[0]\E
-                                     \Q $non_cp_trailing_text\E/x;
+                                     \Q $this_non_cp_trailing_text\E/x;
                         }
                     }
 
diff --git a/proto.h b/proto.h
index d9ef91cd57..a8f6de868c 100644
--- a/proto.h
+++ b/proto.h
@@ -4078,6 +4078,17 @@ PERL_CALLCONV int	Perl_my_sprintf(char *buffer, const char *pat, ...);
 STATIC NV	S_mulexp10(NV value, I32 exponent);
 #  endif
 #endif
+#if !defined(UV_IS_QUAD)
+#  if defined(PERL_IN_UTF8_C)
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE bool	S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS	\
+	assert(s); assert(e)
+#endif
+
+#  endif
+#endif
 #if !defined(WIN32)
 PERL_CALLCONV bool	Perl_do_exec3(pTHX_ const char *incmd, int fd, int do_report);
 #define PERL_ARGS_ASSERT_DO_EXEC3	\
@@ -5835,13 +5846,6 @@ PERL_STATIC_INLINE bool	S_is_utf8_common_with_len(pTHX_ const U8 *const p, const
 #endif
 
 #ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE bool	S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
-			__attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS	\
-	assert(s); assert(e)
-#endif
-
-#ifndef PERL_NO_INLINE_FUNCTIONS
 PERL_STATIC_INLINE bool	S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
 			__attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK	\
diff --git a/utf8.c b/utf8.c
index 88c2b32211..4be3bb734b 100644
--- a/utf8.c
+++ b/utf8.c
@@ -122,8 +122,9 @@ const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
                                    " is not recommended for open interchange";
 const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
                                    " may not be portable";
-const char above_31_bit_cp_format[] = "Code point 0x%" UVXf " is not"
-                                      " Unicode, and not portable";
+const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not"        \
+                                       " Unicode, requires a Perl extension," \
+                                       " and so is not portable";
 
 #define HANDLE_UNICODE_SURROGATE(uv, flags)                         \
     STMT_START {                                                    \
@@ -220,7 +221,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
 
               /* Choose the more dire applicable warning */
               (UNICODE_IS_PERL_EXTENDED(uv))
-              ? above_31_bit_cp_format
+              ? perl_extended_cp_format
               : super_cp_format,
              uv);
         }
@@ -362,30 +363,27 @@ defined in
 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
 See L<perlunicode/Noncharacter code points>.
 
-Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
-so using them is more problematic than other above-Unicode code points.  Perl
-invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
-likely that non-Perl languages will not be able to read files that contain
-these that written by the perl interpreter; nor would Perl understand files
-written by something that uses a different extension.  For these reasons, there
-is a separate set of flags that can warn and/or disallow these extremely high
-code points, even if other above-Unicode ones are accepted.  These are the
-C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags.  These
-are entirely independent from the deprecation warning for code points above
-C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
-code point that needs more than 31 bits to represent.  When that happens,
-effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
-32-bit machines.  (Of course C<UNICODE_DISALLOW_SUPER> will treat all
-above-Unicode code points, including these, as malformations; and
-C<UNICODE_WARN_SUPER> warns on these.)
-
-On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
-extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
-than on ASCII.  Prior to that, code points 2**31 and higher were simply
-unrepresentable, and a different, incompatible method was used to represent
-code points between 2**30 and 2**31 - 1.  The flags C<UNICODE_WARN_ABOVE_31_BIT>
-and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
-platforms, warning and disallowing 2**31 and higher.
+Extremely high code points were never specified in any standard, and require an
+extension to UTF-8 to express, which Perl does.  It is likely that programs
+written in something other than Perl would not be able to read files that
+contain these; nor would Perl understand files written by something that uses a
+different extension.  For these reasons, there is a separate set of flags that
+can warn and/or disallow these extremely high code points, even if other
+above-Unicode ones are accepted.  They are the C<UNICODE_WARN_PERL_EXTENDED>
+and C<UNICODE_DISALLOW_PERL_EXTENDED> flags.  For more information see
+L</C<UTF8_GOT_PERL_EXTENDED>>.  Of course C<UNICODE_DISALLOW_SUPER> will
+treat all above-Unicode code points, including these, as malformations.  (Note
+that the Unicode standard considers anything above 0x10FFFF to be illegal, but
+there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
+
+A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
+retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>.  Similarly,
+C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
+C<UNICODE_DISALLOW_PERL_EXTENDED>.  The names are misleading because these
+flags can apply to code points that actually do fit in 31 bits.  This happens
+on EBCDIC platforms, and sometimes when the L<overlong
+malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
+describe the situation in all cases.
 
 =cut
 */
@@ -399,6 +397,8 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
     return uvchr_to_utf8_flags(d, uv, flags);
 }
 
+#ifndef UV_IS_QUAD
+
 PERL_STATIC_INLINE bool
 S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
 {
@@ -480,6 +480,8 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
 
 }
 
+#endif
+
 /* Anything larger than this will overflow the word if it were converted into a UV */
 #if defined(UV_IS_QUAD)
 #  ifdef EBCDIC     /* Actually is I8 */
@@ -730,10 +732,12 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
 #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xF1              \
                                                        /* B6 and B7 */      \
                                               && ((s1) & 0xFE ) == 0xB6)
+#  define isUTF8_PERL_EXTENDED(s)   (*s == I8_TO_NATIVE_UTF8(0xFF))
 #else
 #  define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xF5
 #  define IS_UTF8_2_BYTE_SUPER(s0, s1)           ((s0) == 0xF4 && (s1) >= 0x90)
 #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xED && (s1) >= 0xA0)
+#  define isUTF8_PERL_EXTENDED(s)   (*s >= 0xFE)
 #endif
 
         if (  (flags & UTF8_DISALLOW_SUPER)
@@ -743,9 +747,9 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
         }
 
         if (   (flags & UTF8_DISALLOW_PERL_EXTENDED)
-            &&  UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
+            &&  UNLIKELY(isUTF8_PERL_EXTENDED(s)))
         {
-            return 0;           /* Above 31 bits */
+            return 0;
         }
 
         if (len > 1) {
@@ -954,36 +958,34 @@ a malformation and raise a warning, specify both the WARN and DISALLOW flags.
 (But note that warnings are not raised if lexically disabled nor if
 C<UTF8_CHECK_ONLY> is also specified.)
 
+Extremely high code points were never specified in any standard, and require an
+extension to UTF-8 to express, which Perl does.  It is likely that programs
+written in something other than Perl would not be able to read files that
+contain these; nor would Perl understand files written by something that uses a
+different extension.  For these reasons, there is a separate set of flags that
+can warn and/or disallow these extremely high code points, even if other
+above-Unicode ones are accepted.  They are the C<UTF8_WARN_PERL_EXTENDED> and
+C<UTF8_DISALLOW_PERL_EXTENDED> flags.  For more information see
+L</C<UTF8_GOT_PERL_EXTENDED>>.  Of course C<UTF8_DISALLOW_SUPER> will treat all
+above-Unicode code points, including these, as malformations.
+(Note that the Unicode standard considers anything above 0x10FFFF to be
+illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
+(2**31 -1))
+
+A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
+retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>.  Similarly,
+C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
+C<UTF8_DISALLOW_PERL_EXTENDED>.  The names are misleading because these flags
+can apply to code points that actually do fit in 31 bits.  This happens on
+EBCDIC platforms, and sometimes when the L<overlong
+malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
+describe the situation in all cases.
+
 It is now deprecated to have very high code points (above C<IV_MAX> on the
 platforms) and this function will raise a deprecation warning for these (unless
 such warnings are turned off).  This value is typically 0x7FFF_FFFF (2**31 -1)
 in a 32-bit word.
 
-Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
-so using them is more problematic than other above-Unicode code points.  Perl
-invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
-likely that non-Perl languages will not be able to read files that contain
-these; nor would Perl understand files
-written by something that uses a different extension.  For these reasons, there
-is a separate set of flags that can warn and/or disallow these extremely high
-code points, even if other above-Unicode ones are accepted.  These are the
-C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags.  These
-are entirely independent from the deprecation warning for code points above
-C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
-code point that needs more than 31 bits to represent.  When that happens,
-effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
-32-bit machines.  (Of course C<UTF8_DISALLOW_SUPER> will treat all
-above-Unicode code points, including these, as malformations; and
-C<UTF8_WARN_SUPER> warns on these.)
-
-On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
-extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
-than on ASCII.  Prior to that, code points 2**31 and higher were simply
-unrepresentable, and a different, incompatible method was used to represent
-code points between 2**30 and 2**31 - 1.  The flags C<UTF8_WARN_ABOVE_31_BIT>
-and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
-platforms, warning and disallowing 2**31 and higher.
-
 All other code points corresponding to Unicode characters, including private
 use and those yet to be assigned, are never considered malformed and never
 warn.
@@ -1026,12 +1028,36 @@ exceptions are noted:
 
 =over 4
 
-=item C<UTF8_GOT_ABOVE_31_BIT>
+=item C<UTF8_GOT_PERL_EXTENDED>
 
-The code point represented by the input UTF-8 sequence occupies more than 31
-bits.
-This bit is set only if the input C<flags> parameter contains either the
-C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
+The input sequence is not standard UTF-8, but a Perl extension.  This bit is
+set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
+
+Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
+and so some extension must be used to express them.  Perl uses a natural
+extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
+extension to represent even higher ones, so that any code point that fits in a
+64-bit word can be represented.  Text using these extensions is not likely to
+be portable to non-Perl code.  We lump both of these extensions together and
+refer to them as Perl extended UTF-8.  There exist other extensions that people
+have invented, incompatible with Perl's.
+
+On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
+extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
+than on ASCII.  Prior to that, code points 2**31 and higher were simply
+unrepresentable, and a different, incompatible method was used to represent
+code points between 2**30 and 2**31 - 1.
+
+On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
+Perl extended UTF-8 is used.
+
+In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
+may use for backward compatibility.  That name is misleading, as this flag may
+be set when the code point actually does fit in 31 bits.  This happens on
+EBCDIC platforms, and sometimes when the L<overlong
+malformation|/C<UTF8_GOT_LONG>> is also present.  The new name accurately
+describes the situation in all cases.
 
 =item C<UTF8_GOT_CONTINUATION>
 
@@ -1119,8 +1145,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
      * too short one.  Otherwise the first two are set to 's0' and 'send', and
      * the third not used at all */
     U8 * adjusted_s0 = (U8 *) s0;
-    U8 * adjusted_send = NULL;  /* (Initialized to silence compilers' wrong
-                                   warning) */
     U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
                                             routine; see [perl #130921] */
     UV uv_so_far = 0;   /* (Initialized to silence compilers' wrong warning) */
@@ -1212,7 +1236,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
     else {
         send += expectlen;
     }
-    adjusted_send = send;
 
     /* Now, loop through the remaining bytes in the character's sequence,
      * accumulating each into the working value as we go. */
@@ -1297,7 +1320,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
             }
 
             adjusted_s0 = temp_char_buf;
-            adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
+            (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
         }
     }
 
@@ -1316,7 +1339,8 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                            * and we deal with those in the overflow handling
                            * code */
                 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
-                && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
+                && (   isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
+                    || UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
 	&& ((flags & ( UTF8_DISALLOW_NONCHAR
                       |UTF8_DISALLOW_SURROGATE
                       |UTF8_DISALLOW_SUPER
@@ -1396,13 +1420,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
      *                      end, based on how many bytes the start byte tells
      *                      us should be in it, but no further than s0 +
      *                      avail_len
-     * adjusted_s0          normally is the same as s0, but in case of an
-     *                      overlong for which the UTF-8 matters below, it is
-     *                      the first byte of the shortest form representation
-     *                      of the input.
-     * adjusted_send        normally is the same as 'send', but if adjusted_s0
-     *                      is set to something other than s0, this points one
-     *                      beyond its end
      */
 
     if (UNLIKELY(possible_problems)) {
@@ -1603,39 +1620,35 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                     }
                 }
 
-                /* The maximum code point ever specified by a standard was
-                 * 2**31 - 1.  Anything larger than that is a Perl extension
-                 * that very well may not be understood by other applications
-                 * (including earlier perl versions on EBCDIC platforms).  We
-                 * test for these after the regular SUPER ones, and before
-                 * possibly bailing out, so that the slightly more dire warning
-                 * will override the regular one. */
-                if (   (flags & (UTF8_WARN_PERL_EXTENDED
-                                |UTF8_WARN_SUPER
-                                |UTF8_DISALLOW_PERL_EXTENDED))
-                    && (   (   UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
-                            && UNLIKELY(is_utf8_cp_above_31_bits(
-                                                adjusted_s0,
-                                                adjusted_send)))
-                        || (   LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
-                            && UNLIKELY(UNICODE_IS_PERL_EXTENDED(uv)))))
-                {
+                /* Test for Perl's extended UTF-8 after the regular SUPER ones,
+                 * and before possibly bailing out, so that the more dire
+                 * warning will override the regular one. */
+                if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
                     if (  ! (flags & UTF8_CHECK_ONLY)
                         &&  (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
                         &&  ckWARN_d(WARN_NON_UNICODE))
                     {
                         pack_warn = packWARN(WARN_NON_UNICODE);
 
-                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                        /* If it is an overlong that evaluates to a code point
+                         * that doesn't have to use the Perl extended UTF-8, it
+                         * still used it, and so we output a message that
+                         * doesn't refer to the code point.  The same is true
+                         * if there was a SHORT malformation where the code
+                         * point is not valid.  In that case, 'uv' will have
+                         * been set to the REPLACEMENT CHAR, and the message
+                         * below without the code point in it will be selected
+                         * */
+                        if (UNICODE_IS_PERL_EXTENDED(uv)) {
                             message = Perl_form(aTHX_
-                                        "Any UTF-8 sequence that starts with"
-                                        " \"%s\" is for a non-Unicode code"
-                                        " point, and is not portable",
-                                        _byte_dump_string(s0, curlen, 0));
+                                            perl_extended_cp_format, uv);
                         }
                         else {
                             message = Perl_form(aTHX_
-                                            above_31_bit_cp_format, uv);
+                                        "Any UTF-8 sequence that starts with"
+                                        " \"%s\" is a Perl extension, and"
+                                        " so is not portable",
+                                        _byte_dump_string(s0, curlen, 0));
                         }
                     }
 
diff --git a/utf8.h b/utf8.h
index c880375e1d..0f29817757 100644
--- a/utf8.h
+++ b/utf8.h
@@ -407,6 +407,8 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
     ( LIKELY( ( ( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
 : LIKELY( ( ( ( 0xF4 == ((const U8*)s)[0] ) && ( ( ((const U8*)s)[1] & 0xF0 ) == 0x80 ) ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
 
+#define UNICODE_IS_PERL_EXTENDED(uv)    UNLIKELY((UV) (uv) > 0x7FFFFFFF)
+
 #endif /* EBCDIC vs ASCII */
 
 /* 2**UTF_ACCUMULATION_SHIFT - 1 */
@@ -781,7 +783,8 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
 #define UTF8_GOT_PERL_EXTENDED          UTF8_DISALLOW_PERL_EXTENDED
 #define UTF8_WARN_PERL_EXTENDED         0x8000
 
-/* For back compat, these old names are misleading for UTF_EBCDIC */
+/* For back compat, these old names are misleading for overlongs and
+ * UTF_EBCDIC. */
 #define UTF8_DISALLOW_ABOVE_31_BIT      UTF8_DISALLOW_PERL_EXTENDED
 #define UTF8_GOT_ABOVE_31_BIT           UTF8_GOT_PERL_EXTENDED
 #define UTF8_WARN_ABOVE_31_BIT          UTF8_WARN_PERL_EXTENDED
@@ -958,7 +961,6 @@ point's representation.
          && UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
 
 #define UNICODE_IS_SUPER(uv)    ((UV) (uv) > PERL_UNICODE_MAX)
-#define UNICODE_IS_PERL_EXTENDED(uv)    ((UV) (uv) > 0x7FFFFFFF)
 
 #define LATIN_SMALL_LETTER_SHARP_S      LATIN_SMALL_LETTER_SHARP_S_NATIVE
 #define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS                                  \
diff --git a/utfebcdic.h b/utfebcdic.h
index 0f81d1ffee..c2f0788cc4 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -511,6 +511,8 @@ explicitly forbidden, and the shortest possible encoding should always be used
  * has this start byte (expressed in I8) as the maximum */
 #define _IS_UTF8_CHAR_HIGHEST_START_BYTE 0xF9
 
+#define UNICODE_IS_PERL_EXTENDED(uv)    UNLIKELY((UV) (uv) > 0x3FFFFFFF)
+
 /*
  * ex: set ts=8 sts=4 sw=4 et:
  */