4 files changed, 53 insertions, 34 deletions
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index 5f1c9c94c7..bc5a7ed0c1 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -166,7 +166,7 @@ my @tests = (
         # This code point is chosen so that it is representable in a UV on
         # 32-bit machines
         $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', 0x80000000, 7,
-        qr/Code point beginning with byte .* is not Unicode, and not portable/
+        qr/Code point 0x80000000 is not Unicode, and not portable/
     ],
     [ "overflow with FE/FF",
         # This tests the interaction of WARN_FE_FF/DISALLOW_FE_FF with
@@ -178,9 +178,12 @@ my @tests = (
         ($has_quad)
             ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
             : "\xfe\x86\x80\x80\x80\x80\x80",
-        $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', 0,
+
+        # We include both warning categories to make sure the FE_FF one has
+        # precedence
+        "$UTF8_WARN_FE_FF|$UTF8_WARN_SUPER", "$UTF8_DISALLOW_FE_FF", 'utf8', 0,
         ($has_quad) ? 13 : 7,
-        qr/Code point beginning with byte .* is not Unicode, and not portable/
+        qr/overflow at byte .*, after start byte 0xf/
     ],
 );
 
@@ -188,7 +191,7 @@ if ($has_quad) {    # All FF's will overflow on 32 bit
     push @tests,
         [ "begins with FF", "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
             $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', $FF_ret, 13,
-            qr/Code point beginning with byte .* is not Unicode, and not portable/
+            qr/Code point 0x.* is not Unicode, and not portable/
         ];
 }
 
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index e36ae85fcc..86f057025c 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -376,9 +376,23 @@ The public API newATTRSUB was previously a macro to the private
 function Perl_newATTRSUB. Function Perl_newATTRSUB has been removed. newATTRSUB
 is now macro to a different internal function.
 
-=item XXX
-
-XXX
+=item Changes in warnings raised by C<utf8n_to_uvchr()>
+
+This bottom level function decodes the first character of a UTF-8 string
+into a code point.  It is accessible to C<XS> level code, but it's
+discouraged from using it directly.  There are higher level functions
+that call this that should be used instead, such as
+L<perlapi/utf8_to_uvchr_buf>.  For completeness though, this documents
+some changes to it.  Now, tests for malformations are done before any
+tests for other potential issues.  One of those issues involves code
+points so large that they have never appeared in any official standard
+(the current standard has scaled back the highest acceptable code point
+from earlier versions).  It is possible (though not done in CPAN) to
+warn and/or forbid these code points, while accepting smaller code
+points that are still above the legal Unicode maximum.  The warning
+message for this now includes the code point if representable on the
+machine.  Previously it always displayed raw bytes, which is what it
+still does for non-representable code points.
 
 =back
 
diff --git a/utf8.c b/utf8.c
index 41e2c4caa6..8dc69bb33d 100644
--- a/utf8.c
+++ b/utf8.c
@@ -778,32 +778,8 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 	}
     }
 
-#ifndef EBCDIC	/* EBCDIC allows FE, FF, can't overflow */
-    if ((*s0 & 0xFE) == 0xFE	/* matches both FE, FF */
-	&& (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
-    {
-	/* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
-	 * generation of the sv, since no warnings are raised under CHECK */
-	if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
-	    && ckWARN_d(WARN_UTF8))
-	{
-	    /* This message is deliberately not of the same syntax as the other
-	     * messages for malformations, for backwards compatibility in the
-	     * unlikely event that code is relying on its precise earlier text
-	     */
-	    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
-	    pack_warn = packWARN(WARN_UTF8);
-	}
-	if (flags & UTF8_DISALLOW_FE_FF) {
-	    goto malformed;
-	}
-    }
+#ifndef EBCDIC	/* EBCDIC can't overflow */
     if (UNLIKELY(overflowed)) {
-
-	/* If the first byte is FF, it will overflow a 32-bit word.  If the
-	 * first byte is FE, it will overflow a signed 32-bit word.  The
-	 * above preserves backward compatibility, since its message was used
-	 * in earlier versions of this code in preference to overflow */
 	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
 	goto malformed;
     }
@@ -830,6 +806,9 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 		     |UTF8_WARN_ILLEGAL_INTERCHANGE)))
     {
 	if (UNICODE_IS_SURROGATE(uv)) {
+
+            /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
+             * generation of the sv, since no warnings are raised under CHECK */
 	    if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
 		&& ckWARN_d(WARN_SURROGATE))
 	    {
@@ -842,11 +821,32 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 	}
 	else if ((uv > PERL_UNICODE_MAX)) {
 	    if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
-		&& ckWARN_d(WARN_NON_UNICODE))
+                && ckWARN_d(WARN_NON_UNICODE))
 	    {
 		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
 		pack_warn = packWARN(WARN_NON_UNICODE);
 	    }
+#ifndef EBCDIC	/* EBCDIC always allows FE, FF */
+
+            /* The first byte being 0xFE or 0xFF is a subset of the SUPER code
+             * points.  We test for these after the regular SUPER ones, and
+             * before possibly bailing out, so that the more dire warning
+             * overrides the regular one, if applicable */
+            if ((*s0 & 0xFE) == 0xFE	/* matches both FE, FF */
+                && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
+            {
+                if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY))
+                                                            == UTF8_WARN_FE_FF
+                    && ckWARN_d(WARN_UTF8))
+                {
+                    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv));
+                    pack_warn = packWARN(WARN_UTF8);
+                }
+                if (flags & UTF8_DISALLOW_FE_FF) {
+                    goto disallowed;
+                }
+            }
+#endif
 	    if (flags & UTF8_DISALLOW_SUPER) {
 		goto disallowed;
 	    }
diff --git a/utf8.h b/utf8.h
index 2d4877552b..f72a2433cc 100644
--- a/utf8.h
+++ b/utf8.h
@@ -447,7 +447,9 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 #define UTF8_WARN_SUPER		        0x0400	/* points above the legal max */
 
 /* Code points which never were part of the original UTF-8 standard, the first
- * byte of which is a FE or FF on ASCII platforms. */
+ * byte of which is a FE or FF on ASCII platforms. If the first byte is FF, it
+ * will overflow a 32-bit word.  If the first byte is FE, it will overflow a
+ * signed 32-bit word. */
 #define UTF8_DISALLOW_FE_FF		0x0800
 #define UTF8_WARN_FE_FF		        0x1000