summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ext/XS-APItest/t/utf8.t11
-rw-r--r--pod/perldelta.pod20
-rw-r--r--utf8.c52
-rw-r--r--utf8.h4
4 files changed, 53 insertions, 34 deletions
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index 5f1c9c94c7..bc5a7ed0c1 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -166,7 +166,7 @@ my @tests = (
# This code point is chosen so that it is representable in a UV on
# 32-bit machines
$UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', 0x80000000, 7,
- qr/Code point beginning with byte .* is not Unicode, and not portable/
+ qr/Code point 0x80000000 is not Unicode, and not portable/
],
[ "overflow with FE/FF",
# This tests the interaction of WARN_FE_FF/DISALLOW_FE_FF with
@@ -178,9 +178,12 @@ my @tests = (
($has_quad)
? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
: "\xfe\x86\x80\x80\x80\x80\x80",
- $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', 0,
+
+ # We include both warning categories to make sure the FE_FF one has
+ # precedence
+ "$UTF8_WARN_FE_FF|$UTF8_WARN_SUPER", "$UTF8_DISALLOW_FE_FF", 'utf8', 0,
($has_quad) ? 13 : 7,
- qr/Code point beginning with byte .* is not Unicode, and not portable/
+ qr/overflow at byte .*, after start byte 0xf/
],
);
@@ -188,7 +191,7 @@ if ($has_quad) { # All FF's will overflow on 32 bit
push @tests,
[ "begins with FF", "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
$UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', $FF_ret, 13,
- qr/Code point beginning with byte .* is not Unicode, and not portable/
+ qr/Code point 0x.* is not Unicode, and not portable/
];
}
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index e36ae85fcc..86f057025c 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -376,9 +376,23 @@ The public API newATTRSUB was previously a macro to the private
function Perl_newATTRSUB. Function Perl_newATTRSUB has been removed. newATTRSUB
is now macro to a different internal function.
-=item XXX
-
-XXX
+=item Changes in warnings raised by C<utf8n_to_uvchr()>
+
+This bottom level function decodes the first character of a UTF-8 string
+into a code point. It is accessible to C<XS> level code, but it's
+discouraged from using it directly. There are higher level functions
+that call this that should be used instead, such as
+L<perlapi/utf8_to_uvchr_buf>. For completeness though, this documents
+some changes to it. Now, tests for malformations are done before any
+tests for other potential issues. One of those issues involves code
+points so large that they have never appeared in any official standard
+(the current standard has scaled back the highest acceptable code point
+from earlier versions). It is possible (though not done in CPAN) to
+warn and/or forbid these code points, while accepting smaller code
+points that are still above the legal Unicode maximum. The warning
+message for this now includes the code point if representable on the
+machine. Previously it always displayed raw bytes, which is what it
+still does for non-representable code points.
=back
diff --git a/utf8.c b/utf8.c
index 41e2c4caa6..8dc69bb33d 100644
--- a/utf8.c
+++ b/utf8.c
@@ -778,32 +778,8 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
}
}
-#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */
- if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
- && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
- {
- /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
- * generation of the sv, since no warnings are raised under CHECK */
- if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
- && ckWARN_d(WARN_UTF8))
- {
- /* This message is deliberately not of the same syntax as the other
- * messages for malformations, for backwards compatibility in the
- * unlikely event that code is relying on its precise earlier text
- */
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
- pack_warn = packWARN(WARN_UTF8);
- }
- if (flags & UTF8_DISALLOW_FE_FF) {
- goto malformed;
- }
- }
+#ifndef EBCDIC /* EBCDIC can't overflow */
if (UNLIKELY(overflowed)) {
-
- /* If the first byte is FF, it will overflow a 32-bit word. If the
- * first byte is FE, it will overflow a signed 32-bit word. The
- * above preserves backward compatibility, since its message was used
- * in earlier versions of this code in preference to overflow */
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
goto malformed;
}
@@ -830,6 +806,9 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
|UTF8_WARN_ILLEGAL_INTERCHANGE)))
{
if (UNICODE_IS_SURROGATE(uv)) {
+
+ /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
+ * generation of the sv, since no warnings are raised under CHECK */
if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
&& ckWARN_d(WARN_SURROGATE))
{
@@ -842,11 +821,32 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
}
else if ((uv > PERL_UNICODE_MAX)) {
if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
- && ckWARN_d(WARN_NON_UNICODE))
+ && ckWARN_d(WARN_NON_UNICODE))
{
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
pack_warn = packWARN(WARN_NON_UNICODE);
}
+#ifndef EBCDIC /* EBCDIC always allows FE, FF */
+
+ /* The first byte being 0xFE or 0xFF is a subset of the SUPER code
+ * points. We test for these after the regular SUPER ones, and
+ * before possibly bailing out, so that the more dire warning
+ * overrides the regular one, if applicable */
+ if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
+ && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
+ {
+ if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY))
+ == UTF8_WARN_FE_FF
+ && ckWARN_d(WARN_UTF8))
+ {
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv));
+ pack_warn = packWARN(WARN_UTF8);
+ }
+ if (flags & UTF8_DISALLOW_FE_FF) {
+ goto disallowed;
+ }
+ }
+#endif
if (flags & UTF8_DISALLOW_SUPER) {
goto disallowed;
}
diff --git a/utf8.h b/utf8.h
index 2d4877552b..f72a2433cc 100644
--- a/utf8.h
+++ b/utf8.h
@@ -447,7 +447,9 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
#define UTF8_WARN_SUPER 0x0400 /* points above the legal max */
/* Code points which never were part of the original UTF-8 standard, the first
- * byte of which is a FE or FF on ASCII platforms. */
+ * byte of which is a FE or FF on ASCII platforms. If the first byte is FF, it
+ * will overflow a 32-bit word. If the first byte is FE, it will overflow a
+ * signed 32-bit word. */
#define UTF8_DISALLOW_FE_FF 0x0800
#define UTF8_WARN_FE_FF 0x1000