diff options
-rw-r--r-- | ext/XS-APItest/t/utf8.t | 11 | ||||
-rw-r--r-- | pod/perldelta.pod | 20 | ||||
-rw-r--r-- | utf8.c | 52 | ||||
-rw-r--r-- | utf8.h | 4 |
4 files changed, 53 insertions, 34 deletions
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t index 5f1c9c94c7..bc5a7ed0c1 100644 --- a/ext/XS-APItest/t/utf8.t +++ b/ext/XS-APItest/t/utf8.t @@ -166,7 +166,7 @@ my @tests = ( # This code point is chosen so that it is representable in a UV on # 32-bit machines $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', 0x80000000, 7, - qr/Code point beginning with byte .* is not Unicode, and not portable/ + qr/Code point 0x80000000 is not Unicode, and not portable/ ], [ "overflow with FE/FF", # This tests the interaction of WARN_FE_FF/DISALLOW_FE_FF with @@ -178,9 +178,12 @@ my @tests = ( ($has_quad) ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" : "\xfe\x86\x80\x80\x80\x80\x80", - $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', 0, + + # We include both warning categories to make sure the FE_FF one has + # precedence + "$UTF8_WARN_FE_FF|$UTF8_WARN_SUPER", "$UTF8_DISALLOW_FE_FF", 'utf8', 0, ($has_quad) ? 13 : 7, - qr/Code point beginning with byte .* is not Unicode, and not portable/ + qr/overflow at byte .*, after start byte 0xf/ ], ); @@ -188,7 +191,7 @@ if ($has_quad) { # All FF's will overflow on 32 bit push @tests, [ "begins with FF", "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80", $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', $FF_ret, 13, - qr/Code point beginning with byte .* is not Unicode, and not portable/ + qr/Code point 0x.* is not Unicode, and not portable/ ]; } diff --git a/pod/perldelta.pod b/pod/perldelta.pod index e36ae85fcc..86f057025c 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -376,9 +376,23 @@ The public API newATTRSUB was previously a macro to the private function Perl_newATTRSUB. Function Perl_newATTRSUB has been removed. newATTRSUB is now macro to a different internal function. -=item XXX - -XXX +=item Changes in warnings raised by C<utf8n_to_uvchr()> + +This bottom level function decodes the first character of a UTF-8 string +into a code point. It is accessible to C<XS> level code, but it's +discouraged from using it directly. There are higher level functions +that call this that should be used instead, such as +L<perlapi/utf8_to_uvchr_buf>. For completeness though, this documents +some changes to it. Now, tests for malformations are done before any +tests for other potential issues. One of those issues involves code +points so large that they have never appeared in any official standard +(the current standard has scaled back the highest acceptable code point +from earlier versions). It is possible (though not done in CPAN) to +warn and/or forbid these code points, while accepting smaller code +points that are still above the legal Unicode maximum. The warning +message for this now includes the code point if representable on the +machine. Previously it always displayed raw bytes, which is what it +still does for non-representable code points. =back @@ -778,32 +778,8 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) } } -#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */ - if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */ - && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF))) - { - /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary - * generation of the sv, since no warnings are raised under CHECK */ - if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF - && ckWARN_d(WARN_UTF8)) - { - /* This message is deliberately not of the same syntax as the other - * messages for malformations, for backwards compatibility in the - * unlikely event that code is relying on its precise earlier text - */ - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0)); - pack_warn = packWARN(WARN_UTF8); - } - if (flags & UTF8_DISALLOW_FE_FF) { - goto malformed; - } - } +#ifndef EBCDIC /* EBCDIC can't overflow */ if (UNLIKELY(overflowed)) { - - /* If the first byte is FF, it will overflow a 32-bit word. If the - * first byte is FE, it will overflow a signed 32-bit word. The - * above preserves backward compatibility, since its message was used - * in earlier versions of this code in preference to overflow */ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0)); goto malformed; } @@ -830,6 +806,9 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) |UTF8_WARN_ILLEGAL_INTERCHANGE))) { if (UNICODE_IS_SURROGATE(uv)) { + + /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary + * generation of the sv, since no warnings are raised under CHECK */ if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE && ckWARN_d(WARN_SURROGATE)) { @@ -842,11 +821,32 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) } else if ((uv > PERL_UNICODE_MAX)) { if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER - && ckWARN_d(WARN_NON_UNICODE)) + && ckWARN_d(WARN_NON_UNICODE)) { sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv)); pack_warn = packWARN(WARN_NON_UNICODE); } +#ifndef EBCDIC /* EBCDIC always allows FE, FF */ + + /* The first byte being 0xFE or 0xFF is a subset of the SUPER code + * points. We test for these after the regular SUPER ones, and + * before possibly bailing out, so that the more dire warning + * overrides the regular one, if applicable */ + if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */ + && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF))) + { + if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) + == UTF8_WARN_FE_FF + && ckWARN_d(WARN_UTF8)) + { + sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv)); + pack_warn = packWARN(WARN_UTF8); + } + if (flags & UTF8_DISALLOW_FE_FF) { + goto disallowed; + } + } +#endif if (flags & UTF8_DISALLOW_SUPER) { goto disallowed; } @@ -447,7 +447,9 @@ Perl's extended UTF-8 means we can have start bytes up to FF. #define UTF8_WARN_SUPER 0x0400 /* points above the legal max */ /* Code points which never were part of the original UTF-8 standard, the first - * byte of which is a FE or FF on ASCII platforms. */ + * byte of which is a FE or FF on ASCII platforms. If the first byte is FF, it + * will overflow a 32-bit word. If the first byte is FE, it will overflow a + * signed 32-bit word. */ #define UTF8_DISALLOW_FE_FF 0x0800 #define UTF8_WARN_FE_FF 0x1000 |