diff options
-rw-r--r-- | embed.fnc | 1 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8.t | 24 | ||||
-rw-r--r-- | proto.h | 6 | ||||
-rw-r--r-- | utf8.c | 44 |
5 files changed, 57 insertions, 19 deletions
@@ -1683,6 +1683,7 @@ ApdD |UV |to_utf8_case |NN const U8 *p \ #if defined(PERL_IN_UTF8_C) inRP |bool |does_utf8_overflow|NN const U8 * const s|NN const U8 * e inRP |bool |is_utf8_overlong_given_start_byte_ok|NN const U8 * const s|const STRLEN len +inRP |bool |isFF_OVERLONG |NN const U8 * const s|const STRLEN len sMR |char * |unexpected_non_continuation_text \ |NN const U8 * const s \ |STRLEN print_len \ @@ -1832,6 +1832,7 @@ #define _to_utf8_case(a,b,c,d,e,f,g) S__to_utf8_case(aTHX_ a,b,c,d,e,f,g) #define check_locale_boundary_crossing(a,b,c,d) S_check_locale_boundary_crossing(aTHX_ a,b,c,d) #define does_utf8_overflow S_does_utf8_overflow +#define isFF_OVERLONG S_isFF_OVERLONG #define is_utf8_common(a,b,c,d) S_is_utf8_common(aTHX_ a,b,c,d) #define is_utf8_cp_above_31_bits S_is_utf8_cp_above_31_bits #define is_utf8_overlong_given_start_byte_ok S_is_utf8_overlong_given_start_byte_ok diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t index 37a1e32185..121c6efac5 100644 --- a/ext/XS-APItest/t/utf8.t +++ b/ext/XS-APItest/t/utf8.t @@ -1731,6 +1731,15 @@ my @tests = ( 'utf8', 0x80000000, (isASCII) ? 7 : $max_bytes, nonportable_regex(0x80000000) ], + [ "highest 32 bit code point", + (isASCII) + ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" + : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), + $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT, + $UTF8_GOT_ABOVE_31_BIT, + 'utf8', 0xFFFFFFFF, (isASCII) ? 7 : $max_bytes, + nonportable_regex(0xffffffff) + ], [ "requires at least 32 bits, and use SUPER-type flags, instead of ABOVE_31_BIT", (isASCII) ? "\xfe\x82\x80\x80\x80\x80\x80" @@ -1764,7 +1773,20 @@ my @tests = ( ], ); -if ($is64bit) { +if (! $is64bit) { + if (isASCII) { + no warnings qw{portable overflow}; + push @tests, + [ "Lowest 33 bit code point: overflow", + "\xFE\x84\x80\x80\x80\x80\x80", + $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT, + $UTF8_GOT_ABOVE_31_BIT, + 'utf8', 0x100000000, 7, + qr/and( is)? not portable/ + ]; + } +} +else { no warnings qw{portable overflow}; push @tests, [ "More than 32 bits", @@ -5605,6 +5605,12 @@ PERL_STATIC_INLINE bool S_does_utf8_overflow(const U8 * const s, const U8 * e) #define PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW \ assert(s); assert(e) +PERL_STATIC_INLINE bool S_isFF_OVERLONG(const U8 * const s, const STRLEN len) + __attribute__warn_unused_result__ + __attribute__pure__; +#define PERL_ARGS_ASSERT_ISFF_OVERLONG \ + assert(s) + PERL_STATIC_INLINE bool S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, const char * const swashname, SV* const invlist) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_IS_UTF8_COMMON \ @@ -442,7 +442,20 @@ S_does_utf8_overflow(const U8 * const s, const U8 * e) * that could result in a non-overflowing code point */ PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW; - assert(s + UTF8SKIP(s) >= e); + assert(s <= e && s + UTF8SKIP(s) >= e); + +#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC) + + /* On 32 bit ASCII machines, many overlongs that start with FF don't + * overflow */ + + if (isFF_OVERLONG(s, e - s)) { + const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84"; + return memGE(s, max_32_bit_overlong, + MIN(e - s, sizeof(max_32_bit_overlong))); + } + +#endif for (x = s; x < e; x++, y++) { @@ -521,27 +534,22 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len) return TRUE; } -# if defined(UV_IS_QUAD) || defined(EBCDIC) + /* Check for the FF overlong */ + return isFF_OVERLONG(s, len); +} + +PERL_STATIC_INLINE bool +S_isFF_OVERLONG(const U8 * const s, const STRLEN len) +{ + PERL_ARGS_ASSERT_ISFF_OVERLONG; /* Check for the FF overlong. This happens only if all these bytes match; * what comes after them doesn't matter. See tables in utf8.h, - * utfebcdic.h. (Can't happen on ASCII 32-bit platforms, as overflows - * instead.) */ - - if ( len >= sizeof(FF_OVERLONG_PREFIX) - 1 - && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX, - sizeof(FF_OVERLONG_PREFIX) - 1))) - { - return TRUE; - } - -#else - - PERL_UNUSED_ARG(len); - -#endif + * utfebcdic.h. */ - return FALSE; + return len >= sizeof(FF_OVERLONG_PREFIX) - 1 + && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX, + sizeof(FF_OVERLONG_PREFIX) - 1)); } #undef F0_ABOVE_OVERLONG |