diff options
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | embed.h | 6 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_warn_base.pl | 69 | ||||
-rw-r--r-- | proto.h | 18 | ||||
-rw-r--r-- | utf8.c | 201 | ||||
-rw-r--r-- | utf8.h | 6 | ||||
-rw-r--r-- | utfebcdic.h | 2 |
7 files changed, 177 insertions, 127 deletions
@@ -744,7 +744,9 @@ ADMpR |bool |isALNUM_lazy |NN const char* p #ifdef PERL_IN_UTF8_C snR |U8 |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp \ |const char dummy +# ifndef UV_IS_QUAD inR |bool |is_utf8_cp_above_31_bits|NN const U8 * const s|NN const U8 * const e +# endif #endif #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) EXp |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const unsigned int flags @@ -1472,6 +1472,11 @@ #define mulexp10 S_mulexp10 # endif # endif +# if !defined(UV_IS_QUAD) +# if defined(PERL_IN_UTF8_C) +#define is_utf8_cp_above_31_bits S_is_utf8_cp_above_31_bits +# endif +# endif # if !defined(WIN32) #define do_exec3(a,b,c) Perl_do_exec3(aTHX_ a,b,c) # endif @@ -1844,7 +1849,6 @@ #define isFF_OVERLONG S_isFF_OVERLONG #define is_utf8_common(a,b,c,d) S_is_utf8_common(aTHX_ a,b,c,d) #define is_utf8_common_with_len(a,b,c,d,e) S_is_utf8_common_with_len(aTHX_ a,b,c,d,e) -#define is_utf8_cp_above_31_bits S_is_utf8_cp_above_31_bits #define is_utf8_overlong_given_start_byte_ok S_is_utf8_overlong_given_start_byte_ok #define swash_scan_list_line(a,b,c,d,e,f,g) S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g) #define swatch_get(a,b,c) S_swatch_get(aTHX_ a,b,c) diff --git a/ext/XS-APItest/t/utf8_warn_base.pl b/ext/XS-APItest/t/utf8_warn_base.pl index 94df88e813..6c88f5c308 100644 --- a/ext/XS-APItest/t/utf8_warn_base.pl +++ b/ext/XS-APItest/t/utf8_warn_base.pl @@ -28,6 +28,7 @@ local $SIG{__WARN__} = sub { my @copy = @_; push @warnings_gotten, map { chomp; $_ } @copy; }; +my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF; my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation); sub requires_extended_utf8($) { @@ -36,8 +37,7 @@ sub requires_extended_utf8($) { # into 31 bits, subject to the convention that a negative code point # stands for one that overflows the word size, so won't fit in 31 bits. - my $cp = shift; - return $cp > 0x7FFFFFFF; + return shift > $highest_non_extended_utf8_cp; } my @tests; @@ -286,7 +286,6 @@ my @tests; : I8_to_native( "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), 0x80000000, - (isASCII) ? 1 : 8, ], [ "highest 32 bit code point", (isASCII) @@ -294,7 +293,6 @@ my @tests; : I8_to_native( "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), 0xFFFFFFFF, - (isASCII) ? 1 : 8, ], [ "Lowest 33 bit code point", (isASCII) @@ -340,7 +338,6 @@ my @tests; [ "Lowest code point requiring 13 bytes to represent", "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80", 0x1000000000, - 1, ], [ "overflow that old algorithm failed to detect", "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", @@ -355,37 +352,31 @@ my @tests; I8_to_native( "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 0x800000000, - 7, ], [ "requires at least 32 bits", I8_to_native( "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 0x10000000000, - 6, ], [ "requires at least 32 bits", I8_to_native( "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 0x200000000000, - 5, ], [ "requires at least 32 bits", I8_to_native( "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 0x4000000000000, - 4, ], [ "requires at least 32 bits", I8_to_native( "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 0x80000000000000, - 3, ], [ "requires at least 32 bits", I8_to_native( "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 0x1000000000000000, - 2, ]; } } @@ -587,6 +578,11 @@ foreach my $test (@tests) { # contain a code point. (This is a result of # some sort of malformation that means we # can't get an exact code poin + my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E + \Q requires a Perl extension, and so is not\E + \Q portable\E/x; + my $extended_non_cp_trailing_text + = "is a Perl extension, and so is not portable"; # Is this test malformed from the beginning? If so, we know to generally # expect that the tests will show it isn't valid. @@ -619,9 +615,9 @@ foreach my $test (@tests) { $initially_malformed = 1; } elsif (requires_extended_utf8($allowed_uv)) { - $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E - \Q and not portable\E/x; - $non_cp_trailing_text = "is for a non-Unicode code point, and is not portable"; + $cp_message_qr = $extended_cp_message_qr; + $non_cp_trailing_text = $extended_non_cp_trailing_text; + $needed_to_discern_len = 1 unless defined $needed_to_discern_len; } else { $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E @@ -870,6 +866,9 @@ foreach my $test (@tests) { # maximum length, so skip if we're already at that length. next if $overlong && $length >= $::max_bytes; + my $this_cp_message_qr = $cp_message_qr; + my $this_non_cp_trailing_text = $non_cp_trailing_text; + foreach my $malformed_allow_type (0..2) { # 0 don't allow this malformation; ignored if no malformation # 1 allow, with REPLACEMENT CHARACTER returned @@ -899,6 +898,8 @@ foreach my $test (@tests) { # combinations of on/off are tested for. It's either all are # allowed, or none are. my $allow_flags = 0; + my $overlong_is_in_perl_extended_utf8 = 0; + my $dont_use_overlong_cp = 0; if ($overlong) { my $new_expected_len; @@ -929,8 +930,20 @@ foreach my $test (@tests) { else { # Must use extended UTF-8. On ASCII platforms, we # could express some overlongs here starting with # \xFE, but there's no real reason to do so. + $overlong_is_in_perl_extended_utf8 = 1; $start_byte = I8_to_native("\xFF"); $new_expected_len = $::max_bytes; + $this_cp_message_qr = $extended_cp_message_qr; + + # The warning that gets raised doesn't include the code + # point in the message if the code point can be expressed + # without using extended UTF-8, but the particular + # overlong sequence used is in extended UTF-8. To do + # otherwise would be confusing to the user, as it would + # claim the code point requires extended, when it doesn't. + $dont_use_overlong_cp = 1 + unless requires_extended_utf8($allowed_uv); + $this_non_cp_trailing_text = $extended_non_cp_trailing_text; } # Splice in the revise continuation byte, preceded by the @@ -1152,12 +1165,20 @@ foreach my $test (@tests) { # on all the other flags. That makes sure that they all # are independent of this flag, and so we don't need to # test them individually. - my $this_warning_flags = ($use_warn_flag) - ? $this_utf8n_flag_to_warn - : $utf8n_flag_to_warn_complement; - my $this_disallow_flags = ($do_disallow) - ? $this_utf8n_flag_to_disallow - : $utf8n_flag_to_disallow_complement; + my $this_warning_flags + = ($use_warn_flag) + ? $this_utf8n_flag_to_warn + : ($overlong_is_in_perl_extended_utf8 + ? ($utf8n_flag_to_warn_complement + & ~$::UTF8_WARN_PERL_EXTENDED) + : $utf8n_flag_to_warn_complement); + my $this_disallow_flags + = ($do_disallow) + ? $this_utf8n_flag_to_disallow + : ($overlong_is_in_perl_extended_utf8 + ? ($utf8n_flag_to_disallow_complement + & ~$::UTF8_DISALLOW_PERL_EXTENDED) + : $utf8n_flag_to_disallow_complement); my $expected_uv = $allowed_uv; my $this_uv_string = $uv_string; @@ -1216,19 +1237,21 @@ foreach my $test (@tests) { # So far the array contains warnings generated by # malformations. Add the expected regular one. - unshift @expected_warnings, $cp_message_qr; + unshift @expected_warnings, $this_cp_message_qr; # But it may need to be modified, because either of # these malformations means we can't determine the # expected code point. - if ($short || $unexpected_noncont) { + if ( $short || $unexpected_noncont + || $dont_use_overlong_cp) + { my $first_byte = substr($this_bytes, 0, 1); $expected_warnings[0] = display_bytes( substr($this_bytes, 0, $this_expected_len)); $expected_warnings[0] = qr/[Aa]\Qny UTF-8 sequence that starts with\E \Q $expected_warnings[0]\E - \Q $non_cp_trailing_text\E/x; + \Q $this_non_cp_trailing_text\E/x; } } @@ -4078,6 +4078,17 @@ PERL_CALLCONV int Perl_my_sprintf(char *buffer, const char *pat, ...); STATIC NV S_mulexp10(NV value, I32 exponent); # endif #endif +#if !defined(UV_IS_QUAD) +# if defined(PERL_IN_UTF8_C) +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE bool S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS \ + assert(s); assert(e) +#endif + +# endif +#endif #if !defined(WIN32) PERL_CALLCONV bool Perl_do_exec3(pTHX_ const char *incmd, int fd, int do_report); #define PERL_ARGS_ASSERT_DO_EXEC3 \ @@ -5835,13 +5846,6 @@ PERL_STATIC_INLINE bool S_is_utf8_common_with_len(pTHX_ const U8 *const p, const #endif #ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE bool S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS \ - assert(s); assert(e) -#endif - -#ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE bool S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK \ @@ -122,8 +122,9 @@ const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf " is not recommended for open interchange"; const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode," " may not be portable"; -const char above_31_bit_cp_format[] = "Code point 0x%" UVXf " is not" - " Unicode, and not portable"; +const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \ + " Unicode, requires a Perl extension," \ + " and so is not portable"; #define HANDLE_UNICODE_SURROGATE(uv, flags) \ STMT_START { \ @@ -220,7 +221,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) /* Choose the more dire applicable warning */ (UNICODE_IS_PERL_EXTENDED(uv)) - ? above_31_bit_cp_format + ? perl_extended_cp_format : super_cp_format, uv); } @@ -362,30 +363,27 @@ defined in L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. See L<perlunicode/Noncharacter code points>. -Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard, -so using them is more problematic than other above-Unicode code points. Perl -invented an extension to UTF-8 to represent the ones above 2**36-1, so it is -likely that non-Perl languages will not be able to read files that contain -these that written by the perl interpreter; nor would Perl understand files -written by something that uses a different extension. For these reasons, there -is a separate set of flags that can warn and/or disallow these extremely high -code points, even if other above-Unicode ones are accepted. These are the -C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags. These -are entirely independent from the deprecation warning for code points above -C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any -code point that needs more than 31 bits to represent. When that happens, -effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on -32-bit machines. (Of course C<UNICODE_DISALLOW_SUPER> will treat all -above-Unicode code points, including these, as malformations; and -C<UNICODE_WARN_SUPER> warns on these.) - -On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing -extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower -than on ASCII. Prior to that, code points 2**31 and higher were simply -unrepresentable, and a different, incompatible method was used to represent -code points between 2**30 and 2**31 - 1. The flags C<UNICODE_WARN_ABOVE_31_BIT> -and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII -platforms, warning and disallowing 2**31 and higher. +Extremely high code points were never specified in any standard, and require an +extension to UTF-8 to express, which Perl does. It is likely that programs +written in something other than Perl would not be able to read files that +contain these; nor would Perl understand files written by something that uses a +different extension. For these reasons, there is a separate set of flags that +can warn and/or disallow these extremely high code points, even if other +above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED> +and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see +L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will +treat all above-Unicode code points, including these, as malformations. (Note +that the Unicode standard considers anything above 0x10FFFF to be illegal, but +there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1)) + +A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is +retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly, +C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named +C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because these +flags can apply to code points that actually do fit in 31 bits. This happens +on EBCDIC platforms, and sometimes when the L<overlong +malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately +describe the situation in all cases. =cut */ @@ -399,6 +397,8 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) return uvchr_to_utf8_flags(d, uv, flags); } +#ifndef UV_IS_QUAD + PERL_STATIC_INLINE bool S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) { @@ -480,6 +480,8 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) } +#endif + /* Anything larger than this will overflow the word if it were converted into a UV */ #if defined(UV_IS_QUAD) # ifdef EBCDIC /* Actually is I8 */ @@ -730,10 +732,12 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) # define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \ /* B6 and B7 */ \ && ((s1) & 0xFE ) == 0xB6) +# define isUTF8_PERL_EXTENDED(s) (*s == I8_TO_NATIVE_UTF8(0xFF)) #else # define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5 # define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90) # define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0) +# define isUTF8_PERL_EXTENDED(s) (*s >= 0xFE) #endif if ( (flags & UTF8_DISALLOW_SUPER) @@ -743,9 +747,9 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) } if ( (flags & UTF8_DISALLOW_PERL_EXTENDED) - && UNLIKELY(is_utf8_cp_above_31_bits(s, e))) + && UNLIKELY(isUTF8_PERL_EXTENDED(s))) { - return 0; /* Above 31 bits */ + return 0; } if (len > 1) { @@ -954,36 +958,34 @@ a malformation and raise a warning, specify both the WARN and DISALLOW flags. (But note that warnings are not raised if lexically disabled nor if C<UTF8_CHECK_ONLY> is also specified.) +Extremely high code points were never specified in any standard, and require an +extension to UTF-8 to express, which Perl does. It is likely that programs +written in something other than Perl would not be able to read files that +contain these; nor would Perl understand files written by something that uses a +different extension. For these reasons, there is a separate set of flags that +can warn and/or disallow these extremely high code points, even if other +above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and +C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see +L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all +above-Unicode code points, including these, as malformations. +(Note that the Unicode standard considers anything above 0x10FFFF to be +illegal, but there are standards predating it that allow up to 0x7FFF_FFFF +(2**31 -1)) + +A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is +retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly, +C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named +C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags +can apply to code points that actually do fit in 31 bits. This happens on +EBCDIC platforms, and sometimes when the L<overlong +malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately +describe the situation in all cases. + It is now deprecated to have very high code points (above C<IV_MAX> on the platforms) and this function will raise a deprecation warning for these (unless such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1) in a 32-bit word. -Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard, -so using them is more problematic than other above-Unicode code points. Perl -invented an extension to UTF-8 to represent the ones above 2**36-1, so it is -likely that non-Perl languages will not be able to read files that contain -these; nor would Perl understand files -written by something that uses a different extension. For these reasons, there -is a separate set of flags that can warn and/or disallow these extremely high -code points, even if other above-Unicode ones are accepted. These are the -C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags. These -are entirely independent from the deprecation warning for code points above -C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any -code point that needs more than 31 bits to represent. When that happens, -effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on -32-bit machines. (Of course C<UTF8_DISALLOW_SUPER> will treat all -above-Unicode code points, including these, as malformations; and -C<UTF8_WARN_SUPER> warns on these.) - -On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing -extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower -than on ASCII. Prior to that, code points 2**31 and higher were simply -unrepresentable, and a different, incompatible method was used to represent -code points between 2**30 and 2**31 - 1. The flags C<UTF8_WARN_ABOVE_31_BIT> -and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII -platforms, warning and disallowing 2**31 and higher. - All other code points corresponding to Unicode characters, including private use and those yet to be assigned, are never considered malformed and never warn. @@ -1026,12 +1028,36 @@ exceptions are noted: =over 4 -=item C<UTF8_GOT_ABOVE_31_BIT> +=item C<UTF8_GOT_PERL_EXTENDED> -The code point represented by the input UTF-8 sequence occupies more than 31 -bits. -This bit is set only if the input C<flags> parameter contains either the -C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags. +The input sequence is not standard UTF-8, but a Perl extension. This bit is +set only if the input C<flags> parameter contains either the +C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags. + +Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard, +and so some extension must be used to express them. Perl uses a natural +extension to UTF-8 to represent the ones up to 2**36-1, and invented a further +extension to represent even higher ones, so that any code point that fits in a +64-bit word can be represented. Text using these extensions is not likely to +be portable to non-Perl code. We lump both of these extensions together and +refer to them as Perl extended UTF-8. There exist other extensions that people +have invented, incompatible with Perl's. + +On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing +extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower +than on ASCII. Prior to that, code points 2**31 and higher were simply +unrepresentable, and a different, incompatible method was used to represent +code points between 2**30 and 2**31 - 1. + +On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if +Perl extended UTF-8 is used. + +In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still +may use for backward compatibility. That name is misleading, as this flag may +be set when the code point actually does fit in 31 bits. This happens on +EBCDIC platforms, and sometimes when the L<overlong +malformation|/C<UTF8_GOT_LONG>> is also present. The new name accurately +describes the situation in all cases. =item C<UTF8_GOT_CONTINUATION> @@ -1119,8 +1145,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * too short one. Otherwise the first two are set to 's0' and 'send', and * the third not used at all */ U8 * adjusted_s0 = (U8 *) s0; - U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong - warning) */ U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this routine; see [perl #130921] */ UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */ @@ -1212,7 +1236,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, else { send += expectlen; } - adjusted_send = send; /* Now, loop through the remaining bytes in the character's sequence, * accumulating each into the working value as we go. */ @@ -1297,7 +1320,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, } adjusted_s0 = temp_char_buf; - adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0); + (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0); } } @@ -1316,7 +1339,8 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * and we deal with those in the overflow handling * code */ && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)) - && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0))) + && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) + || UNLIKELY(isUTF8_PERL_EXTENDED(s0))))) && ((flags & ( UTF8_DISALLOW_NONCHAR |UTF8_DISALLOW_SURROGATE |UTF8_DISALLOW_SUPER @@ -1396,13 +1420,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * end, based on how many bytes the start byte tells * us should be in it, but no further than s0 + * avail_len - * adjusted_s0 normally is the same as s0, but in case of an - * overlong for which the UTF-8 matters below, it is - * the first byte of the shortest form representation - * of the input. - * adjusted_send normally is the same as 'send', but if adjusted_s0 - * is set to something other than s0, this points one - * beyond its end */ if (UNLIKELY(possible_problems)) { @@ -1603,39 +1620,35 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, } } - /* The maximum code point ever specified by a standard was - * 2**31 - 1. Anything larger than that is a Perl extension - * that very well may not be understood by other applications - * (including earlier perl versions on EBCDIC platforms). We - * test for these after the regular SUPER ones, and before - * possibly bailing out, so that the slightly more dire warning - * will override the regular one. */ - if ( (flags & (UTF8_WARN_PERL_EXTENDED - |UTF8_WARN_SUPER - |UTF8_DISALLOW_PERL_EXTENDED)) - && ( ( UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT) - && UNLIKELY(is_utf8_cp_above_31_bits( - adjusted_s0, - adjusted_send))) - || ( LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT)) - && UNLIKELY(UNICODE_IS_PERL_EXTENDED(uv))))) - { + /* Test for Perl's extended UTF-8 after the regular SUPER ones, + * and before possibly bailing out, so that the more dire + * warning will override the regular one. */ + if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) { if ( ! (flags & UTF8_CHECK_ONLY) && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER)) && ckWARN_d(WARN_NON_UNICODE)) { pack_warn = packWARN(WARN_NON_UNICODE); - if (orig_problems & UTF8_GOT_TOO_SHORT) { + /* If it is an overlong that evaluates to a code point + * that doesn't have to use the Perl extended UTF-8, it + * still used it, and so we output a message that + * doesn't refer to the code point. The same is true + * if there was a SHORT malformation where the code + * point is not valid. In that case, 'uv' will have + * been set to the REPLACEMENT CHAR, and the message + * below without the code point in it will be selected + * */ + if (UNICODE_IS_PERL_EXTENDED(uv)) { message = Perl_form(aTHX_ - "Any UTF-8 sequence that starts with" - " \"%s\" is for a non-Unicode code" - " point, and is not portable", - _byte_dump_string(s0, curlen, 0)); + perl_extended_cp_format, uv); } else { message = Perl_form(aTHX_ - above_31_bit_cp_format, uv); + "Any UTF-8 sequence that starts with" + " \"%s\" is a Perl extension, and" + " so is not portable", + _byte_dump_string(s0, curlen, 0)); } } @@ -407,6 +407,8 @@ C<cp> is Unicode if above 255; otherwise is platform-native. ( LIKELY( ( ( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ : LIKELY( ( ( ( 0xF4 == ((const U8*)s)[0] ) && ( ( ((const U8*)s)[1] & 0xF0 ) == 0x80 ) ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 ) +#define UNICODE_IS_PERL_EXTENDED(uv) UNLIKELY((UV) (uv) > 0x7FFFFFFF) + #endif /* EBCDIC vs ASCII */ /* 2**UTF_ACCUMULATION_SHIFT - 1 */ @@ -781,7 +783,8 @@ case any call to string overloading updates the internal UTF-8 encoding flag. #define UTF8_GOT_PERL_EXTENDED UTF8_DISALLOW_PERL_EXTENDED #define UTF8_WARN_PERL_EXTENDED 0x8000 -/* For back compat, these old names are misleading for UTF_EBCDIC */ +/* For back compat, these old names are misleading for overlongs and + * UTF_EBCDIC. */ #define UTF8_DISALLOW_ABOVE_31_BIT UTF8_DISALLOW_PERL_EXTENDED #define UTF8_GOT_ABOVE_31_BIT UTF8_GOT_PERL_EXTENDED #define UTF8_WARN_ABOVE_31_BIT UTF8_WARN_PERL_EXTENDED @@ -958,7 +961,6 @@ point's representation. && UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) #define UNICODE_IS_SUPER(uv) ((UV) (uv) > PERL_UNICODE_MAX) -#define UNICODE_IS_PERL_EXTENDED(uv) ((UV) (uv) > 0x7FFFFFFF) #define LATIN_SMALL_LETTER_SHARP_S LATIN_SMALL_LETTER_SHARP_S_NATIVE #define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS \ diff --git a/utfebcdic.h b/utfebcdic.h index 0f81d1ffee..c2f0788cc4 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -511,6 +511,8 @@ explicitly forbidden, and the shortest possible encoding should always be used * has this start byte (expressed in I8) as the maximum */ #define _IS_UTF8_CHAR_HIGHEST_START_BYTE 0xF9 +#define UNICODE_IS_PERL_EXTENDED(uv) UNLIKELY((UV) (uv) > 0x3FFFFFFF) + /* * ex: set ts=8 sts=4 sw=4 et: */ |