diff options
author | Karl Williamson <khw@cpan.org> | 2017-06-27 14:46:26 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2017-07-12 21:14:25 -0600 |
commit | 57ff5f598ddf7ce8834832a15ba1a4628b5932c4 (patch) | |
tree | f39ae0ce8116b6ee8a13b1014a562f4b350aa3a4 /ext/XS-APItest/t | |
parent | d044b7a780a1f1916e96ed7d255bb0b7dad54713 (diff) | |
download | perl-57ff5f598ddf7ce8834832a15ba1a4628b5932c4.tar.gz |
utf8n_to_uvchr() Properly test for extended UTF-8
It somehow dawned on me that the code is incorrect for
warning/disallowing very high code points. What is really wanted in the
API is to catch UTF-8 that is not necessarily portable. There are
several classes of this, but I'm referring here to just the code points
that are above the Unicode-defined maximum of 0x10FFFF. These can be
considered non-portable, and there is a mechanism in the API to
warn/disallow these.
However an earlier standard defined UTF-8 to handle code points up to
2**31-1. Anything above that is using an extension to UTF-8 that has
never been officially recognized. Perl does use such an extension, and
the API is supposed to have a different mechanism to warn/disallow on
this.
Thus there are two classes of warning/disallowing for above-Unicode code
points. One for things that have some non-Unicode official recognition,
and the other for things that have never had official recognition.
UTF-EBCDIC differs somewhat in this, and since Perl 5.24, we have had a
Perl extension that allows it to handle any code point that fits in a
64-bit word. This kicks in at code points above 2**30-1, a number
different than UTF-8 extended kicks in on ASCII platforms.
Things are also complicated by the fact that the API has provisions for
accepting the overlong UTF-8 malformation. It is possible to use
extended UTF-8 to represent code points smaller than 31-bit ones.
Until this commit, the extended warning/disallowing was based on the
resultant code point, and only when that code point did not fit into 31
bits.
But what is really wanted is if extended UTF-8 was used to represent a
code point, no matter how large the resultant code point is. This
differs from the previous definition, but only for EBCDIC platforms, or
when the overlong malformation was also present. So it does not affect
very many real-world cases.
This commit fixes that. It turns out that it is easier to tell if
something is using extended-UTF8. One just looks at the first byte of a
sequence.
The trailing part of the warning message that gets raised is slightly
changed to be clearer. It's not significant enough to affect perldiag.
Diffstat (limited to 'ext/XS-APItest/t')
-rw-r--r-- | ext/XS-APItest/t/utf8_warn_base.pl | 69 |
1 files changed, 46 insertions, 23 deletions
diff --git a/ext/XS-APItest/t/utf8_warn_base.pl b/ext/XS-APItest/t/utf8_warn_base.pl index 94df88e813..6c88f5c308 100644 --- a/ext/XS-APItest/t/utf8_warn_base.pl +++ b/ext/XS-APItest/t/utf8_warn_base.pl @@ -28,6 +28,7 @@ local $SIG{__WARN__} = sub { my @copy = @_; push @warnings_gotten, map { chomp; $_ } @copy; }; +my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF; my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation); sub requires_extended_utf8($) { @@ -36,8 +37,7 @@ sub requires_extended_utf8($) { # into 31 bits, subject to the convention that a negative code point # stands for one that overflows the word size, so won't fit in 31 bits. - my $cp = shift; - return $cp > 0x7FFFFFFF; + return shift > $highest_non_extended_utf8_cp; } my @tests; @@ -286,7 +286,6 @@ my @tests; : I8_to_native( "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), 0x80000000, - (isASCII) ? 1 : 8, ], [ "highest 32 bit code point", (isASCII) @@ -294,7 +293,6 @@ my @tests; : I8_to_native( "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), 0xFFFFFFFF, - (isASCII) ? 1 : 8, ], [ "Lowest 33 bit code point", (isASCII) @@ -340,7 +338,6 @@ my @tests; [ "Lowest code point requiring 13 bytes to represent", "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80", 0x1000000000, - 1, ], [ "overflow that old algorithm failed to detect", "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", @@ -355,37 +352,31 @@ my @tests; I8_to_native( "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 0x800000000, - 7, ], [ "requires at least 32 bits", I8_to_native( "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 0x10000000000, - 6, ], [ "requires at least 32 bits", I8_to_native( "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 0x200000000000, - 5, ], [ "requires at least 32 bits", I8_to_native( "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 0x4000000000000, - 4, ], [ "requires at least 32 bits", I8_to_native( "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 0x80000000000000, - 3, ], [ "requires at least 32 bits", I8_to_native( "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 0x1000000000000000, - 2, ]; } } @@ -587,6 +578,11 @@ foreach my $test (@tests) { # contain a code point. (This is a result of # some sort of malformation that means we # can't get an exact code poin + my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E + \Q requires a Perl extension, and so is not\E + \Q portable\E/x; + my $extended_non_cp_trailing_text + = "is a Perl extension, and so is not portable"; # Is this test malformed from the beginning? If so, we know to generally # expect that the tests will show it isn't valid. @@ -619,9 +615,9 @@ foreach my $test (@tests) { $initially_malformed = 1; } elsif (requires_extended_utf8($allowed_uv)) { - $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E - \Q and not portable\E/x; - $non_cp_trailing_text = "is for a non-Unicode code point, and is not portable"; + $cp_message_qr = $extended_cp_message_qr; + $non_cp_trailing_text = $extended_non_cp_trailing_text; + $needed_to_discern_len = 1 unless defined $needed_to_discern_len; } else { $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E @@ -870,6 +866,9 @@ foreach my $test (@tests) { # maximum length, so skip if we're already at that length. next if $overlong && $length >= $::max_bytes; + my $this_cp_message_qr = $cp_message_qr; + my $this_non_cp_trailing_text = $non_cp_trailing_text; + foreach my $malformed_allow_type (0..2) { # 0 don't allow this malformation; ignored if no malformation # 1 allow, with REPLACEMENT CHARACTER returned @@ -899,6 +898,8 @@ foreach my $test (@tests) { # combinations of on/off are tested for. It's either all are # allowed, or none are. my $allow_flags = 0; + my $overlong_is_in_perl_extended_utf8 = 0; + my $dont_use_overlong_cp = 0; if ($overlong) { my $new_expected_len; @@ -929,8 +930,20 @@ foreach my $test (@tests) { else { # Must use extended UTF-8. On ASCII platforms, we # could express some overlongs here starting with # \xFE, but there's no real reason to do so. + $overlong_is_in_perl_extended_utf8 = 1; $start_byte = I8_to_native("\xFF"); $new_expected_len = $::max_bytes; + $this_cp_message_qr = $extended_cp_message_qr; + + # The warning that gets raised doesn't include the code + # point in the message if the code point can be expressed + # without using extended UTF-8, but the particular + # overlong sequence used is in extended UTF-8. To do + # otherwise would be confusing to the user, as it would + # claim the code point requires extended, when it doesn't. + $dont_use_overlong_cp = 1 + unless requires_extended_utf8($allowed_uv); + $this_non_cp_trailing_text = $extended_non_cp_trailing_text; } # Splice in the revise continuation byte, preceded by the @@ -1152,12 +1165,20 @@ foreach my $test (@tests) { # on all the other flags. That makes sure that they all # are independent of this flag, and so we don't need to # test them individually. - my $this_warning_flags = ($use_warn_flag) - ? $this_utf8n_flag_to_warn - : $utf8n_flag_to_warn_complement; - my $this_disallow_flags = ($do_disallow) - ? $this_utf8n_flag_to_disallow - : $utf8n_flag_to_disallow_complement; + my $this_warning_flags + = ($use_warn_flag) + ? $this_utf8n_flag_to_warn + : ($overlong_is_in_perl_extended_utf8 + ? ($utf8n_flag_to_warn_complement + & ~$::UTF8_WARN_PERL_EXTENDED) + : $utf8n_flag_to_warn_complement); + my $this_disallow_flags + = ($do_disallow) + ? $this_utf8n_flag_to_disallow + : ($overlong_is_in_perl_extended_utf8 + ? ($utf8n_flag_to_disallow_complement + & ~$::UTF8_DISALLOW_PERL_EXTENDED) + : $utf8n_flag_to_disallow_complement); my $expected_uv = $allowed_uv; my $this_uv_string = $uv_string; @@ -1216,19 +1237,21 @@ foreach my $test (@tests) { # So far the array contains warnings generated by # malformations. Add the expected regular one. - unshift @expected_warnings, $cp_message_qr; + unshift @expected_warnings, $this_cp_message_qr; # But it may need to be modified, because either of # these malformations means we can't determine the # expected code point. - if ($short || $unexpected_noncont) { + if ( $short || $unexpected_noncont + || $dont_use_overlong_cp) + { my $first_byte = substr($this_bytes, 0, 1); $expected_warnings[0] = display_bytes( substr($this_bytes, 0, $this_expected_len)); $expected_warnings[0] = qr/[Aa]\Qny UTF-8 sequence that starts with\E \Q $expected_warnings[0]\E - \Q $non_cp_trailing_text\E/x; + \Q $this_non_cp_trailing_text\E/x; } } |