diff options
author | Karl Williamson <khw@cpan.org> | 2017-02-19 14:14:35 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2017-02-20 01:07:01 -0700 |
commit | 6aa905cfe4f92b6e895d3bf5a22daa52e36ceb63 (patch) | |
tree | 2240bfd2e789c058b57011d161cf80ebe8634afc /ext | |
parent | 934272c1a42c040b4bd39f09eb0eeced3388942e (diff) | |
download | perl-6aa905cfe4f92b6e895d3bf5a22daa52e36ceb63.tar.gz |
Split XS-APItest/t/utf8.t
This test file is one of the longest running ones. It has three main
semi-independent parts. Two of them are split off into 2 files with a
common file required. The other part is still long running, so it is
split so that a common file is used to run the tests, but it is called
with a chunk number and it only executes based on that chunk. The
number of chunks is based on the environment variable TEST_JOBS, up to
10. Each chunk executes 1/TEST_JOBS of the total test. If TEST_JOBS is
not set, it reverts to 1 chunk. The alternative would be to revert to
10, but since there is overhead associated with each new chunk, I chose,
for now, 1.
There may be a better solution later on, but I think this is good enough
for now.
Diffstat (limited to 'ext')
-rw-r--r-- | ext/XS-APItest/t/utf8.t | 1578 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_malformed.t | 418 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_setup.pl | 98 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_warn0.t | 7 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_warn1.t | 7 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_warn2.t | 7 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_warn3.t | 7 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_warn4.t | 7 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_warn5.t | 7 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_warn6.t | 7 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_warn7.t | 7 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_warn8.t | 7 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_warn9.t | 7 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8_warn_base.pl | 1059 |
14 files changed, 1684 insertions, 1539 deletions
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t index ab3c21829d..c7a032efde 100644 --- a/ext/XS-APItest/t/utf8.t +++ b/ext/XS-APItest/t/utf8.t @@ -3,9 +3,12 @@ use strict; use Test::More; +# This file tests various functions and macros in the API related to UTF-8. + BEGIN { use_ok('XS::APItest'); require 'charset_tools.pl'; + require './t/utf8_setup.pl'; }; $|=1; @@ -14,117 +17,32 @@ no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit # machines, and that is tested elsewhere use XS::APItest; -use Data::Dumper; -my $pound_sign = chr utf8::unicode_to_native(163); -sub isASCII { ord "A" == 65 } - -sub display_bytes { - use bytes; - my $string = shift; - return '"' - . join("", map { sprintf("\\x%02x", ord $_) } split "", $string) - . '"'; -} - -sub output_warnings(@) { - diag "The warnings were:\n" . join("", @_); -} +my $pound_sign = chr utf8::unicode_to_native(163); # This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl # because that uses the same functions we are testing here. So UTF-EBCDIC # strings are hard-coded as I8 strings in this file instead, and we use the # translation functions to/from I8 from that file instead. -sub start_byte_to_cont($) { - - # Extract the code point information from the input UTF-8 start byte, and - # return a continuation byte containing the same information. This is - # used in constructing an overlong malformation from valid input. - - my $byte = shift; - my $len = test_UTF8_SKIP($byte); - if ($len < 2) { - die "start_byte_to_cont() is expecting a UTF-8 variant"; - } - - $byte = ord native_to_I8($byte); - - # Copied from utf8.h. This gets rid of the leading 1 bits. - $byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2))); - - $byte |= (isASCII) ? 0x80 : 0xA0; - return I8_to_native(chr $byte); -} - -my $is64bit = length sprintf("%x", ~0) > 8; - - -# Test utf8n_to_uvchr_error(). These provide essentially complete code -# coverage. Copied from utf8.h -my $UTF8_ALLOW_EMPTY = 0x0001; -my $UTF8_GOT_EMPTY = $UTF8_ALLOW_EMPTY; -my $UTF8_ALLOW_CONTINUATION = 0x0002; -my $UTF8_GOT_CONTINUATION = $UTF8_ALLOW_CONTINUATION; -my $UTF8_ALLOW_NON_CONTINUATION = 0x0004; -my $UTF8_GOT_NON_CONTINUATION = $UTF8_ALLOW_NON_CONTINUATION; -my $UTF8_ALLOW_SHORT = 0x0008; -my $UTF8_GOT_SHORT = $UTF8_ALLOW_SHORT; -my $UTF8_ALLOW_LONG = 0x0010; -my $UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020; -my $UTF8_GOT_LONG = $UTF8_ALLOW_LONG; -my $UTF8_ALLOW_OVERFLOW = 0x0080; -my $UTF8_GOT_OVERFLOW = $UTF8_ALLOW_OVERFLOW; -my $UTF8_DISALLOW_SURROGATE = 0x0100; -my $UTF8_GOT_SURROGATE = $UTF8_DISALLOW_SURROGATE; -my $UTF8_WARN_SURROGATE = 0x0200; -my $UTF8_DISALLOW_NONCHAR = 0x0400; -my $UTF8_GOT_NONCHAR = $UTF8_DISALLOW_NONCHAR; -my $UTF8_WARN_NONCHAR = 0x0800; -my $UTF8_DISALLOW_SUPER = 0x1000; -my $UTF8_GOT_SUPER = $UTF8_DISALLOW_SUPER; -my $UTF8_WARN_SUPER = 0x2000; -my $UTF8_DISALLOW_ABOVE_31_BIT = 0x4000; -my $UTF8_GOT_ABOVE_31_BIT = $UTF8_DISALLOW_ABOVE_31_BIT; -my $UTF8_WARN_ABOVE_31_BIT = 0x8000; -my $UTF8_CHECK_ONLY = 0x10000; -my $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE - = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE; -my $UTF8_DISALLOW_ILLEGAL_INTERCHANGE - = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR; -my $UTF8_WARN_ILLEGAL_C9_INTERCHANGE - = $UTF8_WARN_SUPER|$UTF8_WARN_SURROGATE; -my $UTF8_WARN_ILLEGAL_INTERCHANGE - = $UTF8_WARN_ILLEGAL_C9_INTERCHANGE|$UTF8_WARN_NONCHAR; - -# Test uvchr_to_utf8(). -my $UNICODE_WARN_SURROGATE = 0x0001; -my $UNICODE_WARN_NONCHAR = 0x0002; -my $UNICODE_WARN_SUPER = 0x0004; -my $UNICODE_WARN_ABOVE_31_BIT = 0x0008; -my $UNICODE_DISALLOW_SURROGATE = 0x0010; -my $UNICODE_DISALLOW_NONCHAR = 0x0020; -my $UNICODE_DISALLOW_SUPER = 0x0040; -my $UNICODE_DISALLOW_ABOVE_31_BIT = 0x0080; - my $look_for_everything_utf8n_to - = $UTF8_DISALLOW_SURROGATE - | $UTF8_WARN_SURROGATE - | $UTF8_DISALLOW_NONCHAR - | $UTF8_WARN_NONCHAR - | $UTF8_DISALLOW_SUPER - | $UTF8_WARN_SUPER - | $UTF8_DISALLOW_ABOVE_31_BIT - | $UTF8_WARN_ABOVE_31_BIT; + = $::UTF8_DISALLOW_SURROGATE + | $::UTF8_WARN_SURROGATE + | $::UTF8_DISALLOW_NONCHAR + | $::UTF8_WARN_NONCHAR + | $::UTF8_DISALLOW_SUPER + | $::UTF8_WARN_SUPER + | $::UTF8_DISALLOW_ABOVE_31_BIT + | $::UTF8_WARN_ABOVE_31_BIT; my $look_for_everything_uvchr_to - = $UNICODE_DISALLOW_SURROGATE - | $UNICODE_WARN_SURROGATE - | $UNICODE_DISALLOW_NONCHAR - | $UNICODE_WARN_NONCHAR - | $UNICODE_DISALLOW_SUPER - | $UNICODE_WARN_SUPER - | $UNICODE_DISALLOW_ABOVE_31_BIT - | $UNICODE_WARN_ABOVE_31_BIT; + = $::UNICODE_DISALLOW_SURROGATE + | $::UNICODE_WARN_SURROGATE + | $::UNICODE_DISALLOW_NONCHAR + | $::UNICODE_WARN_NONCHAR + | $::UNICODE_DISALLOW_SUPER + | $::UNICODE_WARN_SUPER + | $::UNICODE_DISALLOW_ABOVE_31_BIT + | $::UNICODE_WARN_ABOVE_31_BIT; foreach ([0, '', '', 'empty'], [0, 'N', 'N', '1 char'], @@ -474,7 +392,7 @@ my %code_points = ( : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), ); -if ($is64bit) { +if ($::is64bit) { no warnings qw(overflow portable); $code_points{0x100000000} = (isASCII) @@ -531,21 +449,17 @@ while ($cp < ((isASCII) ? 128 : 160)) { # This is from the definition of # continuation bytes can be in, and what the lowest start byte can be. So we # cycle through them. -my $first_continuation = (isASCII) ? 0x80 : 0xA0; my $final_continuation = 0xBF; my $start = (isASCII) ? 0xC2 : 0xC5; -my $max_bytes = (isASCII) ? 13 : 14; # Max number of bytes in a UTF-8 sequence - # representing a single code point - -my $continuation = $first_continuation - 1; +my $continuation = $::first_continuation - 1; while ($cp < 255) { if (++$continuation > $final_continuation) { # Wrap to the next start byte when we reach the final continuation # byte possible - $continuation = $first_continuation; + $continuation = $::first_continuation; $start++; } $code_points{$cp} = I8_to_native(chr($start) . chr($continuation)); @@ -586,8 +500,8 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } $u < 0x10000 ? 3 : $u < 0x200000 ? 4 : $u < 0x4000000 ? 5 : - $u < 0x80000000 ? 6 : (($is64bit) - ? ($u < 0x1000000000 ? 7 : $max_bytes) + $u < 0x80000000 ? 6 : (($::is64bit) + ? ($u < 0x1000000000 ? 7 : $::max_bytes) : 7) ) : ($u < 0xA0 ? 1 : @@ -596,7 +510,7 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } $u < 0x40000 ? 4 : $u < 0x400000 ? 5 : $u < 0x4000000 ? 6 : - $u < 0x40000000 ? 7 : $max_bytes ); + $u < 0x40000000 ? 7 : $::max_bytes ); } # If this test fails, subsequent ones are meaningless. @@ -708,21 +622,21 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } my $valid_under_c9strict = 1; my $valid_for_fits_in_31_bits = 1; if ($n > 0x10FFFF) { - $this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER); + $this_utf8_flags &= ~($::UTF8_DISALLOW_SUPER|$::UTF8_WARN_SUPER); $valid_under_strict = 0; $valid_under_c9strict = 0; if ($n > 2 ** 31 - 1) { $this_utf8_flags &= - ~($UTF8_DISALLOW_ABOVE_31_BIT|$UTF8_WARN_ABOVE_31_BIT); + ~($::UTF8_DISALLOW_ABOVE_31_BIT|$::UTF8_WARN_ABOVE_31_BIT); $valid_for_fits_in_31_bits = 0; } } elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) { - $this_utf8_flags &= ~($UTF8_DISALLOW_NONCHAR|$UTF8_WARN_NONCHAR); + $this_utf8_flags &= ~($::UTF8_DISALLOW_NONCHAR|$::UTF8_WARN_NONCHAR); $valid_under_strict = 0; } elsif ($n >= 0xD800 && $n <= 0xDFFF) { - $this_utf8_flags &= ~($UTF8_DISALLOW_SURROGATE|$UTF8_WARN_SURROGATE); + $this_utf8_flags &= ~($::UTF8_DISALLOW_SURROGATE|$::UTF8_WARN_SURROGATE); $valid_under_c9strict = 0; $valid_under_strict = 0; } @@ -812,7 +726,7 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } undef @warnings; $ret = test_isUTF8_CHAR_flags($bytes, $len, - $UTF8_DISALLOW_ILLEGAL_INTERCHANGE); + $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE); is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')" . " acts like isSTRICT_UTF8_CHAR"); @@ -845,7 +759,7 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } undef @warnings; $ret = test_isUTF8_CHAR_flags($bytes, $len, - $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); + $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')" ." acts like isC9_STRICT_UTF8_CHAR"); @@ -870,17 +784,17 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } my $this_uvchr_flags = $look_for_everything_uvchr_to; if ($n > 2 ** 31 - 1) { $this_uvchr_flags &= - ~($UNICODE_DISALLOW_ABOVE_31_BIT|$UNICODE_WARN_ABOVE_31_BIT); + ~($::UNICODE_DISALLOW_ABOVE_31_BIT|$::UNICODE_WARN_ABOVE_31_BIT); } if ($n > 0x10FFFF) { - $this_uvchr_flags &= ~($UNICODE_DISALLOW_SUPER|$UNICODE_WARN_SUPER); + $this_uvchr_flags &= ~($::UNICODE_DISALLOW_SUPER|$::UNICODE_WARN_SUPER); } elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) { - $this_uvchr_flags &= ~($UNICODE_DISALLOW_NONCHAR|$UNICODE_WARN_NONCHAR); + $this_uvchr_flags &= ~($::UNICODE_DISALLOW_NONCHAR|$::UNICODE_WARN_NONCHAR); } elsif ($n >= 0xD800 && $n <= 0xDFFF) { $this_uvchr_flags - &= ~($UNICODE_DISALLOW_SURROGATE|$UNICODE_WARN_SURROGATE); + &= ~($::UNICODE_DISALLOW_SURROGATE|$::UNICODE_WARN_SURROGATE); } $display_flags = sprintf "0x%x", $this_uvchr_flags; @@ -1088,13 +1002,13 @@ for my $restriction (sort keys %restriction_types) { $this_name .= "($restriction)"; if ($restriction eq "c9strict") { $test - .= ", $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE"; + .= ", $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE"; } elsif ($restriction eq "strict") { - $test .= ", $UTF8_DISALLOW_ILLEGAL_INTERCHANGE"; + $test .= ", $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE"; } elsif ($restriction eq "fits_in_31_bits") { - $test .= ", $UTF8_DISALLOW_ABOVE_31_BIT"; + $test .= ", $::UTF8_DISALLOW_ABOVE_31_BIT"; } else { fail("Internal test error: Unknown restriction " @@ -1168,1420 +1082,6 @@ for my $restriction (sort keys %restriction_types) { } } -my $REPLACEMENT = 0xFFFD; - -# Now test the malformations. All these raise category utf8 warnings. -my @malformations = ( - # ($testname, $bytes, $length, $allow_flags, $expected_error_flags, - # $allowed_uv, $expected_len, $needed_to_discern_len, $message ) - -# Now considered a program bug, and asserted against - #[ "zero length string malformation", "", 0, - # $UTF8_ALLOW_EMPTY, $UTF8_GOT_EMPTY, $REPLACEMENT, 0, 0, - # qr/empty string/ - #], - [ "orphan continuation byte malformation", I8_to_native("${I8c}a"), 2, - $UTF8_ALLOW_CONTINUATION, $UTF8_GOT_CONTINUATION, $REPLACEMENT, - 1, 1, - qr/unexpected continuation byte/ - ], - [ "premature next character malformation (immediate)", - (isASCII) ? "\xc2\xc2\x80" : I8_to_native("\xc5\xc5\xa0"), - 3, - $UTF8_ALLOW_NON_CONTINUATION, $UTF8_GOT_NON_CONTINUATION, $REPLACEMENT, - 1, 2, - qr/unexpected non-continuation byte.*immediately after start byte/ - ], - [ "premature next character malformation (non-immediate)", - I8_to_native("\xef${I8c}a"), 3, - $UTF8_ALLOW_NON_CONTINUATION, $UTF8_GOT_NON_CONTINUATION, $REPLACEMENT, - 2, 3, - qr/unexpected non-continuation byte .* 2 bytes after start byte/ - ], - [ "too short malformation", I8_to_native("\xf1${I8c}a"), 2, - # Having the 'a' after this, but saying there are only 2 bytes also - # tests that we pay attention to the passed in length - $UTF8_ALLOW_SHORT, $UTF8_GOT_SHORT, $REPLACEMENT, - 2, 2, - qr/2 bytes available, need 4/ - ], - [ "overlong malformation, lowest 2-byte", - (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"), - 2, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - 0, # NUL - 2, 1, - qr/overlong/ - ], - [ "overlong malformation, highest 2-byte", - (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"), - 2, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - (isASCII) ? 0x7F : utf8::unicode_to_native(0x9F), - 2, 1, - qr/overlong/ - ], - [ "overlong malformation, lowest 3-byte", - (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"), - 3, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - 0, # NUL - 3, (isASCII) ? 2 : 1, - qr/overlong/ - ], - [ "overlong malformation, highest 3-byte", - (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"), - 3, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - (isASCII) ? 0x7FF : 0x3FF, - 3, (isASCII) ? 2 : 1, - qr/overlong/ - ], - [ "overlong malformation, lowest 4-byte", - (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"), - 4, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - 0, # NUL - 4, 2, - qr/overlong/ - ], - [ "overlong malformation, highest 4-byte", - (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"), - 4, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - (isASCII) ? 0xFFFF : 0x3FFF, - 4, 2, - qr/overlong/ - ], - [ "overlong malformation, lowest 5-byte", - (isASCII) - ? "\xf8\x80\x80\x80\x80" - : I8_to_native("\xf8\xa0\xa0\xa0\xa0"), - 5, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - 0, # NUL - 5, 2, - qr/overlong/ - ], - [ "overlong malformation, highest 5-byte", - (isASCII) - ? "\xf8\x87\xbf\xbf\xbf" - : I8_to_native("\xf8\xa7\xbf\xbf\xbf"), - 5, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - (isASCII) ? 0x1FFFFF : 0x3FFFF, - 5, 2, - qr/overlong/ - ], - [ "overlong malformation, lowest 6-byte", - (isASCII) - ? "\xfc\x80\x80\x80\x80\x80" - : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"), - 6, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - 0, # NUL - 6, 2, - qr/overlong/ - ], - [ "overlong malformation, highest 6-byte", - (isASCII) - ? "\xfc\x83\xbf\xbf\xbf\xbf" - : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"), - 6, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - (isASCII) ? 0x3FFFFFF : 0x3FFFFF, - 6, 2, - qr/overlong/ - ], - [ "overlong malformation, lowest 7-byte", - (isASCII) - ? "\xfe\x80\x80\x80\x80\x80\x80" - : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"), - 7, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - 0, # NUL - 7, 2, - qr/overlong/ - ], - [ "overlong malformation, highest 7-byte", - (isASCII) - ? "\xfe\x81\xbf\xbf\xbf\xbf\xbf" - : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"), - 7, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF, - 7, 2, - qr/overlong/ - ], -); - -if (isASCII && ! $is64bit) { # 32-bit ASCII platform - no warnings 'portable'; - push @malformations, - [ "overflow malformation", - "\xfe\x84\x80\x80\x80\x80\x80", # Represents 2**32 - 7, - $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW, - $REPLACEMENT, - 7, 2, - qr/overflows/ - ], - [ "overflow malformation", - "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80", - $max_bytes, - $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW, - $REPLACEMENT, - $max_bytes, 1, - qr/overflows/ - ]; -} -else { # 64-bit ASCII, or EBCDIC of any size. - # On EBCDIC platforms, another overlong test is needed even on 32-bit - # systems, whereas it doesn't happen on ASCII except on 64-bit ones. - - no warnings 'portable'; - no warnings 'overflow'; # Doesn't run on 32-bit systems, but compiles - push @malformations, - [ "overlong malformation, lowest max-byte", - (isASCII) - ? "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" - : I8_to_native( - "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), - $max_bytes, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - 0, # NUL - $max_bytes, (isASCII) ? 7 : 8, - qr/overlong/, - ], - [ "overlong malformation, highest max-byte", - (isASCII) # 2**36-1 on ASCII; 2**30-1 on EBCDIC - ? "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf" - : I8_to_native( - "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"), - $max_bytes, - $UTF8_ALLOW_LONG, $UTF8_GOT_LONG, - (isASCII) ? 0xFFFFFFFFF : 0x3FFFFFFF, - $max_bytes, (isASCII) ? 7 : 8, - qr/overlong/, - ]; - - if (! $is64bit) { # 32-bit EBCDIC - push @malformations, - [ "overflow malformation", - I8_to_native( - "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"), - $max_bytes, - $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW, - $REPLACEMENT, - $max_bytes, 8, - qr/overflows/ - ]; - } - else { # 64-bit, either ASCII or EBCDIC - push @malformations, - [ "overflow malformation", - (isASCII) - ? "\xff\x80\x90\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" - : I8_to_native( - "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), - $max_bytes, - $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW, - $REPLACEMENT, - $max_bytes, (isASCII) ? 3 : 2, - qr/overflows/ - ]; - } -} - -# For each overlong malformation in the list, we modify it, so that there are -# two tests. The first one returns the replacement character given the input -# flags, and the second test adds a flag that causes the actual code point the -# malformation represents to be returned. -my @added_overlongs; -foreach my $test (@malformations) { - my ($testname, $bytes, $length, $allow_flags, $expected_error_flags, - $allowed_uv, $expected_len, $needed_to_discern_len, $message ) = @$test; - next unless $testname =~ /overlong/; - - $test->[0] .= "; use REPLACEMENT CHAR"; - $test->[5] = $REPLACEMENT; - - push @added_overlongs, - [ $testname . "; use actual value", - $bytes, $length, - $allow_flags | $UTF8_ALLOW_LONG_AND_ITS_VALUE, - $expected_error_flags, $allowed_uv, $expected_len, - $needed_to_discern_len, $message - ]; -} -push @malformations, @added_overlongs; - -foreach my $test (@malformations) { - my ($testname, $bytes, $length, $allow_flags, $expected_error_flags, - $allowed_uv, $expected_len, $needed_to_discern_len, $message ) = @$test; - - if (length($bytes) < $length) { - fail("Internal test error: actual buffer length (" . length($bytes) - . ") must be at least as high as how far we are allowed to read" - . " into it ($length)"); - diag($testname); - next; - } - - undef @warnings; - - my $ret = test_isUTF8_CHAR($bytes, $length); - is($ret, 0, "$testname: isUTF8_CHAR returns 0"); - is(scalar @warnings, 0, "$testname: isUTF8_CHAR() generated no warnings") - or output_warnings(@warnings); - - undef @warnings; - - $ret = test_isUTF8_CHAR_flags($bytes, $length, 0); - is($ret, 0, "$testname: isUTF8_CHAR_flags returns 0"); - is(scalar @warnings, 0, "$testname: isUTF8_CHAR_flags() generated no" - . " warnings") - or output_warnings(@warnings); - - $ret = test_isSTRICT_UTF8_CHAR($bytes, $length); - is($ret, 0, "$testname: isSTRICT_UTF8_CHAR returns 0"); - is(scalar @warnings, 0, - "$testname: isSTRICT_UTF8_CHAR() generated no warnings") - or output_warnings(@warnings); - - $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length); - is($ret, 0, "$testname: isC9_STRICT_UTF8_CHAR returns 0"); - is(scalar @warnings, 0, - "$testname: isC9_STRICT_UTF8_CHAR() generated no warnings") - or output_warnings(@warnings); - - for my $j (1 .. $length - 1) { - my $partial = substr($bytes, 0, $j); - - undef @warnings; - - $ret = test_is_utf8_valid_partial_char_flags($bytes, $j, 0); - - my $ret_should_be = 0; - my $comment = ""; - if ($j < $needed_to_discern_len) { - $ret_should_be = 1; - $comment = ", but need $needed_to_discern_len bytes to discern:"; - } - - is($ret, $ret_should_be, "$testname: is_utf8_valid_partial_char_flags(" - . display_bytes($partial) - . ")$comment returns $ret_should_be"); - is(scalar @warnings, 0, - "$testname: is_utf8_valid_partial_char_flags() generated" - . " no warnings") - or output_warnings(@warnings); - } - - - # Test what happens when this malformation is not allowed - undef @warnings; - my $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, 0); - is($ret_ref->[0], 0, "$testname: disallowed: Returns 0"); - is($ret_ref->[1], $expected_len, - "$testname: utf8n_to_uvchr_error(), disallowed: Returns expected" - . " length: $expected_len"); - if (is(scalar @warnings, 1, - "$testname: disallowed: Got a single warning ")) - { - like($warnings[0], $message, - "$testname: disallowed: Got expected warning"); - } - else { - if (scalar @warnings) { - output_warnings(@warnings); - } - } - is($ret_ref->[2], $expected_error_flags, - "$testname: utf8n_to_uvchr_error(), disallowed:" - . " Returns expected error"); - - { # Next test when disallowed, and warnings are off. - undef @warnings; - no warnings 'utf8'; - my $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, 0); - is($ret_ref->[0], 0, - "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':" - . " Returns 0"); - is($ret_ref->[1], $expected_len, - "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':" - . " Returns expected length: $expected_len"); - if (!is(scalar @warnings, 0, - "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':" - . " no warnings generated")) - { - output_warnings(@warnings); - } - is($ret_ref->[2], $expected_error_flags, - "$testname: utf8n_to_uvchr_error(), disallowed: Returns" - . " expected error"); - } - - # Test with CHECK_ONLY - undef @warnings; - $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, $UTF8_CHECK_ONLY); - is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0"); - is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns -1 for length"); - if (! is(scalar @warnings, 0, - "$testname: CHECK_ONLY: no warnings generated")) - { - output_warnings(@warnings); - } - is($ret_ref->[2], $expected_error_flags, - "$testname: utf8n_to_uvchr_error(), disallowed: Returns expected" - . " error"); - - next if $allow_flags == 0; # Skip if can't allow this malformation - - # Test when the malformation is allowed - undef @warnings; - $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, $allow_flags); - is($ret_ref->[0], $allowed_uv, - "$testname: utf8n_to_uvchr_error(), allowed: Returns expected uv: " - . sprintf("0x%04X", $allowed_uv)); - is($ret_ref->[1], $expected_len, - "$testname: utf8n_to_uvchr_error(), allowed: Returns expected length:" - . " $expected_len"); - if (!is(scalar @warnings, 0, - "$testname: utf8n_to_uvchr_error(), allowed: no warnings" - . " generated")) - { - output_warnings(@warnings); - } - is($ret_ref->[2], $expected_error_flags, - "$testname: utf8n_to_uvchr_error(), disallowed: Returns" - . " expected error"); -} - -sub nonportable_regex ($) { - - # Returns a pattern that matches the non-portable message raised either - # for the specific input code point, or the one generated when there - # is some malformation that precludes the message containing the specific - # code point - - my $code_point = shift; - - my $string = sprintf '(Code point 0x%X is not Unicode, and' - . '|Any UTF-8 sequence that starts with' - . ' "(\\\x[[:xdigit:]]{2})+" is for a' - . ' non-Unicode code point, and is) not portable', - $code_point; - return qr/$string/; -} - -# Now test the cases where a legal code point is generated, but may or may not -# be allowed/warned on. -my @tests = ( - # ($testname, $bytes, $warn_flags, $disallow_flags, $expected_error_flags, - # $category, $allowed_uv, $expected_len, $needed_to_discern_len, $message ) - [ "lowest surrogate", - (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"), - $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE, - 'surrogate', 0xD800, - (isASCII) ? 3 : 4, - 2, - qr/surrogate/ - ], - [ "a middle surrogate", - (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"), - $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE, - 'surrogate', 0xD90D, - (isASCII) ? 3 : 4, - 2, - qr/surrogate/ - ], - [ "highest surrogate", - (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"), - $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE, - 'surrogate', 0xDFFF, - (isASCII) ? 3 : 4, - 2, - qr/surrogate/ - ], - [ "first non_unicode", - (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"), - $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER, - 'non_unicode', 0x110000, - (isASCII) ? 4 : 5, - 2, - qr/(not Unicode|for a non-Unicode code point).* may not be portable/ - ], - [ "non_unicode whose first byte tells that", - (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"), - $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER, - 'non_unicode', - (isASCII) ? 0x140000 : 0x200000, - (isASCII) ? 4 : 5, - 1, - qr/(not Unicode|for a non-Unicode code point).* may not be portable/ - ], - [ "first of 32 consecutive non-character code points", - (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xFDD0, - (isASCII) ? 3 : 4, - (isASCII) ? 3 : 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "a mid non-character code point of the 32 consecutive ones", - (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xFDE0, - (isASCII) ? 3 : 4, - (isASCII) ? 3 : 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "final of 32 consecutive non-character code points", - (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xFDEF, - (isASCII) ? 3 : 4, - (isASCII) ? 3 : 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+FFFE", - (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xFFFE, - (isASCII) ? 3 : 4, - (isASCII) ? 3 : 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+FFFF", - (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xFFFF, - (isASCII) ? 3 : 4, - (isASCII) ? 3 : 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+1FFFE", - (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x1FFFE, - 4, 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+1FFFF", - (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x1FFFF, - 4, 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+2FFFE", - (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x2FFFE, - 4, 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+2FFFF", - (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x2FFFF, - 4, 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+3FFFE", - (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x3FFFE, - 4, 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+3FFFF", - (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x3FFFF, - 4, 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+4FFFE", - (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x4FFFE, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+4FFFF", - (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x4FFFF, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+5FFFE", - (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x5FFFE, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+5FFFF", - (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x5FFFF, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+6FFFE", - (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x6FFFE, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+6FFFF", - (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x6FFFF, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+7FFFE", - (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x7FFFE, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+7FFFF", - (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x7FFFF, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+8FFFE", - (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x8FFFE, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+8FFFF", - (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x8FFFF, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+9FFFE", - (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x9FFFE, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+9FFFF", - (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x9FFFF, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+AFFFE", - (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xAFFFE, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+AFFFF", - (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xAFFFF, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+BFFFE", - (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xBFFFE, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+BFFFF", - (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xBFFFF, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+CFFFE", - (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xCFFFE, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+CFFFF", - (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xCFFFF, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+DFFFE", - (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xDFFFE, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+DFFFF", - (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xDFFFF, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+EFFFE", - (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xEFFFE, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+EFFFF", - (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xEFFFF, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+FFFFE", - (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xFFFFE, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+FFFFF", - (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0xFFFFF, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+10FFFE", - (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x10FFFE, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+10FFFF", - (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR, - 'nonchar', 0x10FFFF, - (isASCII) ? 4 : 5, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "requires at least 32 bits", - (isASCII) - ? "\xfe\x82\x80\x80\x80\x80\x80" - : I8_to_native( - "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), - # This code point is chosen so that it is representable in a UV on - # 32-bit machines - $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT, - $UTF8_GOT_ABOVE_31_BIT, - 'utf8', 0x80000000, - (isASCII) ? 7 : $max_bytes, - (isASCII) ? 1 : 8, - nonportable_regex(0x80000000) - ], - [ "highest 32 bit code point", - (isASCII) - ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" - : I8_to_native( - "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), - $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT, - $UTF8_GOT_ABOVE_31_BIT, - 'utf8', 0xFFFFFFFF, - (isASCII) ? 7 : $max_bytes, - (isASCII) ? 1 : 8, - nonportable_regex(0xffffffff) - ], - [ "requires at least 32 bits, and use SUPER-type flags, instead of" - . " ABOVE_31_BIT", - (isASCII) - ? "\xfe\x82\x80\x80\x80\x80\x80" - : I8_to_native( - "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), - $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER, - 'utf8', 0x80000000, - (isASCII) ? 7 : $max_bytes, - 1, - nonportable_regex(0x80000000) - ], - [ "overflow with warnings/disallow for more than 31 bits", - # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT - # with overflow. The overflow malformation is never allowed, so - # preventing it takes precedence if the ABOVE_31_BIT options would - # otherwise allow in an overflowing value. The ASCII code points (1 - # for 32-bits; 1 for 64) were chosen because the old overflow - # detection algorithm did not catch them; this means this test also - # checks for that fix. The EBCDIC are arbitrary overflowing ones - # since we have no reports of failures with it. - (($is64bit) - ? ((isASCII) - ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" - : I8_to_native( - "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0")) - : ((isASCII) - ? "\xfe\x86\x80\x80\x80\x80\x80" - : I8_to_native( - "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))), - $UTF8_WARN_ABOVE_31_BIT, - $UTF8_DISALLOW_ABOVE_31_BIT, - $UTF8_GOT_ABOVE_31_BIT, - 'utf8', 0, - (! isASCII || $is64bit) ? $max_bytes : 7, - (isASCII || $is64bit) ? 2 : 8, - qr/overflows/ - ], -); - -if (! $is64bit) { - if (isASCII) { - no warnings qw{portable overflow}; - push @tests, - [ "Lowest 33 bit code point: overflow", - "\xFE\x84\x80\x80\x80\x80\x80", - $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT, - $UTF8_GOT_ABOVE_31_BIT, - 'utf8', 0x100000000, - 7, 1, - qr/and( is)? not portable/ - ]; - } -} -else { - no warnings qw{portable overflow}; - push @tests, - [ "More than 32 bits", - (isASCII) - ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80" - : I8_to_native( - "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), - $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT, - $UTF8_GOT_ABOVE_31_BIT, - 'utf8', 0x1000000000, - $max_bytes, (isASCII) ? 1 : 7, - qr/and( is)? not portable/ - ]; - if (! isASCII) { - push @tests, # These could falsely show wrongly in a naive - # implementation - [ "requires at least 32 bits", - I8_to_native( - "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), - $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT, - $UTF8_GOT_ABOVE_31_BIT, - 'utf8', 0x800000000, - $max_bytes, 7, - nonportable_regex(0x80000000) - ], - [ "requires at least 32 bits", - I8_to_native( - "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), - $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT, - $UTF8_GOT_ABOVE_31_BIT, - 'utf8', 0x10000000000, - $max_bytes, 6, - nonportable_regex(0x10000000000) - ], - [ "requires at least 32 bits", - I8_to_native( - "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), - $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT, - $UTF8_GOT_ABOVE_31_BIT, - 'utf8', 0x200000000000, - $max_bytes, 5, - nonportable_regex(0x20000000000) - ], - [ "requires at least 32 bits", - I8_to_native( - "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), - $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT, - $UTF8_GOT_ABOVE_31_BIT, - 'utf8', 0x4000000000000, - $max_bytes, 4, - nonportable_regex(0x4000000000000) - ], - [ "requires at least 32 bits", - I8_to_native( - "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), - $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT, - $UTF8_GOT_ABOVE_31_BIT, - 'utf8', 0x80000000000000, - $max_bytes, 3, - nonportable_regex(0x80000000000000) - ], - [ "requires at least 32 bits", - I8_to_native( - "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), - $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT, - $UTF8_GOT_ABOVE_31_BIT, - 'utf8', 0x1000000000000000, - $max_bytes, 2, - nonportable_regex(0x1000000000000000) - ]; - } -} - -foreach my $test (@tests) { - my ($testname, $bytes, $warn_flags, $disallow_flags, $expected_error_flags, - $category, $allowed_uv, $expected_len, $needed_to_discern_len, $message - ) = @$test; - - my $length = length $bytes; - my $will_overflow = $testname =~ /overflow/ ? 'overflow' : ""; - - { - use warnings; - undef @warnings; - my $ret = test_isUTF8_CHAR($bytes, $length); - my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0); - if ($will_overflow) { - is($ret, 0, "isUTF8_CHAR() $testname: returns 0"); - is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0"); - } - else { - is($ret, $length, - "isUTF8_CHAR() $testname: returns expected length: $length"); - is($ret_flags, $length, "isUTF8_CHAR_flags(...,0) $testname:" - . " returns expected length: $length"); - } - is(scalar @warnings, 0, - "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated" - . " no warnings") - or output_warnings(@warnings); - - undef @warnings; - $ret = test_isSTRICT_UTF8_CHAR($bytes, $length); - if ($will_overflow) { - is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0"); - } - else { - my $expected_ret = ( $testname =~ /surrogate|non-character/ - || $allowed_uv > 0x10FFFF) - ? 0 - : $length; - is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns" - . " expected length: $expected_ret"); - $ret = test_isUTF8_CHAR_flags($bytes, $length, - $UTF8_DISALLOW_ILLEGAL_INTERCHANGE); - is($ret, $expected_ret, - "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')" - . " acts like isSTRICT_UTF8_CHAR"); - } - is(scalar @warnings, 0, - "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:" - . " generated no warnings") - or output_warnings(@warnings); - - undef @warnings; - $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length); - if ($will_overflow) { - is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0"); - } - else { - my $expected_ret = ( $testname =~ /surrogate/ - || $allowed_uv > 0x10FFFF) - ? 0 - : $length; - is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname:" - ." returns expected length: $expected_ret"); - $ret = test_isUTF8_CHAR_flags($bytes, $length, - $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); - is($ret, $expected_ret, - "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')" - . " acts like isC9_STRICT_UTF8_CHAR"); - } - is(scalar @warnings, 0, - "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:" - . " generated no warnings") - or output_warnings(@warnings); - - # Test partial character handling, for each byte not a full character - for my $j (1.. $length - 1) { - - # Skip the test for the interaction between overflow and above-31 - # bit. It is really testing other things than the partial - # character tests, for which other tests in this file are - # sufficient - last if $testname =~ /overflow/; - - foreach my $disallow_flag (0, $disallow_flags) { - my $partial = substr($bytes, 0, $j); - my $ret_should_be; - my $comment; - if ($disallow_flag) { - $ret_should_be = 0; - $comment = "disallowed"; - if ($j < $needed_to_discern_len) { - $ret_should_be = 1; - $comment .= ", but need $needed_to_discern_len bytes" - . " to discern:"; - } - } - else { - $ret_should_be = 1; - $comment = "allowed"; - } - - undef @warnings; - - $ret = test_is_utf8_valid_partial_char_flags($partial, $j, - $disallow_flag); - is($ret, $ret_should_be, - "$testname: is_utf8_valid_partial_char_flags(" - . display_bytes($partial) - . "), $comment: returns $ret_should_be"); - is(scalar @warnings, 0, - "$testname: is_utf8_valid_partial_char_flags()" - . " generated no warnings") - or output_warnings(@warnings); - } - } - } - - # This is more complicated than the malformations tested earlier, as there - # are several orthogonal variables involved. We test all the subclasses - # of utf8 warnings to verify they work with and without the utf8 class, - # and don't have effects on other sublass warnings - foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') { - foreach my $warn_flag (0, $warn_flags) { - foreach my $disallow_flag (0, $disallow_flags) { - foreach my $do_warning (0, 1) { - - # We try each of the above with various combinations of - # malformations that can occur on the same input sequence. - foreach my $short ("", "short") { - foreach my $unexpected_noncont ("", - "unexpected non-continuation") - { - foreach my $overlong ("", "overlong") { - - # If we're already at the longest possible, we - # can't create an overlong (which would be longer) - # can't handle anything larger. - next if $overlong && $expected_len >= $max_bytes; - - my @malformations; - my @expected_errors; - push @malformations, $short if $short; - push @malformations, $unexpected_noncont - if $unexpected_noncont; - push @malformations, $overlong if $overlong; - - # The overflow malformation test in the input - # array is coerced into being treated like one of - # the others. - if ($will_overflow) { - push @malformations, 'overflow'; - push @expected_errors, $UTF8_GOT_OVERFLOW; - } - - my $malformations_name = join "/", @malformations; - $malformations_name .= " malformation" - if $malformations_name; - $malformations_name .= "s" if @malformations > 1; - my $this_bytes = $bytes; - my $this_length = $length; - my $expected_uv = $allowed_uv; - my $this_expected_len = $expected_len; - my $this_needed_to_discern_len = $needed_to_discern_len; - if ($malformations_name) { - $expected_uv = 0; - - # Coerce the input into the desired - # malformation - if ($malformations_name =~ /overlong/) { - - # For an overlong, we convert the original - # start byte into a continuation byte with - # the same data bits as originally. ... - substr($this_bytes, 0, 1) - = start_byte_to_cont(substr($this_bytes, - 0, 1)); - - # ... Then we prepend it with a known - # overlong sequence. This should evaluate - # to the exact same code point as the - # original. - $this_bytes - = I8_to_native("\xff") - . (I8_to_native(chr $first_continuation) - x ( $max_bytes - 1 - length($this_bytes))) - . $this_bytes; - $this_length = length($this_bytes); - $this_needed_to_discern_len - = $max_bytes - ($this_expected_len - - $this_needed_to_discern_len); - $this_expected_len = $max_bytes; - push @expected_errors, $UTF8_GOT_LONG; - } - if ($malformations_name =~ /short/) { - - # Just tell the test to not look far - # enough into the input. - $this_length--; - $this_expected_len--; - push @expected_errors, $UTF8_GOT_SHORT; - } - if ($malformations_name - =~ /non-continuation/) - { - # Change the final continuation byte into - # a non one. - my $pos = ($short) ? -2 : -1; - substr($this_bytes, $pos, 1) = '?'; - $this_expected_len--; - push @expected_errors, - $UTF8_GOT_NON_CONTINUATION; - } - } - - my $eval_warn = $do_warning - ? "use warnings '$warning'" - : $warning eq "utf8" - ? "no warnings 'utf8'" - : ( "use warnings 'utf8';" - . " no warnings '$warning'"); - - # Is effectively disallowed if we've set up a - # malformation, even if the flag indicates it is - # allowed. Fix up test name to indicate this as - # well - my $disallowed = $disallow_flag - || $malformations_name; - my $this_name = "utf8n_to_uvchr_error() $testname: " - . (($disallow_flag) - ? 'disallowed' - : $disallowed - ? $disallowed - : 'allowed'); - $this_name .= ", $eval_warn"; - $this_name .= ", " . (($warn_flag) - ? 'with warning flag' - : 'no warning flag'); - - undef @warnings; - my $ret_ref; - my $display_bytes = display_bytes($this_bytes); - my $call = " Call was: $eval_warn; \$ret_ref" - . " = test_utf8n_to_uvchr_error(" - . "'$display_bytes', $this_length," - . "$warn_flag" - . "|$disallow_flag)"; - my $eval_text = "$eval_warn; \$ret_ref" - . " = test_utf8n_to_uvchr_error(" - . "'$this_bytes'," - . " $this_length, $warn_flag" - . "|$disallow_flag)"; - eval "$eval_text"; - if (! ok ("$@ eq ''", - "$this_name: eval succeeded")) - { - diag "\$!='$!'; eval'd=\"$call\""; - next; - } - if ($disallowed) { - is($ret_ref->[0], 0, "$this_name: Returns 0") - or diag $call; - } - else { - is($ret_ref->[0], $expected_uv, - "$this_name: Returns expected uv: " - . sprintf("0x%04X", $expected_uv)) - or diag $call; - } - is($ret_ref->[1], $this_expected_len, - "$this_name: Returns expected length:" - . " $this_expected_len") - or diag $call; - - my $errors = $ret_ref->[2]; - - for (my $i = @expected_errors - 1; $i >= 0; $i--) { - if (ok($expected_errors[$i] & $errors, - "Expected and got error bit return" - . " for $malformations[$i] malformation")) - { - $errors &= ~$expected_errors[$i]; - } - splice @expected_errors, $i, 1; - } - is(scalar @expected_errors, 0, - "Got all the expected malformation errors") - or diag Dumper \@expected_errors; - - if ( $this_expected_len >= $this_needed_to_discern_len - && ($warn_flag || $disallow_flag)) - { - is($errors, $expected_error_flags, - "Got the correct error flag") - or diag $call; - } - else { - is($errors, 0, "Got no other error flag"); - } - - if (@malformations) { - if (! $do_warning && $warning eq 'utf8') { - goto no_warnings_expected; - } - - # Check that each malformation generates a - # warning, removing that warning if found - MALFORMATION: - foreach my $malformation (@malformations) { - foreach (my $i = 0; $i < @warnings; $i++) { - if ($warnings[$i] =~ /$malformation/) { - pass("Expected and got" - . "'$malformation' warning"); - splice @warnings, $i, 1; - next MALFORMATION; - } - } - fail("Expected '$malformation' warning" - . " but didn't get it"); - - } - } - - # Any overflow will override any super or above-31 - # warnings. - goto no_warnings_expected - if $will_overflow || $this_expected_len - < $this_needed_to_discern_len; - - if ( ! $do_warning - && ( $warning eq 'utf8' - || $warning eq $category)) - { - goto no_warnings_expected; - } - elsif ($warn_flag) { - if (is(scalar @warnings, 1, - "$this_name: Got a single warning ")) - { - like($warnings[0], $message, - "$this_name: Got expected warning") - or diag $call; - } - else { - diag $call; - if (scalar @warnings) { - output_warnings(@warnings); - } - } - } - else { - no_warnings_expected: - unless (is(scalar @warnings, 0, - "$this_name: Got no warnings")) - { - diag $call; - output_warnings(@warnings); - } - } - - # Check CHECK_ONLY results when the input is - # disallowed. Do this when actually disallowed, - # not just when the $disallow_flag is set - if ($disallowed) { - undef @warnings; - $ret_ref = test_utf8n_to_uvchr_error( - $this_bytes, $this_length, - $disallow_flag|$UTF8_CHECK_ONLY); - is($ret_ref->[0], 0, - "$this_name, CHECK_ONLY: Returns 0") - or diag $call; - is($ret_ref->[1], -1, - "$this_name: CHECK_ONLY: returns -1 for length") - or diag $call; - if (! is(scalar @warnings, 0, - "$this_name, CHECK_ONLY: no warnings" - . " generated")) - { - diag $call; - output_warnings(@warnings); - } - } - - # Now repeat some of the above, but for - # uvchr_to_utf8_flags(). Since this comes from an - # existing code point, it hasn't overflowed, and - # isn't malformed. - next if @malformations; - - # The warning and disallow flags passed in are for - # utf8n_to_uvchr_error(). Convert them for - # uvchr_to_utf8_flags(). - my $uvchr_warn_flag = 0; - my $uvchr_disallow_flag = 0; - if ($warn_flag) { - if ($warn_flag == $UTF8_WARN_SURROGATE) { - $uvchr_warn_flag = $UNICODE_WARN_SURROGATE - } - elsif ($warn_flag == $UTF8_WARN_NONCHAR) { - $uvchr_warn_flag = $UNICODE_WARN_NONCHAR - } - elsif ($warn_flag == $UTF8_WARN_SUPER) { - $uvchr_warn_flag = $UNICODE_WARN_SUPER - } - elsif ($warn_flag == $UTF8_WARN_ABOVE_31_BIT) { - $uvchr_warn_flag - = $UNICODE_WARN_ABOVE_31_BIT; - } - else { - fail(sprintf "Unexpected warn flag: %x", - $warn_flag); - next; - } - } - if ($disallow_flag) { - if ($disallow_flag == $UTF8_DISALLOW_SURROGATE) - { - $uvchr_disallow_flag - = $UNICODE_DISALLOW_SURROGATE; - } - elsif ($disallow_flag == $UTF8_DISALLOW_NONCHAR) - { - $uvchr_disallow_flag - = $UNICODE_DISALLOW_NONCHAR; - } - elsif ($disallow_flag == $UTF8_DISALLOW_SUPER) { - $uvchr_disallow_flag - = $UNICODE_DISALLOW_SUPER; - } - elsif ($disallow_flag - == $UTF8_DISALLOW_ABOVE_31_BIT) - { - $uvchr_disallow_flag = - $UNICODE_DISALLOW_ABOVE_31_BIT; - } - else { - fail(sprintf "Unexpected disallow flag: %x", - $disallow_flag); - next; - } - } - - $disallowed = $uvchr_disallow_flag; - - $this_name = "uvchr_to_utf8_flags() $testname: " - . (($uvchr_disallow_flag) - ? 'disallowed' - : ($disallowed) - ? 'ABOVE_31_BIT allowed' - : 'allowed'); - $this_name .= ", $eval_warn"; - $this_name .= ", " . (($uvchr_warn_flag) - ? 'with warning flag' - : 'no warning flag'); - - undef @warnings; - my $ret; - my $warn_flag = sprintf "0x%x", $uvchr_warn_flag; - my $disallow_flag = sprintf "0x%x", - $uvchr_disallow_flag; - $call = sprintf(" Call was: $eval_warn; \$ret" - . " = test_uvchr_to_utf8_flags(" - . " 0x%x, $warn_flag|$disallow_flag)", - $allowed_uv); - $eval_text = "$eval_warn; \$ret =" - . " test_uvchr_to_utf8_flags(" - . "$allowed_uv, $warn_flag|" - . "$disallow_flag)"; - eval "$eval_text"; - if (! ok ("$@ eq ''", "$this_name: eval succeeded")) - { - diag "\$!='$!'; eval'd=\"$eval_text\""; - next; - } - if ($disallowed) { - is($ret, undef, "$this_name: Returns undef") - or diag $call; - } - else { - is($ret, $bytes, "$this_name: Returns expected string") - or diag $call; - } - if (! $do_warning - && ($warning eq 'utf8' || $warning eq $category)) - { - if (!is(scalar @warnings, 0, - "$this_name: No warnings generated")) - { - diag $call; - output_warnings(@warnings); - } - } - elsif ( $uvchr_warn_flag - && ( $warning eq 'utf8' - || $warning eq $category)) - { - if (is(scalar @warnings, 1, - "$this_name: Got a single warning ")) - { - like($warnings[0], $message, - "$this_name: Got expected warning") - or diag $call; - } - else { - diag $call; - output_warnings(@warnings) - if scalar @warnings; - } - } - } - } - } - } - } - } - } -} - SKIP: { isASCII diff --git a/ext/XS-APItest/t/utf8_malformed.t b/ext/XS-APItest/t/utf8_malformed.t new file mode 100644 index 0000000000..16c5b7f437 --- /dev/null +++ b/ext/XS-APItest/t/utf8_malformed.t @@ -0,0 +1,418 @@ +#!perl -w + +# Test handling of various UTF-8 malformations + +use strict; +use Test::More; + +BEGIN { + use_ok('XS::APItest'); + require 'charset_tools.pl'; + require './t/utf8_setup.pl'; +}; + +$|=1; + +no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit + # machines, and that is tested elsewhere + +use XS::APItest; + +my @warnings; + +use warnings 'utf8'; +local $SIG{__WARN__} = sub { push @warnings, @_ }; + +my $I8c = $::I8c; + +my $REPLACEMENT = 0xFFFD; + +# Now test the malformations. All these raise category utf8 warnings. +my @malformations = ( + # ($testname, $bytes, $length, $allow_flags, $expected_error_flags, + # $allowed_uv, $expected_len, $needed_to_discern_len, $message ) + +# Now considered a program bug, and asserted against + #[ "zero length string malformation", "", 0, + # $::UTF8_ALLOW_EMPTY, $::UTF8_GOT_EMPTY, $REPLACEMENT, 0, 0, + # qr/empty string/ + #], + [ "orphan continuation byte malformation", I8_to_native("${I8c}a"), 2, + $::UTF8_ALLOW_CONTINUATION, $::UTF8_GOT_CONTINUATION, $REPLACEMENT, + 1, 1, + qr/unexpected continuation byte/ + ], + [ "premature next character malformation (immediate)", + (isASCII) ? "\xc2\xc2\x80" : I8_to_native("\xc5\xc5\xa0"), + 3, + $::UTF8_ALLOW_NON_CONTINUATION, $::UTF8_GOT_NON_CONTINUATION, $REPLACEMENT, + 1, 2, + qr/unexpected non-continuation byte.*immediately after start byte/ + ], + [ "premature next character malformation (non-immediate)", + I8_to_native("\xef${I8c}a"), 3, + $::UTF8_ALLOW_NON_CONTINUATION, $::UTF8_GOT_NON_CONTINUATION, $REPLACEMENT, + 2, 3, + qr/unexpected non-continuation byte .* 2 bytes after start byte/ + ], + [ "too short malformation", I8_to_native("\xf1${I8c}a"), 2, + # Having the 'a' after this, but saying there are only 2 bytes also + # tests that we pay attention to the passed in length + $::UTF8_ALLOW_SHORT, $::UTF8_GOT_SHORT, $REPLACEMENT, + 2, 2, + qr/2 bytes available, need 4/ + ], + [ "overlong malformation, lowest 2-byte", + (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"), + 2, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + 0, # NUL + 2, 1, + qr/overlong/ + ], + [ "overlong malformation, highest 2-byte", + (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"), + 2, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + (isASCII) ? 0x7F : utf8::unicode_to_native(0x9F), + 2, 1, + qr/overlong/ + ], + [ "overlong malformation, lowest 3-byte", + (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"), + 3, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + 0, # NUL + 3, (isASCII) ? 2 : 1, + qr/overlong/ + ], + [ "overlong malformation, highest 3-byte", + (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"), + 3, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + (isASCII) ? 0x7FF : 0x3FF, + 3, (isASCII) ? 2 : 1, + qr/overlong/ + ], + [ "overlong malformation, lowest 4-byte", + (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"), + 4, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + 0, # NUL + 4, 2, + qr/overlong/ + ], + [ "overlong malformation, highest 4-byte", + (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"), + 4, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + (isASCII) ? 0xFFFF : 0x3FFF, + 4, 2, + qr/overlong/ + ], + [ "overlong malformation, lowest 5-byte", + (isASCII) + ? "\xf8\x80\x80\x80\x80" + : I8_to_native("\xf8\xa0\xa0\xa0\xa0"), + 5, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + 0, # NUL + 5, 2, + qr/overlong/ + ], + [ "overlong malformation, highest 5-byte", + (isASCII) + ? "\xf8\x87\xbf\xbf\xbf" + : I8_to_native("\xf8\xa7\xbf\xbf\xbf"), + 5, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + (isASCII) ? 0x1FFFFF : 0x3FFFF, + 5, 2, + qr/overlong/ + ], + [ "overlong malformation, lowest 6-byte", + (isASCII) + ? "\xfc\x80\x80\x80\x80\x80" + : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"), + 6, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + 0, # NUL + 6, 2, + qr/overlong/ + ], + [ "overlong malformation, highest 6-byte", + (isASCII) + ? "\xfc\x83\xbf\xbf\xbf\xbf" + : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"), + 6, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + (isASCII) ? 0x3FFFFFF : 0x3FFFFF, + 6, 2, + qr/overlong/ + ], + [ "overlong malformation, lowest 7-byte", + (isASCII) + ? "\xfe\x80\x80\x80\x80\x80\x80" + : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"), + 7, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + 0, # NUL + 7, 2, + qr/overlong/ + ], + [ "overlong malformation, highest 7-byte", + (isASCII) + ? "\xfe\x81\xbf\xbf\xbf\xbf\xbf" + : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"), + 7, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF, + 7, 2, + qr/overlong/ + ], +); + +if (isASCII && ! $::is64bit) { # 32-bit ASCII platform + no warnings 'portable'; + push @malformations, + [ "overflow malformation", + "\xfe\x84\x80\x80\x80\x80\x80", # Represents 2**32 + 7, + $::UTF8_ALLOW_OVERFLOW, $::UTF8_GOT_OVERFLOW, + $REPLACEMENT, + 7, 2, + qr/overflows/ + ], + [ "overflow malformation", + "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80", + $::max_bytes, + $::UTF8_ALLOW_OVERFLOW, $::UTF8_GOT_OVERFLOW, + $REPLACEMENT, + $::max_bytes, 1, + qr/overflows/ + ]; +} +else { # 64-bit ASCII, or EBCDIC of any size. + # On EBCDIC platforms, another overlong test is needed even on 32-bit + # systems, whereas it doesn't happen on ASCII except on 64-bit ones. + + no warnings 'portable'; + no warnings 'overflow'; # Doesn't run on 32-bit systems, but compiles + push @malformations, + [ "overlong malformation, lowest max-byte", + (isASCII) + ? "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" + : I8_to_native( + "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), + $::max_bytes, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + 0, # NUL + $::max_bytes, (isASCII) ? 7 : 8, + qr/overlong/, + ], + [ "overlong malformation, highest max-byte", + (isASCII) # 2**36-1 on ASCII; 2**30-1 on EBCDIC + ? "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf" + : I8_to_native( + "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"), + $::max_bytes, + $::UTF8_ALLOW_LONG, $::UTF8_GOT_LONG, + (isASCII) ? 0xFFFFFFFFF : 0x3FFFFFFF, + $::max_bytes, (isASCII) ? 7 : 8, + qr/overlong/, + ]; + + if (! $::is64bit) { # 32-bit EBCDIC + push @malformations, + [ "overflow malformation", + I8_to_native( + "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"), + $::max_bytes, + $::UTF8_ALLOW_OVERFLOW, $::UTF8_GOT_OVERFLOW, + $REPLACEMENT, + $::max_bytes, 8, + qr/overflows/ + ]; + } + else { # 64-bit, either ASCII or EBCDIC + push @malformations, + [ "overflow malformation", + (isASCII) + ? "\xff\x80\x90\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" + : I8_to_native( + "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), + $::max_bytes, + $::UTF8_ALLOW_OVERFLOW, $::UTF8_GOT_OVERFLOW, + $REPLACEMENT, + $::max_bytes, (isASCII) ? 3 : 2, + qr/overflows/ + ]; + } +} + +# For each overlong malformation in the list, we modify it, so that there are +# two tests. The first one returns the replacement character given the input +# flags, and the second test adds a flag that causes the actual code point the +# malformation represents to be returned. +my @added_overlongs; +foreach my $test (@malformations) { + my ($testname, $bytes, $length, $allow_flags, $expected_error_flags, + $allowed_uv, $expected_len, $needed_to_discern_len, $message ) = @$test; + next unless $testname =~ /overlong/; + + $test->[0] .= "; use REPLACEMENT CHAR"; + $test->[5] = $REPLACEMENT; + + push @added_overlongs, + [ $testname . "; use actual value", + $bytes, $length, + $allow_flags | $::UTF8_ALLOW_LONG_AND_ITS_VALUE, + $expected_error_flags, $allowed_uv, $expected_len, + $needed_to_discern_len, $message + ]; +} +push @malformations, @added_overlongs; + +foreach my $test (@malformations) { + my ($testname, $bytes, $length, $allow_flags, $expected_error_flags, + $allowed_uv, $expected_len, $needed_to_discern_len, $message ) = @$test; + + if (length($bytes) < $length) { + fail("Internal test error: actual buffer length (" . length($bytes) + . ") must be at least as high as how far we are allowed to read" + . " into it ($length)"); + diag($testname); + next; + } + + undef @warnings; + + my $ret = test_isUTF8_CHAR($bytes, $length); + is($ret, 0, "$testname: isUTF8_CHAR returns 0"); + is(scalar @warnings, 0, "$testname: isUTF8_CHAR() generated no warnings") + or output_warnings(@warnings); + + undef @warnings; + + $ret = test_isUTF8_CHAR_flags($bytes, $length, 0); + is($ret, 0, "$testname: isUTF8_CHAR_flags returns 0"); + is(scalar @warnings, 0, "$testname: isUTF8_CHAR_flags() generated no" + . " warnings") + or output_warnings(@warnings); + + $ret = test_isSTRICT_UTF8_CHAR($bytes, $length); + is($ret, 0, "$testname: isSTRICT_UTF8_CHAR returns 0"); + is(scalar @warnings, 0, + "$testname: isSTRICT_UTF8_CHAR() generated no warnings") + or output_warnings(@warnings); + + $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length); + is($ret, 0, "$testname: isC9_STRICT_UTF8_CHAR returns 0"); + is(scalar @warnings, 0, + "$testname: isC9_STRICT_UTF8_CHAR() generated no warnings") + or output_warnings(@warnings); + + for my $j (1 .. $length - 1) { + my $partial = substr($bytes, 0, $j); + + undef @warnings; + + $ret = test_is_utf8_valid_partial_char_flags($bytes, $j, 0); + + my $ret_should_be = 0; + my $comment = ""; + if ($j < $needed_to_discern_len) { + $ret_should_be = 1; + $comment = ", but need $needed_to_discern_len bytes to discern:"; + } + + is($ret, $ret_should_be, "$testname: is_utf8_valid_partial_char_flags(" + . display_bytes($partial) + . ")$comment returns $ret_should_be"); + is(scalar @warnings, 0, + "$testname: is_utf8_valid_partial_char_flags() generated" + . " no warnings") + or output_warnings(@warnings); + } + + + # Test what happens when this malformation is not allowed + undef @warnings; + my $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, 0); + is($ret_ref->[0], 0, "$testname: disallowed: Returns 0"); + is($ret_ref->[1], $expected_len, + "$testname: utf8n_to_uvchr_error(), disallowed: Returns expected" + . " length: $expected_len"); + if (is(scalar @warnings, 1, + "$testname: disallowed: Got a single warning ")) + { + like($warnings[0], $message, + "$testname: disallowed: Got expected warning"); + } + else { + if (scalar @warnings) { + output_warnings(@warnings); + } + } + is($ret_ref->[2], $expected_error_flags, + "$testname: utf8n_to_uvchr_error(), disallowed:" + . " Returns expected error"); + + { # Next test when disallowed, and warnings are off. + undef @warnings; + no warnings 'utf8'; + my $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, 0); + is($ret_ref->[0], 0, + "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':" + . " Returns 0"); + is($ret_ref->[1], $expected_len, + "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':" + . " Returns expected length: $expected_len"); + if (!is(scalar @warnings, 0, + "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':" + . " no warnings generated")) + { + output_warnings(@warnings); + } + is($ret_ref->[2], $expected_error_flags, + "$testname: utf8n_to_uvchr_error(), disallowed: Returns" + . " expected error"); + } + + # Test with CHECK_ONLY + undef @warnings; + $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, $::UTF8_CHECK_ONLY); + is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0"); + is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns -1 for length"); + if (! is(scalar @warnings, 0, + "$testname: CHECK_ONLY: no warnings generated")) + { + output_warnings(@warnings); + } + is($ret_ref->[2], $expected_error_flags, + "$testname: utf8n_to_uvchr_error(), disallowed: Returns expected" + . " error"); + + next if $allow_flags == 0; # Skip if can't allow this malformation + + # Test when the malformation is allowed + undef @warnings; + $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, $allow_flags); + is($ret_ref->[0], $allowed_uv, + "$testname: utf8n_to_uvchr_error(), allowed: Returns expected uv: " + . sprintf("0x%04X", $allowed_uv)); + is($ret_ref->[1], $expected_len, + "$testname: utf8n_to_uvchr_error(), allowed: Returns expected length:" + . " $expected_len"); + if (!is(scalar @warnings, 0, + "$testname: utf8n_to_uvchr_error(), allowed: no warnings" + . " generated")) + { + output_warnings(@warnings); + } + is($ret_ref->[2], $expected_error_flags, + "$testname: utf8n_to_uvchr_error(), disallowed: Returns" + . " expected error"); +} + +done_testing; diff --git a/ext/XS-APItest/t/utf8_setup.pl b/ext/XS-APItest/t/utf8_setup.pl new file mode 100644 index 0000000000..094390016b --- /dev/null +++ b/ext/XS-APItest/t/utf8_setup.pl @@ -0,0 +1,98 @@ +# Common subroutines and constants, called by .t files in this directory that +# deal with UTF-8 + +# The test files can't use byte_utf8a_to_utf8n() from t/charset_tools.pl +# because that uses the same functions we are testing here. So UTF-EBCDIC +# strings are hard-coded as I8 strings in this file instead, and we use the +# translation functions to/from I8 from that file instead. + +sub isASCII { ord "A" == 65 } + +sub display_bytes { + use bytes; + my $string = shift; + return '"' + . join("", map { sprintf("\\x%02x", ord $_) } split "", $string) + . '"'; +} + +sub output_warnings(@) { + diag "The warnings were:\n" . join("", @_); +} + +sub start_byte_to_cont($) { + + # Extract the code point information from the input UTF-8 start byte, and + # return a continuation byte containing the same information. This is + # used in constructing an overlong malformation from valid input. + + my $byte = shift; + my $len = test_UTF8_SKIP($byte); + if ($len < 2) { + die "start_byte_to_cont() is expecting a UTF-8 variant"; + } + + $byte = ord native_to_I8($byte); + + # Copied from utf8.h. This gets rid of the leading 1 bits. + $byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2))); + + $byte |= (isASCII) ? 0x80 : 0xA0; + return I8_to_native(chr $byte); +} + +$::is64bit = length sprintf("%x", ~0) > 8; + +$::first_continuation = (isASCII) ? 0x80 : 0xA0; + +$::I8c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte + + +$::max_bytes = (isASCII) ? 13 : 14; # Max number of bytes in a UTF-8 sequence + # representing a single code point + +# Copied from utf8.h +$::UTF8_ALLOW_EMPTY = 0x0001; +$::UTF8_GOT_EMPTY = $UTF8_ALLOW_EMPTY; +$::UTF8_ALLOW_CONTINUATION = 0x0002; +$::UTF8_GOT_CONTINUATION = $UTF8_ALLOW_CONTINUATION; +$::UTF8_ALLOW_NON_CONTINUATION = 0x0004; +$::UTF8_GOT_NON_CONTINUATION = $UTF8_ALLOW_NON_CONTINUATION; +$::UTF8_ALLOW_SHORT = 0x0008; +$::UTF8_GOT_SHORT = $UTF8_ALLOW_SHORT; +$::UTF8_ALLOW_LONG = 0x0010; +$::UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020; +$::UTF8_GOT_LONG = $UTF8_ALLOW_LONG; +$::UTF8_ALLOW_OVERFLOW = 0x0080; +$::UTF8_GOT_OVERFLOW = $UTF8_ALLOW_OVERFLOW; +$::UTF8_DISALLOW_SURROGATE = 0x0100; +$::UTF8_GOT_SURROGATE = $UTF8_DISALLOW_SURROGATE; +$::UTF8_WARN_SURROGATE = 0x0200; +$::UTF8_DISALLOW_NONCHAR = 0x0400; +$::UTF8_GOT_NONCHAR = $UTF8_DISALLOW_NONCHAR; +$::UTF8_WARN_NONCHAR = 0x0800; +$::UTF8_DISALLOW_SUPER = 0x1000; +$::UTF8_GOT_SUPER = $UTF8_DISALLOW_SUPER; +$::UTF8_WARN_SUPER = 0x2000; +$::UTF8_DISALLOW_ABOVE_31_BIT = 0x4000; +$::UTF8_GOT_ABOVE_31_BIT = $UTF8_DISALLOW_ABOVE_31_BIT; +$::UTF8_WARN_ABOVE_31_BIT = 0x8000; +$::UTF8_CHECK_ONLY = 0x10000; +$::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE + = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE; +$::UTF8_DISALLOW_ILLEGAL_INTERCHANGE + = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR; +$::UTF8_WARN_ILLEGAL_C9_INTERCHANGE + = $UTF8_WARN_SUPER|$UTF8_WARN_SURROGATE; +$::UTF8_WARN_ILLEGAL_INTERCHANGE + = $UTF8_WARN_ILLEGAL_C9_INTERCHANGE|$UTF8_WARN_NONCHAR; + +# Test uvchr_to_utf8(). +$::UNICODE_WARN_SURROGATE = 0x0001; +$::UNICODE_WARN_NONCHAR = 0x0002; +$::UNICODE_WARN_SUPER = 0x0004; +$::UNICODE_WARN_ABOVE_31_BIT = 0x0008; +$::UNICODE_DISALLOW_SURROGATE = 0x0010; +$::UNICODE_DISALLOW_NONCHAR = 0x0020; +$::UNICODE_DISALLOW_SUPER = 0x0040; +$::UNICODE_DISALLOW_ABOVE_31_BIT = 0x0080; diff --git a/ext/XS-APItest/t/utf8_warn0.t b/ext/XS-APItest/t/utf8_warn0.t new file mode 100644 index 0000000000..3f91bf5a4e --- /dev/null +++ b/ext/XS-APItest/t/utf8_warn0.t @@ -0,0 +1,7 @@ +use strict; +use warnings; +no warnings 'once'; + +$::TEST_CHUNK = 0; + +do './t/utf8_warn_base.pl'; diff --git a/ext/XS-APItest/t/utf8_warn1.t b/ext/XS-APItest/t/utf8_warn1.t new file mode 100644 index 0000000000..beb4faf634 --- /dev/null +++ b/ext/XS-APItest/t/utf8_warn1.t @@ -0,0 +1,7 @@ +use strict; +use warnings; +no warnings 'once'; + +$::TEST_CHUNK = 1; + +do './t/utf8_warn_base.pl'; diff --git a/ext/XS-APItest/t/utf8_warn2.t b/ext/XS-APItest/t/utf8_warn2.t new file mode 100644 index 0000000000..d6d3e7a73d --- /dev/null +++ b/ext/XS-APItest/t/utf8_warn2.t @@ -0,0 +1,7 @@ +use strict; +use warnings; +no warnings 'once'; + +$::TEST_CHUNK = 2; + +do './t/utf8_warn_base.pl'; diff --git a/ext/XS-APItest/t/utf8_warn3.t b/ext/XS-APItest/t/utf8_warn3.t new file mode 100644 index 0000000000..ae0da886c9 --- /dev/null +++ b/ext/XS-APItest/t/utf8_warn3.t @@ -0,0 +1,7 @@ +use strict; +use warnings; +no warnings 'once'; + +$::TEST_CHUNK = 3; + +do './t/utf8_warn_base.pl'; diff --git a/ext/XS-APItest/t/utf8_warn4.t b/ext/XS-APItest/t/utf8_warn4.t new file mode 100644 index 0000000000..52e8250851 --- /dev/null +++ b/ext/XS-APItest/t/utf8_warn4.t @@ -0,0 +1,7 @@ +use strict; +use warnings; +no warnings 'once'; + +$::TEST_CHUNK = 4; + +do './t/utf8_warn_base.pl'; diff --git a/ext/XS-APItest/t/utf8_warn5.t b/ext/XS-APItest/t/utf8_warn5.t new file mode 100644 index 0000000000..94f8f0cd45 --- /dev/null +++ b/ext/XS-APItest/t/utf8_warn5.t @@ -0,0 +1,7 @@ +use strict; +use warnings; +no warnings 'once'; + +$::TEST_CHUNK = 5; + +do './t/utf8_warn_base.pl'; diff --git a/ext/XS-APItest/t/utf8_warn6.t b/ext/XS-APItest/t/utf8_warn6.t new file mode 100644 index 0000000000..5995db6906 --- /dev/null +++ b/ext/XS-APItest/t/utf8_warn6.t @@ -0,0 +1,7 @@ +use strict; +use warnings; +no warnings 'once'; + +$::TEST_CHUNK = 6; + +do './t/utf8_warn_base.pl'; diff --git a/ext/XS-APItest/t/utf8_warn7.t b/ext/XS-APItest/t/utf8_warn7.t new file mode 100644 index 0000000000..27dc96bc2f --- /dev/null +++ b/ext/XS-APItest/t/utf8_warn7.t @@ -0,0 +1,7 @@ +use strict; +use warnings; +no warnings 'once'; + +$::TEST_CHUNK = 7; + +do './t/utf8_warn_base.pl'; diff --git a/ext/XS-APItest/t/utf8_warn8.t b/ext/XS-APItest/t/utf8_warn8.t new file mode 100644 index 0000000000..01de3b67a3 --- /dev/null +++ b/ext/XS-APItest/t/utf8_warn8.t @@ -0,0 +1,7 @@ +use strict; +use warnings; +no warnings 'once'; + +$::TEST_CHUNK = 8; + +do './t/utf8_warn_base.pl'; diff --git a/ext/XS-APItest/t/utf8_warn9.t b/ext/XS-APItest/t/utf8_warn9.t new file mode 100644 index 0000000000..aa4c42f912 --- /dev/null +++ b/ext/XS-APItest/t/utf8_warn9.t @@ -0,0 +1,7 @@ +use strict; +use warnings; +no warnings 'once'; + +$::TEST_CHUNK = 9; + +do './t/utf8_warn_base.pl'; diff --git a/ext/XS-APItest/t/utf8_warn_base.pl b/ext/XS-APItest/t/utf8_warn_base.pl new file mode 100644 index 0000000000..66f6f3df6a --- /dev/null +++ b/ext/XS-APItest/t/utf8_warn_base.pl @@ -0,0 +1,1059 @@ +#!perl -w + +# This is a base file to be used by various .t's in its directory + +use strict; +use Test::More; + +BEGIN { + use_ok('XS::APItest'); + require 'charset_tools.pl'; + require './t/utf8_setup.pl'; +}; + +$|=1; + +no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit + # machines, and that is tested elsewhere + +use XS::APItest; +use Data::Dumper; + +my @warnings; + +use warnings 'utf8'; +local $SIG{__WARN__} = sub { push @warnings, @_ }; + +sub nonportable_regex ($) { + + # Returns a pattern that matches the non-portable message raised either + # for the specific input code point, or the one generated when there + # is some malformation that precludes the message containing the specific + # code point + + my $code_point = shift; + + my $string = sprintf '(Code point 0x%X is not Unicode, and' + . '|Any UTF-8 sequence that starts with' + . ' "(\\\x[[:xdigit:]]{2})+" is for a' + . ' non-Unicode code point, and is) not portable', + $code_point; + return qr/$string/; +} + +# Now test the cases where a legal code point is generated, but may or may not +# be allowed/warned on. +my @tests = ( + # ($testname, $bytes, $warn_flags, $disallow_flags, $expected_error_flags, + # $category, $allowed_uv, $expected_len, $needed_to_discern_len, $message ) + [ "lowest surrogate", + (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"), + $::UTF8_WARN_SURROGATE, $::UTF8_DISALLOW_SURROGATE, $::UTF8_GOT_SURROGATE, + 'surrogate', 0xD800, + (isASCII) ? 3 : 4, + 2, + qr/surrogate/ + ], + [ "a middle surrogate", + (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"), + $::UTF8_WARN_SURROGATE, $::UTF8_DISALLOW_SURROGATE, $::UTF8_GOT_SURROGATE, + 'surrogate', 0xD90D, + (isASCII) ? 3 : 4, + 2, + qr/surrogate/ + ], + [ "highest surrogate", + (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"), + $::UTF8_WARN_SURROGATE, $::UTF8_DISALLOW_SURROGATE, $::UTF8_GOT_SURROGATE, + 'surrogate', 0xDFFF, + (isASCII) ? 3 : 4, + 2, + qr/surrogate/ + ], + [ "first non_unicode", + (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"), + $::UTF8_WARN_SUPER, $::UTF8_DISALLOW_SUPER, $::UTF8_GOT_SUPER, + 'non_unicode', 0x110000, + (isASCII) ? 4 : 5, + 2, + qr/(not Unicode|for a non-Unicode code point).* may not be portable/ + ], + [ "non_unicode whose first byte tells that", + (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"), + $::UTF8_WARN_SUPER, $::UTF8_DISALLOW_SUPER, $::UTF8_GOT_SUPER, + 'non_unicode', + (isASCII) ? 0x140000 : 0x200000, + (isASCII) ? 4 : 5, + 1, + qr/(not Unicode|for a non-Unicode code point).* may not be portable/ + ], + [ "first of 32 consecutive non-character code points", + (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xFDD0, + (isASCII) ? 3 : 4, + (isASCII) ? 3 : 4, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "a mid non-character code point of the 32 consecutive ones", + (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xFDE0, + (isASCII) ? 3 : 4, + (isASCII) ? 3 : 4, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "final of 32 consecutive non-character code points", + (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xFDEF, + (isASCII) ? 3 : 4, + (isASCII) ? 3 : 4, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+FFFE", + (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xFFFE, + (isASCII) ? 3 : 4, + (isASCII) ? 3 : 4, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+FFFF", + (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xFFFF, + (isASCII) ? 3 : 4, + (isASCII) ? 3 : 4, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+1FFFE", + (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x1FFFE, + 4, 4, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+1FFFF", + (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x1FFFF, + 4, 4, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+2FFFE", + (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x2FFFE, + 4, 4, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+2FFFF", + (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x2FFFF, + 4, 4, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+3FFFE", + (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x3FFFE, + 4, 4, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+3FFFF", + (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x3FFFF, + 4, 4, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+4FFFE", + (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x4FFFE, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+4FFFF", + (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x4FFFF, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+5FFFE", + (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x5FFFE, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+5FFFF", + (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x5FFFF, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+6FFFE", + (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x6FFFE, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+6FFFF", + (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x6FFFF, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+7FFFE", + (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x7FFFE, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+7FFFF", + (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x7FFFF, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+8FFFE", + (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x8FFFE, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+8FFFF", + (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x8FFFF, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+9FFFE", + (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x9FFFE, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+9FFFF", + (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x9FFFF, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+AFFFE", + (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xAFFFE, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+AFFFF", + (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xAFFFF, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+BFFFE", + (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xBFFFE, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+BFFFF", + (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xBFFFF, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+CFFFE", + (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xCFFFE, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+CFFFF", + (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xCFFFF, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+DFFFE", + (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xDFFFE, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+DFFFF", + (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xDFFFF, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+EFFFE", + (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xEFFFE, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+EFFFF", + (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xEFFFF, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+FFFFE", + (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xFFFFE, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+FFFFF", + (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0xFFFFF, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+10FFFE", + (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x10FFFE, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "non-character code point U+10FFFF", + (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"), + $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR, + 'nonchar', 0x10FFFF, + (isASCII) ? 4 : 5, + (isASCII) ? 4 : 5, + qr/Unicode non-character.*is not recommended for open interchange/ + ], + [ "requires at least 32 bits", + (isASCII) + ? "\xfe\x82\x80\x80\x80\x80\x80" + : I8_to_native( + "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), + # This code point is chosen so that it is representable in a UV on + # 32-bit machines + $::UTF8_WARN_ABOVE_31_BIT, $::UTF8_DISALLOW_ABOVE_31_BIT, + $::UTF8_GOT_ABOVE_31_BIT, + 'utf8', 0x80000000, + (isASCII) ? 7 : $::max_bytes, + (isASCII) ? 1 : 8, + nonportable_regex(0x80000000) + ], + [ "highest 32 bit code point", + (isASCII) + ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" + : I8_to_native( + "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), + $::UTF8_WARN_ABOVE_31_BIT, $::UTF8_DISALLOW_ABOVE_31_BIT, + $::UTF8_GOT_ABOVE_31_BIT, + 'utf8', 0xFFFFFFFF, + (isASCII) ? 7 : $::max_bytes, + (isASCII) ? 1 : 8, + nonportable_regex(0xffffffff) + ], + [ "requires at least 32 bits, and use SUPER-type flags, instead of" + . " ABOVE_31_BIT", + (isASCII) + ? "\xfe\x82\x80\x80\x80\x80\x80" + : I8_to_native( + "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), + $::UTF8_WARN_SUPER, $::UTF8_DISALLOW_SUPER, $::UTF8_GOT_SUPER, + 'utf8', 0x80000000, + (isASCII) ? 7 : $::max_bytes, + 1, + nonportable_regex(0x80000000) + ], + [ "overflow with warnings/disallow for more than 31 bits", + # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT + # with overflow. The overflow malformation is never allowed, so + # preventing it takes precedence if the ABOVE_31_BIT options would + # otherwise allow in an overflowing value. The ASCII code points (1 + # for 32-bits; 1 for 64) were chosen because the old overflow + # detection algorithm did not catch them; this means this test also + # checks for that fix. The EBCDIC are arbitrary overflowing ones + # since we have no reports of failures with it. + (($::is64bit) + ? ((isASCII) + ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" + : I8_to_native( + "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0")) + : ((isASCII) + ? "\xfe\x86\x80\x80\x80\x80\x80" + : I8_to_native( + "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))), + $::UTF8_WARN_ABOVE_31_BIT, + $::UTF8_DISALLOW_ABOVE_31_BIT, + $::UTF8_GOT_ABOVE_31_BIT, + 'utf8', 0, + (! isASCII || $::is64bit) ? $::max_bytes : 7, + (isASCII || $::is64bit) ? 2 : 8, + qr/overflows/ + ], +); + +if (! $::is64bit) { + if (isASCII) { + no warnings qw{portable overflow}; + push @tests, + [ "Lowest 33 bit code point: overflow", + "\xFE\x84\x80\x80\x80\x80\x80", + $::UTF8_WARN_ABOVE_31_BIT, $::UTF8_DISALLOW_ABOVE_31_BIT, + $::UTF8_GOT_ABOVE_31_BIT, + 'utf8', 0x100000000, + 7, 1, + qr/and( is)? not portable/ + ]; + } +} +else { + no warnings qw{portable overflow}; + push @tests, + [ "More than 32 bits", + (isASCII) + ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80" + : I8_to_native( + "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), + $::UTF8_WARN_ABOVE_31_BIT, $::UTF8_DISALLOW_ABOVE_31_BIT, + $::UTF8_GOT_ABOVE_31_BIT, + 'utf8', 0x1000000000, + $::max_bytes, (isASCII) ? 1 : 7, + qr/and( is)? not portable/ + ]; + if (! isASCII) { + push @tests, # These could falsely show wrongly in a naive + # implementation + [ "requires at least 32 bits", + I8_to_native( + "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), + $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT, + $::UTF8_GOT_ABOVE_31_BIT, + 'utf8', 0x800000000, + $::max_bytes, 7, + nonportable_regex(0x80000000) + ], + [ "requires at least 32 bits", + I8_to_native( + "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), + $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT, + $::UTF8_GOT_ABOVE_31_BIT, + 'utf8', 0x10000000000, + $::max_bytes, 6, + nonportable_regex(0x10000000000) + ], + [ "requires at least 32 bits", + I8_to_native( + "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), + $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT, + $::UTF8_GOT_ABOVE_31_BIT, + 'utf8', 0x200000000000, + $::max_bytes, 5, + nonportable_regex(0x20000000000) + ], + [ "requires at least 32 bits", + I8_to_native( + "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), + $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT, + $::UTF8_GOT_ABOVE_31_BIT, + 'utf8', 0x4000000000000, + $::max_bytes, 4, + nonportable_regex(0x4000000000000) + ], + [ "requires at least 32 bits", + I8_to_native( + "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), + $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT, + $::UTF8_GOT_ABOVE_31_BIT, + 'utf8', 0x80000000000000, + $::max_bytes, 3, + nonportable_regex(0x80000000000000) + ], + [ "requires at least 32 bits", + I8_to_native( + "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), + $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT, + $::UTF8_GOT_ABOVE_31_BIT, + 'utf8', 0x1000000000000000, + $::max_bytes, 2, + nonportable_regex(0x1000000000000000) + ]; + } +} + +# This test is split into this number of files. +my $num_test_files = $ENV{TEST_JOBS} || 1; +$num_test_files = 10 if $num_test_files > 10; + +my $test_count = -1; +foreach my $test (@tests) { + $test_count++; + next if $test_count % $num_test_files != $::TEST_CHUNK; + + my ($testname, $bytes, $warn_flags, $disallow_flags, $expected_error_flags, + $category, $allowed_uv, $expected_len, $needed_to_discern_len, $message + ) = @$test; + + my $length = length $bytes; + my $will_overflow = $testname =~ /overflow/ ? 'overflow' : ""; + + { + use warnings; + undef @warnings; + my $ret = test_isUTF8_CHAR($bytes, $length); + my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0); + if ($will_overflow) { + is($ret, 0, "isUTF8_CHAR() $testname: returns 0"); + is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0"); + } + else { + is($ret, $length, + "isUTF8_CHAR() $testname: returns expected length: $length"); + is($ret_flags, $length, "isUTF8_CHAR_flags(...,0) $testname:" + . " returns expected length: $length"); + } + is(scalar @warnings, 0, + "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated" + . " no warnings") + or output_warnings(@warnings); + + undef @warnings; + $ret = test_isSTRICT_UTF8_CHAR($bytes, $length); + if ($will_overflow) { + is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0"); + } + else { + my $expected_ret = ( $testname =~ /surrogate|non-character/ + || $allowed_uv > 0x10FFFF) + ? 0 + : $length; + is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns" + . " expected length: $expected_ret"); + $ret = test_isUTF8_CHAR_flags($bytes, $length, + $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE); + is($ret, $expected_ret, + "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')" + . " acts like isSTRICT_UTF8_CHAR"); + } + is(scalar @warnings, 0, + "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:" + . " generated no warnings") + or output_warnings(@warnings); + + undef @warnings; + $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length); + if ($will_overflow) { + is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0"); + } + else { + my $expected_ret = ( $testname =~ /surrogate/ + || $allowed_uv > 0x10FFFF) + ? 0 + : $length; + is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname:" + ." returns expected length: $expected_ret"); + $ret = test_isUTF8_CHAR_flags($bytes, $length, + $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); + is($ret, $expected_ret, + "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')" + . " acts like isC9_STRICT_UTF8_CHAR"); + } + is(scalar @warnings, 0, + "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:" + . " generated no warnings") + or output_warnings(@warnings); + + # Test partial character handling, for each byte not a full character + for my $j (1.. $length - 1) { + + # Skip the test for the interaction between overflow and above-31 + # bit. It is really testing other things than the partial + # character tests, for which other tests in this file are + # sufficient + last if $testname =~ /overflow/; + + foreach my $disallow_flag (0, $disallow_flags) { + my $partial = substr($bytes, 0, $j); + my $ret_should_be; + my $comment; + if ($disallow_flag) { + $ret_should_be = 0; + $comment = "disallowed"; + if ($j < $needed_to_discern_len) { + $ret_should_be = 1; + $comment .= ", but need $needed_to_discern_len bytes" + . " to discern:"; + } + } + else { + $ret_should_be = 1; + $comment = "allowed"; + } + + undef @warnings; + + $ret = test_is_utf8_valid_partial_char_flags($partial, $j, + $disallow_flag); + is($ret, $ret_should_be, + "$testname: is_utf8_valid_partial_char_flags(" + . display_bytes($partial) + . "), $comment: returns $ret_should_be"); + is(scalar @warnings, 0, + "$testname: is_utf8_valid_partial_char_flags()" + . " generated no warnings") + or output_warnings(@warnings); + } + } + } + + # This is more complicated than the malformations tested earlier, as there + # are several orthogonal variables involved. We test all the subclasses + # of utf8 warnings to verify they work with and without the utf8 class, + # and don't have effects on other sublass warnings + foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') { + foreach my $warn_flag (0, $warn_flags) { + foreach my $disallow_flag (0, $disallow_flags) { + foreach my $do_warning (0, 1) { + + # We try each of the above with various combinations of + # malformations that can occur on the same input sequence. + foreach my $short ("", "short") { + foreach my $unexpected_noncont ("", + "unexpected non-continuation") + { + foreach my $overlong ("", "overlong") { + + # If we're already at the longest possible, we + # can't create an overlong (which would be longer) + # can't handle anything larger. + next if $overlong && $expected_len >= $::max_bytes; + + my @malformations; + my @expected_errors; + push @malformations, $short if $short; + push @malformations, $unexpected_noncont + if $unexpected_noncont; + push @malformations, $overlong if $overlong; + + # The overflow malformation test in the input + # array is coerced into being treated like one of + # the others. + if ($will_overflow) { + push @malformations, 'overflow'; + push @expected_errors, $::UTF8_GOT_OVERFLOW; + } + + my $malformations_name = join "/", @malformations; + $malformations_name .= " malformation" + if $malformations_name; + $malformations_name .= "s" if @malformations > 1; + my $this_bytes = $bytes; + my $this_length = $length; + my $expected_uv = $allowed_uv; + my $this_expected_len = $expected_len; + my $this_needed_to_discern_len = $needed_to_discern_len; + if ($malformations_name) { + $expected_uv = 0; + + # Coerce the input into the desired + # malformation + if ($malformations_name =~ /overlong/) { + + # For an overlong, we convert the original + # start byte into a continuation byte with + # the same data bits as originally. ... + substr($this_bytes, 0, 1) + = start_byte_to_cont(substr($this_bytes, + 0, 1)); + + # ... Then we prepend it with a known + # overlong sequence. This should evaluate + # to the exact same code point as the + # original. + $this_bytes + = I8_to_native("\xff") + . (I8_to_native(chr $::first_continuation) + x ( $::max_bytes - 1 - length($this_bytes))) + . $this_bytes; + $this_length = length($this_bytes); + $this_needed_to_discern_len + = $::max_bytes - ($this_expected_len + - $this_needed_to_discern_len); + $this_expected_len = $::max_bytes; + push @expected_errors, $::UTF8_GOT_LONG; + } + if ($malformations_name =~ /short/) { + + # Just tell the test to not look far + # enough into the input. + $this_length--; + $this_expected_len--; + push @expected_errors, $::UTF8_GOT_SHORT; + } + if ($malformations_name + =~ /non-continuation/) + { + # Change the final continuation byte into + # a non one. + my $pos = ($short) ? -2 : -1; + substr($this_bytes, $pos, 1) = '?'; + $this_expected_len--; + push @expected_errors, + $::UTF8_GOT_NON_CONTINUATION; + } + } + + my $eval_warn = $do_warning + ? "use warnings '$warning'" + : $warning eq "utf8" + ? "no warnings 'utf8'" + : ( "use warnings 'utf8';" + . " no warnings '$warning'"); + + # Is effectively disallowed if we've set up a + # malformation, even if the flag indicates it is + # allowed. Fix up test name to indicate this as + # well + my $disallowed = $disallow_flag + || $malformations_name; + my $this_name = "utf8n_to_uvchr_error() $testname: " + . (($disallow_flag) + ? 'disallowed' + : $disallowed + ? $disallowed + : 'allowed'); + $this_name .= ", $eval_warn"; + $this_name .= ", " . (($warn_flag) + ? 'with warning flag' + : 'no warning flag'); + + undef @warnings; + my $ret_ref; + my $display_bytes = display_bytes($this_bytes); + my $call = " Call was: $eval_warn; \$ret_ref" + . " = test_utf8n_to_uvchr_error(" + . "'$display_bytes', $this_length," + . "$warn_flag" + . "|$disallow_flag)"; + my $eval_text = "$eval_warn; \$ret_ref" + . " = test_utf8n_to_uvchr_error(" + . "'$this_bytes'," + . " $this_length, $warn_flag" + . "|$disallow_flag)"; + eval "$eval_text"; + if (! ok ("$@ eq ''", + "$this_name: eval succeeded")) + { + diag "\$!='$!'; eval'd=\"$call\""; + next; + } + if ($disallowed) { + is($ret_ref->[0], 0, "$this_name: Returns 0") + or diag $call; + } + else { + is($ret_ref->[0], $expected_uv, + "$this_name: Returns expected uv: " + . sprintf("0x%04X", $expected_uv)) + or diag $call; + } + is($ret_ref->[1], $this_expected_len, + "$this_name: Returns expected length:" + . " $this_expected_len") + or diag $call; + + my $errors = $ret_ref->[2]; + + for (my $i = @expected_errors - 1; $i >= 0; $i--) { + if (ok($expected_errors[$i] & $errors, + "Expected and got error bit return" + . " for $malformations[$i] malformation")) + { + $errors &= ~$expected_errors[$i]; + } + splice @expected_errors, $i, 1; + } + is(scalar @expected_errors, 0, + "Got all the expected malformation errors") + or diag Dumper \@expected_errors; + + if ( $this_expected_len >= $this_needed_to_discern_len + && ($warn_flag || $disallow_flag)) + { + is($errors, $expected_error_flags, + "Got the correct error flag") + or diag $call; + } + else { + is($errors, 0, "Got no other error flag"); + } + + if (@malformations) { + if (! $do_warning && $warning eq 'utf8') { + goto no_warnings_expected; + } + + # Check that each malformation generates a + # warning, removing that warning if found + MALFORMATION: + foreach my $malformation (@malformations) { + foreach (my $i = 0; $i < @warnings; $i++) { + if ($warnings[$i] =~ /$malformation/) { + pass("Expected and got" + . "'$malformation' warning"); + splice @warnings, $i, 1; + next MALFORMATION; + } + } + fail("Expected '$malformation' warning" + . " but didn't get it"); + + } + } + + # Any overflow will override any super or above-31 + # warnings. + goto no_warnings_expected + if $will_overflow || $this_expected_len + < $this_needed_to_discern_len; + + if ( ! $do_warning + && ( $warning eq 'utf8' + || $warning eq $category)) + { + goto no_warnings_expected; + } + elsif ($warn_flag) { + if (is(scalar @warnings, 1, + "$this_name: Got a single warning ")) + { + like($warnings[0], $message, + "$this_name: Got expected warning") + or diag $call; + } + else { + diag $call; + if (scalar @warnings) { + output_warnings(@warnings); + } + } + } + else { + no_warnings_expected: + unless (is(scalar @warnings, 0, + "$this_name: Got no warnings")) + { + diag $call; + output_warnings(@warnings); + } + } + + # Check CHECK_ONLY results when the input is + # disallowed. Do this when actually disallowed, + # not just when the $disallow_flag is set + if ($disallowed) { + undef @warnings; + $ret_ref = test_utf8n_to_uvchr_error( + $this_bytes, $this_length, + $disallow_flag|$::UTF8_CHECK_ONLY); + is($ret_ref->[0], 0, + "$this_name, CHECK_ONLY: Returns 0") + or diag $call; + is($ret_ref->[1], -1, + "$this_name: CHECK_ONLY: returns -1 for length") + or diag $call; + if (! is(scalar @warnings, 0, + "$this_name, CHECK_ONLY: no warnings" + . " generated")) + { + diag $call; + output_warnings(@warnings); + } + } + + # Now repeat some of the above, but for + # uvchr_to_utf8_flags(). Since this comes from an + # existing code point, it hasn't overflowed, and + # isn't malformed. + next if @malformations; + + # The warning and disallow flags passed in are for + # utf8n_to_uvchr_error(). Convert them for + # uvchr_to_utf8_flags(). + my $uvchr_warn_flag = 0; + my $uvchr_disallow_flag = 0; + if ($warn_flag) { + if ($warn_flag == $::UTF8_WARN_SURROGATE) { + $uvchr_warn_flag = $::UNICODE_WARN_SURROGATE + } + elsif ($warn_flag == $::UTF8_WARN_NONCHAR) { + $uvchr_warn_flag = $::UNICODE_WARN_NONCHAR + } + elsif ($warn_flag == $::UTF8_WARN_SUPER) { + $uvchr_warn_flag = $::UNICODE_WARN_SUPER + } + elsif ($warn_flag == $::UTF8_WARN_ABOVE_31_BIT) { + $uvchr_warn_flag + = $::UNICODE_WARN_ABOVE_31_BIT; + } + else { + fail(sprintf "Unexpected warn flag: %x", + $warn_flag); + next; + } + } + if ($disallow_flag) { + if ($disallow_flag == $::UTF8_DISALLOW_SURROGATE) + { + $uvchr_disallow_flag + = $::UNICODE_DISALLOW_SURROGATE; + } + elsif ($disallow_flag == $::UTF8_DISALLOW_NONCHAR) + { + $uvchr_disallow_flag + = $::UNICODE_DISALLOW_NONCHAR; + } + elsif ($disallow_flag == $::UTF8_DISALLOW_SUPER) { + $uvchr_disallow_flag + = $::UNICODE_DISALLOW_SUPER; + } + elsif ($disallow_flag + == $::UTF8_DISALLOW_ABOVE_31_BIT) + { + $uvchr_disallow_flag = + $::UNICODE_DISALLOW_ABOVE_31_BIT; + } + else { + fail(sprintf "Unexpected disallow flag: %x", + $disallow_flag); + next; + } + } + + $disallowed = $uvchr_disallow_flag; + + $this_name = "uvchr_to_utf8_flags() $testname: " + . (($uvchr_disallow_flag) + ? 'disallowed' + : ($disallowed) + ? 'ABOVE_31_BIT allowed' + : 'allowed'); + $this_name .= ", $eval_warn"; + $this_name .= ", " . (($uvchr_warn_flag) + ? 'with warning flag' + : 'no warning flag'); + + undef @warnings; + my $ret; + my $warn_flag = sprintf "0x%x", $uvchr_warn_flag; + my $disallow_flag = sprintf "0x%x", + $uvchr_disallow_flag; + $call = sprintf(" Call was: $eval_warn; \$ret" + . " = test_uvchr_to_utf8_flags(" + . " 0x%x, $warn_flag|$disallow_flag)", + $allowed_uv); + $eval_text = "$eval_warn; \$ret =" + . " test_uvchr_to_utf8_flags(" + . "$allowed_uv, $warn_flag|" + . "$disallow_flag)"; + eval "$eval_text"; + if (! ok ("$@ eq ''", "$this_name: eval succeeded")) + { + diag "\$!='$!'; eval'd=\"$eval_text\""; + next; + } + if ($disallowed) { + is($ret, undef, "$this_name: Returns undef") + or diag $call; + } + else { + is($ret, $bytes, "$this_name: Returns expected string") + or diag $call; + } + if (! $do_warning + && ($warning eq 'utf8' || $warning eq $category)) + { + if (!is(scalar @warnings, 0, + "$this_name: No warnings generated")) + { + diag $call; + output_warnings(@warnings); + } + } + elsif ( $uvchr_warn_flag + && ( $warning eq 'utf8' + || $warning eq $category)) + { + if (is(scalar @warnings, 1, + "$this_name: Got a single warning ")) + { + like($warnings[0], $message, + "$this_name: Got expected warning") + or diag $call; + } + else { + diag $call; + output_warnings(@warnings) + if scalar @warnings; + } + } + } + } + } + } + } + } + } +} + +done_testing; |