diff options
-rw-r--r-- | lib/Unicode/UCD.pm | 92 | ||||
-rw-r--r-- | lib/Unicode/UCD.t | 117 | ||||
-rw-r--r-- | lib/unicore/mktables | 118 | ||||
-rw-r--r-- | pod/perldelta.pod | 43 | ||||
-rw-r--r-- | pod/perldiag.pod | 57 | ||||
-rw-r--r-- | pod/perlrecharclass.pod | 18 | ||||
-rw-r--r-- | pod/perlunicode.pod | 140 | ||||
-rw-r--r-- | regcomp.c | 53 | ||||
-rw-r--r-- | regexec.c | 2 | ||||
-rw-r--r-- | t/lib/warnings/utf8 | 451 | ||||
-rw-r--r-- | t/porting/diag.t | 1 | ||||
-rw-r--r-- | t/re/pat.t | 10 |
12 files changed, 439 insertions, 663 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index 14752ae2b1..e4ae34e270 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -5,7 +5,7 @@ use warnings; no warnings 'surrogate'; # surrogates can be inputs to this use charnames (); -our $VERSION = '0.55'; +our $VERSION = '0.56'; require Exporter; @@ -2138,21 +2138,10 @@ too high for some operations to work; you may wish to use a smaller number for your purposes.) Note that the inversion lists returned by this function can possibly include -non-Unicode code points, that is anything above 0x10FFFF. This is in -contrast to Perl regular expression matches on those code points, in which a -non-Unicode code point always fails to match. For example, both of these have -the same result: - - chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Fails. - chr(0x110000) =~ \p{ASCII_Hex_Digit=False} # Fails! - -And both raise a warning that a Unicode property is being used on a -non-Unicode code point. It is arguable as to which is the correct thing to do -here. This function has chosen the way opposite to the Perl regular -expression behavior. This allows you to easily flip to the Perl regular -expression way (for you to go in the other direction would be far harder). -Simply add 0x110000 at the end of the non-empty returned list if it isn't -already that value; and pop that value if it is; like: +non-Unicode code points, that is anything above 0x10FFFF. Unicode properties +are not defined on such code points. You might wish to change the output to +not include these. Simply add 0x110000 at the end of the non-empty returned +list if it isn't already that value; and pop that value if it is; like: my @list = prop_invlist("foo"); if (@list) { @@ -2261,19 +2250,18 @@ sub prop_invlist ($;$) { if (defined $hex_end) { # The next item starts with the code point 1 # beyond the end of the range. - push @invlist, hex($hex_end) + 1; + no warnings 'portable'; + my $end = hex $hex_end; + last if $end == $Unicode::UCD::MAX_CP; + push @invlist, $end + 1; } else { # No end of range, is a single code point. push @invlist, $begin + 1; } } - require "unicore/UCD.pl"; - my $FIRST_NON_UNICODE = $MAX_UNICODE_CODEPOINT + 1; - # Could need to be inverted: add or subtract a 0 at the beginning of the - # list. And to keep it from matching non-Unicode, add or subtract the - # first non-unicode code point. + # list. if ($swash->{'INVERT_IT'}) { if (@invlist && $invlist[0] == 0) { shift @invlist; @@ -2281,46 +2269,6 @@ sub prop_invlist ($;$) { else { unshift @invlist, 0; } - if (@invlist && $invlist[-1] == $FIRST_NON_UNICODE) { - pop @invlist; - } - else { - push @invlist, $FIRST_NON_UNICODE; - } - } - - # Here, the list is set up to include only Unicode code points. But, if - # the table is the default one for the property, it should contain all - # non-Unicode code points. First calculate the loose name for the - # property. This is done even for strict-name properties, as the data - # structure that mktables generates for us is set up so that we don't have - # to worry about that. The property-value needs to be split if compound, - # as the loose rules need to be independently calculated on each part. We - # know that it is syntactically valid, or SWASHNEW would have failed. - - $prop = lc $prop; - my ($prop_only, $table) = split /\s*[:=]\s*/, $prop; - if ($table) { - - # May have optional prefixed 'is' - $prop = utf8::_loose_name($prop_only) =~ s/^is//r; - $prop = $utf8::loose_property_name_of{$prop}; - $prop .= "=" . utf8::_loose_name($table); - } - else { - $prop = utf8::_loose_name($prop); - } - if (exists $loose_defaults{$prop}) { - - # Here, is the default table. If a range ended with 10ffff, instead - # continue that range to infinity, by popping the 110000; otherwise, - # add the range from 11000 to infinity - if (! @invlist || $invlist[-1] != $FIRST_NON_UNICODE) { - push @invlist, $FIRST_NON_UNICODE; - } - else { - pop @invlist; - } } return @invlist; @@ -2349,8 +2297,8 @@ or even better, C<"Gc=LC">). Many Unicode properties have more than one name (or alias). C<prop_invmap> understands all of these, including Perl extensions to them. Ambiguities are resolved as described above for L</prop_aliases()>. The Perl internal -property "Perl_Decimal_Digit, described below, is also accepted. C<undef> is -returned if the property name is unknown. +property "Perl_Decimal_Digit, described below, is also accepted. An empty +list is returned if the property name is unknown. See L<perluniprops/Properties accessible through Unicode::UCD> for the properties acceptable as inputs to this function. @@ -3252,6 +3200,7 @@ RETRY: # Find the beginning and end of the range on the line my ($hex_begin, $hex_end, $map) = split "\t", $range; my $begin = hex $hex_begin; + no warnings 'portable'; my $end = (defined $hex_end && $hex_end ne "") ? hex $hex_end : $begin; @@ -3375,7 +3324,7 @@ RETRY: # to the default value. If there is no gap, the next iteration will # pop this, unless there is no next iteration, and we have filled all # of the Unicode code space, so check for that and skip. - if ($end < $MAX_UNICODE_CODEPOINT) { + if ($end < $Unicode::UCD::MAX_CP) { push @invlist, $end + 1; push @invmap, $missing; } @@ -3388,10 +3337,15 @@ RETRY: push @invmap, $missing; } - # And add in standard element that all non-Unicode code points map to: - # $missing - push @invlist, $MAX_UNICODE_CODEPOINT + 1; - push @invmap, $missing; + # The final element is always for just the above-Unicode code points. If + # not already there, add it. It merely splits the current final range + # that extends to infinity into two elements, each with the same map. + # (This is to conform with the API that says the final element is for + # $MAX_UNICODE_CODEPOINT + 1 .. INFINITY.) + if ($invlist[-1] != $MAX_UNICODE_CODEPOINT + 1) { + push @invmap, $invmap[-1]; + push @invlist, $MAX_UNICODE_CODEPOINT + 1; + } # The second component of the map are those values that require # non-standard specification, stored in SPECIALS. These override any diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index c4b5a85098..b2caf8934c 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -1058,25 +1058,12 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of # If we are to test against an inverted file, it is easier to invert # our array than the file. - # The file only is valid for Unicode code points, while the inversion - # list is valid for all possible code points. Therefore, we must test - # just the Unicode part against the file. Later we will test for - # the non-Unicode part. - - my $before_invert; # Saves the pre-inverted table. if ($invert) { - $before_invert = dclone \@tested; if (@tested && $tested[0] == 0) { shift @tested; } else { unshift @tested, 0; } - if (@tested && $tested[-1] == 0x110000) { - pop @tested; - } - else { - push @tested, 0x110000; - } } # Now construct a string from the list that should match the file. @@ -1091,9 +1078,11 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of # otherwise don't get reflected in the file. my $tested = ""; my $i = 0; - for (; $i < @tested - 1; $i += 2) { + for (; $i < @tested; $i += 2) { my $start = $tested[$i]; - my $end = $tested[$i+1] - 1; + my $end = ($i + 1 < @tested) + ? $tested[$i+1] - 1 + : $Unicode::UCD::MAX_CP; if ($start == $end) { $tested .= sprintf("%X\n", $start); } @@ -1102,12 +1091,6 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of } } - # As mentioned earlier, the disk files only go up through Unicode, - # whereas the prop_invlist() ones go as high as necessary. The - # comparison is only valid through max Unicode. - if ($i == @tested - 1 && $tested[$i] <= 0x10FFFF) { - $tested .= sprintf("%X\t10FFFF\n", $tested[$i]); - } local $/ = "\n"; chomp $tested; $/ = $input_record_separator; @@ -1116,50 +1099,6 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of next; } - # Here, it matched the table. Now need to check for if it is correct - # for beyond Unicode. First, calculate if is the default table or - # not. This is the same algorithm as used internally in - # prop_invlist(), so if it is wrong there, this test won't catch it. - my $prop = lc $table; - ($prop_only, $table) = split /\s*[:=]\s*/, $prop; - if (defined $table) { - - # May have optional prefixed 'is' - $prop = &utf8::_loose_name($prop_only) =~ s/^is//r; - $prop = $utf8::loose_property_name_of{$prop}; - $prop .= "=" . &utf8::_loose_name($table); - } - else { - $prop = &utf8::_loose_name($prop); - } - my $is_default = exists $Unicode::UCD::loose_defaults{$prop}; - - @tested = @$before_invert if $invert; # Use the original - if (@tested % 2 == 0) { - - # If there are an even number of elements, the final one starts a - # range (going to infinity) of code points that are not in the - # list. - if ($is_default) { - fail("prop_invlist('$mod_table')"); - diag("default table doesn't goto infinity"); - use Data::Dumper; - diag Dumper \@tested; - next; - } - } - else { - # An odd number of elements means the final one starts a range - # (going to infinity of code points that are in the list. - if (! $is_default) { - fail("prop_invlist('$mod_table')"); - diag("non-default table needs to stop in the Unicode range"); - use Data::Dumper; - diag Dumper \@tested; - next; - } - } - pass("prop_invlist('$mod_table')"); } } @@ -1391,7 +1330,35 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { diag("The last inversion list element is not 0x110000"); next PROPERTY; } - if ($invmap_ref->[-1] ne $missing) { + + my $upper_limit_subtract; + + # prop_invmap() adds an extra element not present in the disk files for + # the above-Unicode code points. For almost all properties, that will be + # to $missing. In that case we don't look further at it when comparing + # with the disk files. + if ($invmap_ref->[-1] eq $missing) { + $upper_limit_subtract = 1; + } + elsif ($invmap_ref->[-1] eq 'Y' && ! grep { $_ !~ /[YN]/ } @$invmap_ref) { + + # But that's not true for a few binary properties like 'Unassigned' + # that are Perl extensions (in this case for Gc=Unassigned) which + # match above-Unicode code points (hence the 'Y' in the test above). + # For properties where it isn't $missing, we're going to want to look + # at the whole thing when comparing with the disk file. + $upper_limit_subtract = 0; + + # In those properties like 'Unassigned, the final element should be + # just a repetition of the next-to-last element, and won't be in the + # disk file, so remove it for the comparison. Otherwise, we will + # compare the whole of the array with the whole of the disk file. + if ($invlist_ref->[-2] <= 0x10FFFF && $invmap_ref->[-2] eq 'Y') { + pop @$invlist_ref; + pop @$invmap_ref; + } + } + else { fail("prop_invmap('$display_prop')"); diag("The last inversion list element is '$invmap_ref->[-1]', and should be '$missing'"); next PROPERTY; @@ -1705,9 +1672,10 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { # it's an error my %specials = %$specials_ref if $specials_ref; - # The extra -1 is because the final element has been tested above to - # be for anything above Unicode. The file doesn't go that high. - for (my $i = 0; $i < @$invlist_ref - 1; $i++) { + # The extra -$upper_limit_subtract is because the final element may + # have been tested above to be for anything above Unicode, in which + # case the file may not go that high. + for (my $i = 0; $i < @$invlist_ref - $upper_limit_subtract; $i++) { # If the map element is a reference, have to stringify it (but # don't do so if the format doesn't allow references, so that an @@ -1899,7 +1867,9 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { # Finally have figured out what the map column in the file should # be. Append the line to the running string. my $start = $invlist_ref->[$i]; - my $end = $invlist_ref->[$i+1] - 1; + my $end = (defined $invlist_ref->[$i+1]) + ? $invlist_ref->[$i+1] - 1 + : $Unicode::UCD::MAX_CP; $end = ($start == $end) ? "" : sprintf($file_range_format, $end); if ($invmap_ref->[$i] ne "") { $tested_map .= sprintf "$file_range_format\t%s\t%s\n", @@ -1999,7 +1969,7 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { my @code_point_in_names = @Unicode::UCD::code_points_ending_in_code_point; - for my $i (0 .. @$invlist_ref - 1 - 1) { + for my $i (0 .. @$invlist_ref - 1 - $upper_limit_subtract) { my $start = $invlist_ref->[$i]; my $end = $invlist_ref->[$i+1] - 1; if ($invmap_ref->[$i] eq $missing) { @@ -2105,10 +2075,7 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { my %maps; my $previous_map; - # (The extra -1 is to not look at the final element in the loop, which - # we know is the one that starts just beyond Unicode and goes to - # infinity.) - for my $i (0 .. @$invlist_ref - 1 - 1) { + for my $i (0 .. @$invlist_ref - 1 - $upper_limit_subtract) { my $range_start = $invlist_ref->[$i]; # Because we are sorting into buckets, things could be diff --git a/lib/unicore/mktables b/lib/unicore/mktables index f3d5c83d58..b94433fe2c 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -1200,6 +1200,18 @@ my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF"; my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; +# We work with above-Unicode code points, up to UV_MAX. But when you get +# that high, above IV_MAX, some operations don't work, and you can easily get +# overflow. Therefore for internal use, we use a much smaller number, +# translating it to UV_MAX only for output. The exact number is immaterial +# (all Unicode code points are treated exactly the same), but the algorithm +# requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1; +my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8; +my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1; +my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT); + +my $MAX_PLATFORM_CODEPOINT = ~0; + # Matches legal code point. 4-6 hex numbers, If there are 6, the first # two must be 10; if there are 5, the first must not be a 0. Written this way # to decrease backtracking. The first regex allows the code point to be at @@ -1531,7 +1543,8 @@ my $UNASSIGNED_TYPE = -2; my $PRIVATE_USE_TYPE = -3; my $NONCHARACTER_TYPE = -4; my $CONTROL_TYPE = -5; -my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program +my $ABOVE_UNICODE_TYPE = -6; +my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program sub populate_char_info ($) { # Used only with the $annotate option. Populates the arrays with the @@ -1562,7 +1575,13 @@ sub populate_char_info ($) { my $end; if (! $viacode[$i]) { my $nonchar; - if ($gc-> table('Private_use')->contains($i)) { + if ($i > $MAX_UNICODE_CODEPOINT) { + $viacode[$i] = 'Above-Unicode'; + $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE; + $printable[$i] = 0; + $end = $MAX_WORKING_CODEPOINT; + } + elsif ($gc-> table('Private_use')->contains($i)) { $viacode[$i] = 'Private Use'; $annotate_char_type[$i] = $PRIVATE_USE_TYPE; $printable[$i] = 0; @@ -1715,7 +1734,15 @@ sub clarify_code_point_count ($) { # This is like clarify_number(), but the input is assumed to be a count of # code points, rather than a generic number. - return clarify_number(shift); + my $append = ""; + + my $number = shift; + if ($number > $MAX_UNICODE_CODEPOINTS) { + $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS); + return "All above-Unicode code points" if $number == 0; + $append = " + all above-Unicode code points"; + } + return clarify_number($number) . $append; } package Carp; @@ -3450,7 +3477,7 @@ sub trace { return main::trace(@_); } # If the range list is empty, return a large value that isn't adjacent # to any that could be in the range list, for simpler tests - return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; + return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; return $ranges{$addr}->[0]->start; } @@ -3729,9 +3756,6 @@ sub trace { return main::trace(@_); } Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken."); return; } - if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') { - Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway"); - } #local $to_trace = 1 if main::DEBUG; if ($operation eq '-') { @@ -4529,8 +4553,8 @@ sub trace { return main::trace(@_); } # And finally, add the gap from the end of the table to the max # possible code point - if ($max < $MAX_UNICODE_CODEPOINT) { - $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT); + if ($max < $MAX_WORKING_CODEPOINT) { + $new->add_range($max + 1, $MAX_WORKING_CODEPOINT); } return $new; } @@ -4819,6 +4843,7 @@ sub trace { return main::trace(@_); } # range. my $end = $set->end; return $end if is_code_point_usable($end, $try_hard); + $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT; # End point didn't, work. Start at the beginning and try # every one until find one that does work. @@ -5722,6 +5747,7 @@ END my $next_end; my $next_value; my $offset = 0; + my $invlist_count = 0; my $output_value_in_hex = $self->isa('Map_Table') && ($self->format eq $HEX_ADJUST_FORMAT @@ -5855,9 +5881,16 @@ END # If there is a range if ($start != $end) { - push @OUT, sprintf "$hex_format\t$hex_format", - $start, $end; - if ($value ne "") { + if ($end == $MAX_WORKING_CODEPOINT) { + push @OUT, sprintf "$hex_format\t$hex_format", + $start, + $MAX_PLATFORM_CODEPOINT; + } + else { + push @OUT, sprintf "$hex_format\t$hex_format", + $start, $end; + } + if (length $value) { if ($convert_map_to_from_hex) { $OUT[-1] .= sprintf "\t$hex_format\n", $value; } @@ -5958,8 +5991,15 @@ END } if ($i != $start || $range_end < $end) { - $annotation = sprintf "%04X..%04X", - $i, $range_end; + if ($range_end < $MAX_WORKING_CODEPOINT) + { + $annotation = sprintf "%04X..%04X", + $i, $range_end; + } + else { + $annotation = sprintf "%04X..INFINITY", + $i; + } } else { # Indent if not displaying code points $annotation = " " x 4; @@ -7696,7 +7736,18 @@ END # Get the number of code points matched by each of the tables in this # file, and add underscores for clarity. my $count = $leader->count; - my $string_count = main::clarify_code_point_count($count); + my $unicode_count; + my $non_unicode_string; + if ($count > $MAX_UNICODE_CODEPOINTS) { + $unicode_count = $count - ($MAX_WORKING_CODEPOINT + - $MAX_UNICODE_CODEPOINT); + $non_unicode_string = "All above-Unicode code points match as well, and are also returned"; + } + else { + $unicode_count = $count; + $non_unicode_string = ""; + } + my $string_count = main::clarify_code_point_count($unicode_count); my $loose_count = 0; # how many aliases loosely matched my $compound_name = ""; # ? Are any names compound?, and if so, an @@ -7894,11 +7945,13 @@ END } } # End of looping through all tables + $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string; + my $code_points; my $match; my $any_of_these; - if ($count == 1) { + if ($unicode_count == 1) { $match = 'matches'; $code_points = 'single code point'; } @@ -12999,7 +13052,7 @@ END # This fills in any missing values with the default. It's not # necessary to do this with binary properties, as the default # is defined completely in terms of the Y table. - $property->add_map(0, $MAX_UNICODE_CODEPOINT, + $property->add_map(0, $MAX_WORKING_CODEPOINT, $default_map, Replace => $NO); } } @@ -13211,8 +13264,9 @@ sub compile_perl() { # 'All' is all code points. As an error check, instead of just setting it # to be that, construct it to be the union of all the major categories $All = $perl->add_match_table('All', - Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]", - Matches_All => 1); + Description + => "All code points, including those above Unicode. Same as qr/./s", + Matches_All => 1); foreach my $major_table ($gc->tables) { @@ -13222,10 +13276,10 @@ sub compile_perl() { $All += $major_table; } - if ($All->max != $MAX_UNICODE_CODEPOINT) { + if ($All->max != $MAX_WORKING_CODEPOINT) { Carp::my_carp_bug("Generated highest code point (" . sprintf("%X", $All->max) - . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.") + . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.") } if ($All->range_count != 1 || $All->min != 0) { Carp::my_carp_bug("Generated table 'All' doesn't match all code points.") @@ -14284,8 +14338,9 @@ END $unassigned_sans_noncharacters &= $nonchars->table('N'); } - for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) { + for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) { $i = populate_char_info($i); # Note sets $i so may cause skips + } } @@ -14937,7 +14992,18 @@ sub make_re_pod_entries($) { my $full_name = $property->full_name; my $count = $input_table->count; - my $string_count = clarify_number($count); + my $unicode_count; + my $non_unicode_string; + if ($count > $MAX_UNICODE_CODEPOINTS) { + $unicode_count = $count - ($MAX_WORKING_CODEPOINT + - $MAX_UNICODE_CODEPOINT); + $non_unicode_string = " plus all above-Unicode code points"; + } + else { + $unicode_count = $count; + $non_unicode_string = ""; + } + my $string_count = clarify_number($unicode_count) . $non_unicode_string; my $status = $input_table->status; my $status_info = $input_table->status_info; my $caseless_equivalent = $input_table->caseless_equivalent; @@ -16840,13 +16906,13 @@ sub write_all_tables() { my $count = $table->count; if ($expected_full) { - if ($count != $MAX_UNICODE_CODEPOINTS) { + if ($count != $MAX_WORKING_CODEPOINTS) { Carp::my_carp("$table matches only " . clarify_number($count) . " Unicode code points but should match " - . clarify_number($MAX_UNICODE_CODEPOINTS) + . clarify_number($MAX_WORKING_CODEPOINTS) . " (off by " - . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count)) + . clarify_number(abs($MAX_WORKING_CODEPOINTS - $count)) . "). Proceeding anyway."); } diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 96e9163075..e36ae85fcc 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -40,19 +40,40 @@ L</Selected Bug Fixes> section. =head1 Incompatible Changes -XXX For a release on a stable branch, this section aspires to be: - - There are no changes intentionally incompatible with 5.XXX.XXX - If any exist, they are bugs, and we request that you submit a - report. See L</Reporting Bugs> below. - -[ List each incompatible change as a =head2 entry ] - =head2 C<do> can no longer be used to call subroutines The C<do SUBROUTINE(LIST)> form has resulted in a deprecation warning since Perl v5.0.0, and is now a syntax error. +=head2 C<\p{}>, C<\P{}> matching has changed for non-Unicode code +points. + +C<\p{}> and C<\P{}> are defined by Unicode only on Unicode-defined code +points (C<U+0000> through C<U+10FFFF>). Their behavior on matching +these legal Unicode code points is unchanged, but there are changes for +code points C<0x110000> and above. Previously, Perl treated the result +of matching C<\p{}> and C<\P{}> against these as C<undef>, which +translates into "false". For C<\P{}>, this was then complemented into +"true". A warning was supposed to be raised when this happened. +However, various optimizations could prevent the warning, and the +results were often counter-intuitive, with both a match and its seeming +complement being false. Now all non-Unicode code points are treated as +typical unassigned Unicode code points. This generally is more +Do-What-I-Mean. A warning is raised only if the results are arguably +different from a strict Unicode approach, and from what Perl used to do. +Code that needs to be strictly Unicode compliant can make this warning +fatal, and then Perl always raises the warning. + +Details are in L<perlunicode/Beyond Unicode code points>. + +=head2 C<\p{All}> has been expanded to match all possible code points + +The Perl-defined regular expression pattern element C<\p{All}>, unused +on CPAN, used to match just the Unicode code points; now it matches all +possible code points; that is, it is equivalent to C<qr/./s>. Thus +C<\p{All}> is no longer synonymous with C<\p{Any}>, which continues to +match just the Unicode code points, as Unicode says it should. + =head1 Deprecations XXX Any deprecated features, syntax, modules etc. should be listed here. @@ -200,6 +221,12 @@ XXX L<message|perldiag/"message"> XXX L<message|perldiag/"message"> +=item * + +L<Matched non-Unicode code point 0x%X against Unicode property; may not be portable|perldiag/"Matched non-Unicode code point 0x%X against Unicode property; may not be portable">. +This replaces the message "Code point 0x%X is not Unicode, all \p{} +matches fail; all \P{} matches succeed". + =back =head2 Changes to Existing Diagnostics diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 0b3c096392..61d144ab2e 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -1426,12 +1426,9 @@ This subroutine cannot be called. (F) You had a (sub-)template that ends with a '/'. There must be another template code following the slash. See L<perlfunc/pack>. -=item Code point 0x%X is not Unicode, all \p{} matches fail; all \P{} matches -succeed - =item Code point 0x%X is not Unicode, may not be portable -(S utf8, non_unicode) You had a code point above the Unicode maximum +(S non_unicode) You had a code point above the Unicode maximum of U+10FFFF. Perl allows strings to contain a superset of Unicode code points, up @@ -1441,27 +1438,6 @@ it was legal in some standards to have code points up to 0x7FFF_FFFF, but not higher. Code points above 0xFFFF_FFFF require larger than a 32 bit word. -None of the Unicode or Perl-defined properties will match a non-Unicode -code point. For example, - - chr(0x7FF_FFFF) =~ /\p{Any}/ - -will not match, because the code point is not in Unicode. But - - chr(0x7FF_FFFF) =~ /\P{Any}/ - -will match. - -This may be counterintuitive at times, as both these fail: - - chr(0x110000) =~ /\p{ASCII_Hex_Digit=True}/ # Fails. - chr(0x110000) =~ /\p{ASCII_Hex_Digit=False}/ # Also fails! - -and both these succeed: - - chr(0x110000) =~ /\P{ASCII_Hex_Digit=True}/ # Succeeds. - chr(0x110000) =~ /\P{ASCII_Hex_Digit=False}/ # Also succeeds! - =item %s: Command not found (A) You've accidentally run your script through B<csh> or another shell @@ -2920,6 +2896,37 @@ rules and perl was unable to guess how to make more progress. (F) Perl thought it was reading UTF-16 encoded character data but while doing it Perl met a malformed Unicode surrogate. +=item Matched non-Unicode code point 0x%X against Unicode property; may +not be portable + +(S non_unicode) Perl allows strings to contain a superset of +Unicode code points; each code point may be as large as what is storable +in an unsigned integer on your system, but these may not be accepted by +other languages/systems. This message occurs when you matched a string +containing such a code point against a regular expression pattern, and +the code point was matched against a Unicode property, C<\p{...}> or +C<\P{...}>. Unicode properties are only defined on Unicode code points, +so the result of this match is undefined by Unicode, but Perl (starting +in v5.20) treats non-Unicode code points as if they were typical +unassigned Unicode ones, and matched this one accordingly. Whether a +given property matches these code points or not is specified in +L<perluniprops/Properties accessible through \p{} and \P{}>. + +This message is suppressed (unless it has been made fatal) if it is +immaterial to the results of the match if the code point is Unicode or +not. For example, the property C<\p{ASCII_Hex_Digit}> only can match +the 22 characters C<[0-9A-Fa-f]>, so obviously all other code points, +Unicode or not, won't match it. (And C<\P{ASCII_Hex_Digit}> will match +every code point except these 22.) + +Getting this message indicates that the outcome of the match arguably +should have been the opposite of what actually happened. If you think +that is the case, you may wish to make the C<non_unicode> warnings +category fatal; if you agree with Perl's decision, you may wish to turn +off this category. + +See L<perlunicode/Beyond Unicode code points> for more information. + =item %s matches null string many times in regex; marked by S<<-- HERE> in m/%s/ diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index a8ee854d15..ee033634e8 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -389,15 +389,19 @@ It is also possible to define your own properties. This is discussed in L<perlunicode/User-Defined Character Properties>. Unicode properties are defined (surprise!) only on Unicode code points. -A warning is raised and all matches fail on non-Unicode code points -(those above the legal Unicode maximum of 0x10FFFF). This can be -somewhat surprising, +Starting in v5.20, when matching against C<\p> and C<\P>, Perl treats +non-Unicode code points (those above the legal Unicode maximum of +0x10FFFF) as if they were typical unassigned Unicode code points. - chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Fails. - chr(0x110000) =~ \p{ASCII_Hex_Digit=False} # Also fails! +Prior to v5.20, Perl raised a warning and made all matches fail on +non-Unicode code points. This could be somewhat surprising: -Even though these two matches might be thought of as complements, they -are so only on Unicode code points. + chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Fails on Perls < v5.20. + chr(0x110000) =~ \p{ASCII_Hex_Digit=False} # Also fails on Perls + # < v5.20 + +Even though these two matches might be thought of as complements, until +v5.20 they were so only on Unicode code points. =head4 Examples diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index a198d00191..01b94c5604 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -371,13 +371,8 @@ of which under C</i> match C<PosixAlpha>. numerals, come in both upper and lower case so they are C<Cased>, but aren't considered letters, so they aren't C<Cased_Letter>s.) -The result is undefined if you try to match a non-Unicode code point -(that is, one above 0x10FFFF) against a Unicode property. Currently, a -warning is raised, and the match will fail. In some cases, this is -counterintuitive, as both these fail: - - chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Fails. - chr(0x110000) =~ \p{ASCII_Hex_Digit=False} # Fails! +See L</Beyond Unicode code points> for special considerations when +matching Unicode properties against non-Unicode code points. =head3 B<General_Category> @@ -634,8 +629,10 @@ L<Unicode Standard|http://www.unicode.org/reports/tr44>. =item B<C<\p{All}>> -This matches any of the 1_114_112 Unicode code points. It is a synonym for -C<\p{Any}>. +This matches every possible code point. It is equivalent to C<qr/./s>. +Unlike all the other non-user-defined C<\p{}> property matches, no +warning is ever generated if this is property is matched against a +non-Unicode code point (see L</Beyond Unicode code points> below). =item B<C<\p{Alnum}>> @@ -643,8 +640,8 @@ This matches any C<\p{Alphabetic}> or C<\p{Decimal_Number}> character. =item B<C<\p{Any}>> -This matches any of the 1_114_112 Unicode code points. It is a synonym for -C<\p{All}>. +This matches any of the 1_114_112 Unicode code points. It is a synonym +for C<\p{Unicode}>. =item B<C<\p{ASCII}>> @@ -796,6 +793,11 @@ C<\p{General Category=Titlecase_Letter}> (C<\p{gc=lt}>). The difference is that under C</i> caseless matching, these match the same as C<\p{Cased}>, whereas C<\p{gc=lt}> matches C<\p{Cased_Letter>). +=item B<C<\p{Unicode}>> + +This matches any of the 1_114_112 Unicode code points. +C<\p{Any}>. + =item B<C<\p{VertSpace}>> This is the same as C<\v>: A character that changes the spacing vertically. @@ -956,9 +958,9 @@ by two (or more) classes. It's important to remember not to use C<"&"> for the first set; that would be intersecting with nothing, resulting in an empty set. -(Note that official Unicode properties differ from these in that they -automatically exclude non-Unicode code points and a warning is raised if -a match is attempted on one of those.) +Unlike non-user-defined C<\p{}> property matches, no warning is ever +generated if these properties are matched against a non-Unicode code +point (see L</Beyond Unicode code points> below). =head2 User-Defined Case Mappings (for serious hackers only) @@ -1311,10 +1313,112 @@ operations on code points up through that. But Perl works on code points up to the maximum permissible unsigned number available on the platform. However, Perl will not accept these from input streams unless lax rules are being used, and will warn (using the warning category -"non_unicode", which is a sub-category of "utf8") if an attempt is made to -operate on or output them. For example, C<uc(0x11_0000)> will generate -this warning, returning the input parameter as its result, as the upper -case of every non-Unicode code point is the code point itself. +C<"non_unicode">, which is a sub-category of C<"utf8">) if any are output. + +Since Unicode rules are not defined on these code points, if a +Unicode-defined operation is done on them, Perl uses what we believe are +sensible rules, while generally warning, using the C<"non_unicode"> +category. For example, C<uc("\x{11_0000}")> will generate such a +warning, returning the input parameter as its result, since Perl defines +the uppercase of every non-Unicode code point to be the code point +itself. In fact, all the case changing operations, not just +uppercasing, work this way. + +The situation with matching Unicode properties in regular expressions, +the C<\p{}> and C<\P{}> constructs, against these code points is not as +clear cut, and how these are handled has changed as we've gained +experience. + +One possibility is to treat any match against these code points as +undefined. But since Perl doesn't have the concept of a match being +undefined, it converts this to failing or C<FALSE>. This is almost, but +not quite, what Perl did from v5.14 (when use of these code points +became generally reliable) through v5.18. The difference is that Perl +treated all C<\p{}> matches as failing, but all C<\P{}> matches as +succeeding. + +One problem with this is that it leads to unexpected, and confusting +results in some cases: + + chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Failed on <= v5.18 + chr(0x110000) =~ \p{ASCII_Hex_Digit=False} # Failed! on <= v5.18 + +That is, it treated both matches as undefined, and converted that to +false (raising a warning on each). The first case is the expected +result, but the second is likely counterintuitive: "How could both be +false when they are complements?" Another problem was that the +implementation optimized many Unicode property matches down to already +existing simpler, faster operations, which don't raise the warning. We +chose to not forgo those optimizations, which help the vast majority of +matches, just to generate a warning for the unlikely event that an +above-Unicode code point is being matched against. + +As a result of these problems, starting in v5.20, what Perl does is +to treat non-Unicode code points as just typical unassigned Unicode +characters, and matches accordingly. (Note: Unicode has atypical +unassigned code points. For example, it has non-character code points, +and ones that, when they do get assigned, are destined to be written +Right-to-left, as Arabic and Hebrew are. Perl assumes that no +non-Unicode code point has any atypical properties.) + +Perl, in most cases, will raise a warning when matching an above-Unicode +code point against a Unicode property when the result is C<TRUE> for +C<\p{}>, and C<FALSE> for C<\P{}>. For example: + + chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Fails, no warning + chr(0x110000) =~ \p{ASCII_Hex_Digit=False} # Succeeds, with warning + +In both these examples, the character being matched is non-Unicode, so +Unicode doesn't define how it should match. It clearly isn't an ASCII +hex digit, so the first example clearly should fail, and so it does, +with no warning. But it is arguable that the second example should have +an undefined, hence C<FALSE>, result. So a warning is raised for it. + +Thus the warning is raised for many fewer cases than in earlier Perls, +and only when what the result is could be arguable. It turns out that +none of the optimizations made by Perl (or are ever likely to be made) +cause the warning to be skipped, so it solves both problems of Perl's +earlier approach. The most commonly used property that is affected by +this change is C<\p{Unassigned}> which is a short form for +C<\p{General_Category=Unassigned}>. Starting in v5.20, all non-Unicode +code points are considered C<Unassigned>. In earlier releases the +matches failed because the result was considered undefined. + +The only place where the warning is not raised when it might ought to +have been is if optimizations cause the whole pattern match to not even +be attempted. For example, Perl may figure out that for a string to +match a certain regular expression pattern, the string has to contain +the substring C<"foobar">. Before attempting the match, Perl may look +for that substring, and if not found, immediately fail the match without +actually trying it; so no warning gets generated even if the string +contains an above-Unicode code point. + +This behavior is more "Do what I mean" than in earlier Perls for most +applications. But it catches fewer issues for code that needs to be +strictly Unicode compliant. Therefore there is an additional mode of +operation available to accommodate such code. This mode is enabled if a +regular expression pattern is compiled within the lexical scope where +the C<"non_unicode"> warning class has been made fatal, say by: + + use warnings FATAL => "non_unicode" + +(see L<perllexwarn>). In this mode of operation, Perl will raise the +warning for all matches against a non-Unicode code point (not just the +arguable ones), and it skips the optimizations that might cause the +warning to not be output. (It currently still won't warn if the match +isn't even attempted, like in the C<"foobar"> example above.) + +In summary, Perl now normally treats non-Unicode code points as typical +Unicode unassigned code points for regular expression matches, raising a +warning only when it is arguable what the result should be. However, if +this warning has been made fatal, it isn't skipped. + +There is one exception to all this. C<\p{All}> looks like a Unicode +property, but it is a Perl extension that is defined to be true for all +possible code points, Unicode or not, so no warning is ever generated +when matching this against a non-Unicode code point. (Prior to v5.20, +it was an exact synonym for C<\p{Any}>, matching code points C<0> +through C<0x10FFFF>.) =head2 Security Implications of Unicode @@ -8606,38 +8606,12 @@ Perl__invlist_invert(pTHX_ SV* const invlist) void Perl__invlist_invert_prop(pTHX_ SV* const invlist) { - /* Complement the input inversion list (which must be a Unicode property, - * all of which don't match above the Unicode maximum code point.) And - * Perl has chosen to not have the inversion match above that either. This - * adds a 0x110000 if the list didn't end with it, and removes it if it did - */ - - UV len; - UV* array; + /* Complement the input inversion list (which must be a Unicode property). + * Starting in v5.20, this is no different than any invert. */ PERL_ARGS_ASSERT__INVLIST_INVERT_PROP; _invlist_invert(invlist); - - len = _invlist_len(invlist); - - if (len != 0) { /* If empty do nothing */ - array = invlist_array(invlist); - if (array[len - 1] != PERL_UNICODE_MAX + 1) { - /* Add 0x110000. First, grow if necessary */ - len++; - if (invlist_max(invlist) < len) { - invlist_extend(invlist, len); - array = invlist_array(invlist); - } - invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist)); - array[len - 1] = PERL_UNICODE_MAX + 1; - } - else { /* Remove the 0x110000 */ - invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist)); - } - } - return; } #endif @@ -12899,6 +12873,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * Unicode range? */ bool runtime_posix_matches_above_Unicode = FALSE; + bool warn_super = ALWAYS_WARN_SUPER; + regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; @@ -13173,9 +13149,23 @@ parseit: /* Here, did get the swash and its inversion list. If * the swash is from a user-defined property, then this * whole character class should be regarded as such */ - has_user_defined_property = - (swash_init_flags - & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY); + if (swash_init_flags + & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) + { + has_user_defined_property = TRUE; + } + else if + /* We warn on matching an above-Unicode code point + * if the match would return true, except don't + * warn for \p{All}, which has exactly one element + * = 0 */ + (_invlist_contains_cp(invlist, 0x110000) + && (! (_invlist_len(invlist) == 1 + && *invlist_array(invlist) == 0))) + { + warn_super = TRUE; + } + /* Invert if asking for the complement */ if (value == 'P') { @@ -14409,7 +14399,6 @@ parseit: * <depends_list>, because having a Unicode property forces Unicode * semantics */ if (properties) { - bool warn_super = ! has_user_defined_property; if (cp_list) { /* If it matters to the final outcome, see if a non-property @@ -7545,7 +7545,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const && ckWARN_d(WARN_NON_UNICODE)) { Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), - "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c); + "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c); } } diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8 index d6032d4905..9004731cc6 100644 --- a/t/lib/warnings/utf8 +++ b/t/lib/warnings/utf8 @@ -157,6 +157,7 @@ Operation "ucfirst" returns its argument for UTF-16 surrogate U+D800 at - line 3 Operation "ucfirst" returns its argument for UTF-16 surrogate U+DFFF at - line 4. Operation "ucfirst" returns its argument for non-Unicode code point 0x110000 at - line 14. ######## +# NAME Matching \p{} against above-Unicode use warnings 'utf8'; chr(0xD7FF) =~ /\p{Any}/; chr(0xD800) =~ /\p{Any}/; @@ -170,72 +171,20 @@ chr(0x10000) =~ /\p{Any}/; chr(0x100000) =~ /\p{Any}/; chr(0x10FFFE) =~ /\p{Any}/; chr(0x10FFFF) =~ /\p{Any}/; -chr(0x110000) =~ /[\w\p{Any}]/; -chr(0x110010) =~ /[\w\p{PosixWord}]/; -chr(0x110011) =~ /[\w\P{PosixWord}]/; -chr(0x110012) =~ /[\w\p{XPosixWord}]/; -chr(0x110013) =~ /[\w\P{XPosixWord}]/; -chr(0x110014) =~ /[\w\p{PosixAlnum}]/; -chr(0x110015) =~ /[\w\P{PosixAlnum}]/; -chr(0x110016) =~ /[\w\p{XPosixAlnum}]/; -chr(0x110017) =~ /[\w\P{XPosixAlnum}]/; -chr(0x110018) =~ /[\w\p{PosixSpace}]/; -chr(0x110019) =~ /[\w\P{PosixSpace}]/; -chr(0x11001A) =~ /[\w\p{XPosixSpace}]/; -chr(0x11001B) =~ /[\w\P{XPosixSpace}]/; -chr(0x11001C) =~ /[\w\p{PosixDigit}]/; -chr(0x11001D) =~ /[\w\P{PosixDigit}]/; -chr(0x11001E) =~ /[\w\p{XPosixDigit}]/; -chr(0x11001F) =~ /[\w\P{XPosixDigit}]/; -chr(0x110020) =~ /[\w\p{PosixAlpha}]/; -chr(0x110021) =~ /[\w\P{PosixAlpha}]/; -chr(0x110022) =~ /[\w\p{XPosixAlpha}]/; -chr(0x110023) =~ /[\w\P{XPosixAlpha}]/; -chr(0x110024) =~ /[\w\p{Ascii}]/; -chr(0x110025) =~ /[\w\P{Ascii}]/; -chr(0x110026) =~ /[\w\p{PosixCntrl}]/; -chr(0x110027) =~ /[\w\P{PosixCntrl}]/; -chr(0x110028) =~ /[\w\p{XPosixCntrl}]/; -chr(0x110029) =~ /[\w\P{XPosixCntrl}]/; -chr(0x11002A) =~ /[\w\p{PosixGraph}]/; -chr(0x11002B) =~ /[\w\P{PosixGraph}]/; -chr(0x11002C) =~ /[\w\p{XPosixGraph}]/; -chr(0x11002D) =~ /[\w\P{XPosixGraph}]/; -chr(0x11002E) =~ /[\w\p{PosixLower}]/; -chr(0x11002F) =~ /[\w\P{PosixLower}]/; -chr(0x110030) =~ /[\w\p{XPosixLower}]/; -chr(0x110031) =~ /[\w\P{XPosixLower}]/; -chr(0x110032) =~ /[\w\p{PosixPrint}]/; -chr(0x110033) =~ /[\w\P{PosixPrint}]/; -chr(0x110034) =~ /[\w\p{XPosixPrint}]/; -chr(0x110035) =~ /[\w\P{XPosixPrint}]/; -chr(0x110036) =~ /[\w\p{PosixPunct}]/; -chr(0x110037) =~ /[\w\P{PosixPunct}]/; -chr(0x110038) =~ /[\w\p{XPosixPunct}]/; -chr(0x110039) =~ /[\w\P{XPosixPunct}]/; -chr(0x11003A) =~ /[\w\p{PosixUpper}]/; -chr(0x11003B) =~ /[\w\P{PosixUpper}]/; -chr(0x11003C) =~ /[\w\p{XPosixUpper}]/; -chr(0x11003D) =~ /[\w\P{XPosixUpper}]/; -chr(0x11003E) =~ /[\w\p{PosixXdigit}]/; -chr(0x11003F) =~ /[\w\P{PosixXdigit}]/; -chr(0x110040) =~ /[\w\p{XPosixXdigit}]/; -chr(0x110041) =~ /[\w\P{XPosixXdigit}]/; -chr(0x110042) =~ /[\w\p{PerlSpace}]/; -chr(0x110043) =~ /[\w\P{PerlSpace}]/; -chr(0x110044) =~ /[\w\p{XPerlSpace}]/; -chr(0x110045) =~ /[\w\P{XPerlSpace}]/; -chr(0x110046) =~ /[\w\p{PosixBlank}]/; -chr(0x110047) =~ /[\w\P{PosixBlank}]/; -chr(0x110048) =~ /[\w\p{XPosixBlank}]/; -chr(0x110049) =~ /[\w\P{XPosixBlank}]/; -# Currently some warnings from the above are output twice -# Only Unicode properties give non-Unicode warnings, and not when something -# else in the class matches above Unicode. Below we test three ways where -# something outside the property may match non-Unicode: a code point above it, -# a class \S that we know at compile time doesn't, and a class \W whose values -# aren't (at the time of this writing) specified at compile time, but which -# wouldn't match +chr(0x110000) =~ /[\p{Any}]/; +chr(0x110001) =~ /[\w\p{Any}]/; +chr(0x10FFFF) =~ /\p{All}/; +chr(0x110002) =~ /[\w\p{All}]/; +chr(0x110003) =~ /[\p{XPosixWord}]/; +chr(0x110004) =~ /[\P{XPosixWord}]/; +chr(0x110005) =~ /^[\p{Unassigned}]/; +chr(0x110006) =~ /^[\P{Unassigned}]/; +# Only Unicode properties give non-Unicode warnings, and only those properties +# which do match above Unicode; and not when something else in the class +# matches above Unicode. Below we test three ways where something outside the +# property may match non-Unicode: a code point above it, a class \S that we +# know at compile time doesn't, and a class \W whose values aren't (at the time +# of this writing) specified at compile time, but which wouldn't match chr(0x110050) =~ /\w/; chr(0x110051) =~ /\W/; chr(0x110052) =~ /\d/; @@ -270,183 +219,12 @@ chr(0x11006E) =~ /[[:xdigit:]]/; chr(0x11006F) =~ /[[:^xdigit:]]/; chr(0x110070) =~ /[[:blank:]]/; chr(0x110071) =~ /[[:^blank:]]/; -chr(0x111000) =~ /[\W\p{Any}]/; -chr(0x111010) =~ /[\W\p{PosixWord}]/; -chr(0x111011) =~ /[\W\P{PosixWord}]/; -chr(0x111012) =~ /[\W\p{XPosixWord}]/; -chr(0x111013) =~ /[\W\P{XPosixWord}]/; -chr(0x111014) =~ /[\W\p{PosixAlnum}]/; -chr(0x111015) =~ /[\W\P{PosixAlnum}]/; -chr(0x111016) =~ /[\W\p{XPosixAlnum}]/; -chr(0x111017) =~ /[\W\P{XPosixAlnum}]/; -chr(0x111018) =~ /[\W\p{PosixSpace}]/; -chr(0x111019) =~ /[\W\P{PosixSpace}]/; -chr(0x11101A) =~ /[\W\p{XPosixSpace}]/; -chr(0x11101B) =~ /[\W\P{XPosixSpace}]/; -chr(0x11101C) =~ /[\W\p{PosixDigit}]/; -chr(0x11101D) =~ /[\W\P{PosixDigit}]/; -chr(0x11101E) =~ /[\W\p{XPosixDigit}]/; -chr(0x11101F) =~ /[\W\P{XPosixDigit}]/; -chr(0x111020) =~ /[\W\p{PosixAlpha}]/; -chr(0x111021) =~ /[\W\P{PosixAlpha}]/; -chr(0x111022) =~ /[\W\p{XPosixAlpha}]/; -chr(0x111023) =~ /[\W\P{XPosixAlpha}]/; -chr(0x111024) =~ /[\W\p{Ascii}]/; -chr(0x111025) =~ /[\W\P{Ascii}]/; -chr(0x111026) =~ /[\W\p{PosixCntrl}]/; -chr(0x111027) =~ /[\W\P{PosixCntrl}]/; -chr(0x111028) =~ /[\W\p{XPosixCntrl}]/; -chr(0x111029) =~ /[\W\P{XPosixCntrl}]/; -chr(0x11102A) =~ /[\W\p{PosixGraph}]/; -chr(0x11102B) =~ /[\W\P{PosixGraph}]/; -chr(0x11102C) =~ /[\W\p{XPosixGraph}]/; -chr(0x11102D) =~ /[\W\P{XPosixGraph}]/; -chr(0x11102E) =~ /[\W\p{PosixLower}]/; -chr(0x11102F) =~ /[\W\P{PosixLower}]/; -chr(0x111030) =~ /[\W\p{XPosixLower}]/; -chr(0x111031) =~ /[\W\P{XPosixLower}]/; -chr(0x111032) =~ /[\W\p{PosixPrint}]/; -chr(0x111033) =~ /[\W\P{PosixPrint}]/; -chr(0x111034) =~ /[\W\p{XPosixPrint}]/; -chr(0x111035) =~ /[\W\P{XPosixPrint}]/; -chr(0x111036) =~ /[\W\p{PosixPunct}]/; -chr(0x111037) =~ /[\W\P{PosixPunct}]/; -chr(0x111038) =~ /[\W\p{XPosixPunct}]/; -chr(0x111039) =~ /[\W\P{XPosixPunct}]/; -chr(0x11103A) =~ /[\W\p{PosixUpper}]/; -chr(0x11103B) =~ /[\W\P{PosixUpper}]/; -chr(0x11103C) =~ /[\W\p{XPosixUpper}]/; -chr(0x11103D) =~ /[\W\P{XPosixUpper}]/; -chr(0x11103E) =~ /[\W\p{PosixXdigit}]/; -chr(0x11103F) =~ /[\W\P{PosixXdigit}]/; -chr(0x111040) =~ /[\W\p{XPosixXdigit}]/; -chr(0x111041) =~ /[\W\P{XPosixXdigit}]/; -chr(0x111042) =~ /[\W\p{PerlSpace}]/; -chr(0x111043) =~ /[\W\P{PerlSpace}]/; -chr(0x111044) =~ /[\W\p{XPerlSpace}]/; -chr(0x111045) =~ /[\W\P{XPerlSpace}]/; -chr(0x111046) =~ /[\W\p{PosixBlank}]/; -chr(0x111047) =~ /[\W\P{PosixBlank}]/; -chr(0x111048) =~ /[\W\p{XPosixBlank}]/; -chr(0x111049) =~ /[\W\P{XPosixBlank}]/; -chr(0x112000) =~ /[\S\p{Any}]/; -chr(0x112010) =~ /[\S\p{PosixWord}]/; -chr(0x112011) =~ /[\S\P{PosixWord}]/; -chr(0x112012) =~ /[\S\p{XPosixWord}]/; -chr(0x112013) =~ /[\S\P{XPosixWord}]/; -chr(0x112014) =~ /[\S\p{PosixAlnum}]/; -chr(0x112015) =~ /[\S\P{PosixAlnum}]/; -chr(0x112016) =~ /[\S\p{XPosixAlnum}]/; -chr(0x112017) =~ /[\S\P{XPosixAlnum}]/; -chr(0x112018) =~ /[\S\p{PosixSpace}]/; -chr(0x112019) =~ /[\S\P{PosixSpace}]/; -chr(0x11201A) =~ /[\S\p{XPosixSpace}]/; -chr(0x11201B) =~ /[\S\P{XPosixSpace}]/; -chr(0x11201C) =~ /[\S\p{PosixDigit}]/; -chr(0x11201D) =~ /[\S\P{PosixDigit}]/; -chr(0x11201E) =~ /[\S\p{XPosixDigit}]/; -chr(0x11201F) =~ /[\S\P{XPosixDigit}]/; -chr(0x112020) =~ /[\S\p{PosixAlpha}]/; -chr(0x112021) =~ /[\S\P{PosixAlpha}]/; -chr(0x112022) =~ /[\S\p{XPosixAlpha}]/; -chr(0x112023) =~ /[\S\P{XPosixAlpha}]/; -chr(0x112024) =~ /[\S\p{Ascii}]/; -chr(0x112025) =~ /[\S\P{Ascii}]/; -chr(0x112026) =~ /[\S\p{PosixCntrl}]/; -chr(0x112027) =~ /[\S\P{PosixCntrl}]/; -chr(0x112028) =~ /[\S\p{XPosixCntrl}]/; -chr(0x112029) =~ /[\S\P{XPosixCntrl}]/; -chr(0x11202A) =~ /[\S\p{PosixGraph}]/; -chr(0x11202B) =~ /[\S\P{PosixGraph}]/; -chr(0x11202C) =~ /[\S\p{XPosixGraph}]/; -chr(0x11202D) =~ /[\S\P{XPosixGraph}]/; -chr(0x11202E) =~ /[\S\p{PosixLower}]/; -chr(0x11202F) =~ /[\S\P{PosixLower}]/; -chr(0x112030) =~ /[\S\p{XPosixLower}]/; -chr(0x112031) =~ /[\S\P{XPosixLower}]/; -chr(0x112032) =~ /[\S\p{PosixPrint}]/; -chr(0x112033) =~ /[\S\P{PosixPrint}]/; -chr(0x112034) =~ /[\S\p{XPosixPrint}]/; -chr(0x112035) =~ /[\S\P{XPosixPrint}]/; -chr(0x112036) =~ /[\S\p{PosixPunct}]/; -chr(0x112037) =~ /[\S\P{PosixPunct}]/; -chr(0x112038) =~ /[\S\p{XPosixPunct}]/; -chr(0x112039) =~ /[\S\P{XPosixPunct}]/; -chr(0x11203A) =~ /[\S\p{PosixUpper}]/; -chr(0x11203B) =~ /[\S\P{PosixUpper}]/; -chr(0x11203C) =~ /[\S\p{XPosixUpper}]/; -chr(0x11203D) =~ /[\S\P{XPosixUpper}]/; -chr(0x11203E) =~ /[\S\p{PosixXdigit}]/; -chr(0x11203F) =~ /[\S\P{PosixXdigit}]/; -chr(0x112040) =~ /[\S\p{XPosixXdigit}]/; -chr(0x112041) =~ /[\S\P{XPosixXdigit}]/; -chr(0x112042) =~ /[\S\p{PerlSpace}]/; -chr(0x112043) =~ /[\S\P{PerlSpace}]/; -chr(0x112044) =~ /[\S\p{XPerlSpace}]/; -chr(0x112045) =~ /[\S\P{XPerlSpace}]/; -chr(0x112046) =~ /[\S\p{PosixBlank}]/; -chr(0x112047) =~ /[\S\P{PosixBlank}]/; -chr(0x112048) =~ /[\S\p{XPosixBlank}]/; -chr(0x112049) =~ /[\S\P{XPosixBlank}]/; -chr(0x113000) =~ /[\x{110000}\p{Any}]/; -chr(0x113010) =~ /[\x{110000}\p{PosixWord}]/; -chr(0x113011) =~ /[\x{110000}\P{PosixWord}]/; -chr(0x113012) =~ /[\x{110000}\p{XPosixWord}]/; -chr(0x113013) =~ /[\x{110000}\P{XPosixWord}]/; -chr(0x113014) =~ /[\x{110000}\p{PosixAlnum}]/; -chr(0x113015) =~ /[\x{110000}\P{PosixAlnum}]/; -chr(0x113016) =~ /[\x{110000}\p{XPosixAlnum}]/; -chr(0x113017) =~ /[\x{110000}\P{XPosixAlnum}]/; -chr(0x113018) =~ /[\x{110000}\p{PosixSpace}]/; -chr(0x113019) =~ /[\x{110000}\P{PosixSpace}]/; -chr(0x11301A) =~ /[\x{110000}\p{XPosixSpace}]/; -chr(0x11301B) =~ /[\x{110000}\P{XPosixSpace}]/; -chr(0x11301C) =~ /[\x{110000}\p{PosixDigit}]/; -chr(0x11301D) =~ /[\x{110000}\P{PosixDigit}]/; -chr(0x11301E) =~ /[\x{110000}\p{XPosixDigit}]/; -chr(0x11301F) =~ /[\x{110000}\P{XPosixDigit}]/; -chr(0x113020) =~ /[\x{110000}\p{PosixAlpha}]/; -chr(0x113021) =~ /[\x{110000}\P{PosixAlpha}]/; -chr(0x113022) =~ /[\x{110000}\p{XPosixAlpha}]/; -chr(0x113023) =~ /[\x{110000}\P{XPosixAlpha}]/; -chr(0x113024) =~ /[\x{110000}\p{Ascii}]/; -chr(0x113025) =~ /[\x{110000}\P{Ascii}]/; -chr(0x113026) =~ /[\x{110000}\p{PosixCntrl}]/; -chr(0x113027) =~ /[\x{110000}\P{PosixCntrl}]/; -chr(0x113028) =~ /[\x{110000}\p{XPosixCntrl}]/; -chr(0x113029) =~ /[\x{110000}\P{XPosixCntrl}]/; -chr(0x11302A) =~ /[\x{110000}\p{PosixGraph}]/; -chr(0x11302B) =~ /[\x{110000}\P{PosixGraph}]/; -chr(0x11302C) =~ /[\x{110000}\p{XPosixGraph}]/; -chr(0x11302D) =~ /[\x{110000}\P{XPosixGraph}]/; -chr(0x11302E) =~ /[\x{110000}\p{PosixLower}]/; -chr(0x11302F) =~ /[\x{110000}\P{PosixLower}]/; -chr(0x113030) =~ /[\x{110000}\p{XPosixLower}]/; -chr(0x113031) =~ /[\x{110000}\P{XPosixLower}]/; -chr(0x113032) =~ /[\x{110000}\p{PosixPrint}]/; -chr(0x113033) =~ /[\x{110000}\P{PosixPrint}]/; -chr(0x113034) =~ /[\x{110000}\p{XPosixPrint}]/; -chr(0x113035) =~ /[\x{110000}\P{XPosixPrint}]/; -chr(0x113036) =~ /[\x{110000}\p{PosixPunct}]/; -chr(0x113037) =~ /[\x{110000}\P{PosixPunct}]/; -chr(0x113038) =~ /[\x{110000}\p{XPosixPunct}]/; -chr(0x113039) =~ /[\x{110000}\P{XPosixPunct}]/; -chr(0x11303A) =~ /[\x{110000}\p{PosixUpper}]/; -chr(0x11303B) =~ /[\x{110000}\P{PosixUpper}]/; -chr(0x11303C) =~ /[\x{110000}\p{XPosixUpper}]/; -chr(0x11303D) =~ /[\x{110000}\P{XPosixUpper}]/; -chr(0x11303E) =~ /[\x{110000}\p{PosixXdigit}]/; -chr(0x11303F) =~ /[\x{110000}\P{PosixXdigit}]/; -chr(0x113040) =~ /[\x{110000}\p{XPosixXdigit}]/; -chr(0x113041) =~ /[\x{110000}\P{XPosixXdigit}]/; -chr(0x113042) =~ /[\x{110000}\p{PerlSpace}]/; -chr(0x113043) =~ /[\x{110000}\P{PerlSpace}]/; -chr(0x113044) =~ /[\x{110000}\p{XPerlSpace}]/; -chr(0x113045) =~ /[\x{110000}\P{XPerlSpace}]/; -chr(0x113046) =~ /[\x{110000}\p{PosixBlank}]/; -chr(0x113047) =~ /[\x{110000}\P{PosixBlank}]/; -chr(0x113048) =~ /[\x{110000}\p{XPosixBlank}]/; -chr(0x113049) =~ /[\x{110000}\P{XPosixBlank}]/; +chr(0x111010) =~ /[\W\p{Unassigned}]/; +chr(0x111011) =~ /[\W\P{Unassigned}]/; +chr(0x112010) =~ /[\S\p{Unassigned}]/; +chr(0x112011) =~ /[\S\P{Unassigned}]/; +chr(0x113010) =~ /[\x{110000}\p{Unassigned}]/; +chr(0x113011) =~ /[\x{110000}\P{Unassigned}]/; no warnings 'utf8'; chr(0xD7FF) =~ /\p{Any}/; chr(0xD800) =~ /\p{Any}/; @@ -460,65 +238,14 @@ chr(0x10000) =~ /\p{Any}/; chr(0x100000) =~ /\p{Any}/; chr(0x10FFFE) =~ /\p{Any}/; chr(0x10FFFF) =~ /\p{Any}/; -chr(0x110000) =~ /\p{Any}/; -chr(0x110010) =~ /\p{PosixWord}/; -chr(0x110011) =~ /\P{PosixWord}/; -chr(0x110012) =~ /\p{XPosixWord}/; -chr(0x110013) =~ /\P{XPosixWord}/; -chr(0x110014) =~ /\p{PosixAlnum}/; -chr(0x110015) =~ /\P{PosixAlnum}/; -chr(0x110016) =~ /\p{XPosixAlnum}/; -chr(0x110017) =~ /\P{XPosixAlnum}/; -chr(0x110018) =~ /\p{PosixSpace}/; -chr(0x110019) =~ /\P{PosixSpace}/; -chr(0x11001A) =~ /\p{XPosixSpace}/; -chr(0x11001B) =~ /\P{XPosixSpace}/; -chr(0x11001C) =~ /\p{PosixDigit}/; -chr(0x11001D) =~ /\P{PosixDigit}/; -chr(0x11001E) =~ /\p{XPosixDigit}/; -chr(0x11001F) =~ /\P{XPosixDigit}/; -chr(0x110020) =~ /\p{PosixAlpha}/; -chr(0x110021) =~ /\P{PosixAlpha}/; -chr(0x110022) =~ /\p{XPosixAlpha}/; -chr(0x110023) =~ /\P{XPosixAlpha}/; -chr(0x110024) =~ /\p{Ascii}/; -chr(0x110025) =~ /\P{Ascii}/; -chr(0x110026) =~ /\p{PosixCntrl}/; -chr(0x110027) =~ /\P{PosixCntrl}/; -chr(0x110028) =~ /\p{XPosixCntrl}/; -chr(0x110029) =~ /\P{XPosixCntrl}/; -chr(0x11002A) =~ /\p{PosixGraph}/; -chr(0x11002B) =~ /\P{PosixGraph}/; -chr(0x11002C) =~ /\p{XPosixGraph}/; -chr(0x11002D) =~ /\P{XPosixGraph}/; -chr(0x11002E) =~ /\p{PosixLower}/; -chr(0x11002F) =~ /\P{PosixLower}/; -chr(0x110030) =~ /\p{XPosixLower}/; -chr(0x110031) =~ /\P{XPosixLower}/; -chr(0x110032) =~ /\p{PosixPrint}/; -chr(0x110033) =~ /\P{PosixPrint}/; -chr(0x110034) =~ /\p{XPosixPrint}/; -chr(0x110035) =~ /\P{XPosixPrint}/; -chr(0x110036) =~ /\p{PosixPunct}/; -chr(0x110037) =~ /\P{PosixPunct}/; -chr(0x110038) =~ /\p{XPosixPunct}/; -chr(0x110039) =~ /\P{XPosixPunct}/; -chr(0x11003A) =~ /\p{PosixUpper}/; -chr(0x11003B) =~ /\P{PosixUpper}/; -chr(0x11003C) =~ /\p{XPosixUpper}/; -chr(0x11003D) =~ /\P{XPosixUpper}/; -chr(0x11003E) =~ /\p{PosixXdigit}/; -chr(0x11003F) =~ /\P{PosixXdigit}/; -chr(0x110040) =~ /\p{XPosixXdigit}/; -chr(0x110041) =~ /\P{XPosixXdigit}/; -chr(0x110042) =~ /\p{PerlSpace}/; -chr(0x110043) =~ /\P{PerlSpace}/; -chr(0x110044) =~ /\p{XPerlSpace}/; -chr(0x110045) =~ /\P{XPerlSpace}/; -chr(0x110046) =~ /\p{PosixBlank}/; -chr(0x110047) =~ /\P{PosixBlank}/; -chr(0x110048) =~ /\p{XPosixBlank}/; -chr(0x110049) =~ /\P{XPosixBlank}/; +chr(0x110000) =~ /[\p{Any}]/; +chr(0x110001) =~ /[\w\p{Any}]/; +chr(0x10FFFF) =~ /\p{All}/; +chr(0x110002) =~ /[\w\p{All}]/; +chr(0x110003) =~ /[\p{XPosixWord}]/; +chr(0x110004) =~ /[\P{XPosixWord}]/; +chr(0x110005) =~ /^[\p{Unassigned}]/; +chr(0x110006) =~ /^[\P{Unassigned}]/; chr(0x110050) =~ /\w/; chr(0x110051) =~ /\W/; chr(0x110052) =~ /\d/; @@ -553,126 +280,60 @@ chr(0x11006E) =~ /[[:xdigit:]]/; chr(0x11006F) =~ /[[:^xdigit:]]/; chr(0x110070) =~ /[[:blank:]]/; chr(0x110071) =~ /[[:^blank:]]/; +chr(0x111010) =~ /[\W\p{Unassigned}]/; +chr(0x111011) =~ /[\W\P{Unassigned}]/; +chr(0x112010) =~ /[\S\p{Unassigned}]/; +chr(0x112011) =~ /[\S\P{Unassigned}]/; +chr(0x113010) =~ /[\x{110000}\p{Unassigned}]/; +chr(0x113011) =~ /[\x{110000}\P{Unassigned}]/; EXPECT -Code point 0x110000 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 14. -Code point 0x110010 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 15. -Code point 0x110011 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 16. -Code point 0x110011 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 16. -Code point 0x110012 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 17. -Code point 0x110013 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 18. -Code point 0x110013 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 18. -Code point 0x110014 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 19. -Code point 0x110015 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 20. -Code point 0x110015 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 20. -Code point 0x110016 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 21. -Code point 0x110017 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 22. -Code point 0x110017 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 22. -Code point 0x110018 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 23. -Code point 0x110019 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 24. -Code point 0x110019 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 24. -Code point 0x11001A is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 25. -Code point 0x11001B is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 26. -Code point 0x11001B is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 26. -Code point 0x11001C is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 27. -Code point 0x11001D is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 28. -Code point 0x11001D is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 28. -Code point 0x11001E is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 29. -Code point 0x11001F is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 30. -Code point 0x11001F is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 30. -Code point 0x110020 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 31. -Code point 0x110021 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 32. -Code point 0x110021 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 32. -Code point 0x110022 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 33. -Code point 0x110023 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 34. -Code point 0x110023 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 34. -Code point 0x110024 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 35. -Code point 0x110025 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 36. -Code point 0x110025 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 36. -Code point 0x110026 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 37. -Code point 0x110027 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 38. -Code point 0x110027 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 38. -Code point 0x110028 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 39. -Code point 0x110029 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 40. -Code point 0x110029 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 40. -Code point 0x11002A is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 41. -Code point 0x11002B is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 42. -Code point 0x11002B is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 42. -Code point 0x11002C is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 43. -Code point 0x11002D is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 44. -Code point 0x11002D is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 44. -Code point 0x11002E is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 45. -Code point 0x11002F is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 46. -Code point 0x11002F is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 46. -Code point 0x110030 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 47. -Code point 0x110031 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 48. -Code point 0x110031 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 48. -Code point 0x110032 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 49. -Code point 0x110033 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 50. -Code point 0x110033 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 50. -Code point 0x110034 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 51. -Code point 0x110035 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 52. -Code point 0x110035 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 52. -Code point 0x110036 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 53. -Code point 0x110037 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 54. -Code point 0x110037 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 54. -Code point 0x110038 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 55. -Code point 0x110039 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 56. -Code point 0x110039 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 56. -Code point 0x11003A is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 57. -Code point 0x11003B is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 58. -Code point 0x11003B is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 58. -Code point 0x11003C is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 59. -Code point 0x11003D is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 60. -Code point 0x11003D is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 60. -Code point 0x11003E is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 61. -Code point 0x11003F is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 62. -Code point 0x11003F is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 62. -Code point 0x110040 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 63. -Code point 0x110041 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 64. -Code point 0x110041 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 64. -Code point 0x110042 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 65. -Code point 0x110043 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 66. -Code point 0x110043 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 66. -Code point 0x110044 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 67. -Code point 0x110045 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 68. -Code point 0x110045 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 68. -Code point 0x110046 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 69. -Code point 0x110047 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 70. -Code point 0x110047 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 70. -Code point 0x110048 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 71. -Code point 0x110049 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 72. -Code point 0x110049 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 72. +Matched non-Unicode code point 0x110005 against Unicode property; may not be portable at - line 20. +Matched non-Unicode code point 0x110006 against Unicode property; may not be portable at - line 21. ######## # NAME Matching Unicode property against above-Unicode code point outputs a warning even if optimizer rejects the match (in synthetic start class) -use warnings 'non_unicode'; +# Now have to make FATAL to guarantee being output +use warnings FATAL => 'non_unicode'; "\x{110000}" =~ /b?\p{Space}/; EXPECT -Code point 0x110000 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 2. +Matched non-Unicode code point 0x110000 against Unicode property; may not be portable at - line 3. ######## # NAME Matching POSIX class property against above-Unicode code point doesn't output a warning use warnings 'non_unicode'; +use warnings FATAL => 'non_unicode'; "\x{110000}" =~ /b?[[:space:]]/; EXPECT ######## use warnings 'utf8'; chr(0x110000) =~ /\p{Any}/; +######## +# NAME utf8, non_unicode warnings categories work on Matched non-Unicode code point warning +use warnings qw(utf8 non_unicode); +chr(0x110000) =~ /^\p{Unassigned}/; no warnings 'non_unicode'; -chr(0x110000) =~ /\p{Any}/; +chr(0x110001) =~ /\p{Unassigned}/; +use warnings 'non_unicode'; +no warnings 'utf8'; +chr(0x110002) =~ /\p{Unassigned}/; EXPECT -Code point 0x110000 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 2. +Matched non-Unicode code point 0x110000 against Unicode property; may not be portable at - line 2. ######## # NAME optimizable regnode should still give non_unicode warnings when fatalized use warnings 'utf8'; use warnings FATAL => 'non_unicode'; chr(0x110000) =~ /\p{lb=cr}/; EXPECT -Code point 0x110000 is not Unicode, all \p{} matches fail; all \P{} matches succeed at - line 3. +Matched non-Unicode code point 0x110000 against Unicode property; may not be portable at - line 3. ######## # NAME optimizable regnode should not give non_unicode warnings when warnings are off no warnings 'non_unicode'; chr(0x110000) =~ /\p{lb=cr}/; EXPECT ######## +# NAME 'All' matches above-Unicode without any warning +use warnings qw(utf8 non_unicode); +chr(0x110000) =~ /\p{All}/; +EXPECT +######## require "../test.pl"; use warnings 'utf8'; sub Is_Super { return '!utf8::Any' } diff --git a/t/porting/diag.t b/t/porting/diag.t index 95099bc0e5..6119aeda98 100644 --- a/t/porting/diag.t +++ b/t/porting/diag.t @@ -672,7 +672,6 @@ Wrong syntax (suid) fd script name "%s" 'X' outside of string in unpack __CATEGORIES__ -Code point 0x%X is not Unicode, all \p{} matches fail; all \P{} matches succeed Code point 0x%X is not Unicode, may not be portable Illegal character \%o (carriage return) Missing argument in %s diff --git a/t/re/pat.t b/t/re/pat.t index 905c242988..5676422439 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -1217,12 +1217,10 @@ sub run_tests { local $SIG{__WARN__} = sub {}; my $str = "\x{110000}"; - # No non-unicode code points match any Unicode property, even inverse - # ones - unlike($str, qr/\p{ASCII_Hex_Digit=True}/, "Non-Unicode doesn't match \\p{}"); - unlike($str, qr/\p{ASCII_Hex_Digit=False}/, "Non-Unicode doesn't match \\p{}"); - like($str, qr/\P{ASCII_Hex_Digit=True}/, "Non-Unicode matches \\P{}"); - like($str, qr/\P{ASCII_Hex_Digit=False}/, "Non-Unicode matches \\P{}"); + unlike($str, qr/\p{ASCII_Hex_Digit=True}/, "Non-Unicode doesn't match \\p{AHEX=True}"); + like($str, qr/\p{ASCII_Hex_Digit=False}/, "Non-Unicode matches \\p{AHEX=False}"); + like($str, qr/\P{ASCII_Hex_Digit=True}/, "Non-Unicode matches \\P{AHEX=True}"); + unlike($str, qr/\P{ASCII_Hex_Digit=False}/, "Non-Unicode matches \\P{AHEX=FALSE}"); } { |