Change \p{} matching for above-Unicode code points

http://markmail.org/message/eod7ukhbbh5tnll4 is the beginning of the thread that led to this commit. This commit revises the handling of \p{} and \P{} to treat above-Unicode code points as typical Unicode unassigned ones, and only output a warning during matching when the answer is arguable under strict Unicode rules (that is "matched" for \p{}, and "didn't match" for \P{}). The exception is if the warning category has been made fatal, then it tries hard to always output the warning. The definition of \p{All} is changed to be qr/./s, and no warning is issued at all for matching it against above-Unicode code points.
author: Karl Williamson <public@khwilliamson.com> 2013-12-23 20:35:54 -0700
committer: Karl Williamson <public@khwilliamson.com> 2013-12-31 08:27:23 -0700
commit: 2d88a86a5910c97496b47b7b7c223f2c9a14b57c (patch)
tree: c0125ea6a9b6175c93245c4048773ae82e0f4efc /lib
parent: f215ab38f4d9ea2dca08fc71b38db0eb650d5107 (diff)
download: perl-2d88a86a5910c97496b47b7b7c223f2c9a14b57c.tar.gz
3 files changed, 157 insertions, 170 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 14752ae2b1..e4ae34e270 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -5,7 +5,7 @@ use warnings;
 no warnings 'surrogate';    # surrogates can be inputs to this
 use charnames ();
 
-our $VERSION = '0.55';
+our $VERSION = '0.56';
 
 require Exporter;
 
@@ -2138,21 +2138,10 @@ too high for some operations to work; you may wish to use a smaller number for
 your purposes.)
 
 Note that the inversion lists returned by this function can possibly include
-non-Unicode code points, that is anything above 0x10FFFF.  This is in
-contrast to Perl regular expression matches on those code points, in which a
-non-Unicode code point always fails to match.  For example, both of these have
-the same result:
-
- chr(0x110000) =~ \p{ASCII_Hex_Digit=True}      # Fails.
- chr(0x110000) =~ \p{ASCII_Hex_Digit=False}     # Fails!
-
-And both raise a warning that a Unicode property is being used on a
-non-Unicode code point.  It is arguable as to which is the correct thing to do
-here.  This function has chosen the way opposite to the Perl regular
-expression behavior.  This allows you to easily flip to the Perl regular
-expression way (for you to go in the other direction would be far harder).
-Simply add 0x110000 at the end of the non-empty returned list if it isn't
-already that value; and pop that value if it is; like:
+non-Unicode code points, that is anything above 0x10FFFF.  Unicode properties
+are not defined on such code points.  You might wish to change the output to
+not include these.  Simply add 0x110000 at the end of the non-empty returned
+list if it isn't already that value; and pop that value if it is; like:
 
  my @list = prop_invlist("foo");
  if (@list) {
@@ -2261,19 +2250,18 @@ sub prop_invlist ($;$) {
 
         if (defined $hex_end) { # The next item starts with the code point 1
                                 # beyond the end of the range.
-            push @invlist, hex($hex_end) + 1;
+            no warnings 'portable';
+            my $end = hex $hex_end;
+            last if $end == $Unicode::UCD::MAX_CP;
+            push @invlist, $end + 1;
         }
         else {  # No end of range, is a single code point.
             push @invlist, $begin + 1;
         }
     }
 
-    require "unicore/UCD.pl";
-    my $FIRST_NON_UNICODE = $MAX_UNICODE_CODEPOINT + 1;
-
     # Could need to be inverted: add or subtract a 0 at the beginning of the
-    # list.  And to keep it from matching non-Unicode, add or subtract the
-    # first non-unicode code point.
+    # list.
     if ($swash->{'INVERT_IT'}) {
         if (@invlist && $invlist[0] == 0) {
             shift @invlist;
@@ -2281,46 +2269,6 @@ sub prop_invlist ($;$) {
         else {
             unshift @invlist, 0;
         }
-        if (@invlist && $invlist[-1] == $FIRST_NON_UNICODE) {
-            pop @invlist;
-        }
-        else {
-            push @invlist, $FIRST_NON_UNICODE;
-        }
-    }
-
-    # Here, the list is set up to include only Unicode code points.  But, if
-    # the table is the default one for the property, it should contain all
-    # non-Unicode code points.  First calculate the loose name for the
-    # property.  This is done even for strict-name properties, as the data
-    # structure that mktables generates for us is set up so that we don't have
-    # to worry about that.  The property-value needs to be split if compound,
-    # as the loose rules need to be independently calculated on each part.  We
-    # know that it is syntactically valid, or SWASHNEW would have failed.
-
-    $prop = lc $prop;
-    my ($prop_only, $table) = split /\s*[:=]\s*/, $prop;
-    if ($table) {
-
-        # May have optional prefixed 'is'
-        $prop = utf8::_loose_name($prop_only) =~ s/^is//r;
-        $prop = $utf8::loose_property_name_of{$prop};
-        $prop .= "=" . utf8::_loose_name($table);
-    }
-    else {
-        $prop = utf8::_loose_name($prop);
-    }
-    if (exists $loose_defaults{$prop}) {
-
-        # Here, is the default table.  If a range ended with 10ffff, instead
-        # continue that range to infinity, by popping the 110000; otherwise,
-        # add the range from 11000 to infinity
-        if (! @invlist || $invlist[-1] != $FIRST_NON_UNICODE) {
-            push @invlist, $FIRST_NON_UNICODE;
-        }
-        else {
-            pop @invlist;
-        }
     }
 
     return @invlist;
@@ -2349,8 +2297,8 @@ or even better, C<"Gc=LC">).
 Many Unicode properties have more than one name (or alias).  C<prop_invmap>
 understands all of these, including Perl extensions to them.  Ambiguities are
 resolved as described above for L</prop_aliases()>.  The Perl internal
-property "Perl_Decimal_Digit, described below, is also accepted.  C<undef> is
-returned if the property name is unknown.
+property "Perl_Decimal_Digit, described below, is also accepted.  An empty
+list is returned if the property name is unknown.
 See L<perluniprops/Properties accessible through Unicode::UCD> for the
 properties acceptable as inputs to this function.
 
@@ -3252,6 +3200,7 @@ RETRY:
         # Find the beginning and end of the range on the line
         my ($hex_begin, $hex_end, $map) = split "\t", $range;
         my $begin = hex $hex_begin;
+        no warnings 'portable';
         my $end = (defined $hex_end && $hex_end ne "")
                   ? hex $hex_end
                   : $begin;
@@ -3375,7 +3324,7 @@ RETRY:
         # to the default value.  If there is no gap, the next iteration will
         # pop this, unless there is no next iteration, and we have filled all
         # of the Unicode code space, so check for that and skip.
-        if ($end < $MAX_UNICODE_CODEPOINT) {
+        if ($end < $Unicode::UCD::MAX_CP) {
             push @invlist, $end + 1;
             push @invmap, $missing;
         }
@@ -3388,10 +3337,15 @@ RETRY:
         push @invmap, $missing;
     }
 
-    # And add in standard element that all non-Unicode code points map to:
-    # $missing
-    push @invlist, $MAX_UNICODE_CODEPOINT + 1;
-    push @invmap, $missing;
+    # The final element is always for just the above-Unicode code points.  If
+    # not already there, add it.  It merely splits the current final range
+    # that extends to infinity into two elements, each with the same map.
+    # (This is to conform with the API that says the final element is for
+    # $MAX_UNICODE_CODEPOINT + 1 .. INFINITY.)
+    if ($invlist[-1] != $MAX_UNICODE_CODEPOINT + 1) {
+        push @invmap, $invmap[-1];
+        push @invlist, $MAX_UNICODE_CODEPOINT + 1;
+    }
 
     # The second component of the map are those values that require
     # non-standard specification, stored in SPECIALS.  These override any
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index c4b5a85098..b2caf8934c 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -1058,25 +1058,12 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
 
         # If we are to test against an inverted file, it is easier to invert
         # our array than the file.
-        # The file only is valid for Unicode code points, while the inversion
-        # list is valid for all possible code points.  Therefore, we must test
-        # just the Unicode part against the file.  Later we will test for
-        # the non-Unicode part.
-
-        my $before_invert;  # Saves the pre-inverted table.
         if ($invert) {
-            $before_invert = dclone \@tested;
             if (@tested && $tested[0] == 0) {
                 shift @tested;
             } else {
                 unshift @tested, 0;
             }
-            if (@tested && $tested[-1] == 0x110000) {
-                pop @tested;
-            }
-            else {
-                push @tested, 0x110000;
-            }
         }
 
         # Now construct a string from the list that should match the file.
@@ -1091,9 +1078,11 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
         # otherwise don't get reflected in the file.
         my $tested = "";
         my $i = 0;
-        for (; $i < @tested - 1; $i += 2) {
+        for (; $i < @tested; $i += 2) {
             my $start = $tested[$i];
-            my $end = $tested[$i+1] - 1;
+            my $end = ($i + 1 < @tested)
+                      ? $tested[$i+1] - 1
+                      : $Unicode::UCD::MAX_CP;
             if ($start == $end) {
                 $tested .= sprintf("%X\n", $start);
             }
@@ -1102,12 +1091,6 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
             }
         }
 
-        # As mentioned earlier, the disk files only go up through Unicode,
-        # whereas the prop_invlist() ones go as high as necessary.  The
-        # comparison is only valid through max Unicode.
-        if ($i == @tested - 1 && $tested[$i] <= 0x10FFFF) {
-            $tested .= sprintf("%X\t10FFFF\n", $tested[$i]);
-        }
         local $/ = "\n";
         chomp $tested;
         $/ = $input_record_separator;
@@ -1116,50 +1099,6 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
             next;
         }
 
-        # Here, it matched the table.  Now need to check for if it is correct
-        # for beyond Unicode.  First, calculate if is the default table or
-        # not.  This is the same algorithm as used internally in
-        # prop_invlist(), so if it is wrong there, this test won't catch it.
-        my $prop = lc $table;
-        ($prop_only, $table) = split /\s*[:=]\s*/, $prop;
-        if (defined $table) {
-
-            # May have optional prefixed 'is'
-            $prop = &utf8::_loose_name($prop_only) =~ s/^is//r;
-            $prop = $utf8::loose_property_name_of{$prop};
-            $prop .= "=" . &utf8::_loose_name($table);
-        }
-        else {
-            $prop = &utf8::_loose_name($prop);
-        }
-        my $is_default = exists $Unicode::UCD::loose_defaults{$prop};
-
-        @tested = @$before_invert if $invert;    # Use the original
-        if (@tested % 2 == 0) {
-
-            # If there are an even number of elements, the final one starts a
-            # range (going to infinity) of code points that are not in the
-            # list.
-            if ($is_default) {
-                fail("prop_invlist('$mod_table')");
-                diag("default table doesn't goto infinity");
-                use Data::Dumper;
-                diag Dumper \@tested;
-                next;
-            }
-        }
-        else {
-            # An odd number of elements means the final one starts a range
-            # (going to infinity of code points that are in the list.
-            if (! $is_default) {
-                fail("prop_invlist('$mod_table')");
-                diag("non-default table needs to stop in the Unicode range");
-                use Data::Dumper;
-                diag Dumper \@tested;
-                next;
-            }
-        }
-
         pass("prop_invlist('$mod_table')");
     }
 }
@@ -1391,7 +1330,35 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
         diag("The last inversion list element is not 0x110000");
         next PROPERTY;
     }
-    if ($invmap_ref->[-1] ne $missing) {
+
+    my $upper_limit_subtract;
+
+    # prop_invmap() adds an extra element not present in the disk files for
+    # the above-Unicode code points.  For almost all properties, that will be
+    # to $missing.  In that case we don't look further at it when comparing
+    # with the disk files.
+    if ($invmap_ref->[-1] eq $missing) {
+        $upper_limit_subtract = 1;
+    }
+    elsif ($invmap_ref->[-1] eq 'Y' && ! grep { $_ !~ /[YN]/ } @$invmap_ref) {
+
+        # But that's not true for a few binary properties like 'Unassigned'
+        # that are Perl extensions (in this case for Gc=Unassigned) which
+        # match above-Unicode code points (hence the 'Y' in the test above).
+        # For properties where it isn't $missing, we're going to want to look
+        # at the whole thing when comparing with the disk file.
+        $upper_limit_subtract = 0;
+
+        # In those properties like 'Unassigned, the final element should be
+        # just a repetition of the next-to-last element, and won't be in the
+        # disk file, so remove it for the comparison.  Otherwise, we will
+        # compare the whole of the array with the whole of the disk file.
+        if ($invlist_ref->[-2] <= 0x10FFFF && $invmap_ref->[-2] eq 'Y') {
+            pop @$invlist_ref;
+            pop @$invmap_ref;
+        }
+    }
+    else {
         fail("prop_invmap('$display_prop')");
         diag("The last inversion list element is '$invmap_ref->[-1]', and should be '$missing'");
         next PROPERTY;
@@ -1705,9 +1672,10 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
         # it's an error
         my %specials = %$specials_ref if $specials_ref;
 
-        # The extra -1 is because the final element has been tested above to
-        # be for anything above Unicode.  The file doesn't go that high.
-        for (my $i = 0; $i <  @$invlist_ref - 1; $i++) {
+        # The extra -$upper_limit_subtract is because the final element may
+        # have been tested above to be for anything above Unicode, in which
+        # case the file may not go that high.
+        for (my $i = 0; $i < @$invlist_ref - $upper_limit_subtract; $i++) {
 
             # If the map element is a reference, have to stringify it (but
             # don't do so if the format doesn't allow references, so that an
@@ -1899,7 +1867,9 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
             # Finally have figured out what the map column in the file should
             # be.  Append the line to the running string.
             my $start = $invlist_ref->[$i];
-            my $end = $invlist_ref->[$i+1] - 1;
+            my $end = (defined $invlist_ref->[$i+1])
+                      ? $invlist_ref->[$i+1] - 1
+                      : $Unicode::UCD::MAX_CP;
             $end = ($start == $end) ? "" : sprintf($file_range_format, $end);
             if ($invmap_ref->[$i] ne "") {
                 $tested_map .= sprintf "$file_range_format\t%s\t%s\n",
@@ -1999,7 +1969,7 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
         my @code_point_in_names =
                                @Unicode::UCD::code_points_ending_in_code_point;
 
-        for my $i (0 .. @$invlist_ref - 1 - 1) {
+        for my $i (0 .. @$invlist_ref - 1 - $upper_limit_subtract) {
             my $start = $invlist_ref->[$i];
             my $end = $invlist_ref->[$i+1] - 1;
             if ($invmap_ref->[$i] eq $missing) {
@@ -2105,10 +2075,7 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
         my %maps;
         my $previous_map;
 
-        # (The extra -1 is to not look at the final element in the loop, which
-        # we know is the one that starts just beyond Unicode and goes to
-        # infinity.)
-        for my $i (0 .. @$invlist_ref - 1 - 1) {
+        for my $i (0 .. @$invlist_ref - 1 - $upper_limit_subtract) {
             my $range_start = $invlist_ref->[$i];
 
             # Because we are sorting into buckets, things could be
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index f3d5c83d58..b94433fe2c 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -1200,6 +1200,18 @@ my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
 
+# We work with above-Unicode code points, up to UV_MAX.   But when you get
+# that high, above IV_MAX, some operations don't work, and you can easily get
+# overflow.  Therefore for internal use, we use a much smaller number,
+# translating it to UV_MAX only for output.  The exact number is immaterial
+# (all Unicode code points are treated exactly the same), but the algorithm
+# requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
+my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
+my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
+my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
+
+my $MAX_PLATFORM_CODEPOINT = ~0;
+
 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
 # two must be 10; if there are 5, the first must not be a 0.  Written this way
 # to decrease backtracking.  The first regex allows the code point to be at
@@ -1531,7 +1543,8 @@ my $UNASSIGNED_TYPE = -2;
 my $PRIVATE_USE_TYPE = -3;
 my $NONCHARACTER_TYPE = -4;
 my $CONTROL_TYPE = -5;
-my $UNKNOWN_TYPE = -6;  # Used only if there is a bug in this program
+my $ABOVE_UNICODE_TYPE = -6;
+my $UNKNOWN_TYPE = -7;  # Used only if there is a bug in this program
 
 sub populate_char_info ($) {
     # Used only with the $annotate option.  Populates the arrays with the
@@ -1562,7 +1575,13 @@ sub populate_char_info ($) {
     my $end;
     if (! $viacode[$i]) {
         my $nonchar;
-        if ($gc-> table('Private_use')->contains($i)) {
+        if ($i > $MAX_UNICODE_CODEPOINT) {
+            $viacode[$i] = 'Above-Unicode';
+            $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
+            $printable[$i] = 0;
+            $end = $MAX_WORKING_CODEPOINT;
+        }
+        elsif ($gc-> table('Private_use')->contains($i)) {
             $viacode[$i] = 'Private Use';
             $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
             $printable[$i] = 0;
@@ -1715,7 +1734,15 @@ sub clarify_code_point_count ($) {
     # This is like clarify_number(), but the input is assumed to be a count of
     # code points, rather than a generic number.
 
-    return clarify_number(shift);
+    my $append = "";
+
+    my $number = shift;
+    if ($number > $MAX_UNICODE_CODEPOINTS) {
+        $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
+        return "All above-Unicode code points" if $number == 0;
+        $append = " + all above-Unicode code points";
+    }
+    return clarify_number($number) . $append;
 }
 
 package Carp;
@@ -3450,7 +3477,7 @@ sub trace { return main::trace(@_); }
 
         # If the range list is empty, return a large value that isn't adjacent
         # to any that could be in the range list, for simpler tests
-        return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
+        return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
         return $ranges{$addr}->[0]->start;
     }
 
@@ -3729,9 +3756,6 @@ sub trace { return main::trace(@_); }
             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
             return;
         }
-        if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
-            Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ".  Adding it anyway");
-        }
         #local $to_trace = 1 if main::DEBUG;
 
         if ($operation eq '-') {
@@ -4529,8 +4553,8 @@ sub trace { return main::trace(@_); }
 
         # And finally, add the gap from the end of the table to the max
         # possible code point
-        if ($max < $MAX_UNICODE_CODEPOINT) {
-            $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
+        if ($max < $MAX_WORKING_CODEPOINT) {
+            $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
         }
         return $new;
     }
@@ -4819,6 +4843,7 @@ sub trace { return main::trace(@_); }
                 # range.
                 my $end = $set->end;
                 return $end if is_code_point_usable($end, $try_hard);
+                $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
 
                 # End point didn't, work.  Start at the beginning and try
                 # every one until find one that does work.
@@ -5722,6 +5747,7 @@ END
             my $next_end;
             my $next_value;
             my $offset = 0;
+            my $invlist_count = 0;
 
             my $output_value_in_hex = $self->isa('Map_Table')
                                 && ($self->format eq $HEX_ADJUST_FORMAT
@@ -5855,9 +5881,16 @@ END
 
                     # If there is a range
                     if ($start != $end) {
-                        push @OUT, sprintf "$hex_format\t$hex_format",
-                                             $start,       $end;
-                        if ($value ne "") {
+                        if ($end == $MAX_WORKING_CODEPOINT) {
+                            push @OUT, sprintf "$hex_format\t$hex_format",
+                                                $start,
+                                                $MAX_PLATFORM_CODEPOINT;
+                        }
+                        else {
+                            push @OUT, sprintf "$hex_format\t$hex_format",
+                                                $start,       $end;
+                        }
+                        if (length $value) {
                             if ($convert_map_to_from_hex) {
                                 $OUT[-1] .= sprintf "\t$hex_format\n", $value;
                             }
@@ -5958,8 +5991,15 @@ END
                                 }
 
                                 if ($i != $start || $range_end < $end) {
-                                    $annotation = sprintf "%04X..%04X",
-                                                           $i,   $range_end;
+                                    if ($range_end < $MAX_WORKING_CODEPOINT)
+                                    {
+                                        $annotation = sprintf "%04X..%04X",
+                                                              $i,   $range_end;
+                                    }
+                                    else {
+                                        $annotation = sprintf "%04X..INFINITY",
+                                                               $i;
+                                    }
                                 }
                                 else { # Indent if not displaying code points
                                     $annotation = " " x 4;
@@ -7696,7 +7736,18 @@ END
         # Get the number of code points matched by each of the tables in this
         # file, and add underscores for clarity.
         my $count = $leader->count;
-        my $string_count = main::clarify_code_point_count($count);
+        my $unicode_count;
+        my $non_unicode_string;
+        if ($count > $MAX_UNICODE_CODEPOINTS) {
+            $unicode_count = $count - ($MAX_WORKING_CODEPOINT
+                                       - $MAX_UNICODE_CODEPOINT);
+            $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
+        }
+        else {
+            $unicode_count = $count;
+            $non_unicode_string = "";
+        }
+        my $string_count = main::clarify_code_point_count($unicode_count);
 
         my $loose_count = 0;        # how many aliases loosely matched
         my $compound_name = "";     # ? Are any names compound?, and if so, an
@@ -7894,11 +7945,13 @@ END
             }
         } # End of looping through all tables
 
+        $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
+
 
         my $code_points;
         my $match;
         my $any_of_these;
-        if ($count == 1) {
+        if ($unicode_count == 1) {
             $match = 'matches';
             $code_points = 'single code point';
         }
@@ -12999,7 +13052,7 @@ END
                 # This fills in any missing values with the default.  It's not
                 # necessary to do this with binary properties, as the default
                 # is defined completely in terms of the Y table.
-                $property->add_map(0, $MAX_UNICODE_CODEPOINT,
+                $property->add_map(0, $MAX_WORKING_CODEPOINT,
                                    $default_map, Replace => $NO);
             }
         }
@@ -13211,8 +13264,9 @@ sub compile_perl() {
     # 'All' is all code points.  As an error check, instead of just setting it
     # to be that, construct it to be the union of all the major categories
     $All = $perl->add_match_table('All',
-            Description  => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
-            Matches_All => 1);
+      Description
+        => "All code points, including those above Unicode.  Same as qr/./s",
+      Matches_All => 1);
 
     foreach my $major_table ($gc->tables) {
 
@@ -13222,10 +13276,10 @@ sub compile_perl() {
         $All += $major_table;
     }
 
-    if ($All->max != $MAX_UNICODE_CODEPOINT) {
+    if ($All->max != $MAX_WORKING_CODEPOINT) {
         Carp::my_carp_bug("Generated highest code point ("
            . sprintf("%X", $All->max)
-           . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
+           . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
     }
     if ($All->range_count != 1 || $All->min != 0) {
      Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
@@ -14284,8 +14338,9 @@ END
             $unassigned_sans_noncharacters &= $nonchars->table('N');
         }
 
-        for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
+        for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
             $i = populate_char_info($i);    # Note sets $i so may cause skips
+
         }
     }
 
@@ -14937,7 +14992,18 @@ sub make_re_pod_entries($) {
     my $full_name = $property->full_name;
 
     my $count = $input_table->count;
-    my $string_count = clarify_number($count);
+    my $unicode_count;
+    my $non_unicode_string;
+    if ($count > $MAX_UNICODE_CODEPOINTS) {
+        $unicode_count = $count - ($MAX_WORKING_CODEPOINT
+                                    - $MAX_UNICODE_CODEPOINT);
+        $non_unicode_string = " plus all above-Unicode code points";
+    }
+    else {
+        $unicode_count = $count;
+        $non_unicode_string = "";
+    }
+    my $string_count = clarify_number($unicode_count) . $non_unicode_string;
     my $status = $input_table->status;
     my $status_info = $input_table->status_info;
     my $caseless_equivalent = $input_table->caseless_equivalent;
@@ -16840,13 +16906,13 @@ sub write_all_tables() {
 
             my $count = $table->count;
             if ($expected_full) {
-                if ($count != $MAX_UNICODE_CODEPOINTS) {
+                if ($count != $MAX_WORKING_CODEPOINTS) {
                     Carp::my_carp("$table matches only "
                     . clarify_number($count)
                     . " Unicode code points but should match "
-                    . clarify_number($MAX_UNICODE_CODEPOINTS)
+                    . clarify_number($MAX_WORKING_CODEPOINTS)
                     . " (off by "
-                    .  clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
+                    .  clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
                     . ").  Proceeding anyway.");
                 }
author	Karl Williamson <public@khwilliamson.com>	2013-12-23 20:35:54 -0700
committer	Karl Williamson <public@khwilliamson.com>	2013-12-31 08:27:23 -0700
commit	2d88a86a5910c97496b47b7b7c223f2c9a14b57c (patch)
tree	c0125ea6a9b6175c93245c4048773ae82e0f4efc /lib
parent	f215ab38f4d9ea2dca08fc71b38db0eb650d5107 (diff)
download	perl-2d88a86a5910c97496b47b7b7c223f2c9a14b57c.tar.gz