mktables: Generate some delta tables

This commit has the effect of changing the non-legacy tables for the lc, uc, tc, and fc properties to use maps of deltas from the code points instead of the code points themselves, thus shortening them significantly, and hence the time required to search through them. Note that these tables are new, and currently used only by Unicode::UCD. A future commit will change the Perl core to use them.
author: Karl Williamson <public@khwilliamson.com> 2012-01-28 09:51:58 -0700
committer: Karl Williamson <public@khwilliamson.com> 2012-02-04 16:29:31 -0700
commit: bf7fe2df5e62a989d05ef5bc74eeddc134b7594e (patch)
tree: 09fb278472ed8eca5a6839ff8842726cc111783c
parent: ed307795c2d094385cb96962549a9d4d1aeb5f70 (diff)
download: perl-bf7fe2df5e62a989d05ef5bc74eeddc134b7594e.tar.gz
3 files changed, 107 insertions, 75 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index c137af3c47..6fb5cfc552 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -407,15 +407,21 @@ sub charinfo {
 
     %SIMPLE_UPPER = _read_table("unicore/To/Uc.pl", "use_hash")
                                                            unless %SIMPLE_UPPER;
-    $prop{'upper'} = $SIMPLE_UPPER{$code} // "";
+    $prop{'upper'} = (defined $SIMPLE_UPPER{$code})
+                     ? sprintf("%04X", $SIMPLE_UPPER{$code} + $code)
+                     : "";
 
     %SIMPLE_LOWER = _read_table("unicore/To/Lc.pl", "use_hash")
                                                            unless %SIMPLE_LOWER;
-    $prop{'lower'} = $SIMPLE_LOWER{$code} // "";
+    $prop{'lower'} = (defined $SIMPLE_LOWER{$code})
+                     ? sprintf("%04X", $SIMPLE_LOWER{$code} + $code)
+                     : "";
 
     %SIMPLE_TITLE = _read_table("unicore/To/Tc.pl", "use_hash")
                                                            unless %SIMPLE_TITLE;
-    $prop{'title'} = $SIMPLE_TITLE{$code} // "";
+    $prop{'title'} = (defined $SIMPLE_TITLE{$code})
+                     ? sprintf("%04X", $SIMPLE_TITLE{$code} + $code)
+                     : "";
 
     $prop{block}  = charblock($code);
     $prop{script} = charscript($code);
@@ -2326,10 +2332,9 @@ C<"r">.
 
 =item B<C<c>>
 
-is like C<s> in that all the map array elements are scalars, but some of them
-are the special string S<C<"E<lt>code pointE<gt>">>, meaning that the map of
-each code point in the corresponding range in the inversion list is the code
-point itself.  For example, in:
+is like C<s> in that all the map array elements are scalars, but here they are
+restricted to all being integers, and each has to be tweaked to get the correct
+result by adding the code point number to it.  For example, in:
 
  my ($uppers_ranges_ref, $uppers_maps_ref, $format)
                           = prop_invmap("Simple_Uppercase_Mapping");
@@ -2337,24 +2342,22 @@ point itself.  For example, in:
 the returned arrays look like this:
 
  @$uppers_ranges_ref    @$uppers_maps_ref   Note
-       0                 "<code point>"
-      97                     65          'a' maps to 'A'
-      98                     66          'b' => 'B'
-      99                     67          'c' => 'C'
-      ...
-     120                     88          'x' => 'X'
-     121                     89          'y' => 'Y'
-     122                     90          'z' => 'Z'
-     123                "<code point>"
-     181                    924          MICRO SIGN => Greek Cap MU
-     182                "<code point>"
+       0                      0
+      97                    -32          'a' maps to 'A', b => B ...
+     123                      0
+     181                    743          MICRO SIGN => Greek Cap MU
+     182                      0
      ...
 
-The first line means that the uppercase of code point 0 is 0;
-the uppercase of code point 1 is 1; ...  of code point 96 is 96.  Without the
-C<"E<lt>code_pointE<gt>"> notation, every code point would have to have an
-entry.  This would mean that the arrays would each have more than a million
-entries to list just the legal Unicode code points!
+The first line means that the uppercase of code point 0 is 0+0; the uppercase
+of code point 1 is 1+0; ...  of code point 96 is 96+0.  In other words, the
+uppercase of each of the first 0..96 code points is itself.  The second line
+means that code point 97 maps to 97-32 (=65) or the uppercase of 'a' is 'A';
+98 => 98-32 (=66) or the uppercase of 'b' is 'B'; ... 122 => 122-32 (=90) or
+the uppercase of 'z' is 'Z'.
+
+By requiring adding the code point to the returned result, the arrays are made
+significantly smaller.
 
 =item B<C<cl>>
 
@@ -2368,17 +2371,15 @@ For example, in:
 the returned arrays look like this:
 
  @$uppers_ranges_ref    @$uppers_maps_ref
-       0                 "<code point>"
-      97                     65
-     ...
-     122                     90
-     123                "<code point>"
-     181                    924
-     182                "<code point>"
+       0                      0
+      97                    -32
+     123                      0
+     181                    743
+     182                      0
      ...
     0x0149              [ 0x02BC 0x004E ]
-    0x014A              "<code point>"
-    0x014B                 0x014A
+    0x014A                    0
+    0x014B                   -1
      ...
 
 This is the full Uppercase_Mapping property (as opposed to the
@@ -2388,9 +2389,18 @@ difference between the two in the ranges shown is that the code point at
 characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
 CAPITAL LETTER N).
 
+Yes, there is an inconsistency here.  When the map is a single element the
+correct value must be derived by adding the code point number to it; when the
+map is a list of code points, they are the final correct values.  The reason
+for forcing the addition is to make the returned map array significantly more
+compact.  There is no such advantage to doing the same thing to the elements
+that are lists, and the addition is extra work.
+
 =item B<C<cle>>
 
-means that some of the map array elements have the forms given by C<cl>, and
+is like C<cl> except that, for the time being, as an interim measure, the map
+returned for simple scalars is the correct value and the code point should NOT
+be added to it.  Also, some of the map array elements have the forms given by C<cl>, and
 the rest are the empty string.  The property C<NFKC_Casefold> has this form.
 An example slice is:
 
@@ -2434,8 +2444,8 @@ contained in the C<Name_Alias> property.)
 
 =item B<C<d>>
 
-means the Decomposition_Mapping property.  This property is like C<cl>
-properties, except it has an additional entry type:
+means the Decomposition_Mapping property.  This property is like C<cle>
+properties, except it has no empties, and it has an additional entry type:
 
  <hangul syllable>
 
@@ -3058,7 +3068,7 @@ RETRY:
     elsif ($format eq 'x') {
 
         # All hex-valued properties are really to code points
-        $format = 'c';
+        $format = 'i';
     }
     elsif ($format eq 'dm') {
         $format = 'd';
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index 8dd977fd81..dd23b48aec 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -873,9 +873,9 @@ use Unicode::UCD qw(prop_invlist prop_invmap MAX_CP);
 my $prop = "uc";
 my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
 is($format, 'cl', "prop_invmap() format of '$prop' is 'cl'");
-is($missing, '<code point>', "prop_invmap() missing of '$prop' is '<code point>'");
+is($missing, '0', "prop_invmap() missing of '$prop' is '0'");
 is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61");
-is($invmap_ref->[1], 0x41, "prop_invmap('$prop') map[1] is 0x41");
+is($invmap_ref->[1], -32, "prop_invmap('$prop') map[1] is -32");
 
 $prop = "upper";
 ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
@@ -894,9 +894,9 @@ is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'");
 $prop = "lc";
 ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
 is($format, 'cl', "prop_invmap() format of '$prop' is 'cl'");
-is($missing, '<code point>', "prop_invmap() missing of '$prop' is '<code point>'");
+is($missing, '0', "prop_invmap() missing of '$prop' is '0'");
 is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41");
-is($invmap_ref->[1], 0x61, "prop_invmap('$prop') map[1] is 0x61");
+is($invmap_ref->[1], 32, "prop_invmap('$prop') map[1] is 32");
 
 # This property is stable and small, so can test all of it
 $prop = "ASCII_Hex_Digit";
@@ -1259,7 +1259,21 @@ foreach my $prop (keys %props) {
             next PROPERTY;
         }
     }
-    elsif ($format =~ /^ [cd] /x) {
+    elsif ($name eq 'nfkccf') {   # This one has an atypical $missing
+        if ($missing ne "<code point>") {
+            fail("prop_invmap('$mod_prop')");
+            diag("The missings should be \"\"; got '$missing'");
+            next PROPERTY;
+        }
+    }
+    elsif ($format =~ /^ c /x) {
+        if ($missing ne "0") {
+            fail("prop_invmap('$mod_prop')");
+            diag("The missings should be '0'; got '$missing'");
+            next PROPERTY;
+        }
+    }
+    elsif ($format =~ /^ d /x) {
         if ($missing ne "<code point>") {
             fail("prop_invmap('$mod_prop')");
             diag("The missings should be '<code point>'; got '$missing'");
@@ -1374,9 +1388,10 @@ foreach my $prop (keys %props) {
         chomp $official;
 
         # If there are any special elements, get a reference to them.
-        my $specials_ref = $utf8::file_to_swash_name{$base_file};
-        if ($specials_ref) {
-            $specials_ref = $utf8::SwashInfo{$specials_ref}{'specials_name'};
+        my $swash_name = $utf8::file_to_swash_name{$base_file};
+        my $specials_ref;
+        if ($swash_name) {
+            $specials_ref = $utf8::SwashInfo{$swash_name}{'specials_name'};
             if ($specials_ref) {
 
                 # Convert from the name to the actual reference.
@@ -1404,12 +1419,7 @@ foreach my $prop (keys %props) {
                                                 \s* ( \# .* )?
                                                 $ /x;
                 $end = $start if $end eq "";
-                if ($end ne $start) {
-                    fail("prop_invmap('$mod_prop')");
-                    diag("This test is expecting only single code point ranges in $file.pl");
-                    next PROPERTY;
-                }
-                push @list, [ hex $start, $value ];
+                push @list, [ hex $start, hex $end, $value ];
             }
 
             # For these mappings, the file contains all the simple mappings,
@@ -1444,7 +1454,16 @@ foreach my $prop (keys %props) {
 
             # Here, have gone through all the specials, modifying @list as
             # needed.  Turn it back into what the file should look like.
-            $official = join "\n", map { sprintf "%04X\t\t%s", @$_ } @list;
+            $official = "";
+            for my $element (@list) {
+                $official .= "\n" if $official;
+                if ($element->[1] == $element->[0]) {
+                    $official .= sprintf "%04X\t\t%s", $element->[0], $element->[2];
+                }
+                else {
+                    $official .= sprintf "%04X\t%04X\t%s", $element->[0], $element->[1], $element->[2];
+                }
+            }
         }
         elsif ($full_name =~ /Simple_(Case_Folding|(Lower|Title|Upper)case_Mapping)/)
         {
@@ -1453,6 +1472,18 @@ foreach my $prop (keys %props) {
             # specials are superfluous.
             undef $specials_ref;
         }
+        elsif ($name eq 'bmg') {
+
+            # For this property, the file is output using hex notation for the
+            # map, with all ranges equal to length 1.  Convert from hex to
+            # decimal.
+            my @lines = split "\n", $official;
+            foreach my $line (@lines) {
+                my ($code_point, $map) = split "\t\t", $line;
+                $line = $code_point . "\t\t" . hex $map;
+            }
+            $official = join "\n", @lines;
+        }
 
         # Here, in $official, we have what the file looks like, or should like
         # if we've had to fix it up.  Now take the invmap() output and reverse
@@ -1598,9 +1629,13 @@ foreach my $prop (keys %props) {
                 next;
             }
 
-            # 'c'-type and 'd' properties have the mapping expressed in hex in
-            # the file
-            if ($format =~ /^ [cd] /x) {
+            # The 'd' property and 'c' properties whose underlying format is
+            # hexadecimal have the mapping expressed in hex in the file
+            if ($format eq 'd'
+                || ($format =~ /^c/
+                    && $swash_name
+                    && $utf8::SwashInfo{$swash_name}{'format'} eq 'x'))
+            {
 
                 # The d property has one entry which isn't in the file.
                 # Ignore it, but make sure it is in order.
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index e27297463c..98898e8250 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -5986,8 +5986,14 @@ sub trace { return main::trace(@_); }
 
         # But do want to output string ones.  All the ones that remain to
         # be dealt with (i.e. which haven't explicitly been set to external)
-        # are for internal Perl use only.
-        return $INTERNAL_MAP if $type == $STRING;
+        # are for internal Perl use only.  The default for those that map to
+        # $CODE_POINT and haven't been restricted to a single element range
+        # is to use the delta form.
+        if ($type == $STRING) {
+            return $INTERNAL_MAP if $self->range_size_1
+                                    || $default_map{$addr} ne $CODE_POINT;
+            return $OUTPUT_DELTAS;
+        }
 
         # Otherwise is an $ENUM, do output it, for Perl's purposes
         return $INTERNAL_MAP;
@@ -6527,12 +6533,6 @@ END
 
         $self->_set_format($format);
 
-        # Core Perl has a different definition of mapping ranges than we do,
-        # that is applicable mainly to mapping code points, so for tables
-        # where it is possible that core Perl could be used to read it,
-        # make it range size 1 to prevent possible confusion
-        $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
-
         return $self->SUPER::write(
             $output_deltas,
             ($self->property == $block)
@@ -8772,19 +8772,6 @@ sub finish_property_setup {
         $ccc->set_directory(File::Spec->curdir());
     }
 
-    # utf8.c has a different meaning for non range-size-1 for map properties
-    # that this program doesn't currently handle; and even if it were changed
-    # to do so, some other code may be using them expecting range size 1.
-    foreach my $property (qw {
-                                Case_Folding
-                                Lowercase_Mapping
-                                Titlecase_Mapping
-                                Uppercase_Mapping
-                            })
-    {
-        property_ref($property)->set_range_size_1(1);
-    }
-
     # These two properties aren't actually used in the core, but unfortunately
     # the names just above that are in the core interfere with these, so
     # choose different names.  These aren't a problem unless the map tables
author	Karl Williamson <public@khwilliamson.com>	2012-01-28 09:51:58 -0700
committer	Karl Williamson <public@khwilliamson.com>	2012-02-04 16:29:31 -0700
commit	bf7fe2df5e62a989d05ef5bc74eeddc134b7594e (patch)
tree	09fb278472ed8eca5a6839ff8842726cc111783c
parent	ed307795c2d094385cb96962549a9d4d1aeb5f70 (diff)
download	perl-bf7fe2df5e62a989d05ef5bc74eeddc134b7594e.tar.gz