Unicode::UCD::prop_invmap(): Return deltas for the 'dm' property

Earlier commits caused the return of prop_invmap() for certain properties to return deltas from code points instead of the code points themselves, for compactness of storage and speed of searching. This causes the same for the 'dm' property, for consistency with the others, even though the space savings is not large for this one; essentially the same code can be used for the two types now; instead of an application having to have special cases.
author: Karl Williamson <public@khwilliamson.com> 2012-01-28 17:39:10 -0700
committer: Karl Williamson <public@khwilliamson.com> 2012-02-04 16:29:32 -0700
commit: bea2c146352c47a938243d84d1a4fa99f7a328bb (patch)
tree: 989b074c648dbd41cc21cfb20b5067f1068b6656 /lib
parent: a9d188b3497f7a6fe7fe9b203df96d3017dda4a4 (diff)
download: perl-bea2c146352c47a938243d84d1a4fa99f7a328bb.tar.gz
2 files changed, 122 insertions, 25 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 6fb5cfc552..de62e5035a 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -2444,16 +2444,17 @@ contained in the C<Name_Alias> property.)
 
 =item B<C<d>>
 
-means the Decomposition_Mapping property.  This property is like C<cle>
-properties, except it has no empties, and it has an additional entry type:
+means the Decomposition_Mapping property.  This property is like C<cl>
+properties, except that one of the scalar elements is of the form:
 
  <hangul syllable>
 
-for those code points whose decomposition is algorithmically calculated.  (The
-C<n> format has this same entry.)  These can be generated via the function
+This signifies that this entry should be replaced by the decompositions for
+all the code points whose decomposition is algorithmically calculated.  (All
+of them are currently in one range and likely to remain so; the C<n> format
+has this same entry.)  These can be generated via the function
 L<Unicode::Normalize::NFD()|Unicode::Normalize>.
 
-
 Note that the mapping is the one that is specified in the Unicode data files,
 and to get the final decomposition, it may need to be applied recursively.
 
@@ -2719,7 +2720,8 @@ RETRY:
             }
             else {
                 $decomps{'TYPE'} = "ToDm";
-                $utf8::SwashInfo{'ToDm'}{'missing'} = "<code point>";
+                $utf8::SwashInfo{'ToDm'}{'missing'} = "0";
+                $utf8::SwashInfo{'ToDm'}{'format'} = 'i';
 
                 # Use a special internal-to-this_routine format, 'dm', to
                 # distinguish from 'd', meaning decimal.
@@ -2735,6 +2737,7 @@ RETRY:
                 my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line;
                 my $code_point = hex $hex_lower;
                 my $value;
+                my $redo = 0;
 
                 # The type, enclosed in <...>, precedes the mapping separated
                 # by blanks
@@ -2746,6 +2749,35 @@ RETRY:
                              ? "Canonical" :
                              $type_and_map;
                 }
+                if ($second_try eq 'dm') {
+                    my @map = map { hex } split " ", $value;
+
+                    if (@map == 1) {
+
+                        # Single character maps are converted to deltas, as
+                        # this file is stored, for backwards compatibility,
+                        # not using them.
+                        $value = $map[0] - $code_point;
+
+                        # If this is a multi-char range, process the rest of
+                        # it by doing a 'redo' after this line is done.  Fix
+                        # up the line to contain the rest of the range for
+                        # that redo.
+                        if ($hex_upper ne "" && hex $hex_upper != $code_point) {
+                            $line = sprintf("%04X\t%s\t%s",
+                                            $code_point + 1,
+                                            $hex_upper,
+                                            $type_and_map);
+                            $redo = 1;
+
+                            # Pretend that this is a single element range.
+                            $hex_upper = $hex_lower;
+                        }
+                    }
+                    else {
+                        $value = join " ", @map;
+                    }
+                }
 
                 # Insert the hangul range at the appropriate spot.
                 if (! $done_hangul && $code_point > $HANGUL_BEGIN) {
@@ -2761,6 +2793,8 @@ RETRY:
 
                 # And append this to our constructed LIST.
                 $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n";
+
+                redo if $redo;
             }
             $swash = \%decomps;
         }
@@ -2906,7 +2940,7 @@ RETRY:
                 push @invmap, $map;
             }
             else {
-                my @map = map { hex } split " ", $map;
+                my @map = split " ", $map;
                 if (@map == 1) {
                     push @invmap, $map[0];
                 }
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index dd23b48aec..45573de07c 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -1274,7 +1274,7 @@ foreach my $prop (keys %props) {
         }
     }
     elsif ($format =~ /^ d /x) {
-        if ($missing ne "<code point>") {
+        if ($missing ne "0") {
             fail("prop_invmap('$mod_prop')");
             diag("The missings should be '<code point>'; got '$missing'");
             next PROPERTY;
@@ -1381,9 +1381,28 @@ foreach my $prop (keys %props) {
             # Get rid of any trailing space and comments in the file.
             $official =~ s/\s*(#.*)?$//mg;
 
-            # Decomposition.pl also has the <compatible> types in it, which
-            # should be removed.
-            $official =~ s/<.*?> //mg if $format eq 'd';
+            if ($format eq 'd') {
+                my @official = split /\n/, $official;
+                $official = "";
+                foreach my $line (@official) {
+                    my ($start, $end, $value)
+                                    = $line =~ / ^ (.+?) \t (.*?) \t (.+?)
+                                                \s* ( \# .* )? $ /x;
+                    # Decomposition.pl also has the <compatible> types in it,
+                    # which should be removed.
+                    $value =~ s/<.*?> //;
+                    $official .= "$start\t\t$value\n";
+
+                    # If this is a multi-char range, we turn it into as many
+                    # single character ranges as necessary.  This makes things
+                    # easier below.
+                    if ($end ne "") {
+                        for my $i (hex($start) + 1 .. hex $end) {
+                            $official .= sprintf "%04X\t\t%s\n", $i, $value;
+                        }
+                    }
+                }
+            }
         }
         chomp $official;
 
@@ -1416,8 +1435,7 @@ foreach my $prop (keys %props) {
             my @list;
             for (split "\n", $official) {
                 my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
-                                                \s* ( \# .* )?
-                                                $ /x;
+                                                \s* ( \# .* )? $ /x;
                 $end = $start if $end eq "";
                 push @list, [ hex $start, hex $end, $value ];
             }
@@ -1434,22 +1452,41 @@ foreach my $prop (keys %props) {
                 # Find the spot in the @list of simple mappings that this
                 # special applies to; uses a linear search.
                 while ($i < @list -1 ) {
-                    last if  $cp <= $list[$i][0];
+                    last if  $cp <= $list[$i][1];
                     $i++;
                 }
 
-                #note  $i-1 . ": " . join " => ", @{$list[$i-1]};
-                #note  $i-0 . ": " . join " => ", @{$list[$i-0]};
-                #note  $i+1 . ": " . join " => ", @{$list[$i+1]};
+                # Here $i is such that it points to the first range which ends
+                # at or above cp, and hence is the only range that could
+                # possibly contain it.
+
+                # If not in this range, no range contains it: nothing to
+                # remove.
+                next if $cp < $list[$i][0];
 
-                # Then, remove any existing entry for this code point.
-                next if $cp != $list[$i][0];
-                splice @list, $i, 1;
+                # Otherwise, remove the existing entry.  If it is the first
+                # element of the range...
+                if ($cp == $list[$i][0]) {
+
+                    # ... and there are other elements in the range, just shorten
+                    # the range to exclude this code point.
+                    if ($list[$i][1] > $list[$i][0]) {
+                        $list[$i][0]++;
+                    }
 
-                #note __LINE__ . ": $cp";
-                #note  $i-1 . ": " . join " => ", @{$list[$i-1]};
-                #note  $i-0 . ": " . join " => ", @{$list[$i-0]};
-                #note  $i+1 . ": " . join " => ", @{$list[$i+1]};
+                    # ... but if it is the only element in the range, remove
+                    # it entirely.
+                    else {
+                        splice @list, $i, 1;
+                    }
+                }
+                else { # Is somewhere in the middle of the range
+                    # Split the range into two, excluding this one in the
+                    # middle
+                    splice @list, $i, 1,
+                           [ $list[$i][0], $cp - 1, $list[$i][2] ],
+                           [ $cp + 1, $list[$i][1], $list[$i][2] ];
+                }
             }
 
             # Here, have gone through all the specials, modifying @list as
@@ -1500,7 +1537,7 @@ foreach my $prop (keys %props) {
 
         # The extra -1 is because the final element has been tested above to
         # be for anything above Unicode.  The file doesn't go that high.
-        for my $i (0 .. @$invlist_ref - 1 - 1) {
+        for (my $i = 0; $i <  @$invlist_ref - 1; $i++) {
 
             # If the map element is a reference, have to stringify it (but
             # don't do so if the format doesn't allow references, so that an
@@ -1582,6 +1619,32 @@ foreach my $prop (keys %props) {
                     next PROPERTY;
                 }
             }
+            elsif ($format eq 'd') {
+
+                # The numerics in the map are stored as deltas.  The defaults
+                # are 0, and don't appear in $official, and are excluded
+                # later, but the elements must be converted back to their real
+                # code point values before comparing with $official, as that
+                # file, for backwards compatibility, is not stored as deltas
+                if ($invmap_ref->[$i] =~ / ^ -? \d+ $ /x
+                    && $invmap_ref->[$i] != 0)
+                {
+                    my $delta = $invmap_ref->[$i];
+                    $invmap_ref->[$i] += $invlist_ref->[$i];
+
+                    # If there are other elements with this same delta, they
+                    # must individually be re-mapped.  Do this by splicing in
+                    # a new element into the list and the map containing the
+                    # remainder of the range.  Next time through we will look
+                    # at that (possibly splicing again until the whole range
+                    # is processed).
+                    if ($invlist_ref->[$i+1] > $invlist_ref->[$i] + 1) {
+                        splice @$invlist_ref, $i+1, 0,
+                                $invlist_ref->[$i] + 1;
+                        splice @$invmap_ref, $i+1, 0, $delta;
+                    }
+                }
+            }
             elsif ($format eq 'cle' && $invmap_ref->[$i] eq "") {
 
                 # cle properties have maps to the empty string that also
author	Karl Williamson <public@khwilliamson.com>	2012-01-28 17:39:10 -0700
committer	Karl Williamson <public@khwilliamson.com>	2012-02-04 16:29:32 -0700
commit	bea2c146352c47a938243d84d1a4fa99f7a328bb (patch)
tree	989b074c648dbd41cc21cfb20b5067f1068b6656 /lib
parent	a9d188b3497f7a6fe7fe9b203df96d3017dda4a4 (diff)
download	perl-bea2c146352c47a938243d84d1a4fa99f7a328bb.tar.gz