2 files changed, 122 insertions, 25 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 6fb5cfc552..de62e5035a 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -2444,16 +2444,17 @@ contained in the C<Name_Alias> property.)
 
 =item B<C<d>>
 
-means the Decomposition_Mapping property.  This property is like C<cle>
-properties, except it has no empties, and it has an additional entry type:
+means the Decomposition_Mapping property.  This property is like C<cl>
+properties, except that one of the scalar elements is of the form:
 
  <hangul syllable>
 
-for those code points whose decomposition is algorithmically calculated.  (The
-C<n> format has this same entry.)  These can be generated via the function
+This signifies that this entry should be replaced by the decompositions for
+all the code points whose decomposition is algorithmically calculated.  (All
+of them are currently in one range and likely to remain so; the C<n> format
+has this same entry.)  These can be generated via the function
 L<Unicode::Normalize::NFD()|Unicode::Normalize>.
 
-
 Note that the mapping is the one that is specified in the Unicode data files,
 and to get the final decomposition, it may need to be applied recursively.
 
@@ -2719,7 +2720,8 @@ RETRY:
             }
             else {
                 $decomps{'TYPE'} = "ToDm";
-                $utf8::SwashInfo{'ToDm'}{'missing'} = "<code point>";
+                $utf8::SwashInfo{'ToDm'}{'missing'} = "0";
+                $utf8::SwashInfo{'ToDm'}{'format'} = 'i';
 
                 # Use a special internal-to-this_routine format, 'dm', to
                 # distinguish from 'd', meaning decimal.
@@ -2735,6 +2737,7 @@ RETRY:
                 my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line;
                 my $code_point = hex $hex_lower;
                 my $value;
+                my $redo = 0;
 
                 # The type, enclosed in <...>, precedes the mapping separated
                 # by blanks
@@ -2746,6 +2749,35 @@ RETRY:
                              ? "Canonical" :
                              $type_and_map;
                 }
+                if ($second_try eq 'dm') {
+                    my @map = map { hex } split " ", $value;
+
+                    if (@map == 1) {
+
+                        # Single character maps are converted to deltas, as
+                        # this file is stored, for backwards compatibility,
+                        # not using them.
+                        $value = $map[0] - $code_point;
+
+                        # If this is a multi-char range, process the rest of
+                        # it by doing a 'redo' after this line is done.  Fix
+                        # up the line to contain the rest of the range for
+                        # that redo.
+                        if ($hex_upper ne "" && hex $hex_upper != $code_point) {
+                            $line = sprintf("%04X\t%s\t%s",
+                                            $code_point + 1,
+                                            $hex_upper,
+                                            $type_and_map);
+                            $redo = 1;
+
+                            # Pretend that this is a single element range.
+                            $hex_upper = $hex_lower;
+                        }
+                    }
+                    else {
+                        $value = join " ", @map;
+                    }
+                }
 
                 # Insert the hangul range at the appropriate spot.
                 if (! $done_hangul && $code_point > $HANGUL_BEGIN) {
@@ -2761,6 +2793,8 @@ RETRY:
 
                 # And append this to our constructed LIST.
                 $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n";
+
+                redo if $redo;
             }
             $swash = \%decomps;
         }
@@ -2906,7 +2940,7 @@ RETRY:
                 push @invmap, $map;
             }
             else {
-                my @map = map { hex } split " ", $map;
+                my @map = split " ", $map;
                 if (@map == 1) {
                     push @invmap, $map[0];
                 }
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index dd23b48aec..45573de07c 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -1274,7 +1274,7 @@ foreach my $prop (keys %props) {
         }
     }
     elsif ($format =~ /^ d /x) {
-        if ($missing ne "<code point>") {
+        if ($missing ne "0") {
             fail("prop_invmap('$mod_prop')");
             diag("The missings should be '<code point>'; got '$missing'");
             next PROPERTY;
@@ -1381,9 +1381,28 @@ foreach my $prop (keys %props) {
             # Get rid of any trailing space and comments in the file.
             $official =~ s/\s*(#.*)?$//mg;
 
-            # Decomposition.pl also has the <compatible> types in it, which
-            # should be removed.
-            $official =~ s/<.*?> //mg if $format eq 'd';
+            if ($format eq 'd') {
+                my @official = split /\n/, $official;
+                $official = "";
+                foreach my $line (@official) {
+                    my ($start, $end, $value)
+                                    = $line =~ / ^ (.+?) \t (.*?) \t (.+?)
+                                                \s* ( \# .* )? $ /x;
+                    # Decomposition.pl also has the <compatible> types in it,
+                    # which should be removed.
+                    $value =~ s/<.*?> //;
+                    $official .= "$start\t\t$value\n";
+
+                    # If this is a multi-char range, we turn it into as many
+                    # single character ranges as necessary.  This makes things
+                    # easier below.
+                    if ($end ne "") {
+                        for my $i (hex($start) + 1 .. hex $end) {
+                            $official .= sprintf "%04X\t\t%s\n", $i, $value;
+                        }
+                    }
+                }
+            }
         }
         chomp $official;
 
@@ -1416,8 +1435,7 @@ foreach my $prop (keys %props) {
             my @list;
             for (split "\n", $official) {
                 my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
-                                                \s* ( \# .* )?
-                                                $ /x;
+                                                \s* ( \# .* )? $ /x;
                 $end = $start if $end eq "";
                 push @list, [ hex $start, hex $end, $value ];
             }
@@ -1434,22 +1452,41 @@ foreach my $prop (keys %props) {
                 # Find the spot in the @list of simple mappings that this
                 # special applies to; uses a linear search.
                 while ($i < @list -1 ) {
-                    last if  $cp <= $list[$i][0];
+                    last if  $cp <= $list[$i][1];
                     $i++;
                 }
 
-                #note  $i-1 . ": " . join " => ", @{$list[$i-1]};
-                #note  $i-0 . ": " . join " => ", @{$list[$i-0]};
-                #note  $i+1 . ": " . join " => ", @{$list[$i+1]};
+                # Here $i is such that it points to the first range which ends
+                # at or above cp, and hence is the only range that could
+                # possibly contain it.
+
+                # If not in this range, no range contains it: nothing to
+                # remove.
+                next if $cp < $list[$i][0];
 
-                # Then, remove any existing entry for this code point.
-                next if $cp != $list[$i][0];
-                splice @list, $i, 1;
+                # Otherwise, remove the existing entry.  If it is the first
+                # element of the range...
+                if ($cp == $list[$i][0]) {
+
+                    # ... and there are other elements in the range, just shorten
+                    # the range to exclude this code point.
+                    if ($list[$i][1] > $list[$i][0]) {
+                        $list[$i][0]++;
+                    }
 
-                #note __LINE__ . ": $cp";
-                #note  $i-1 . ": " . join " => ", @{$list[$i-1]};
-                #note  $i-0 . ": " . join " => ", @{$list[$i-0]};
-                #note  $i+1 . ": " . join " => ", @{$list[$i+1]};
+                    # ... but if it is the only element in the range, remove
+                    # it entirely.
+                    else {
+                        splice @list, $i, 1;
+                    }
+                }
+                else { # Is somewhere in the middle of the range
+                    # Split the range into two, excluding this one in the
+                    # middle
+                    splice @list, $i, 1,
+                           [ $list[$i][0], $cp - 1, $list[$i][2] ],
+                           [ $cp + 1, $list[$i][1], $list[$i][2] ];
+                }
             }
 
             # Here, have gone through all the specials, modifying @list as
@@ -1500,7 +1537,7 @@ foreach my $prop (keys %props) {
 
         # The extra -1 is because the final element has been tested above to
         # be for anything above Unicode.  The file doesn't go that high.
-        for my $i (0 .. @$invlist_ref - 1 - 1) {
+        for (my $i = 0; $i <  @$invlist_ref - 1; $i++) {
 
             # If the map element is a reference, have to stringify it (but
             # don't do so if the format doesn't allow references, so that an
@@ -1582,6 +1619,32 @@ foreach my $prop (keys %props) {
                     next PROPERTY;
                 }
             }
+            elsif ($format eq 'd') {
+
+                # The numerics in the map are stored as deltas.  The defaults
+                # are 0, and don't appear in $official, and are excluded
+                # later, but the elements must be converted back to their real
+                # code point values before comparing with $official, as that
+                # file, for backwards compatibility, is not stored as deltas
+                if ($invmap_ref->[$i] =~ / ^ -? \d+ $ /x
+                    && $invmap_ref->[$i] != 0)
+                {
+                    my $delta = $invmap_ref->[$i];
+                    $invmap_ref->[$i] += $invlist_ref->[$i];
+
+                    # If there are other elements with this same delta, they
+                    # must individually be re-mapped.  Do this by splicing in
+                    # a new element into the list and the map containing the
+                    # remainder of the range.  Next time through we will look
+                    # at that (possibly splicing again until the whole range
+                    # is processed).
+                    if ($invlist_ref->[$i+1] > $invlist_ref->[$i] + 1) {
+                        splice @$invlist_ref, $i+1, 0,
+                                $invlist_ref->[$i] + 1;
+                        splice @$invmap_ref, $i+1, 0, $delta;
+                    }
+                }
+            }
             elsif ($format eq 'cle' && $invmap_ref->[$i] eq "") {
 
                 # cle properties have maps to the empty string that also