diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/Unicode/UCD.pm | 48 | ||||
-rw-r--r-- | lib/Unicode/UCD.t | 99 |
2 files changed, 122 insertions, 25 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index 6fb5cfc552..de62e5035a 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -2444,16 +2444,17 @@ contained in the C<Name_Alias> property.) =item B<C<d>> -means the Decomposition_Mapping property. This property is like C<cle> -properties, except it has no empties, and it has an additional entry type: +means the Decomposition_Mapping property. This property is like C<cl> +properties, except that one of the scalar elements is of the form: <hangul syllable> -for those code points whose decomposition is algorithmically calculated. (The -C<n> format has this same entry.) These can be generated via the function +This signifies that this entry should be replaced by the decompositions for +all the code points whose decomposition is algorithmically calculated. (All +of them are currently in one range and likely to remain so; the C<n> format +has this same entry.) These can be generated via the function L<Unicode::Normalize::NFD()|Unicode::Normalize>. - Note that the mapping is the one that is specified in the Unicode data files, and to get the final decomposition, it may need to be applied recursively. @@ -2719,7 +2720,8 @@ RETRY: } else { $decomps{'TYPE'} = "ToDm"; - $utf8::SwashInfo{'ToDm'}{'missing'} = "<code point>"; + $utf8::SwashInfo{'ToDm'}{'missing'} = "0"; + $utf8::SwashInfo{'ToDm'}{'format'} = 'i'; # Use a special internal-to-this_routine format, 'dm', to # distinguish from 'd', meaning decimal. @@ -2735,6 +2737,7 @@ RETRY: my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line; my $code_point = hex $hex_lower; my $value; + my $redo = 0; # The type, enclosed in <...>, precedes the mapping separated # by blanks @@ -2746,6 +2749,35 @@ RETRY: ? "Canonical" : $type_and_map; } + if ($second_try eq 'dm') { + my @map = map { hex } split " ", $value; + + if (@map == 1) { + + # Single character maps are converted to deltas, as + # this file is stored, for backwards compatibility, + # not using them. + $value = $map[0] - $code_point; + + # If this is a multi-char range, process the rest of + # it by doing a 'redo' after this line is done. Fix + # up the line to contain the rest of the range for + # that redo. + if ($hex_upper ne "" && hex $hex_upper != $code_point) { + $line = sprintf("%04X\t%s\t%s", + $code_point + 1, + $hex_upper, + $type_and_map); + $redo = 1; + + # Pretend that this is a single element range. + $hex_upper = $hex_lower; + } + } + else { + $value = join " ", @map; + } + } # Insert the hangul range at the appropriate spot. if (! $done_hangul && $code_point > $HANGUL_BEGIN) { @@ -2761,6 +2793,8 @@ RETRY: # And append this to our constructed LIST. $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n"; + + redo if $redo; } $swash = \%decomps; } @@ -2906,7 +2940,7 @@ RETRY: push @invmap, $map; } else { - my @map = map { hex } split " ", $map; + my @map = split " ", $map; if (@map == 1) { push @invmap, $map[0]; } diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index dd23b48aec..45573de07c 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -1274,7 +1274,7 @@ foreach my $prop (keys %props) { } } elsif ($format =~ /^ d /x) { - if ($missing ne "<code point>") { + if ($missing ne "0") { fail("prop_invmap('$mod_prop')"); diag("The missings should be '<code point>'; got '$missing'"); next PROPERTY; @@ -1381,9 +1381,28 @@ foreach my $prop (keys %props) { # Get rid of any trailing space and comments in the file. $official =~ s/\s*(#.*)?$//mg; - # Decomposition.pl also has the <compatible> types in it, which - # should be removed. - $official =~ s/<.*?> //mg if $format eq 'd'; + if ($format eq 'd') { + my @official = split /\n/, $official; + $official = ""; + foreach my $line (@official) { + my ($start, $end, $value) + = $line =~ / ^ (.+?) \t (.*?) \t (.+?) + \s* ( \# .* )? $ /x; + # Decomposition.pl also has the <compatible> types in it, + # which should be removed. + $value =~ s/<.*?> //; + $official .= "$start\t\t$value\n"; + + # If this is a multi-char range, we turn it into as many + # single character ranges as necessary. This makes things + # easier below. + if ($end ne "") { + for my $i (hex($start) + 1 .. hex $end) { + $official .= sprintf "%04X\t\t%s\n", $i, $value; + } + } + } + } } chomp $official; @@ -1416,8 +1435,7 @@ foreach my $prop (keys %props) { my @list; for (split "\n", $official) { my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?) - \s* ( \# .* )? - $ /x; + \s* ( \# .* )? $ /x; $end = $start if $end eq ""; push @list, [ hex $start, hex $end, $value ]; } @@ -1434,22 +1452,41 @@ foreach my $prop (keys %props) { # Find the spot in the @list of simple mappings that this # special applies to; uses a linear search. while ($i < @list -1 ) { - last if $cp <= $list[$i][0]; + last if $cp <= $list[$i][1]; $i++; } - #note $i-1 . ": " . join " => ", @{$list[$i-1]}; - #note $i-0 . ": " . join " => ", @{$list[$i-0]}; - #note $i+1 . ": " . join " => ", @{$list[$i+1]}; + # Here $i is such that it points to the first range which ends + # at or above cp, and hence is the only range that could + # possibly contain it. + + # If not in this range, no range contains it: nothing to + # remove. + next if $cp < $list[$i][0]; - # Then, remove any existing entry for this code point. - next if $cp != $list[$i][0]; - splice @list, $i, 1; + # Otherwise, remove the existing entry. If it is the first + # element of the range... + if ($cp == $list[$i][0]) { + + # ... and there are other elements in the range, just shorten + # the range to exclude this code point. + if ($list[$i][1] > $list[$i][0]) { + $list[$i][0]++; + } - #note __LINE__ . ": $cp"; - #note $i-1 . ": " . join " => ", @{$list[$i-1]}; - #note $i-0 . ": " . join " => ", @{$list[$i-0]}; - #note $i+1 . ": " . join " => ", @{$list[$i+1]}; + # ... but if it is the only element in the range, remove + # it entirely. + else { + splice @list, $i, 1; + } + } + else { # Is somewhere in the middle of the range + # Split the range into two, excluding this one in the + # middle + splice @list, $i, 1, + [ $list[$i][0], $cp - 1, $list[$i][2] ], + [ $cp + 1, $list[$i][1], $list[$i][2] ]; + } } # Here, have gone through all the specials, modifying @list as @@ -1500,7 +1537,7 @@ foreach my $prop (keys %props) { # The extra -1 is because the final element has been tested above to # be for anything above Unicode. The file doesn't go that high. - for my $i (0 .. @$invlist_ref - 1 - 1) { + for (my $i = 0; $i < @$invlist_ref - 1; $i++) { # If the map element is a reference, have to stringify it (but # don't do so if the format doesn't allow references, so that an @@ -1582,6 +1619,32 @@ foreach my $prop (keys %props) { next PROPERTY; } } + elsif ($format eq 'd') { + + # The numerics in the map are stored as deltas. The defaults + # are 0, and don't appear in $official, and are excluded + # later, but the elements must be converted back to their real + # code point values before comparing with $official, as that + # file, for backwards compatibility, is not stored as deltas + if ($invmap_ref->[$i] =~ / ^ -? \d+ $ /x + && $invmap_ref->[$i] != 0) + { + my $delta = $invmap_ref->[$i]; + $invmap_ref->[$i] += $invlist_ref->[$i]; + + # If there are other elements with this same delta, they + # must individually be re-mapped. Do this by splicing in + # a new element into the list and the map containing the + # remainder of the range. Next time through we will look + # at that (possibly splicing again until the whole range + # is processed). + if ($invlist_ref->[$i+1] > $invlist_ref->[$i] + 1) { + splice @$invlist_ref, $i+1, 0, + $invlist_ref->[$i] + 1; + splice @$invmap_ref, $i+1, 0, $delta; + } + } + } elsif ($format eq 'cle' && $invmap_ref->[$i] eq "") { # cle properties have maps to the empty string that also |