diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-01-30 18:17:11 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-02-04 16:29:32 -0700 |
commit | 34132297113975a3522f23d745e0ccf336803994 (patch) | |
tree | 46168c9561d1fe5fc4ebe13d5077e514a14dea98 | |
parent | bea2c146352c47a938243d84d1a4fa99f7a328bb (diff) | |
download | perl-34132297113975a3522f23d745e0ccf336803994.tar.gz |
Unicode::UCD::prop_invmap(): Make the NFKCCF property return deltas
The file for this property is stored in the old-style format for
backward compatibility with any applications that might be reading it
directly. But the values should be returned through the Unicode::UCD
API as deltas for consistency with other, similar properties.
-rw-r--r-- | lib/Unicode/UCD.pm | 46 | ||||
-rw-r--r-- | lib/Unicode/UCD.t | 27 |
2 files changed, 48 insertions, 25 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index de62e5035a..3473ecbe1a 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -2398,20 +2398,18 @@ that are lists, and the addition is extra work. =item B<C<cle>> -is like C<cl> except that, for the time being, as an interim measure, the map -returned for simple scalars is the correct value and the code point should NOT -be added to it. Also, some of the map array elements have the forms given by C<cl>, and +means that some of the map array elements have the forms given by C<cl>, and the rest are the empty string. The property C<NFKC_Casefold> has this form. An example slice is: @$ranges_ref @$maps_ref Note ... - 0x00AA 0x0061 FEMININE ORDINAL INDICATOR => 'a' - 0x00AB <code point> + 0x00AA -73 FEMININE ORDINAL INDICATOR => 'a' + 0x00AB 0 0x00AD SOFT HYPHEN => "" - 0x00AE <code point> + 0x00AE 0 0x00AF [ 0x0020, 0x0304 ] MACRON => SPACE . COMBINING MACRON - 0x00B0 <code point> + 0x00B0 0 ... =item B<C<n>> @@ -2576,8 +2574,9 @@ RETRY: # new-style, and this routine is supposed to return old-style block names. # The Name table is valid, but we need to execute the special code below # to add in the algorithmic-defined name entries. + # And NFKCCF needs conversion, so handle that here too. if (ref $swash eq "" - || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na) $ /x) + || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na | NFKCCF ) $ /x) { # Get the short name of the input property, in standard form @@ -2798,6 +2797,35 @@ RETRY: } $swash = \%decomps; } + elsif ($second_try eq 'nfkccf') { + + # This property is stored in the old format for backwards + # compatibility for any applications that read its file directly. + # So here we convert it to delta format for compatibility with the + # other properties similar to it. + my %nfkccf; + + # Create a new LIST with deltas instead of code points. + my $list = ""; + foreach my $range (split "\n", $swash->{'LIST'}) { + my ($hex_begin, $hex_end, $map) = split "\t", $range; + my $begin = hex $hex_begin; + my $end = (defined $hex_end && $hex_end ne "") + ? hex $hex_end + : $begin; + my $decimal_map = hex $map; + foreach my $code_point ($begin .. $end) { + $list .= sprintf("%04X\t\t%d\n", $code_point, $decimal_map - $code_point); + } + } + + $nfkccf{'LIST'} = $list; + $nfkccf{'TYPE'} = "ToNFKCCF"; + $nfkccf{'SPECIALS'} = $swash->{'SPECIALS'}; + $swash = \%nfkccf; + $utf8::SwashInfo{'ToNFKCCF'}{'missing'} = 0; + $utf8::SwashInfo{'ToNFKCCF'}{'format'} = 'i'; + } else { # Don't know this property. Fail. return; } @@ -2809,7 +2837,7 @@ RETRY: } # Here, have a valid swash return. Examine it. - my $returned_prop = $swash->{TYPE}; + my $returned_prop = $swash->{'TYPE'}; # All properties but binary ones should have 'missing' and 'format' # entries diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index 45573de07c..530c548694 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -1259,13 +1259,6 @@ foreach my $prop (keys %props) { next PROPERTY; } } - elsif ($name eq 'nfkccf') { # This one has an atypical $missing - if ($missing ne "<code point>") { - fail("prop_invmap('$mod_prop')"); - diag("The missings should be \"\"; got '$missing'"); - next PROPERTY; - } - } elsif ($format =~ /^ c /x) { if ($missing ne "0") { fail("prop_invmap('$mod_prop')"); @@ -1619,13 +1612,15 @@ foreach my $prop (keys %props) { next PROPERTY; } } - elsif ($format eq 'd') { - - # The numerics in the map are stored as deltas. The defaults - # are 0, and don't appear in $official, and are excluded - # later, but the elements must be converted back to their real - # code point values before comparing with $official, as that - # file, for backwards compatibility, is not stored as deltas + elsif ($format eq 'd' || $format eq 'cle') { + + # The numerics in the returned map are stored as deltas. The + # defaults are 0, and don't appear in $official, and are + # excluded later, but the elements must be converted back to + # their real code point values before comparing with + # $official, as these files, for backwards compatibility, are + # not stored as deltas. (There currently is only one cle + # property, nfkccf. If that changed this would also have to.) if ($invmap_ref->[$i] =~ / ^ -? \d+ $ /x && $invmap_ref->[$i] != 0) { @@ -1644,8 +1639,7 @@ foreach my $prop (keys %props) { splice @$invmap_ref, $i+1, 0, $delta; } } - } - elsif ($format eq 'cle' && $invmap_ref->[$i] eq "") { + if ($format eq 'cle' && $invmap_ref->[$i] eq "") { # cle properties have maps to the empty string that also # should be in the specials hash, with the key the packed code @@ -1673,6 +1667,7 @@ foreach my $prop (keys %props) { next PROPERTY; } next; + } } elsif ($is_binary) { # These binary files don't have an explicit Y $invmap_ref->[$i] =~ s/Y//; |