diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-11-06 14:31:26 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-11-08 08:09:36 -0700 |
commit | 62b3b855a6b9268ee171e2c384362d719ea21537 (patch) | |
tree | fd7b7068c8888430d4813e67cdd86c4483079eaf /lib/Unicode | |
parent | a33a1c99e77db13418959b16b072f82ae531372b (diff) | |
download | perl-62b3b855a6b9268ee171e2c384362d719ea21537.tar.gz |
Unicode::UCD: Add prop_invmap()
Diffstat (limited to 'lib/Unicode')
-rw-r--r-- | lib/Unicode/UCD.pm | 976 | ||||
-rw-r--r-- | lib/Unicode/UCD.t | 827 |
2 files changed, 1800 insertions, 3 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index ef46c29e21..09ea439919 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -26,6 +26,7 @@ our @EXPORT_OK = qw(charinfo prop_aliases prop_value_aliases prop_invlist + prop_invmap MAX_CP ); @@ -75,6 +76,10 @@ Unicode::UCD - Unicode character database use Unicode::UCD 'prop_invlist'; my @puncts = prop_invlist("gc=punctuation"); + use Unicode::UCD 'prop_invmap'; + my ($list_ref, $map_ref, $format, $missing) + = prop_invmap("General Category"); + use Unicode::UCD 'compexcl'; my $compexcl = compexcl($codepoint); @@ -657,6 +662,9 @@ as the keys, and the code point ranges (see L</charblock()>) as the values. The names are in the old-style (see L</Old-style versus new-style block names>). +L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a +different type of data structure. + See also L</Blocks versus Scripts>. =cut @@ -676,6 +684,9 @@ charscripts() returns a reference to a hash with the known script names as the keys, and the code point ranges (see L</charscript()>) as the values. +L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a +different type of data structure. + See also L</Blocks versus Scripts>. =cut @@ -2033,6 +2044,971 @@ sub prop_invlist ($) { return @invlist; } +sub _search_invlist { + # Find the range in the inversion list which contains a code point; that + # is, find i such that l[i] <= code_point < l[i+1] + + # If this is ever made public, could use to speed up .t specials. Would + # need to use code point argument, as in other functions in this pm + + my $list_ref = shift; + my $code_point = shift; + # Verify non-neg numeric XXX + + my $max_element = @$list_ref - 1; + return if ! $max_element < 0; # Undef if list is empty. + + # Short cut something at the far-end of the table. This also allows us to + # refer to element [$i+1] without fear of being out-of-bounds in the loop + # below. + return $max_element if $code_point >= $list_ref->[$max_element]; + + use integer; # want integer division + + my $i = $max_element / 2; + + my $lower = 0; + my $upper = $max_element; + while (1) { + + if ($code_point >= $list_ref->[$i]) { + + # Here we have met the lower constraint. We can quit if we + # also meet the upper one. + last if $code_point < $list_ref->[$i+1]; + + $lower = $i; # Still too low. + + } + else { + + # Here, $code_point < $list_ref[$i], so look lower down. + $upper = $i; + } + + # Split search domain in half to try again. + my $temp = ($upper + $lower) / 2; + + # No point in continuing unless $i changes for next time + # in the loop. + return $i if $temp == $i; + $i = $temp; + } # End of while loop + + # Here we have found the offset + return $i; +} + +=pod + +=head2 B<prop_invmap()> + + use Unicode::UCD 'prop_invmap'; + my ($list_ref, $map_ref, $format, $missing) + = prop_invmap("General Category"); + +C<prop_invmap> is used to get the complete mapping definition for a property, +in the form of an inversion map. An inversion map consists of two parallel +arrays. One is an ordered list of code points that mark range beginnings, and +the other gives the value (or mapping) that all code points in the +corresponding range have. + +C<prop_invmap> is called with the name of the desired property. The name is +loosely matched, meaning that differences in case, white-space, hyphens, and +underscores are not meaningful (except for the trailing underscore in the +old-form grandfathered-in property C<"L_">, which is better written as C<"LC">, +or even better, C<"Gc=LC">). + +Many Unicode properties have more than one name (or alias). C<prop_invmap> +understands all of these, including Perl extensions to them. Ambiguities are +resolved as described above for L</prop_aliases()>. The Perl internal +property "Perl_Decimal_Digit, described below, is also accepted. C<undef> is +returned if the property name is unknown. + +It is a fatal error to call this function except in list context. + +In addition to the the two arrays that form the inversion map, C<prop_invmap> +returns two other values; one is a scalar that gives some details as to the +format of the entries of the map array; the other is used for specialized +purposes, described at the end of this section. + +This means that C<prop_invmap> returns a 4 element list. For example, + + my ($blocks_ranges_ref, $blocks_maps_ref, $format, $default) + = prop_invmap("Block"); + +In this call, the two arrays will be populated as shown below (for Unicode +6.0): + + Index @blocks_ranges @blocks_maps + 0 0x0000 Basic Latin + 1 0x0080 Latin-1 Supplement + 2 0x0100 Latin Extended-A + 3 0x0180 Latin Extended-B + 4 0x0250 IPA Extensions + 5 0x02B0 Spacing Modifier Letters + 6 0x0300 Combining Diacritical Marks + 7 0x0370 Greek and Coptic + 8 0x0400 Cyrillic + ... + 233 0x2B820 No_Block + 234 0x2F800 CJK Compatibility Ideographs Supplement + 235 0x2FA20 No_Block + 236 0xE0000 Tags + 237 0xE0080 No_Block + 238 0xE0100 Variation Selectors Supplement + 239 0xE01F0 No_Block + 240 0xF0000 Supplementary Private Use Area-A + 241 0x100000 Supplementary Private Use Area-B + 242 0x110000 No_Block + +The first line (with Index [0]) means that the value for code point 0 is "Basic +Latin". The entry "0x0080" in the @blocks_ranges column in the second line +means that the value from the first line, "Basic Latin", extends to all code +points in the range from 0 up to but not including 0x0080, that is, through +255. In other words, the code points from 0 to 255 are all in the "Basic +Latin" block. Similarly, all code points in the range from 0x0080 up to (but +not including) 0x0100 are in the block named "Latin-1 Supplement", etc. +(Notice that the return is the old-style block names; see L</Old-style versus +new-style block names>). + +The final line (with Index [242]) means that the value for all code points above +the legal Unicode maximum code point have the value "No_Block", which is the +term Unicode uses for a non-existing block. + +The arrays completely specify the mappings for all possible code points. +The final element in an inversion map returned by this function will always be +for the range that consists of all the code points that aren't legal Unicode, +but that are expressible on the platform. (That is, it starts with code point +0x110000, the first code point above the legal Unicode maximum, and extends to +infinity.) The value for that range will be the same that any typical +unassigned code point has for the specified property. (Certain unassigned +code points are not "typical"; for example the non-character code points, or +those in blocks that are to be written right-to-left. The above-Unicode +range's value is not based on these atypical code points.) It could be argued +that, instead of treating these as unassigned Unicode code points, the value +for this range should be C<undef>. If you wish, you can change the returned +arrays accordingly. + +The maps are almost always simple scalars that should be interpreted as-is. +These values are those given in the Unicode-supplied data files, which may be +inconsistent as to capitalization and as to which synonym for a property-value +is given. The results may be normalized by using the L</prop_value_aliases()> +function. + +There are exceptions to the simple scalar maps. Some properties have some +elements in their map list that are themselves lists of scalars; and some +special strings are returned that are not to be interpreted as-is. Element +[2] (placed into C<$format> in the example above) of the returned four element +list tells you if the map has any of these special elements, as follows: + +=over + +=item C<s> + +means all the elements of the map array are simple scalars, with no special +elements. Almost all properties are like this, like the C<block> example +above. + +=item C<sl> + +means that some of the map array elements have the form given by C<s>, and +the rest are lists of scalars. For example, here is a portion of the output +of calling C<prop_invmap>() with the "Script Extensions" property: + + @scripts_ranges @scripts_maps + ... + 0x0953 Deva + 0x0964 [ Beng Deva Guru Orya ] + 0x0966 Deva + 0x0970 Common + +Here, the code points 0x964 and 0x965 are used in the Bengali, +Devanagari, Gurmukhi, and Oriya scripts. + +=item C<r> + +means that all the elements of the map array are either rational numbers or +the string C<"NaN">, meaning "Not a Number". A rational number is either an +integer, or two integers separated by a solidus (C<"/">). The second integer +represents the denominator of the division implied by the solidus, and is +guaranteed not to be 0. If you want to convert them to scalar numbers, you +can use something like this: + + my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property); + if ($format && $format eq "r") { + map { $_ = eval $_ } @$invmap_ref; + } + +Here's some entries from the output of the property "Nv", which has format +C<"r">. + + @numerics_ranges @numerics_maps Note + 0x00 "NaN" + 0x30 0 DIGIT 0 + 0x31 1 + 0x32 2 + ... + 0x37 7 + 0x38 8 + 0x39 9 DIGIT 9 + 0x3A "NaN" + 0xB2 2 SUPERSCRIPT 2 + 0xB3 3 SUPERSCRIPT 2 + 0xB4 "NaN" + 0xB9 1 SUPERSCRIPT 1 + 0xBA "NaN" + 0xBC 1/4 VULGAR FRACTION 1/4 + 0xBD 1/2 VULGAR FRACTION 1/2 + 0xBE 3/4 VULGAR FRACTION 3/4 + 0xBF "NaN" + 0x660 0 ARABIC-INDIC DIGIT ZERO + +=item C<c> + +is like C<s> in that all the map array elements are scalars, but some of them +are the special string S<C<"E<lt>code pointE<gt>">>, meaning that the map of +each code point in the corresponding range in the inversion list is the code +point itself. For example, in: + + my ($uppers_ranges_ref, $uppers_maps_ref, $format) + = prop_invmap("Simple_Uppercase_Mapping"); + +the returned arrays look like this: + + @$uppers_ranges_ref @$uppers_maps_ref Note + 0 "<code point>" + 97 65 'a' maps to 'A' + 98 66 'b' => 'B' + 99 67 'c' => 'C' + ... + 120 88 'x' => 'X' + 121 89 'y' => 'Y' + 122 90 'z' => 'Z' + 123 "<code point>" + 181 924 MICRO SIGN => Greek Cap MU + 182 "<code point>" + ... + +The first line means that the uppercase of code point 0 is 0; +the uppercase of code point 1 is 1; ... of code point 96 is 96. Without the +C<"E<lt>code_pointE<gt>"> notation, every code point would have to have an +entry. This would mean that the arrays would each have more than a million +entries to list just the legal Unicode code points! + +=item C<cl> + +means that some of the map array elements have the form given by C<c>, and +the rest are ordered lists of code points. +For example, in: + + my ($uppers_ranges_ref, $uppers_maps_ref, $format) + = prop_invmap("Uppercase_Mapping"); + +the returned arrays look like this: + + @$uppers_ranges_ref @$uppers_maps_ref + 0 "<code point>" + 97 65 + ... + 122 90 + 123 "<code point>" + 181 924 + 182 "<code point>" + ... + 0x0149 [ 0x02BC 0x004E ] + 0x014A "<code point>" + 0x014B 0x014A + ... + +This is the full Uppercase_Mapping property (as opposed to the +Simple_Uppercase_Mapping given in the example for format C<"c">). The only +difference between the two in the ranges shown is that the code point at +0x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two +characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN +CAPITAL LETTER N). + +=item C<cle> + +means that some of the map array elements have the forms given by C<cl>, and +the rest are the empty string. The property C<NFKC_Casefold> has this form. +An example slice is: + + @$ranges_ref @$maps_ref Note + ... + 0x00AA 0x0061 FEMININE ORDINAL INDICATOR => 'a' + 0x00AB <code point> + 0x00AD SOFT HYPHEN => "" + 0x00AE <code point> + 0x00AF [ 0x0020, 0x0304 ] MACRON => SPACE . COMBINING MACRON + 0x00B0 <code point> + ... + +=item C<n> + +means the Name property. All the elements of the map array are simple +scalars, but some of them contain special strings that require more work to +get the actual name. + +Entries such as: + + CJK UNIFIED IDEOGRAPH-<code point> + +mean that the name for the code point is "CJK UNIFIED IDEOGRAPH-" +with the code point (expressed in hexadecimal) appended to it, like "CJK +UNIFIED IDEOGRAPH-3403" (similarly for C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code +pointE<gt>>). + +Also, entries like + + <hangul syllable> + +means that the name is algorithmically calculated. This is easily done by +the function L<charnames/charnames::viacode(code)>. + +Note that for control characters (C<Gc=cc>), Unicode's data files have the +string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty +string. This function returns that real name, the empty string. + +=item C<d> + +means the Decomposition_Mapping property. This property is like C<cl> +properties, except it has an additional entry type: + + <hangul syllable> + +for those code points whose decomposition is algorithmically calculated. (The +C<n> format has this same entry.) These can be generated via the function +L<Unicode::Normalize::NFD()|Unicode::Normalize>. + + +Note that the mapping is the one that is specified in the Unicode data files, +and to get the final decomposition, it may need to be applied recursively. + +=back + +A binary search can be used to quickly find a code point in the inversion +list, and hence its corresponding mapping. + +The final element (index [3], assigned to C<$default> in the "block" example) in +the four element list returned by this function may be useful for applications +that wish to convert the returned inversion map data structure into some +other, such as a hash. It gives the mapping that most code points map to +under the property. If you establish the convention that any code point not +explicitly listed in your data structure maps to this value, you can +potentially make your data structure much smaller. As you construct your data +structure from the one returned by this function, simply ignore those ranges +that map to this value, generally called the "default" value. For example, to +convert to the data structure searchable by L</charinrange()>, you can follow +this recipe: + + my ($list_ref, $map_ref, $format, $missing) = prop_invmap($property); + my @range_list; + for my $i (0 .. @$list_ref - 2) { + next if $map_ref->[$i] eq $missing; + push @range_list, [ $list_ref->[$i], + $list_ref->[$i+1], + $map_ref->[$i] + ]; + } + + print charinrange(\@range_list, $code_point), "\n"; + + +With this, C<charinrange()> will return C<undef> if its input code point maps +to C<$missing>. You can avoid this by omitting the C<next> statement, and adding +a line after the loop to handle the final element of the inversion map. + +One internal Perl property is accessible by this function. +"Perl_Decimal_Digit" returns an inversion map in which all the Unicode decimal +digits map to their numeric values, and everything else to the empty string, +like so: + + @digits @values + 0x0000 "" + 0x0030 0 + 0x0031 1 + 0x0032 2 + 0x0033 3 + 0x0034 4 + 0x0035 5 + 0x0036 6 + 0x0037 7 + 0x0038 8 + 0x0039 9 + 0x003A "" + 0x0660 0 + 0x0661 1 + ... + +Note that the inversion maps returned for the C<Case_Folding> and +C<Simple_Case_Folding> properties do not include the Turkic-locale mappings. +Use L</casefold()> for these. + +The C<Name_Alias> property is potentially undergoing signficant revision by +Unicode at the time of this writing. The format of the values returned for it +may change substantially in future Unicode versions. + +C<prop_invmap> does not know about any user-defined properties, and will +return C<undef> if called with one of those. + +=cut + +# User-defined properties could be handled with some changes to utf8_heavy.pl; +# if done, consideration should be given to the fact that the user subroutine +# could return different results with each call, which could lead to some +# security issues. + +# One could store things in memory so they don't have to be recalculated, but +# it is unlikely this will be called often, and some properties would take up +# significant memory. + +# These are created by mktables for this routine and stored in unicore/UCD.pl +# where their structures are described. +our @algorithmic_named_code_points; +our $HANGUL_BEGIN; +our $HANGUL_COUNT; + +sub prop_invmap ($) { + + croak __PACKAGE__, "::prop_invmap: must be called in list context" unless wantarray; + + my $prop = $_[0]; + return unless defined $prop; + + # Fail internal properties + return if $prop =~ /^_/; + + # The values returned by this function. + my (@invlist, @invmap, $format, $missing); + + # The swash has two components we look at, the base list, and a hash, + # named 'SPECIALS', containing any additional members whose mappings don't + # fit into the the base list scheme of things. These generally 'override' + # any value in the base list for the same code point. + my $overrides; + + require "utf8_heavy.pl"; + require "unicore/UCD.pl"; + +RETRY: + + # Try to get the map swash for the property. They have 'To' prepended to + # the property name, and 32 means we will accept 32 bit return values. + my $swash = utf8::SWASHNEW(__PACKAGE__, "To$prop", undef, 32, 0); + + # If didn't find it, could be because needs a proxy. And if was the + # 'Block' or 'Name' property, use a proxy even if did find it. Finding it + # would be the result of the installation changing mktables to output the + # Block or Name tables. The Block table gives block names in the + # new-style, and this routine is supposed to return old-style block names. + # The Name table is valid, but we need to execute the special code below + # to add in the algorithmic-defined name entries. + if (ref $swash eq "" + || $swash->{'TYPE'} eq 'ToBlk' + || $swash->{'TYPE'} eq 'ToNa') + { + + # Get the short name of the input property, in standard form + my ($second_try) = prop_aliases($prop); + return unless $second_try; + $second_try = utf8::_loose_name(lc $second_try); + + if ($second_try eq "in") { + + # This property is identical to age for inversion map purposes + $prop = "age"; + goto RETRY; + } + elsif ($second_try eq 'scf') { + + # This property uses just the LIST part of cf which includes the + # simple folds that are otherwise overridden by the SPECIALS. So + # all we need do is to not look at the SPECIALS; set $overrides to + # indicate that + $overrides = -1; + $prop = "cf"; + goto RETRY; + } + elsif ($second_try =~ / ^ s[ltu]c $ /x) { + + # Because some applications may be reading the full mapping + # equivalent files directly, they haven't been changed to include + # the simple mappings as well, as was done with the cf file (which + # doesn't have those backward compatibility issues) in 5.14. + # Instead, separate internal-only files were created that + # contain just the simple mappings that get overridden by the + # SPECIALS. Thus, these simple case mappings use the LIST part of + # their full mapping equivalents; plus the ones that are in those + # additional files. These special files are used by other + # functions in this module, so use the same hashes that those + # functions use. + my $file; + if ($second_try eq "suc") { + $file = '_suc.pl'; + $overrides = \%SIMPLE_UPPER; + } + elsif ($second_try eq "slc") { + $file = '_slc.pl'; + $overrides = \%SIMPLE_LOWER; + } + else { + $file = '_stc.pl'; + $overrides = \%SIMPLE_TITLE; + } + + # The files are already handled by the _read_table() function. + # Don't read them in if already done. + %$overrides =_read_table("unicore/To/$file", 'use_hash') + unless %$overrides; + + # Convert to the full mapping name, and go handle that; e.g., + # suc => uc. + $prop = $second_try =~ s/^s//r; + goto RETRY; + } + elsif ($second_try eq "blk") { + + # We use the old block names. Just create a fake swash from its + # data. + _charblocks(); + my %blocks; + $blocks{'LIST'} = ""; + $blocks{'TYPE'} = "ToBlk"; + $utf8::SwashInfo{ToBlk}{'missing'} = "No_Block"; + $utf8::SwashInfo{ToBlk}{'format'} = "s"; + + foreach my $block (@BLOCKS) { + $blocks{'LIST'} .= sprintf "%x\t%x\t%s\n", + $block->[0], + $block->[1], + $block->[2]; + } + $swash = \%blocks; + } + elsif ($second_try eq "na") { + + # Use the combo file that has all the Name-type properties in it, + # extracting just the ones that are for the actual 'Name' + # property. And create a fake swash from it. + my %names; + $names{'LIST'} = ""; + my $original = do "unicore/Name.pl"; + my $previous_hex_code_point = ""; + my $algorithm_names = \@algorithmic_named_code_points; + + # We hold off on adding the next entry to the list until we know, + # that the next line isn't for the same code point. We only + # output the final line. That one is the original Name property + # value. The others are the Name_Alias corrections, which are + # listed first in the file. + my $staging = ""; + + my $i = 0; + foreach my $line (split "\n", $original) { + my ($hex_code_point, $name) = split "\t", $line; + + # Weeds out all comments, blank lines, and named sequences + next if $hex_code_point =~ /\P{ASCII_HEX_DIGIT}/; + + my $code_point = hex $hex_code_point; + + # The name of all controls is the default: the empty string. + # The set of controls is immutable, so these hard-coded + # constants work. + next if $code_point <= 0x9F + && ($code_point <= 0x1F || $code_point >= 0x7F); + + # Output the last iteration's result, but only output the + # final name if a code point has more than one. + $names{'LIST'} .= $staging + if $hex_code_point ne $previous_hex_code_point; + + # If we are beyond where one of the special lines needs to + # be inserted ... + if ($i < @$algorithm_names + && $code_point > $algorithm_names->[$i]->{'low'}) + { + + # ... then insert it, ahead of what we were about to + # output + $staging = sprintf "%x\t%x\t%s\n", + $algorithm_names->[$i]->{'low'}, + $algorithm_names->[$i]->{'high'}, + $algorithm_names->[$i]->{'name'}; + + # And pretend that what we last saw was the final code + # point of the inserted range. + $previous_hex_code_point = sprintf "%04X", + $algorithm_names->[$i]->{'high'}; + + # Done with this range. + $i++; + + # Except we actually need to output the inserted line. + redo; + } + + # Normal name. + $staging = sprintf "%x\t\t%s\n", $code_point, $name; + $previous_hex_code_point = $hex_code_point; + } + + # Add the name from the final iteration + $names{'LIST'} .= $staging; + + $names{'TYPE'} = "ToNa"; + $utf8::SwashInfo{ToNa}{'missing'} = ""; + $utf8::SwashInfo{ToNa}{'format'} = "n"; + $swash = \%names; + } + elsif ($second_try =~ / ^ ( d [mt] ) $ /x) { + + # The file is a combination of dt and dm properties. Create a + # fake swash from the portion that we want. + my $original = do "unicore/Decomposition.pl"; + my %decomps; + + if ($second_try eq 'dt') { + $decomps{'TYPE'} = "ToDt"; + $utf8::SwashInfo{'ToDt'}{'missing'} = "None"; + $utf8::SwashInfo{'ToDt'}{'format'} = "s"; + } + else { + $decomps{'TYPE'} = "ToDm"; + $utf8::SwashInfo{'ToDm'}{'missing'} = "<code point>"; + + # Use a special internal-to-this_routine format, 'dm', to + # distinguish from 'd', meaning decimal. + $utf8::SwashInfo{'ToDm'}{'format'} = "dm"; + } + + $decomps{'LIST'} = ""; + + # This property has one special range not in the file: for the + # hangul syllables + my $done_hangul = 0; # Have we done the hangul range. + foreach my $line (split "\n", $original) { + my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line; + my $code_point = hex $hex_lower; + my $value; + + # The type, enclosed in <...>, precedes the mapping separated + # by blanks + if ($type_and_map =~ / ^ < ( .* ) > \s+ (.*) $ /x) { + $value = ($second_try eq 'dt') ? $1 : $2 + } + else { # If there is no type specified, it's canonical + $value = ($second_try eq 'dt') + ? "Canonical" : + $type_and_map; + } + + # Insert the hangul range at the appropriate spot. + if (! $done_hangul && $code_point > $HANGUL_BEGIN) { + $done_hangul = 1; + $decomps{'LIST'} .= + sprintf "%x\t%x\t%s\n", + $HANGUL_BEGIN, + $HANGUL_BEGIN + $HANGUL_COUNT - 1, + ($second_try eq 'dt') + ? "Canonical" + : "<hangul syllable>"; + } + + # And append this to our constructed LIST. + $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n"; + } + $swash = \%decomps; + } + else { # Don't know this property. Fail. + return; + } + } + + if ($swash->{'EXTRAS'}) { + carp __PACKAGE__, "::prop_invmap: swash returned for $prop unexpectedly has EXTRAS magic"; + return; + } + + # Here, have a valid swash return. Examine it. + my $returned_prop = $swash->{TYPE}; + + # All properties but binary ones should have 'missing' and 'format' + # entries + $missing = $utf8::SwashInfo{$returned_prop}{'missing'}; + $missing = 'N' unless defined $missing; + + $format = $utf8::SwashInfo{$returned_prop}{'format'}; + $format = 'b' unless defined $format; + + # The LIST input lines look like: + # ... + # 0374\t\tCommon + # 0375\t0377\tGreek # [3] + # 037A\t037D\tGreek # [4] + # 037E\t\tCommon + # 0384\t\tGreek + # ... + # + # Convert them to like + # 0374 => Common + # 0375 => Greek + # 0378 => $missing + # 037A => Greek + # 037E => Common + # 037F => $missing + # 0384 => Greek + # + # For binary properties, the final non-comment column is absent, and + # assumed to be 'Y'. + + foreach my $range (split "\n", $swash->{'LIST'}) { + $range =~ s/ \s* (?: \# .* )? $ //xg; # rmv trailing space, comments + + # Find the beginning and end of the range on the line + my ($hex_begin, $hex_end, $map) = split "\t", $range; + my $begin = hex $hex_begin; + my $end = (defined $hex_end && $hex_end ne "") + ? hex $hex_end + : $begin; + + # If the property doesn't have a range that begins at 0, add one that + # maps to the default value (for missing ranges). + if (! @invlist) { + if ($begin != 0) { + push @invlist, 0; + push @invmap, $missing; + } + } + elsif ($invlist[-1] == $begin) { + + # If the input isn't in the most compact form, so that there are + # two adjacent ranges that map to the same thing, they should be + # combined. This happens in our constructed dt mapping, as + # Element [-2] is the map for the latest range so far processed. + # Just set the beginning point of the map to $missing (in + # invlist[-1]) to 1 beyond where this range ends. For example, in + # 12\t13\tXYZ + # 14\t17\tXYZ + # we have set it up so that it looks like + # 12 => XYZ + # 14 => $missing + # + # We now see that it should be + # 12 => XYZ + # 18 => $missing + if (@invlist > 1 && $invmap[-2] eq $map) { + $invlist[-1] = $end + 1; + next; + } + + # Here, the range started in the previous iteration that maps to + # $missing starts at the same code point as this range. That + # means there is no gap to fill that that range was intended for, + # so we just pop it off the parallel arrays. + pop @invlist; + pop @invmap; + } + + # Add the range beginning, and the range's map. + push @invlist, $begin; + if ($format eq 'dm') { + + # The decomposition maps are either a line like <hangul syllable> + # which are to be taken as is; or a sequence of code points in hex + # and separated by blanks. Convert them to decimal, and if there + # is more than one, use an anonymous array as the map. + if ($map =~ /^ < /x) { + push @invmap, $map; + } + else { + my @map = map { hex } split " ", $map; + if (@map == 1) { + push @invmap, $map[0]; + } + else { + push @invmap, \@map; + } + } + } + else { + + # Otherwise, convert hex formatted list entries to decimal; add a + # 'Y' map for the missing value in binary properties, or + # otherwise, use the input map unchanged. + $map = ($format eq 'x') + ? hex $map + : $format eq 'b' + ? 'Y' + : $map; + push @invmap, $map; + } + + # We just started a range. It ends with $end. The gap between it and + # the next element in the list must be filled with a range that maps + # to the default value. If there is no gap, the next iteration will + # pop this, unless there is no next iteration, and we have filled all + # of the Unicode code space, so check for that and skip. + if ($end < $MAX_UNICODE_CODEPOINT) { + push @invlist, $end + 1; + push @invmap, $missing; + } + } + + # If the property is empty, make all code points use the value for missing + # ones. + if (! @invlist) { + push @invlist, 0; + push @invmap, $missing; + } + + # And add in standard element that all non-Unicode code points map to + # $missing + push @invlist, $MAX_UNICODE_CODEPOINT + 1; + push @invmap, $missing; + + # The second component of the map are those values that require + # non-standard specification, stored in SPECIALS. These override any + # duplicate code points in LIST. If we are using a proxy, we may have + # already set $overrides based on the proxy. + $overrides = $swash->{'SPECIALS'} unless defined $overrides; + if ($overrides) { + + # A negative $overrides implies that the SPECIALS should be ignored, + # and a simple 'c' list is the value. + if ($overrides < 0) { + $format = 'c'; + } + else { + + # Currently, all overrides are for properties that normally map to + # single code points, but now some will map to lists of code + # points (but there is an exception case handled below). + $format = 'cl'; + + # Look through the overrides. + foreach my $cp_maybe_utf8 (keys %$overrides) { + my $cp; + my @map; + + # If the overrides came from SPECIALS, the code point keys are + # packed UTF-8. + if ($overrides == $swash->{'SPECIALS'}) { + $cp = unpack("C0U", $cp_maybe_utf8); + @map = unpack "U0U*", $swash->{'SPECIALS'}{$cp_maybe_utf8}; + + # The empty string will show up unpacked as an empty + # array. + $format = 'cle' if @map == 0; + } + else { + + # But if we generated the overrides, we didn't bother to + # pack them, and we, so far, do this only for properties + # that are 'c' ones. + $cp = $cp_maybe_utf8; + @map = hex $overrides->{$cp}; + $format = 'c'; + } + + # Find the range that the override applies to. + my $i = _search_invlist(\@invlist, $cp); + if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) { + croak __PACKAGE__, "wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]" + } + + # And what that range currently maps to + my $cur_map = $invmap[$i]; + + # If there is a gap between the next range and the code point + # we are overriding, we have to add elements to both arrays to + # fill that gap, using the map that applies to it, which is + # $cur_map, since it is part of the current range. + if ($invlist[$i + 1] > $cp + 1) { + #use feature 'say'; + #say "Before splice:"; + #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2; + #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1; + #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]); + #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1; + #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2; + + splice @invlist, $i + 1, 0, $cp + 1; + splice @invmap, $i + 1, 0, $cur_map; + + #say "After splice:"; + #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2; + #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1; + #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]); + #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1; + #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2; + } + + # If the remaining portion of the range is multiple code + # points (ending with the one we are replacing, guaranteed by + # the earlier splice). We must split it into two + if ($invlist[$i] < $cp) { + $i++; # Compensate for the new element + + #use feature 'say'; + #say "Before splice:"; + #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2; + #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1; + #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]); + #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1; + #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2; + + splice @invlist, $i, 0, $cp; + splice @invmap, $i, 0, 'dummy'; + + #say "After splice:"; + #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2; + #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1; + #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]); + #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1; + #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2; + } + + # Here, the range we are overriding contains a single code + # point. The result could be the empty string, a single + # value, or a list. If the last case, we use an anonymous + # array. + $invmap[$i] = (scalar @map == 0) + ? "" + : (scalar @map > 1) + ? \@map + : $map[0]; + } + } + } + elsif ($format eq 'x') { + + # All hex-valued properties are really to code points + $format = 'c'; + } + elsif ($format eq 'dm') { + $format = 'd'; + } + elsif ($format eq 'sw') { # blank-separated elements to form a list. + map { $_ = [ split " ", $_ ] if $_ =~ / / } @invmap; + $format = 'sl'; + } + elsif ($returned_prop eq 'ToNameAlias') { + + # This property currently doesn't have any lists, but theoretically + # could + $format = 'sl'; + } + elsif ($format ne 'n' && $format ne 'r') { + + # All others are simple scalars + $format = 's'; + } + + return (\@invlist, \@invmap, $format, $missing); +} + =head2 Unicode::UCD::UnicodeVersion This returns the version of the Unicode Character Database, in other words, the diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index bdb9812396..3903d45e16 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -851,7 +851,60 @@ undef %pva_tested; no warnings 'once'; # We use some values once from 'required' modules. -use Unicode::UCD qw(prop_invlist MAX_CP); +use Unicode::UCD qw(prop_invlist prop_invmap MAX_CP); + +# There were some problems with caching interfering with prop_invlist() vs +# prop_invmap() on binary properties, and also between the 3 properties where +# Perl used the same 'To' name as another property (see utf8_heavy.pl). +# So, before testing all of prop_invlist(), +# 1) call prop_invmap() to try both orders of these name issues. This uses +# up two of the 3 properties; the third will be left so that invlist() +# on it gets called before invmap() +# 2) call prop_invmap() on a generic binary property, ahead of invlist(). +# This should test that the caching works in both directions. + +# These properties are not stable between Unicode versions, but the first few +# elements are; just look at the first element to see if are getting the +# distinction right. The general inversion map testing below will test the +# whole thing. +my $prop = "uc"; +my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); +is($format, 'cl', "prop_invmap() format of '$prop' is 'cl'"); +is($missing, '<code point>', "prop_invmap() missing of '$prop' is '<code point>'"); +is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61"); +is($invmap_ref->[1], 0x41, "prop_invmap('$prop') map[1] is 0x41"); + +$prop = "upper"; +($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); +is($format, 's', "prop_invmap() format of '$prop' is 'cl'"); +is($missing, 'N', "prop_invmap() missing of '$prop' is '<code point>'"); +is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41"); +is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'"); + +$prop = "lower"; +($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); +is($format, 's', "prop_invmap() format of '$prop' is 'cl'"); +is($missing, 'N', "prop_invmap() missing of '$prop' is '<code point>'"); +is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61"); +is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'"); + +$prop = "lc"; +($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); +is($format, 'cl', "prop_invmap() format of '$prop' is 'cl'"); +is($missing, '<code point>', "prop_invmap() missing of '$prop' is '<code point>'"); +is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41"); +is($invmap_ref->[1], 0x61, "prop_invmap('$prop') map[1] is 0x61"); + +# This property is stable and small, so can test all of it +$prop = "ASCII_Hex_Digit"; +($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); +is($format, 's', "prop_invmap() format of '$prop' is 's'"); +is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'"); +is_deeply($invlist_ref, [ 0x0000, 0x0030, 0x003A, 0x0041, + 0x0047, 0x0061, 0x0067, 0x110000 ], + "prop_invmap('$prop') code point list is correct"); +is_deeply($invmap_ref, [ 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N' ] , + "prop_invmap('$prop') map list is correct"); is(prop_invlist("Unknown property"), undef, "prop_invlist(<Unknown property>) returns undef"); is(prop_invlist(undef), undef, "prop_invlist(undef) returns undef"); @@ -864,11 +917,11 @@ use Storable qw(dclone); is(prop_invlist("InKana"), undef, "prop_invlist(<user-defined property returns undef>)"); -# The way both the tests for invlist work is that they take the +# The way both the tests for invlist and invmap work is that they take the # lists returned by the functions and construct from them what the original # file should look like, which are then compared with the file. If they are # identical, the test passes. What this tests isn't that the results are -# correct, but that invlist hasn't introduced errors beyond what +# correct, but that invlist and invmap haven't introduced errors beyond what # are there in the files. As a small hedge against that, test some # prop_invlist() tables fully with the known correct result. We choose # ASCII_Hex_Digit again, as it is stable. @@ -1085,4 +1138,772 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of } } +# Now test prop_invmap(). + +@list = prop_invmap("Unknown property"); +is (@list, 0, "prop_invmap(<Unknown property>) returns an empty list"); +@list = prop_invmap(undef); +is (@list, 0, "prop_invmap(undef) returns an empty list"); +ok (! eval "prop_invmap('gc')" && $@ ne "", + "prop_invmap('gc') dies in scalar context"); +@list = prop_invmap("_X_Begin"); +is (@list, 0, "prop_invmap(<internal property>) returns an empty list"); +@list = prop_invmap("InKana"); +is(@list, 0, "prop_invmap(<user-defined property returns undef>)"); +@list = prop_invmap("Perl_Decomposition_Mapping"), undef, +is(@list, 0, "prop_invmap('Perl_Decomposition_Mapping') returns <undef> since internal-Perl-only"); +@list = prop_invmap("Perl_Charnames"), undef, +is(@list, 0, "prop_invmap('Perl_Charnames') returns <undef> since internal-Perl-only"); +@list = prop_invmap("Is_Is_Any"); +is(@list, 0, "prop_invmap('Is_Is_Any') returns <undef> since two is's"); + +# The set of properties to test on has already been compiled into %props by +# the prop_aliases() tests. + +my %tested_invmaps; + +# Like prop_invlist(), prop_invmap() is tested by comparing the results +# returned by the function with the tables that mktables generates. Some of +# these tables are directly stored as files on disk, in either the unicore or +# unicore/To directories, and most should be listed in the mktables generated +# hash %utf8::loose_property_to_file_of, with a few additional ones that this +# handles specially. For these, the files are read in directly, massaged, and +# compared with what invmap() returns. The SPECIALS hash in some of these +# files overrides values in the main part of the file. +# +# The other properties are tested indirectly by generating all the possible +# inversion lists for the property, and seeing if those match the inversion +# lists returned by prop_invlist(), which has already been tested. + +PROPERTY: +foreach my $prop (keys %props) { + my $loose_prop = utf8::_loose_name(lc $prop); + my $suppressed = grep { $_ eq $loose_prop } + @Unicode::UCD::suppressed_properties; + + # Find the short and full names that this property goes by + my ($name, $full_name) = prop_aliases($prop); + if (! $name) { + if (! $suppressed) { + fail("prop_invmap('$prop')"); + diag("is unknown to prop_aliases(), and we need it in order to test prop_invmap"); + } + next PROPERTY; + } + + # Normalize the short name, as it is stored in the hashes under the + # normalized version. + $name = utf8::_loose_name(lc $name); + + # Add in the characters that are supposed to be ignored to test loose + # matching, which the tested function applies to all properties + my $mod_prop = "$extra_chars$prop"; + + my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($mod_prop); + my $return_ref = [ $invlist_ref, $invmap_ref, $format, $missing ]; + + # If have already tested this property under a different name, merely + # compare the return from now with the saved one from before. + if (exists $tested_invmaps{$name}) { + is_deeply($return_ref, $tested_invmaps{$name}, "prop_invmap('$mod_prop') gave same results as its synonym, '$name'"); + next PROPERTY; + } + $tested_invmaps{$name} = dclone $return_ref; + + # If prop_invmap() returned nothing, is ok iff is a property whose file is + # not generated. + if ($suppressed) { + if (defined $format) { + fail("prop_invmap('$mod_prop')"); + diag("did not return undef for suppressed property $prop"); + } + next PROPERTY; + } + elsif (!defined $format) { + fail("prop_invmap('$mod_prop')"); + diag("'$prop' is unknown to prop_invmap()"); + next PROPERTY; + } + + # The two parallel arrays must have the same number of elements. + if (@$invlist_ref != @$invmap_ref) { + fail("prop_invmap('$mod_prop')"); + diag("invlist has " + . scalar @$invlist_ref + . " while invmap has " + . scalar @$invmap_ref + . " elements"); + next PROPERTY; + } + + # The last element must be for the above-Unicode code points, and must be + # for the default value. + if ($invlist_ref->[-1] != 0x110000) { + fail("prop_invmap('$mod_prop')"); + diag("The last inversion list element is not 0x110000"); + next PROPERTY; + } + if ($invmap_ref->[-1] ne $missing) { + fail("prop_invmap('$mod_prop')"); + diag("The last inversion list element is '$invmap_ref->[-1]', and should be '$missing'"); + next PROPERTY; + } + + if ($name eq 'bmg') { # This one has an atypical $missing + if ($missing ne "") { + fail("prop_invmap('$mod_prop')"); + diag("The missings should be \"\"; got '$missing'"); + next PROPERTY; + } + } + elsif ($format =~ /^ [cd] /x) { + if ($missing ne "<code point>") { + fail("prop_invmap('$mod_prop')"); + diag("The missings should be '<code point>'; got '$missing'"); + next PROPERTY; + } + } + elsif ($missing =~ /[<>]/) { + fail("prop_invmap('$mod_prop')"); + diag("The missings should NOT be something with <...>'"); + next PROPERTY; + + # I don't want to hard code in what all the missings should be, so + # those don't get fully tested. + } + + # Certain properties don't have their own files, but must be constructed + # using proxies. + my $proxy_prop = $name; + if ($full_name eq 'Present_In') { + $proxy_prop = "age"; # The maps for these two props are identical + } + elsif ($full_name eq 'Simple_Case_Folding' + || $full_name =~ /Simple_ (.) .*? case_Mapping /x) + { + if ($full_name eq 'Simple_Case_Folding') { + $proxy_prop = 'cf'; + } + else { + # We captured the U, L, or T, leading to uc, lc, or tc. + $proxy_prop = lc $1 . "c"; + } + if ($format ne "c") { + fail("prop_invmap('$mod_prop')"); + diag("The format should be 'c'; got '$format'"); + next PROPERTY; + } + } + + my $base_file; + my $official; + + # Handle the properties that have full disk files for them (except the + # Name property which is structurally enough different that it is handled + # separately below.) + if ($name ne 'na' + && ($name eq 'blk' + || defined + ($base_file = $utf8::loose_property_to_file_of{$proxy_prop}) + || exists $utf8::loose_to_file_of{$proxy_prop} + || $name eq "dm")) + { + # In the above, blk is done unconditionally, as we need to test that + # the old-style block names are returned, even if mktables has + # generated a file for the new-style; the test for dm comes afterward, + # so that if a file has been generated for it explicitly, we use that + # file (which is valid, unlike blk) instead of the combo + # Decomposition.pl files. + my $file; + my $is_binary = 0; + if ($name eq 'blk') { + + # The blk property is special. The original file with old block + # names is retained, and the default is to not write out a + # new-name file. What we do is get the old names into a data + # structure, and from that create what the new file would look + # like. $base_file is needed to be defined, just to avoid a + # message below. + $base_file = "This is a dummy name"; + my $blocks_ref = charblocks(); + $official = ""; + for my $range (sort { $a->[0][0] <=> $b->[0][0] } + values %$blocks_ref) + { + # Translate the charblocks() data structure to what the file + # would like. + $official .= sprintf"%04X\t%04X\t%s\n", + $range->[0][0], + $range->[0][1], + $range->[0][2]; + } + } + else { + + # Above leaves $base_file undefined only if it came from the hash + # below. This should happen only when it is a binary property + # (and are accessing via a single-form name, like 'In_Latin1'), + # and so it is stored in a different directory than the To ones. + # XXX Currently, the only cases where it is complemented are the + # ones that have no code points. And it works out for these that + # 1) complementing them, and then 2) adding or subtracting the + # initial 0 and final 110000 cancel each other out. But further + # work would be needed in the unlikely event that an inverted + # property comes along without these characteristics + if (!defined $base_file) { + $base_file = $utf8::loose_to_file_of{$proxy_prop}; + $is_binary = ($base_file =~ s/^!//) ? -1 : 1; + $base_file = "lib/$base_file"; + } + + # Read in the file + $base_file = "Decomposition" if $format eq 'd'; + $file = "unicore/$base_file.pl"; + $official = do $file; + + # Get rid of any trailing space and comments in the file. + $official =~ s/\s*(#.*)?$//mg; + + # Decomposition.pl also has the <compatible> types in it, which + # should be removed. + $official =~ s/<.*?> //mg if $format eq 'd'; + } + chomp $official; + + # If there are any special elements, get a reference to them. + my $specials_ref = $utf8::file_to_swash_name{$base_file}; + if ($specials_ref) { + $specials_ref = $utf8::SwashInfo{$specials_ref}{'specials_name'}; + if ($specials_ref) { + + # Convert from the name to the actual reference. + no strict 'refs'; + $specials_ref = \%{$specials_ref}; + } + } + + # Certain of the proxy properties have to be adjusted to match the + # real ones. + if (($proxy_prop ne $name && $full_name =~ 'Mapping') + || $full_name eq 'Case_Folding') + { + + # Here we have either + # 1) Case_Folding; or + # 2) a proxy that is a full mapping, which means that what the + # real property is is the equivalent simple mapping. + # In both cases, the file will have a standard list containing + # simple mappings (to a single code point), and a specials hash + # which contains all the mappings that are to multiple code + # points. First, extract a list containing all the file's simple + # mappings. + my @list; + for (split "\n", $official) { + my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?) + \s* ( \# .* )? + $ /x; + $end = $start if $end eq ""; + if ($end ne $start) { + fail("prop_invmap('$mod_prop')"); + diag("This test is expecting only single code point ranges in $file.pl"); + next PROPERTY; + } + push @list, [ hex $start, $value ]; + } + + # For Case_Folding, the file contains all the simple mappings, + # including the ones that are overridden by the specials. These + # need to be removed as the list is for just the full ones. For + # the other files, the proxy is missing the simple mappings that + # are overridden by the specials, so we need to add them. + + # For the missing simples, we get the correct values by calling + # charinfo(). Set up which element of the hash returned by + # charinfo to look at + my $charinfo_element; + if ($full_name =~ / ^ Simple_ (Lower | Upper | Title) case_Mapping/x) + { + $charinfo_element = lc $1; # e.g. Upper is referred to by the + # key 'upper' in the charinfo() + # returned hash + } + + # Go through any special mappings one by one. They are packed. + my $i = 0; + foreach my $utf8_cp (sort keys %$specials_ref) { + my $cp = unpack("C0U", $utf8_cp); + + # Get what the simple value for this should be; either nothing + # for Case_Folding, or what charinfo returns for the others. + my $simple = ($full_name eq "Case_Folding") + ? "" + : charinfo($cp)->{$charinfo_element}; + + # And create an entry to add to the list, if appropriate + my $replacement; + $replacement = [ $cp, $simple ] if $simple ne ""; + + # Find the spot in the @list of simple mappings that this + # special applies to; uses a linear search. + while ($i < @list -1 ) { + last if $cp <= $list[$i][0]; + $i++; + } + + #note $i-1 . ": " . join " => ", @{$list[$i-1]}; + #note $i-0 . ": " . join " => ", @{$list[$i-0]}; + #note $i+1 . ": " . join " => ", @{$list[$i+1]}; + + if (! defined $replacement) { + + # Here, are to remove any existing entry for this code + # point. + next if $cp != $list[$i][0]; + splice @list, $i, 1; + } + elsif ($cp == $list[$i][0]) { + + # Here, are to add something, but there is an existing + # entry, so this just replaces it. + $list[$i] = $replacement; + } + else { + + # Here, are to add something, and there isn't an existing + # entry. + splice @list, $i, 0, $replacement; + } + + #note __LINE__ . ": $cp"; + #note $i-1 . ": " . join " => ", @{$list[$i-1]}; + #note $i-0 . ": " . join " => ", @{$list[$i-0]}; + #note $i+1 . ": " . join " => ", @{$list[$i+1]}; + } + + # Here, have gone through all the specials, modifying @list as + # needed. Turn it back into what the file should look like. + $official = join "\n", map { sprintf "%04X\t\t%s", @$_ } @list; + + # And, no longer need the specials for the simple mappings, as are + # all incorporated into $official + undef $specials_ref if $full_name ne 'Case_Folding'; + } + elsif ($full_name eq 'Simple_Case_Folding') { + + # This property has everything in the regular array, and the + # specials are superfluous. + undef $specials_ref if $full_name ne 'Case_Folding'; + } + + # Here, in $official, we have what the file looks like, or should like + # if we've had to fix it up. Now take the invmap() output and reverse + # engineer from that what the file should look like. Each iteration + # appends the next line to the running string. + my $tested_map = ""; + + # Create a copy of the file's specials hash. (It has been undef'd if + # we know it isn't relevant to this property, so if it exists, it's an + # error or is relevant). As we go along, we delete from that copy. + # If a delete fails, or something is left over after we are done, + # it's an error + my %specials = %$specials_ref if $specials_ref; + + # The extra -1 is because the final element has been tested above to + # be for anything above Unicode. The file doesn't go that high. + for my $i (0 .. @$invlist_ref - 1 - 1) { + + # If the map element is a reference, have to stringify it (but + # don't do so if the format doesn't allow references, so that an + # improper format will generate an error. + if (ref $invmap_ref->[$i] + && ($format eq 'd' || $format =~ /^ . l /x)) + { + # The stringification depends on the format. At the time of + # this writing, all 'sl' formats are space separated. + if ($format eq 'sl') { + $invmap_ref->[$i] = join " ", @{$invmap_ref->[$i]}; + } + elsif ($format =~ / ^ cl e? $/x) { + + # For a cl property, the stringified result should be in + # the specials hash. The key is the packed code point, + # and the value is the packed map. + my $value; + if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) { + fail("prop_invmap('$mod_prop')"); + diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]); + next PROPERTY; + } + my $packed = pack "U*", @{$invmap_ref->[$i]}; + if ($value ne $packed) { + fail("prop_invmap('$mod_prop')"); + diag(sprintf "For %04X, expected the mapping to be '$packed', but got '$value'"); + next PROPERTY; + } + + # As this doesn't get tested when we later compare with + # the actual file, it could be out of order and we + # wouldn't know it. + if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) + || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) + { + fail("prop_invmap('$mod_prop')"); + diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); + next PROPERTY; + } + next; + } + elsif ($format eq 'd') { + + # The decomposition mapping file has the code points as + # a string of space-separated hex constants. + $invmap_ref->[$i] = join " ", map { sprintf "%04X", $_ } @{$invmap_ref->[$i]}; + } + else { + fail("prop_invmap('$mod_prop')"); + diag("Can't handle format '$format'"); + next PROPERTY; + } + } + elsif ($format eq 'cle' && $invmap_ref->[$i] eq "") { + + # cle properties have maps to the empty string that also + # should be in the specials hash, with the key the packed code + # point, and the map just empty. + my $value; + if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) { + fail("prop_invmap('$mod_prop')"); + diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]); + next PROPERTY; + } + if ($value ne "") { + fail("prop_invmap('$mod_prop')"); + diag(sprintf "For %04X, expected the mapping to be \"\", but got '$value'", $invlist_ref->[$i]); + next PROPERTY; + } + + # As this doesn't get tested when we later compare with + # the actual file, it could be out of order and we + # wouldn't know it. + if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) + || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) + { + fail("prop_invmap('$mod_prop')"); + diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); + next PROPERTY; + } + next; + } + elsif ($is_binary) { # These binary files don't have an explicit Y + $invmap_ref->[$i] =~ s/Y//; + } + + # The file doesn't include entries that map to $missing, so don't + # include it in the built-up string. But make sure that it is in + # the correct order in the input. + if ($invmap_ref->[$i] eq $missing) { + if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) + || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) + { + fail("prop_invmap('$mod_prop')"); + diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); + next PROPERTY; + } + next; + } + + # 'c'-type and 'd' properties have the mapping expressed in hex in + # the file + if ($format =~ /^ [cd] /x) { + + # The d property has one entry which isn't in the file. + # Ignore it, but make sure it is in order. + if ($format eq 'd' + && $invmap_ref->[$i] eq '<hangul syllable>' + && $invlist_ref->[$i] == 0xAC00) + { + if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) + || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) + { + fail("prop_invmap('$mod_prop')"); + diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); + next PROPERTY; + } + next; + } + $invmap_ref->[$i] = sprintf("%04X", $invmap_ref->[$i]) + if $invmap_ref->[$i] =~ / ^ [A-Fa-f0-9]+ $/x; + } + + # Finally have figured out what the map column in the file should + # be. Append the line to the running string. + my $start = $invlist_ref->[$i]; + my $end = $invlist_ref->[$i+1] - 1; + $end = ($start == $end) ? "" : sprintf("%04X", $end); + if ($invmap_ref->[$i] ne "") { + $tested_map .= sprintf "%04X\t%s\t%s\n", $start, $end, $invmap_ref->[$i]; + } + elsif ($end ne "") { + $tested_map .= sprintf "%04X\t%s\n", $start, $end; + } + else { + $tested_map .= sprintf "%04X\n", $start; + } + } # End of looping over all elements. + + # Here are done with generating what the file should look like + + chomp $tested_map; + + # And compare. + if ($tested_map ne $official) { + fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap"); + next PROPERTY; + } + + # There shouldn't be any specials unaccounted for. + if (keys %specials) { + fail("prop_invmap('$mod_prop')"); + diag("Unexpected specials: " . join ", ", keys %specials); + next PROPERTY; + } + } + elsif ($format eq 'n') { + + # Handle the Name property similar to the above. But the file is + # sufficiently different that it is more convenient to make a special + # case for it. + + if ($missing ne "") { + fail("prop_invmap('$mod_prop')"); + diag("The missings should be \"\"; got \"missing\""); + next PROPERTY; + } + + $official = do "unicore/Name.pl"; + + # Get rid of the named sequences portion of the file. These don't + # have a tab before the first blank on a line. + $official =~ s/ ^ [^\t]+ \ .*? \n //xmg; + + # And get rid of the controls. These are named in the file, but + # shouldn't be in the property. + $official =~ s/ 00000 \t .* 0001F .*? \n//xs; + $official =~ s/ 0007F \t .* 0009F .*? \n//xs; + + # This is slow; it gets rid of the aliases. We look for lines that + # are for the same code point as the previous line. The previous line + # will be a name_alias; and the current line will be the name. Get + # rid of the name_alias line. This won't work if there are multiple + # aliases for a given name. + my @temp_names = split "\n", $official; + my $previous_cp = ""; + for (my $i = 0; $i < @temp_names - 1; $i++) { + $temp_names[$i] =~ /^ (.*)? \t /x; + my $current_cp = $1; + if ($current_cp eq $previous_cp) { + splice @temp_names, $i - 1, 1; + redo; + } + else { + $previous_cp = $current_cp; + } + } + $official = join "\n", @temp_names; + undef @temp_names; + chomp $official; + + # Here have adjusted the file. We also have to adjust the returned + # inversion map by checking and deleting all the lines in it that + # won't be in the file. These are the lines that have generated + # things, like <hangul syllable>. + my $tested_map = ""; # Current running string + my @code_point_in_names = + @Unicode::UCD::code_points_ending_in_code_point; + + for my $i (0 .. @$invlist_ref - 1 - 1) { + my $start = $invlist_ref->[$i]; + my $end = $invlist_ref->[$i+1] - 1; + if ($invmap_ref->[$i] eq $missing) { + if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) + || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) + { + fail("prop_invmap('$mod_prop')"); + diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); + next PROPERTY; + } + next; + } + if ($invmap_ref->[$i] =~ / (.*) ( < .*? > )/x) { + my $name = $1; + my $type = $2; + if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) + || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) + { + fail("prop_invmap('$mod_prop')"); + diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); + next PROPERTY; + } + if ($type eq "<hangul syllable>") { + if ($name ne "") { + fail("prop_invmap('$mod_prop')"); + diag("Unexpected text in $invmap_ref->[$i]"); + next PROPERTY; + } + if ($start != 0xAC00) { + fail("prop_invmap('$mod_prop')"); + diag(sprintf("<hangul syllables> should begin at 0xAC00, got %04X", $start)); + next PROPERTY; + } + if ($end != $start + 11172 - 1) { + fail("prop_invmap('$mod_prop')"); + diag(sprintf("<hangul syllables> should end at %04X, got %04X", $start + 11172 -1, $end)); + next PROPERTY; + } + } + elsif ($type ne "<code point>") { + fail("prop_invmap('$mod_prop')"); + diag("Unexpected text '$type' in $invmap_ref->[$i]"); + next PROPERTY; + } + else { + + # Look through the array of names that end in code points, + # and look for this start and end. If not found is an + # error. If found, delete it, and at the end, make sure + # have deleted everything. + for my $i (0 .. @code_point_in_names - 1) { + my $hash = $code_point_in_names[$i]; + if ($hash->{'low'} == $start + && $hash->{'high'} == $end + && "$hash->{'name'}-" eq $name) + { + splice @code_point_in_names, $i, 1; + last; + } + else { + fail("prop_invmap('$mod_prop')"); + diag("Unexpected code-point-in-name line '$invmap_ref->[$i]'"); + next PROPERTY; + } + } + } + + next; + } + + # Have adjusted the map, as needed. Append to running string. + $end = ($start == $end) ? "" : sprintf("%05X", $end); + $tested_map .= sprintf "%05X\t%s\n", $start, $invmap_ref->[$i]; + } + + # Finished creating the string from the inversion map. Can compare + # with what the file is. + chomp $tested_map; + if ($tested_map ne $official) { + fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap"); + next PROPERTY; + } + if (@code_point_in_names) { + fail("prop_invmap('$mod_prop')"); + use Data::Dumper; + diag("Missing code-point-in-name line(s)" . Dumper \@code_point_in_names); + next PROPERTY; + } + } + elsif ($format eq 's' || $format eq 'r') { + + # Here the map is not more or less directly from a file stored on + # disk. We try a different tack. These should all be properties that + # have just a few possible values (most of them are binary). We go + # through the map list, sorting each range into buckets, one for each + # map value. Thus for binary properties there will be a bucket for Y + # and one for N. The buckets are inversion lists. We compare each + # constructed inversion list with what we would get for it using + # prop_invlist(), which has already been tested. If they all match, + # the whole map must have matched. + my %maps; + my $previous_map; + + # (The extra -1 is to not look at the final element in the loop, which + # we know is the one that starts just beyond Unicode and goes to + # infinity.) + for my $i (0 .. @$invlist_ref - 1 - 1) { + my $range_start = $invlist_ref->[$i]; + + # Because we are sorting into buckets, things could be + # out-of-order here, and still be in the correct order in the + # bucket, and hence wouldn't show up as an error; so have to + # check. + if (($i > 0 && $range_start <= $invlist_ref->[$i-1]) + || $range_start >= $invlist_ref->[$i+1]) + { + fail("prop_invmap('$mod_prop')"); + diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); + next PROPERTY; + } + + # This new range closes out the range started in the previous + # iteration. + push @{$maps{$previous_map}}, $range_start if defined $previous_map; + + # And starts a range which will be closed in the next iteration. + $previous_map = $invmap_ref->[$i]; + push @{$maps{$previous_map}}, $range_start; + } + + # The range we just started hasn't been closed, and we didn't look at + # the final element of the loop. If that range is for the default + # value, it shouldn't be closed, as it is to extend to infinity. But + # otherwise, it should end at the final Unicode code point, and the + # list that maps to the default value should have another element that + # does go to infinity for every above Unicode code point. + + if (@$invlist_ref > 1) { + my $penultimate_map = $invmap_ref->[-2]; + if ($penultimate_map ne $missing) { + + # The -1th element contains the first non-Unicode code point. + push @{$maps{$penultimate_map}}, $invlist_ref->[-1]; + push @{$maps{$missing}}, $invlist_ref->[-1]; + } + } + + # Here, we have the buckets (inversion lists) all constructed. Go + # through each and verify that matches what prop_invlist() returns. + # We could use is_deeply() for the comparison, but would get multiple + # messages for each $prop. + foreach my $map (keys %maps) { + my @off_invlist = prop_invlist("$prop = $map"); + my $min = (@off_invlist >= @{$maps{$map}}) + ? @off_invlist + : @{$maps{$map}}; + for my $i (0 .. $min- 1) { + if ($i > @off_invlist - 1) { + fail("prop_invmap('$mod_prop')"); + diag("There is no element [$i] for $prop=$map from prop_invlist(), while [$i] in the implicit one constructed from prop_invmap() is '$maps{$map}[$i]'"); + next PROPERTY; + } + elsif ($i > @{$maps{$map}} - 1) { + fail("prop_invmap('$mod_prop')"); + diag("There is no element [$i] from the implicit $prop=$map constructed from prop_invmap(), while [$i] in the one from prop_invlist() is '$off_invlist[$i]'"); + next PROPERTY; + } + elsif ($maps{$map}[$i] ne $off_invlist[$i]) { + fail("prop_invmap('$mod_prop')"); + diag("Element [$i] of the implicit $prop=$map constructed from prop_invmap() is '$maps{$map}[$i]', and the one from prop_invlist() is '$off_invlist[$i]'"); + next PROPERTY; + } + } + } + } + else { # Don't know this property nor format. + + fail("prop_invmap('$mod_prop')"); + diag("Unknown format '$format'"); + } + + pass("prop_invmap('$mod_prop')"); +} + done_testing(); |