summaryrefslogtreecommitdiff
path: root/lib/Unicode
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-11-06 14:31:26 -0700
committerKarl Williamson <public@khwilliamson.com>2011-11-08 08:09:36 -0700
commit62b3b855a6b9268ee171e2c384362d719ea21537 (patch)
treefd7b7068c8888430d4813e67cdd86c4483079eaf /lib/Unicode
parenta33a1c99e77db13418959b16b072f82ae531372b (diff)
downloadperl-62b3b855a6b9268ee171e2c384362d719ea21537.tar.gz
Unicode::UCD: Add prop_invmap()
Diffstat (limited to 'lib/Unicode')
-rw-r--r--lib/Unicode/UCD.pm976
-rw-r--r--lib/Unicode/UCD.t827
2 files changed, 1800 insertions, 3 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index ef46c29e21..09ea439919 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -26,6 +26,7 @@ our @EXPORT_OK = qw(charinfo
prop_aliases
prop_value_aliases
prop_invlist
+ prop_invmap
MAX_CP
);
@@ -75,6 +76,10 @@ Unicode::UCD - Unicode character database
use Unicode::UCD 'prop_invlist';
my @puncts = prop_invlist("gc=punctuation");
+ use Unicode::UCD 'prop_invmap';
+ my ($list_ref, $map_ref, $format, $missing)
+ = prop_invmap("General Category");
+
use Unicode::UCD 'compexcl';
my $compexcl = compexcl($codepoint);
@@ -657,6 +662,9 @@ as the keys, and the code point ranges (see L</charblock()>) as the values.
The names are in the old-style (see L</Old-style versus new-style block
names>).
+L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a
+different type of data structure.
+
See also L</Blocks versus Scripts>.
=cut
@@ -676,6 +684,9 @@ charscripts() returns a reference to a hash with the known script
names as the keys, and the code point ranges (see L</charscript()>) as
the values.
+L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
+different type of data structure.
+
See also L</Blocks versus Scripts>.
=cut
@@ -2033,6 +2044,971 @@ sub prop_invlist ($) {
return @invlist;
}
+sub _search_invlist {
+ # Find the range in the inversion list which contains a code point; that
+ # is, find i such that l[i] <= code_point < l[i+1]
+
+ # If this is ever made public, could use to speed up .t specials. Would
+ # need to use code point argument, as in other functions in this pm
+
+ my $list_ref = shift;
+ my $code_point = shift;
+ # Verify non-neg numeric XXX
+
+ my $max_element = @$list_ref - 1;
+ return if ! $max_element < 0; # Undef if list is empty.
+
+ # Short cut something at the far-end of the table. This also allows us to
+ # refer to element [$i+1] without fear of being out-of-bounds in the loop
+ # below.
+ return $max_element if $code_point >= $list_ref->[$max_element];
+
+ use integer; # want integer division
+
+ my $i = $max_element / 2;
+
+ my $lower = 0;
+ my $upper = $max_element;
+ while (1) {
+
+ if ($code_point >= $list_ref->[$i]) {
+
+ # Here we have met the lower constraint. We can quit if we
+ # also meet the upper one.
+ last if $code_point < $list_ref->[$i+1];
+
+ $lower = $i; # Still too low.
+
+ }
+ else {
+
+ # Here, $code_point < $list_ref[$i], so look lower down.
+ $upper = $i;
+ }
+
+ # Split search domain in half to try again.
+ my $temp = ($upper + $lower) / 2;
+
+ # No point in continuing unless $i changes for next time
+ # in the loop.
+ return $i if $temp == $i;
+ $i = $temp;
+ } # End of while loop
+
+ # Here we have found the offset
+ return $i;
+}
+
+=pod
+
+=head2 B<prop_invmap()>
+
+ use Unicode::UCD 'prop_invmap';
+ my ($list_ref, $map_ref, $format, $missing)
+ = prop_invmap("General Category");
+
+C<prop_invmap> is used to get the complete mapping definition for a property,
+in the form of an inversion map. An inversion map consists of two parallel
+arrays. One is an ordered list of code points that mark range beginnings, and
+the other gives the value (or mapping) that all code points in the
+corresponding range have.
+
+C<prop_invmap> is called with the name of the desired property. The name is
+loosely matched, meaning that differences in case, white-space, hyphens, and
+underscores are not meaningful (except for the trailing underscore in the
+old-form grandfathered-in property C<"L_">, which is better written as C<"LC">,
+or even better, C<"Gc=LC">).
+
+Many Unicode properties have more than one name (or alias). C<prop_invmap>
+understands all of these, including Perl extensions to them. Ambiguities are
+resolved as described above for L</prop_aliases()>. The Perl internal
+property "Perl_Decimal_Digit, described below, is also accepted. C<undef> is
+returned if the property name is unknown.
+
+It is a fatal error to call this function except in list context.
+
+In addition to the the two arrays that form the inversion map, C<prop_invmap>
+returns two other values; one is a scalar that gives some details as to the
+format of the entries of the map array; the other is used for specialized
+purposes, described at the end of this section.
+
+This means that C<prop_invmap> returns a 4 element list. For example,
+
+ my ($blocks_ranges_ref, $blocks_maps_ref, $format, $default)
+ = prop_invmap("Block");
+
+In this call, the two arrays will be populated as shown below (for Unicode
+6.0):
+
+ Index @blocks_ranges @blocks_maps
+ 0 0x0000 Basic Latin
+ 1 0x0080 Latin-1 Supplement
+ 2 0x0100 Latin Extended-A
+ 3 0x0180 Latin Extended-B
+ 4 0x0250 IPA Extensions
+ 5 0x02B0 Spacing Modifier Letters
+ 6 0x0300 Combining Diacritical Marks
+ 7 0x0370 Greek and Coptic
+ 8 0x0400 Cyrillic
+ ...
+ 233 0x2B820 No_Block
+ 234 0x2F800 CJK Compatibility Ideographs Supplement
+ 235 0x2FA20 No_Block
+ 236 0xE0000 Tags
+ 237 0xE0080 No_Block
+ 238 0xE0100 Variation Selectors Supplement
+ 239 0xE01F0 No_Block
+ 240 0xF0000 Supplementary Private Use Area-A
+ 241 0x100000 Supplementary Private Use Area-B
+ 242 0x110000 No_Block
+
+The first line (with Index [0]) means that the value for code point 0 is "Basic
+Latin". The entry "0x0080" in the @blocks_ranges column in the second line
+means that the value from the first line, "Basic Latin", extends to all code
+points in the range from 0 up to but not including 0x0080, that is, through
+255. In other words, the code points from 0 to 255 are all in the "Basic
+Latin" block. Similarly, all code points in the range from 0x0080 up to (but
+not including) 0x0100 are in the block named "Latin-1 Supplement", etc.
+(Notice that the return is the old-style block names; see L</Old-style versus
+new-style block names>).
+
+The final line (with Index [242]) means that the value for all code points above
+the legal Unicode maximum code point have the value "No_Block", which is the
+term Unicode uses for a non-existing block.
+
+The arrays completely specify the mappings for all possible code points.
+The final element in an inversion map returned by this function will always be
+for the range that consists of all the code points that aren't legal Unicode,
+but that are expressible on the platform. (That is, it starts with code point
+0x110000, the first code point above the legal Unicode maximum, and extends to
+infinity.) The value for that range will be the same that any typical
+unassigned code point has for the specified property. (Certain unassigned
+code points are not "typical"; for example the non-character code points, or
+those in blocks that are to be written right-to-left. The above-Unicode
+range's value is not based on these atypical code points.) It could be argued
+that, instead of treating these as unassigned Unicode code points, the value
+for this range should be C<undef>. If you wish, you can change the returned
+arrays accordingly.
+
+The maps are almost always simple scalars that should be interpreted as-is.
+These values are those given in the Unicode-supplied data files, which may be
+inconsistent as to capitalization and as to which synonym for a property-value
+is given. The results may be normalized by using the L</prop_value_aliases()>
+function.
+
+There are exceptions to the simple scalar maps. Some properties have some
+elements in their map list that are themselves lists of scalars; and some
+special strings are returned that are not to be interpreted as-is. Element
+[2] (placed into C<$format> in the example above) of the returned four element
+list tells you if the map has any of these special elements, as follows:
+
+=over
+
+=item C<s>
+
+means all the elements of the map array are simple scalars, with no special
+elements. Almost all properties are like this, like the C<block> example
+above.
+
+=item C<sl>
+
+means that some of the map array elements have the form given by C<s>, and
+the rest are lists of scalars. For example, here is a portion of the output
+of calling C<prop_invmap>() with the "Script Extensions" property:
+
+ @scripts_ranges @scripts_maps
+ ...
+ 0x0953 Deva
+ 0x0964 [ Beng Deva Guru Orya ]
+ 0x0966 Deva
+ 0x0970 Common
+
+Here, the code points 0x964 and 0x965 are used in the Bengali,
+Devanagari, Gurmukhi, and Oriya scripts.
+
+=item C<r>
+
+means that all the elements of the map array are either rational numbers or
+the string C<"NaN">, meaning "Not a Number". A rational number is either an
+integer, or two integers separated by a solidus (C<"/">). The second integer
+represents the denominator of the division implied by the solidus, and is
+guaranteed not to be 0. If you want to convert them to scalar numbers, you
+can use something like this:
+
+ my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property);
+ if ($format && $format eq "r") {
+ map { $_ = eval $_ } @$invmap_ref;
+ }
+
+Here's some entries from the output of the property "Nv", which has format
+C<"r">.
+
+ @numerics_ranges @numerics_maps Note
+ 0x00 "NaN"
+ 0x30 0 DIGIT 0
+ 0x31 1
+ 0x32 2
+ ...
+ 0x37 7
+ 0x38 8
+ 0x39 9 DIGIT 9
+ 0x3A "NaN"
+ 0xB2 2 SUPERSCRIPT 2
+ 0xB3 3 SUPERSCRIPT 2
+ 0xB4 "NaN"
+ 0xB9 1 SUPERSCRIPT 1
+ 0xBA "NaN"
+ 0xBC 1/4 VULGAR FRACTION 1/4
+ 0xBD 1/2 VULGAR FRACTION 1/2
+ 0xBE 3/4 VULGAR FRACTION 3/4
+ 0xBF "NaN"
+ 0x660 0 ARABIC-INDIC DIGIT ZERO
+
+=item C<c>
+
+is like C<s> in that all the map array elements are scalars, but some of them
+are the special string S<C<"E<lt>code pointE<gt>">>, meaning that the map of
+each code point in the corresponding range in the inversion list is the code
+point itself. For example, in:
+
+ my ($uppers_ranges_ref, $uppers_maps_ref, $format)
+ = prop_invmap("Simple_Uppercase_Mapping");
+
+the returned arrays look like this:
+
+ @$uppers_ranges_ref @$uppers_maps_ref Note
+ 0 "<code point>"
+ 97 65 'a' maps to 'A'
+ 98 66 'b' => 'B'
+ 99 67 'c' => 'C'
+ ...
+ 120 88 'x' => 'X'
+ 121 89 'y' => 'Y'
+ 122 90 'z' => 'Z'
+ 123 "<code point>"
+ 181 924 MICRO SIGN => Greek Cap MU
+ 182 "<code point>"
+ ...
+
+The first line means that the uppercase of code point 0 is 0;
+the uppercase of code point 1 is 1; ... of code point 96 is 96. Without the
+C<"E<lt>code_pointE<gt>"> notation, every code point would have to have an
+entry. This would mean that the arrays would each have more than a million
+entries to list just the legal Unicode code points!
+
+=item C<cl>
+
+means that some of the map array elements have the form given by C<c>, and
+the rest are ordered lists of code points.
+For example, in:
+
+ my ($uppers_ranges_ref, $uppers_maps_ref, $format)
+ = prop_invmap("Uppercase_Mapping");
+
+the returned arrays look like this:
+
+ @$uppers_ranges_ref @$uppers_maps_ref
+ 0 "<code point>"
+ 97 65
+ ...
+ 122 90
+ 123 "<code point>"
+ 181 924
+ 182 "<code point>"
+ ...
+ 0x0149 [ 0x02BC 0x004E ]
+ 0x014A "<code point>"
+ 0x014B 0x014A
+ ...
+
+This is the full Uppercase_Mapping property (as opposed to the
+Simple_Uppercase_Mapping given in the example for format C<"c">). The only
+difference between the two in the ranges shown is that the code point at
+0x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two
+characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
+CAPITAL LETTER N).
+
+=item C<cle>
+
+means that some of the map array elements have the forms given by C<cl>, and
+the rest are the empty string. The property C<NFKC_Casefold> has this form.
+An example slice is:
+
+ @$ranges_ref @$maps_ref Note
+ ...
+ 0x00AA 0x0061 FEMININE ORDINAL INDICATOR => 'a'
+ 0x00AB <code point>
+ 0x00AD SOFT HYPHEN => ""
+ 0x00AE <code point>
+ 0x00AF [ 0x0020, 0x0304 ] MACRON => SPACE . COMBINING MACRON
+ 0x00B0 <code point>
+ ...
+
+=item C<n>
+
+means the Name property. All the elements of the map array are simple
+scalars, but some of them contain special strings that require more work to
+get the actual name.
+
+Entries such as:
+
+ CJK UNIFIED IDEOGRAPH-<code point>
+
+mean that the name for the code point is "CJK UNIFIED IDEOGRAPH-"
+with the code point (expressed in hexadecimal) appended to it, like "CJK
+UNIFIED IDEOGRAPH-3403" (similarly for C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code
+pointE<gt>>).
+
+Also, entries like
+
+ <hangul syllable>
+
+means that the name is algorithmically calculated. This is easily done by
+the function L<charnames/charnames::viacode(code)>.
+
+Note that for control characters (C<Gc=cc>), Unicode's data files have the
+string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty
+string. This function returns that real name, the empty string.
+
+=item C<d>
+
+means the Decomposition_Mapping property. This property is like C<cl>
+properties, except it has an additional entry type:
+
+ <hangul syllable>
+
+for those code points whose decomposition is algorithmically calculated. (The
+C<n> format has this same entry.) These can be generated via the function
+L<Unicode::Normalize::NFD()|Unicode::Normalize>.
+
+
+Note that the mapping is the one that is specified in the Unicode data files,
+and to get the final decomposition, it may need to be applied recursively.
+
+=back
+
+A binary search can be used to quickly find a code point in the inversion
+list, and hence its corresponding mapping.
+
+The final element (index [3], assigned to C<$default> in the "block" example) in
+the four element list returned by this function may be useful for applications
+that wish to convert the returned inversion map data structure into some
+other, such as a hash. It gives the mapping that most code points map to
+under the property. If you establish the convention that any code point not
+explicitly listed in your data structure maps to this value, you can
+potentially make your data structure much smaller. As you construct your data
+structure from the one returned by this function, simply ignore those ranges
+that map to this value, generally called the "default" value. For example, to
+convert to the data structure searchable by L</charinrange()>, you can follow
+this recipe:
+
+ my ($list_ref, $map_ref, $format, $missing) = prop_invmap($property);
+ my @range_list;
+ for my $i (0 .. @$list_ref - 2) {
+ next if $map_ref->[$i] eq $missing;
+ push @range_list, [ $list_ref->[$i],
+ $list_ref->[$i+1],
+ $map_ref->[$i]
+ ];
+ }
+
+ print charinrange(\@range_list, $code_point), "\n";
+
+
+With this, C<charinrange()> will return C<undef> if its input code point maps
+to C<$missing>. You can avoid this by omitting the C<next> statement, and adding
+a line after the loop to handle the final element of the inversion map.
+
+One internal Perl property is accessible by this function.
+"Perl_Decimal_Digit" returns an inversion map in which all the Unicode decimal
+digits map to their numeric values, and everything else to the empty string,
+like so:
+
+ @digits @values
+ 0x0000 ""
+ 0x0030 0
+ 0x0031 1
+ 0x0032 2
+ 0x0033 3
+ 0x0034 4
+ 0x0035 5
+ 0x0036 6
+ 0x0037 7
+ 0x0038 8
+ 0x0039 9
+ 0x003A ""
+ 0x0660 0
+ 0x0661 1
+ ...
+
+Note that the inversion maps returned for the C<Case_Folding> and
+C<Simple_Case_Folding> properties do not include the Turkic-locale mappings.
+Use L</casefold()> for these.
+
+The C<Name_Alias> property is potentially undergoing signficant revision by
+Unicode at the time of this writing. The format of the values returned for it
+may change substantially in future Unicode versions.
+
+C<prop_invmap> does not know about any user-defined properties, and will
+return C<undef> if called with one of those.
+
+=cut
+
+# User-defined properties could be handled with some changes to utf8_heavy.pl;
+# if done, consideration should be given to the fact that the user subroutine
+# could return different results with each call, which could lead to some
+# security issues.
+
+# One could store things in memory so they don't have to be recalculated, but
+# it is unlikely this will be called often, and some properties would take up
+# significant memory.
+
+# These are created by mktables for this routine and stored in unicore/UCD.pl
+# where their structures are described.
+our @algorithmic_named_code_points;
+our $HANGUL_BEGIN;
+our $HANGUL_COUNT;
+
+sub prop_invmap ($) {
+
+ croak __PACKAGE__, "::prop_invmap: must be called in list context" unless wantarray;
+
+ my $prop = $_[0];
+ return unless defined $prop;
+
+ # Fail internal properties
+ return if $prop =~ /^_/;
+
+ # The values returned by this function.
+ my (@invlist, @invmap, $format, $missing);
+
+ # The swash has two components we look at, the base list, and a hash,
+ # named 'SPECIALS', containing any additional members whose mappings don't
+ # fit into the the base list scheme of things. These generally 'override'
+ # any value in the base list for the same code point.
+ my $overrides;
+
+ require "utf8_heavy.pl";
+ require "unicore/UCD.pl";
+
+RETRY:
+
+ # Try to get the map swash for the property. They have 'To' prepended to
+ # the property name, and 32 means we will accept 32 bit return values.
+ my $swash = utf8::SWASHNEW(__PACKAGE__, "To$prop", undef, 32, 0);
+
+ # If didn't find it, could be because needs a proxy. And if was the
+ # 'Block' or 'Name' property, use a proxy even if did find it. Finding it
+ # would be the result of the installation changing mktables to output the
+ # Block or Name tables. The Block table gives block names in the
+ # new-style, and this routine is supposed to return old-style block names.
+ # The Name table is valid, but we need to execute the special code below
+ # to add in the algorithmic-defined name entries.
+ if (ref $swash eq ""
+ || $swash->{'TYPE'} eq 'ToBlk'
+ || $swash->{'TYPE'} eq 'ToNa')
+ {
+
+ # Get the short name of the input property, in standard form
+ my ($second_try) = prop_aliases($prop);
+ return unless $second_try;
+ $second_try = utf8::_loose_name(lc $second_try);
+
+ if ($second_try eq "in") {
+
+ # This property is identical to age for inversion map purposes
+ $prop = "age";
+ goto RETRY;
+ }
+ elsif ($second_try eq 'scf') {
+
+ # This property uses just the LIST part of cf which includes the
+ # simple folds that are otherwise overridden by the SPECIALS. So
+ # all we need do is to not look at the SPECIALS; set $overrides to
+ # indicate that
+ $overrides = -1;
+ $prop = "cf";
+ goto RETRY;
+ }
+ elsif ($second_try =~ / ^ s[ltu]c $ /x) {
+
+ # Because some applications may be reading the full mapping
+ # equivalent files directly, they haven't been changed to include
+ # the simple mappings as well, as was done with the cf file (which
+ # doesn't have those backward compatibility issues) in 5.14.
+ # Instead, separate internal-only files were created that
+ # contain just the simple mappings that get overridden by the
+ # SPECIALS. Thus, these simple case mappings use the LIST part of
+ # their full mapping equivalents; plus the ones that are in those
+ # additional files. These special files are used by other
+ # functions in this module, so use the same hashes that those
+ # functions use.
+ my $file;
+ if ($second_try eq "suc") {
+ $file = '_suc.pl';
+ $overrides = \%SIMPLE_UPPER;
+ }
+ elsif ($second_try eq "slc") {
+ $file = '_slc.pl';
+ $overrides = \%SIMPLE_LOWER;
+ }
+ else {
+ $file = '_stc.pl';
+ $overrides = \%SIMPLE_TITLE;
+ }
+
+ # The files are already handled by the _read_table() function.
+ # Don't read them in if already done.
+ %$overrides =_read_table("unicore/To/$file", 'use_hash')
+ unless %$overrides;
+
+ # Convert to the full mapping name, and go handle that; e.g.,
+ # suc => uc.
+ $prop = $second_try =~ s/^s//r;
+ goto RETRY;
+ }
+ elsif ($second_try eq "blk") {
+
+ # We use the old block names. Just create a fake swash from its
+ # data.
+ _charblocks();
+ my %blocks;
+ $blocks{'LIST'} = "";
+ $blocks{'TYPE'} = "ToBlk";
+ $utf8::SwashInfo{ToBlk}{'missing'} = "No_Block";
+ $utf8::SwashInfo{ToBlk}{'format'} = "s";
+
+ foreach my $block (@BLOCKS) {
+ $blocks{'LIST'} .= sprintf "%x\t%x\t%s\n",
+ $block->[0],
+ $block->[1],
+ $block->[2];
+ }
+ $swash = \%blocks;
+ }
+ elsif ($second_try eq "na") {
+
+ # Use the combo file that has all the Name-type properties in it,
+ # extracting just the ones that are for the actual 'Name'
+ # property. And create a fake swash from it.
+ my %names;
+ $names{'LIST'} = "";
+ my $original = do "unicore/Name.pl";
+ my $previous_hex_code_point = "";
+ my $algorithm_names = \@algorithmic_named_code_points;
+
+ # We hold off on adding the next entry to the list until we know,
+ # that the next line isn't for the same code point. We only
+ # output the final line. That one is the original Name property
+ # value. The others are the Name_Alias corrections, which are
+ # listed first in the file.
+ my $staging = "";
+
+ my $i = 0;
+ foreach my $line (split "\n", $original) {
+ my ($hex_code_point, $name) = split "\t", $line;
+
+ # Weeds out all comments, blank lines, and named sequences
+ next if $hex_code_point =~ /\P{ASCII_HEX_DIGIT}/;
+
+ my $code_point = hex $hex_code_point;
+
+ # The name of all controls is the default: the empty string.
+ # The set of controls is immutable, so these hard-coded
+ # constants work.
+ next if $code_point <= 0x9F
+ && ($code_point <= 0x1F || $code_point >= 0x7F);
+
+ # Output the last iteration's result, but only output the
+ # final name if a code point has more than one.
+ $names{'LIST'} .= $staging
+ if $hex_code_point ne $previous_hex_code_point;
+
+ # If we are beyond where one of the special lines needs to
+ # be inserted ...
+ if ($i < @$algorithm_names
+ && $code_point > $algorithm_names->[$i]->{'low'})
+ {
+
+ # ... then insert it, ahead of what we were about to
+ # output
+ $staging = sprintf "%x\t%x\t%s\n",
+ $algorithm_names->[$i]->{'low'},
+ $algorithm_names->[$i]->{'high'},
+ $algorithm_names->[$i]->{'name'};
+
+ # And pretend that what we last saw was the final code
+ # point of the inserted range.
+ $previous_hex_code_point = sprintf "%04X",
+ $algorithm_names->[$i]->{'high'};
+
+ # Done with this range.
+ $i++;
+
+ # Except we actually need to output the inserted line.
+ redo;
+ }
+
+ # Normal name.
+ $staging = sprintf "%x\t\t%s\n", $code_point, $name;
+ $previous_hex_code_point = $hex_code_point;
+ }
+
+ # Add the name from the final iteration
+ $names{'LIST'} .= $staging;
+
+ $names{'TYPE'} = "ToNa";
+ $utf8::SwashInfo{ToNa}{'missing'} = "";
+ $utf8::SwashInfo{ToNa}{'format'} = "n";
+ $swash = \%names;
+ }
+ elsif ($second_try =~ / ^ ( d [mt] ) $ /x) {
+
+ # The file is a combination of dt and dm properties. Create a
+ # fake swash from the portion that we want.
+ my $original = do "unicore/Decomposition.pl";
+ my %decomps;
+
+ if ($second_try eq 'dt') {
+ $decomps{'TYPE'} = "ToDt";
+ $utf8::SwashInfo{'ToDt'}{'missing'} = "None";
+ $utf8::SwashInfo{'ToDt'}{'format'} = "s";
+ }
+ else {
+ $decomps{'TYPE'} = "ToDm";
+ $utf8::SwashInfo{'ToDm'}{'missing'} = "<code point>";
+
+ # Use a special internal-to-this_routine format, 'dm', to
+ # distinguish from 'd', meaning decimal.
+ $utf8::SwashInfo{'ToDm'}{'format'} = "dm";
+ }
+
+ $decomps{'LIST'} = "";
+
+ # This property has one special range not in the file: for the
+ # hangul syllables
+ my $done_hangul = 0; # Have we done the hangul range.
+ foreach my $line (split "\n", $original) {
+ my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line;
+ my $code_point = hex $hex_lower;
+ my $value;
+
+ # The type, enclosed in <...>, precedes the mapping separated
+ # by blanks
+ if ($type_and_map =~ / ^ < ( .* ) > \s+ (.*) $ /x) {
+ $value = ($second_try eq 'dt') ? $1 : $2
+ }
+ else { # If there is no type specified, it's canonical
+ $value = ($second_try eq 'dt')
+ ? "Canonical" :
+ $type_and_map;
+ }
+
+ # Insert the hangul range at the appropriate spot.
+ if (! $done_hangul && $code_point > $HANGUL_BEGIN) {
+ $done_hangul = 1;
+ $decomps{'LIST'} .=
+ sprintf "%x\t%x\t%s\n",
+ $HANGUL_BEGIN,
+ $HANGUL_BEGIN + $HANGUL_COUNT - 1,
+ ($second_try eq 'dt')
+ ? "Canonical"
+ : "<hangul syllable>";
+ }
+
+ # And append this to our constructed LIST.
+ $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n";
+ }
+ $swash = \%decomps;
+ }
+ else { # Don't know this property. Fail.
+ return;
+ }
+ }
+
+ if ($swash->{'EXTRAS'}) {
+ carp __PACKAGE__, "::prop_invmap: swash returned for $prop unexpectedly has EXTRAS magic";
+ return;
+ }
+
+ # Here, have a valid swash return. Examine it.
+ my $returned_prop = $swash->{TYPE};
+
+ # All properties but binary ones should have 'missing' and 'format'
+ # entries
+ $missing = $utf8::SwashInfo{$returned_prop}{'missing'};
+ $missing = 'N' unless defined $missing;
+
+ $format = $utf8::SwashInfo{$returned_prop}{'format'};
+ $format = 'b' unless defined $format;
+
+ # The LIST input lines look like:
+ # ...
+ # 0374\t\tCommon
+ # 0375\t0377\tGreek # [3]
+ # 037A\t037D\tGreek # [4]
+ # 037E\t\tCommon
+ # 0384\t\tGreek
+ # ...
+ #
+ # Convert them to like
+ # 0374 => Common
+ # 0375 => Greek
+ # 0378 => $missing
+ # 037A => Greek
+ # 037E => Common
+ # 037F => $missing
+ # 0384 => Greek
+ #
+ # For binary properties, the final non-comment column is absent, and
+ # assumed to be 'Y'.
+
+ foreach my $range (split "\n", $swash->{'LIST'}) {
+ $range =~ s/ \s* (?: \# .* )? $ //xg; # rmv trailing space, comments
+
+ # Find the beginning and end of the range on the line
+ my ($hex_begin, $hex_end, $map) = split "\t", $range;
+ my $begin = hex $hex_begin;
+ my $end = (defined $hex_end && $hex_end ne "")
+ ? hex $hex_end
+ : $begin;
+
+ # If the property doesn't have a range that begins at 0, add one that
+ # maps to the default value (for missing ranges).
+ if (! @invlist) {
+ if ($begin != 0) {
+ push @invlist, 0;
+ push @invmap, $missing;
+ }
+ }
+ elsif ($invlist[-1] == $begin) {
+
+ # If the input isn't in the most compact form, so that there are
+ # two adjacent ranges that map to the same thing, they should be
+ # combined. This happens in our constructed dt mapping, as
+ # Element [-2] is the map for the latest range so far processed.
+ # Just set the beginning point of the map to $missing (in
+ # invlist[-1]) to 1 beyond where this range ends. For example, in
+ # 12\t13\tXYZ
+ # 14\t17\tXYZ
+ # we have set it up so that it looks like
+ # 12 => XYZ
+ # 14 => $missing
+ #
+ # We now see that it should be
+ # 12 => XYZ
+ # 18 => $missing
+ if (@invlist > 1 && $invmap[-2] eq $map) {
+ $invlist[-1] = $end + 1;
+ next;
+ }
+
+ # Here, the range started in the previous iteration that maps to
+ # $missing starts at the same code point as this range. That
+ # means there is no gap to fill that that range was intended for,
+ # so we just pop it off the parallel arrays.
+ pop @invlist;
+ pop @invmap;
+ }
+
+ # Add the range beginning, and the range's map.
+ push @invlist, $begin;
+ if ($format eq 'dm') {
+
+ # The decomposition maps are either a line like <hangul syllable>
+ # which are to be taken as is; or a sequence of code points in hex
+ # and separated by blanks. Convert them to decimal, and if there
+ # is more than one, use an anonymous array as the map.
+ if ($map =~ /^ < /x) {
+ push @invmap, $map;
+ }
+ else {
+ my @map = map { hex } split " ", $map;
+ if (@map == 1) {
+ push @invmap, $map[0];
+ }
+ else {
+ push @invmap, \@map;
+ }
+ }
+ }
+ else {
+
+ # Otherwise, convert hex formatted list entries to decimal; add a
+ # 'Y' map for the missing value in binary properties, or
+ # otherwise, use the input map unchanged.
+ $map = ($format eq 'x')
+ ? hex $map
+ : $format eq 'b'
+ ? 'Y'
+ : $map;
+ push @invmap, $map;
+ }
+
+ # We just started a range. It ends with $end. The gap between it and
+ # the next element in the list must be filled with a range that maps
+ # to the default value. If there is no gap, the next iteration will
+ # pop this, unless there is no next iteration, and we have filled all
+ # of the Unicode code space, so check for that and skip.
+ if ($end < $MAX_UNICODE_CODEPOINT) {
+ push @invlist, $end + 1;
+ push @invmap, $missing;
+ }
+ }
+
+ # If the property is empty, make all code points use the value for missing
+ # ones.
+ if (! @invlist) {
+ push @invlist, 0;
+ push @invmap, $missing;
+ }
+
+ # And add in standard element that all non-Unicode code points map to
+ # $missing
+ push @invlist, $MAX_UNICODE_CODEPOINT + 1;
+ push @invmap, $missing;
+
+ # The second component of the map are those values that require
+ # non-standard specification, stored in SPECIALS. These override any
+ # duplicate code points in LIST. If we are using a proxy, we may have
+ # already set $overrides based on the proxy.
+ $overrides = $swash->{'SPECIALS'} unless defined $overrides;
+ if ($overrides) {
+
+ # A negative $overrides implies that the SPECIALS should be ignored,
+ # and a simple 'c' list is the value.
+ if ($overrides < 0) {
+ $format = 'c';
+ }
+ else {
+
+ # Currently, all overrides are for properties that normally map to
+ # single code points, but now some will map to lists of code
+ # points (but there is an exception case handled below).
+ $format = 'cl';
+
+ # Look through the overrides.
+ foreach my $cp_maybe_utf8 (keys %$overrides) {
+ my $cp;
+ my @map;
+
+ # If the overrides came from SPECIALS, the code point keys are
+ # packed UTF-8.
+ if ($overrides == $swash->{'SPECIALS'}) {
+ $cp = unpack("C0U", $cp_maybe_utf8);
+ @map = unpack "U0U*", $swash->{'SPECIALS'}{$cp_maybe_utf8};
+
+ # The empty string will show up unpacked as an empty
+ # array.
+ $format = 'cle' if @map == 0;
+ }
+ else {
+
+ # But if we generated the overrides, we didn't bother to
+ # pack them, and we, so far, do this only for properties
+ # that are 'c' ones.
+ $cp = $cp_maybe_utf8;
+ @map = hex $overrides->{$cp};
+ $format = 'c';
+ }
+
+ # Find the range that the override applies to.
+ my $i = _search_invlist(\@invlist, $cp);
+ if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
+ croak __PACKAGE__, "wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
+ }
+
+ # And what that range currently maps to
+ my $cur_map = $invmap[$i];
+
+ # If there is a gap between the next range and the code point
+ # we are overriding, we have to add elements to both arrays to
+ # fill that gap, using the map that applies to it, which is
+ # $cur_map, since it is part of the current range.
+ if ($invlist[$i + 1] > $cp + 1) {
+ #use feature 'say';
+ #say "Before splice:";
+ #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
+ #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
+ #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
+ #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
+ #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
+
+ splice @invlist, $i + 1, 0, $cp + 1;
+ splice @invmap, $i + 1, 0, $cur_map;
+
+ #say "After splice:";
+ #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
+ #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
+ #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
+ #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
+ #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
+ }
+
+ # If the remaining portion of the range is multiple code
+ # points (ending with the one we are replacing, guaranteed by
+ # the earlier splice). We must split it into two
+ if ($invlist[$i] < $cp) {
+ $i++; # Compensate for the new element
+
+ #use feature 'say';
+ #say "Before splice:";
+ #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
+ #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
+ #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
+ #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
+ #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
+
+ splice @invlist, $i, 0, $cp;
+ splice @invmap, $i, 0, 'dummy';
+
+ #say "After splice:";
+ #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
+ #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
+ #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
+ #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
+ #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
+ }
+
+ # Here, the range we are overriding contains a single code
+ # point. The result could be the empty string, a single
+ # value, or a list. If the last case, we use an anonymous
+ # array.
+ $invmap[$i] = (scalar @map == 0)
+ ? ""
+ : (scalar @map > 1)
+ ? \@map
+ : $map[0];
+ }
+ }
+ }
+ elsif ($format eq 'x') {
+
+ # All hex-valued properties are really to code points
+ $format = 'c';
+ }
+ elsif ($format eq 'dm') {
+ $format = 'd';
+ }
+ elsif ($format eq 'sw') { # blank-separated elements to form a list.
+ map { $_ = [ split " ", $_ ] if $_ =~ / / } @invmap;
+ $format = 'sl';
+ }
+ elsif ($returned_prop eq 'ToNameAlias') {
+
+ # This property currently doesn't have any lists, but theoretically
+ # could
+ $format = 'sl';
+ }
+ elsif ($format ne 'n' && $format ne 'r') {
+
+ # All others are simple scalars
+ $format = 's';
+ }
+
+ return (\@invlist, \@invmap, $format, $missing);
+}
+
=head2 Unicode::UCD::UnicodeVersion
This returns the version of the Unicode Character Database, in other words, the
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index bdb9812396..3903d45e16 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -851,7 +851,60 @@ undef %pva_tested;
no warnings 'once'; # We use some values once from 'required' modules.
-use Unicode::UCD qw(prop_invlist MAX_CP);
+use Unicode::UCD qw(prop_invlist prop_invmap MAX_CP);
+
+# There were some problems with caching interfering with prop_invlist() vs
+# prop_invmap() on binary properties, and also between the 3 properties where
+# Perl used the same 'To' name as another property (see utf8_heavy.pl).
+# So, before testing all of prop_invlist(),
+# 1) call prop_invmap() to try both orders of these name issues. This uses
+# up two of the 3 properties; the third will be left so that invlist()
+# on it gets called before invmap()
+# 2) call prop_invmap() on a generic binary property, ahead of invlist().
+# This should test that the caching works in both directions.
+
+# These properties are not stable between Unicode versions, but the first few
+# elements are; just look at the first element to see if are getting the
+# distinction right. The general inversion map testing below will test the
+# whole thing.
+my $prop = "uc";
+my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
+is($format, 'cl', "prop_invmap() format of '$prop' is 'cl'");
+is($missing, '<code point>', "prop_invmap() missing of '$prop' is '<code point>'");
+is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61");
+is($invmap_ref->[1], 0x41, "prop_invmap('$prop') map[1] is 0x41");
+
+$prop = "upper";
+($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
+is($format, 's', "prop_invmap() format of '$prop' is 'cl'");
+is($missing, 'N', "prop_invmap() missing of '$prop' is '<code point>'");
+is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41");
+is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'");
+
+$prop = "lower";
+($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
+is($format, 's', "prop_invmap() format of '$prop' is 'cl'");
+is($missing, 'N', "prop_invmap() missing of '$prop' is '<code point>'");
+is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61");
+is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'");
+
+$prop = "lc";
+($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
+is($format, 'cl', "prop_invmap() format of '$prop' is 'cl'");
+is($missing, '<code point>', "prop_invmap() missing of '$prop' is '<code point>'");
+is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41");
+is($invmap_ref->[1], 0x61, "prop_invmap('$prop') map[1] is 0x61");
+
+# This property is stable and small, so can test all of it
+$prop = "ASCII_Hex_Digit";
+($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
+is($format, 's', "prop_invmap() format of '$prop' is 's'");
+is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'");
+is_deeply($invlist_ref, [ 0x0000, 0x0030, 0x003A, 0x0041,
+ 0x0047, 0x0061, 0x0067, 0x110000 ],
+ "prop_invmap('$prop') code point list is correct");
+is_deeply($invmap_ref, [ 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N' ] ,
+ "prop_invmap('$prop') map list is correct");
is(prop_invlist("Unknown property"), undef, "prop_invlist(<Unknown property>) returns undef");
is(prop_invlist(undef), undef, "prop_invlist(undef) returns undef");
@@ -864,11 +917,11 @@ use Storable qw(dclone);
is(prop_invlist("InKana"), undef, "prop_invlist(<user-defined property returns undef>)");
-# The way both the tests for invlist work is that they take the
+# The way both the tests for invlist and invmap work is that they take the
# lists returned by the functions and construct from them what the original
# file should look like, which are then compared with the file. If they are
# identical, the test passes. What this tests isn't that the results are
-# correct, but that invlist hasn't introduced errors beyond what
+# correct, but that invlist and invmap haven't introduced errors beyond what
# are there in the files. As a small hedge against that, test some
# prop_invlist() tables fully with the known correct result. We choose
# ASCII_Hex_Digit again, as it is stable.
@@ -1085,4 +1138,772 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
}
}
+# Now test prop_invmap().
+
+@list = prop_invmap("Unknown property");
+is (@list, 0, "prop_invmap(<Unknown property>) returns an empty list");
+@list = prop_invmap(undef);
+is (@list, 0, "prop_invmap(undef) returns an empty list");
+ok (! eval "prop_invmap('gc')" && $@ ne "",
+ "prop_invmap('gc') dies in scalar context");
+@list = prop_invmap("_X_Begin");
+is (@list, 0, "prop_invmap(<internal property>) returns an empty list");
+@list = prop_invmap("InKana");
+is(@list, 0, "prop_invmap(<user-defined property returns undef>)");
+@list = prop_invmap("Perl_Decomposition_Mapping"), undef,
+is(@list, 0, "prop_invmap('Perl_Decomposition_Mapping') returns <undef> since internal-Perl-only");
+@list = prop_invmap("Perl_Charnames"), undef,
+is(@list, 0, "prop_invmap('Perl_Charnames') returns <undef> since internal-Perl-only");
+@list = prop_invmap("Is_Is_Any");
+is(@list, 0, "prop_invmap('Is_Is_Any') returns <undef> since two is's");
+
+# The set of properties to test on has already been compiled into %props by
+# the prop_aliases() tests.
+
+my %tested_invmaps;
+
+# Like prop_invlist(), prop_invmap() is tested by comparing the results
+# returned by the function with the tables that mktables generates. Some of
+# these tables are directly stored as files on disk, in either the unicore or
+# unicore/To directories, and most should be listed in the mktables generated
+# hash %utf8::loose_property_to_file_of, with a few additional ones that this
+# handles specially. For these, the files are read in directly, massaged, and
+# compared with what invmap() returns. The SPECIALS hash in some of these
+# files overrides values in the main part of the file.
+#
+# The other properties are tested indirectly by generating all the possible
+# inversion lists for the property, and seeing if those match the inversion
+# lists returned by prop_invlist(), which has already been tested.
+
+PROPERTY:
+foreach my $prop (keys %props) {
+ my $loose_prop = utf8::_loose_name(lc $prop);
+ my $suppressed = grep { $_ eq $loose_prop }
+ @Unicode::UCD::suppressed_properties;
+
+ # Find the short and full names that this property goes by
+ my ($name, $full_name) = prop_aliases($prop);
+ if (! $name) {
+ if (! $suppressed) {
+ fail("prop_invmap('$prop')");
+ diag("is unknown to prop_aliases(), and we need it in order to test prop_invmap");
+ }
+ next PROPERTY;
+ }
+
+ # Normalize the short name, as it is stored in the hashes under the
+ # normalized version.
+ $name = utf8::_loose_name(lc $name);
+
+ # Add in the characters that are supposed to be ignored to test loose
+ # matching, which the tested function applies to all properties
+ my $mod_prop = "$extra_chars$prop";
+
+ my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($mod_prop);
+ my $return_ref = [ $invlist_ref, $invmap_ref, $format, $missing ];
+
+ # If have already tested this property under a different name, merely
+ # compare the return from now with the saved one from before.
+ if (exists $tested_invmaps{$name}) {
+ is_deeply($return_ref, $tested_invmaps{$name}, "prop_invmap('$mod_prop') gave same results as its synonym, '$name'");
+ next PROPERTY;
+ }
+ $tested_invmaps{$name} = dclone $return_ref;
+
+ # If prop_invmap() returned nothing, is ok iff is a property whose file is
+ # not generated.
+ if ($suppressed) {
+ if (defined $format) {
+ fail("prop_invmap('$mod_prop')");
+ diag("did not return undef for suppressed property $prop");
+ }
+ next PROPERTY;
+ }
+ elsif (!defined $format) {
+ fail("prop_invmap('$mod_prop')");
+ diag("'$prop' is unknown to prop_invmap()");
+ next PROPERTY;
+ }
+
+ # The two parallel arrays must have the same number of elements.
+ if (@$invlist_ref != @$invmap_ref) {
+ fail("prop_invmap('$mod_prop')");
+ diag("invlist has "
+ . scalar @$invlist_ref
+ . " while invmap has "
+ . scalar @$invmap_ref
+ . " elements");
+ next PROPERTY;
+ }
+
+ # The last element must be for the above-Unicode code points, and must be
+ # for the default value.
+ if ($invlist_ref->[-1] != 0x110000) {
+ fail("prop_invmap('$mod_prop')");
+ diag("The last inversion list element is not 0x110000");
+ next PROPERTY;
+ }
+ if ($invmap_ref->[-1] ne $missing) {
+ fail("prop_invmap('$mod_prop')");
+ diag("The last inversion list element is '$invmap_ref->[-1]', and should be '$missing'");
+ next PROPERTY;
+ }
+
+ if ($name eq 'bmg') { # This one has an atypical $missing
+ if ($missing ne "") {
+ fail("prop_invmap('$mod_prop')");
+ diag("The missings should be \"\"; got '$missing'");
+ next PROPERTY;
+ }
+ }
+ elsif ($format =~ /^ [cd] /x) {
+ if ($missing ne "<code point>") {
+ fail("prop_invmap('$mod_prop')");
+ diag("The missings should be '<code point>'; got '$missing'");
+ next PROPERTY;
+ }
+ }
+ elsif ($missing =~ /[<>]/) {
+ fail("prop_invmap('$mod_prop')");
+ diag("The missings should NOT be something with <...>'");
+ next PROPERTY;
+
+ # I don't want to hard code in what all the missings should be, so
+ # those don't get fully tested.
+ }
+
+ # Certain properties don't have their own files, but must be constructed
+ # using proxies.
+ my $proxy_prop = $name;
+ if ($full_name eq 'Present_In') {
+ $proxy_prop = "age"; # The maps for these two props are identical
+ }
+ elsif ($full_name eq 'Simple_Case_Folding'
+ || $full_name =~ /Simple_ (.) .*? case_Mapping /x)
+ {
+ if ($full_name eq 'Simple_Case_Folding') {
+ $proxy_prop = 'cf';
+ }
+ else {
+ # We captured the U, L, or T, leading to uc, lc, or tc.
+ $proxy_prop = lc $1 . "c";
+ }
+ if ($format ne "c") {
+ fail("prop_invmap('$mod_prop')");
+ diag("The format should be 'c'; got '$format'");
+ next PROPERTY;
+ }
+ }
+
+ my $base_file;
+ my $official;
+
+ # Handle the properties that have full disk files for them (except the
+ # Name property which is structurally enough different that it is handled
+ # separately below.)
+ if ($name ne 'na'
+ && ($name eq 'blk'
+ || defined
+ ($base_file = $utf8::loose_property_to_file_of{$proxy_prop})
+ || exists $utf8::loose_to_file_of{$proxy_prop}
+ || $name eq "dm"))
+ {
+ # In the above, blk is done unconditionally, as we need to test that
+ # the old-style block names are returned, even if mktables has
+ # generated a file for the new-style; the test for dm comes afterward,
+ # so that if a file has been generated for it explicitly, we use that
+ # file (which is valid, unlike blk) instead of the combo
+ # Decomposition.pl files.
+ my $file;
+ my $is_binary = 0;
+ if ($name eq 'blk') {
+
+ # The blk property is special. The original file with old block
+ # names is retained, and the default is to not write out a
+ # new-name file. What we do is get the old names into a data
+ # structure, and from that create what the new file would look
+ # like. $base_file is needed to be defined, just to avoid a
+ # message below.
+ $base_file = "This is a dummy name";
+ my $blocks_ref = charblocks();
+ $official = "";
+ for my $range (sort { $a->[0][0] <=> $b->[0][0] }
+ values %$blocks_ref)
+ {
+ # Translate the charblocks() data structure to what the file
+ # would like.
+ $official .= sprintf"%04X\t%04X\t%s\n",
+ $range->[0][0],
+ $range->[0][1],
+ $range->[0][2];
+ }
+ }
+ else {
+
+ # Above leaves $base_file undefined only if it came from the hash
+ # below. This should happen only when it is a binary property
+ # (and are accessing via a single-form name, like 'In_Latin1'),
+ # and so it is stored in a different directory than the To ones.
+ # XXX Currently, the only cases where it is complemented are the
+ # ones that have no code points. And it works out for these that
+ # 1) complementing them, and then 2) adding or subtracting the
+ # initial 0 and final 110000 cancel each other out. But further
+ # work would be needed in the unlikely event that an inverted
+ # property comes along without these characteristics
+ if (!defined $base_file) {
+ $base_file = $utf8::loose_to_file_of{$proxy_prop};
+ $is_binary = ($base_file =~ s/^!//) ? -1 : 1;
+ $base_file = "lib/$base_file";
+ }
+
+ # Read in the file
+ $base_file = "Decomposition" if $format eq 'd';
+ $file = "unicore/$base_file.pl";
+ $official = do $file;
+
+ # Get rid of any trailing space and comments in the file.
+ $official =~ s/\s*(#.*)?$//mg;
+
+ # Decomposition.pl also has the <compatible> types in it, which
+ # should be removed.
+ $official =~ s/<.*?> //mg if $format eq 'd';
+ }
+ chomp $official;
+
+ # If there are any special elements, get a reference to them.
+ my $specials_ref = $utf8::file_to_swash_name{$base_file};
+ if ($specials_ref) {
+ $specials_ref = $utf8::SwashInfo{$specials_ref}{'specials_name'};
+ if ($specials_ref) {
+
+ # Convert from the name to the actual reference.
+ no strict 'refs';
+ $specials_ref = \%{$specials_ref};
+ }
+ }
+
+ # Certain of the proxy properties have to be adjusted to match the
+ # real ones.
+ if (($proxy_prop ne $name && $full_name =~ 'Mapping')
+ || $full_name eq 'Case_Folding')
+ {
+
+ # Here we have either
+ # 1) Case_Folding; or
+ # 2) a proxy that is a full mapping, which means that what the
+ # real property is is the equivalent simple mapping.
+ # In both cases, the file will have a standard list containing
+ # simple mappings (to a single code point), and a specials hash
+ # which contains all the mappings that are to multiple code
+ # points. First, extract a list containing all the file's simple
+ # mappings.
+ my @list;
+ for (split "\n", $official) {
+ my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
+ \s* ( \# .* )?
+ $ /x;
+ $end = $start if $end eq "";
+ if ($end ne $start) {
+ fail("prop_invmap('$mod_prop')");
+ diag("This test is expecting only single code point ranges in $file.pl");
+ next PROPERTY;
+ }
+ push @list, [ hex $start, $value ];
+ }
+
+ # For Case_Folding, the file contains all the simple mappings,
+ # including the ones that are overridden by the specials. These
+ # need to be removed as the list is for just the full ones. For
+ # the other files, the proxy is missing the simple mappings that
+ # are overridden by the specials, so we need to add them.
+
+ # For the missing simples, we get the correct values by calling
+ # charinfo(). Set up which element of the hash returned by
+ # charinfo to look at
+ my $charinfo_element;
+ if ($full_name =~ / ^ Simple_ (Lower | Upper | Title) case_Mapping/x)
+ {
+ $charinfo_element = lc $1; # e.g. Upper is referred to by the
+ # key 'upper' in the charinfo()
+ # returned hash
+ }
+
+ # Go through any special mappings one by one. They are packed.
+ my $i = 0;
+ foreach my $utf8_cp (sort keys %$specials_ref) {
+ my $cp = unpack("C0U", $utf8_cp);
+
+ # Get what the simple value for this should be; either nothing
+ # for Case_Folding, or what charinfo returns for the others.
+ my $simple = ($full_name eq "Case_Folding")
+ ? ""
+ : charinfo($cp)->{$charinfo_element};
+
+ # And create an entry to add to the list, if appropriate
+ my $replacement;
+ $replacement = [ $cp, $simple ] if $simple ne "";
+
+ # Find the spot in the @list of simple mappings that this
+ # special applies to; uses a linear search.
+ while ($i < @list -1 ) {
+ last if $cp <= $list[$i][0];
+ $i++;
+ }
+
+ #note $i-1 . ": " . join " => ", @{$list[$i-1]};
+ #note $i-0 . ": " . join " => ", @{$list[$i-0]};
+ #note $i+1 . ": " . join " => ", @{$list[$i+1]};
+
+ if (! defined $replacement) {
+
+ # Here, are to remove any existing entry for this code
+ # point.
+ next if $cp != $list[$i][0];
+ splice @list, $i, 1;
+ }
+ elsif ($cp == $list[$i][0]) {
+
+ # Here, are to add something, but there is an existing
+ # entry, so this just replaces it.
+ $list[$i] = $replacement;
+ }
+ else {
+
+ # Here, are to add something, and there isn't an existing
+ # entry.
+ splice @list, $i, 0, $replacement;
+ }
+
+ #note __LINE__ . ": $cp";
+ #note $i-1 . ": " . join " => ", @{$list[$i-1]};
+ #note $i-0 . ": " . join " => ", @{$list[$i-0]};
+ #note $i+1 . ": " . join " => ", @{$list[$i+1]};
+ }
+
+ # Here, have gone through all the specials, modifying @list as
+ # needed. Turn it back into what the file should look like.
+ $official = join "\n", map { sprintf "%04X\t\t%s", @$_ } @list;
+
+ # And, no longer need the specials for the simple mappings, as are
+ # all incorporated into $official
+ undef $specials_ref if $full_name ne 'Case_Folding';
+ }
+ elsif ($full_name eq 'Simple_Case_Folding') {
+
+ # This property has everything in the regular array, and the
+ # specials are superfluous.
+ undef $specials_ref if $full_name ne 'Case_Folding';
+ }
+
+ # Here, in $official, we have what the file looks like, or should like
+ # if we've had to fix it up. Now take the invmap() output and reverse
+ # engineer from that what the file should look like. Each iteration
+ # appends the next line to the running string.
+ my $tested_map = "";
+
+ # Create a copy of the file's specials hash. (It has been undef'd if
+ # we know it isn't relevant to this property, so if it exists, it's an
+ # error or is relevant). As we go along, we delete from that copy.
+ # If a delete fails, or something is left over after we are done,
+ # it's an error
+ my %specials = %$specials_ref if $specials_ref;
+
+ # The extra -1 is because the final element has been tested above to
+ # be for anything above Unicode. The file doesn't go that high.
+ for my $i (0 .. @$invlist_ref - 1 - 1) {
+
+ # If the map element is a reference, have to stringify it (but
+ # don't do so if the format doesn't allow references, so that an
+ # improper format will generate an error.
+ if (ref $invmap_ref->[$i]
+ && ($format eq 'd' || $format =~ /^ . l /x))
+ {
+ # The stringification depends on the format. At the time of
+ # this writing, all 'sl' formats are space separated.
+ if ($format eq 'sl') {
+ $invmap_ref->[$i] = join " ", @{$invmap_ref->[$i]};
+ }
+ elsif ($format =~ / ^ cl e? $/x) {
+
+ # For a cl property, the stringified result should be in
+ # the specials hash. The key is the packed code point,
+ # and the value is the packed map.
+ my $value;
+ if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) {
+ fail("prop_invmap('$mod_prop')");
+ diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]);
+ next PROPERTY;
+ }
+ my $packed = pack "U*", @{$invmap_ref->[$i]};
+ if ($value ne $packed) {
+ fail("prop_invmap('$mod_prop')");
+ diag(sprintf "For %04X, expected the mapping to be '$packed', but got '$value'");
+ next PROPERTY;
+ }
+
+ # As this doesn't get tested when we later compare with
+ # the actual file, it could be out of order and we
+ # wouldn't know it.
+ if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+ || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
+ {
+ fail("prop_invmap('$mod_prop')");
+ diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+ next PROPERTY;
+ }
+ next;
+ }
+ elsif ($format eq 'd') {
+
+ # The decomposition mapping file has the code points as
+ # a string of space-separated hex constants.
+ $invmap_ref->[$i] = join " ", map { sprintf "%04X", $_ } @{$invmap_ref->[$i]};
+ }
+ else {
+ fail("prop_invmap('$mod_prop')");
+ diag("Can't handle format '$format'");
+ next PROPERTY;
+ }
+ }
+ elsif ($format eq 'cle' && $invmap_ref->[$i] eq "") {
+
+ # cle properties have maps to the empty string that also
+ # should be in the specials hash, with the key the packed code
+ # point, and the map just empty.
+ my $value;
+ if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) {
+ fail("prop_invmap('$mod_prop')");
+ diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]);
+ next PROPERTY;
+ }
+ if ($value ne "") {
+ fail("prop_invmap('$mod_prop')");
+ diag(sprintf "For %04X, expected the mapping to be \"\", but got '$value'", $invlist_ref->[$i]);
+ next PROPERTY;
+ }
+
+ # As this doesn't get tested when we later compare with
+ # the actual file, it could be out of order and we
+ # wouldn't know it.
+ if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+ || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
+ {
+ fail("prop_invmap('$mod_prop')");
+ diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+ next PROPERTY;
+ }
+ next;
+ }
+ elsif ($is_binary) { # These binary files don't have an explicit Y
+ $invmap_ref->[$i] =~ s/Y//;
+ }
+
+ # The file doesn't include entries that map to $missing, so don't
+ # include it in the built-up string. But make sure that it is in
+ # the correct order in the input.
+ if ($invmap_ref->[$i] eq $missing) {
+ if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+ || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
+ {
+ fail("prop_invmap('$mod_prop')");
+ diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+ next PROPERTY;
+ }
+ next;
+ }
+
+ # 'c'-type and 'd' properties have the mapping expressed in hex in
+ # the file
+ if ($format =~ /^ [cd] /x) {
+
+ # The d property has one entry which isn't in the file.
+ # Ignore it, but make sure it is in order.
+ if ($format eq 'd'
+ && $invmap_ref->[$i] eq '<hangul syllable>'
+ && $invlist_ref->[$i] == 0xAC00)
+ {
+ if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+ || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
+ {
+ fail("prop_invmap('$mod_prop')");
+ diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+ next PROPERTY;
+ }
+ next;
+ }
+ $invmap_ref->[$i] = sprintf("%04X", $invmap_ref->[$i])
+ if $invmap_ref->[$i] =~ / ^ [A-Fa-f0-9]+ $/x;
+ }
+
+ # Finally have figured out what the map column in the file should
+ # be. Append the line to the running string.
+ my $start = $invlist_ref->[$i];
+ my $end = $invlist_ref->[$i+1] - 1;
+ $end = ($start == $end) ? "" : sprintf("%04X", $end);
+ if ($invmap_ref->[$i] ne "") {
+ $tested_map .= sprintf "%04X\t%s\t%s\n", $start, $end, $invmap_ref->[$i];
+ }
+ elsif ($end ne "") {
+ $tested_map .= sprintf "%04X\t%s\n", $start, $end;
+ }
+ else {
+ $tested_map .= sprintf "%04X\n", $start;
+ }
+ } # End of looping over all elements.
+
+ # Here are done with generating what the file should look like
+
+ chomp $tested_map;
+
+ # And compare.
+ if ($tested_map ne $official) {
+ fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap");
+ next PROPERTY;
+ }
+
+ # There shouldn't be any specials unaccounted for.
+ if (keys %specials) {
+ fail("prop_invmap('$mod_prop')");
+ diag("Unexpected specials: " . join ", ", keys %specials);
+ next PROPERTY;
+ }
+ }
+ elsif ($format eq 'n') {
+
+ # Handle the Name property similar to the above. But the file is
+ # sufficiently different that it is more convenient to make a special
+ # case for it.
+
+ if ($missing ne "") {
+ fail("prop_invmap('$mod_prop')");
+ diag("The missings should be \"\"; got \"missing\"");
+ next PROPERTY;
+ }
+
+ $official = do "unicore/Name.pl";
+
+ # Get rid of the named sequences portion of the file. These don't
+ # have a tab before the first blank on a line.
+ $official =~ s/ ^ [^\t]+ \ .*? \n //xmg;
+
+ # And get rid of the controls. These are named in the file, but
+ # shouldn't be in the property.
+ $official =~ s/ 00000 \t .* 0001F .*? \n//xs;
+ $official =~ s/ 0007F \t .* 0009F .*? \n//xs;
+
+ # This is slow; it gets rid of the aliases. We look for lines that
+ # are for the same code point as the previous line. The previous line
+ # will be a name_alias; and the current line will be the name. Get
+ # rid of the name_alias line. This won't work if there are multiple
+ # aliases for a given name.
+ my @temp_names = split "\n", $official;
+ my $previous_cp = "";
+ for (my $i = 0; $i < @temp_names - 1; $i++) {
+ $temp_names[$i] =~ /^ (.*)? \t /x;
+ my $current_cp = $1;
+ if ($current_cp eq $previous_cp) {
+ splice @temp_names, $i - 1, 1;
+ redo;
+ }
+ else {
+ $previous_cp = $current_cp;
+ }
+ }
+ $official = join "\n", @temp_names;
+ undef @temp_names;
+ chomp $official;
+
+ # Here have adjusted the file. We also have to adjust the returned
+ # inversion map by checking and deleting all the lines in it that
+ # won't be in the file. These are the lines that have generated
+ # things, like <hangul syllable>.
+ my $tested_map = ""; # Current running string
+ my @code_point_in_names =
+ @Unicode::UCD::code_points_ending_in_code_point;
+
+ for my $i (0 .. @$invlist_ref - 1 - 1) {
+ my $start = $invlist_ref->[$i];
+ my $end = $invlist_ref->[$i+1] - 1;
+ if ($invmap_ref->[$i] eq $missing) {
+ if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+ || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
+ {
+ fail("prop_invmap('$mod_prop')");
+ diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+ next PROPERTY;
+ }
+ next;
+ }
+ if ($invmap_ref->[$i] =~ / (.*) ( < .*? > )/x) {
+ my $name = $1;
+ my $type = $2;
+ if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+ || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
+ {
+ fail("prop_invmap('$mod_prop')");
+ diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+ next PROPERTY;
+ }
+ if ($type eq "<hangul syllable>") {
+ if ($name ne "") {
+ fail("prop_invmap('$mod_prop')");
+ diag("Unexpected text in $invmap_ref->[$i]");
+ next PROPERTY;
+ }
+ if ($start != 0xAC00) {
+ fail("prop_invmap('$mod_prop')");
+ diag(sprintf("<hangul syllables> should begin at 0xAC00, got %04X", $start));
+ next PROPERTY;
+ }
+ if ($end != $start + 11172 - 1) {
+ fail("prop_invmap('$mod_prop')");
+ diag(sprintf("<hangul syllables> should end at %04X, got %04X", $start + 11172 -1, $end));
+ next PROPERTY;
+ }
+ }
+ elsif ($type ne "<code point>") {
+ fail("prop_invmap('$mod_prop')");
+ diag("Unexpected text '$type' in $invmap_ref->[$i]");
+ next PROPERTY;
+ }
+ else {
+
+ # Look through the array of names that end in code points,
+ # and look for this start and end. If not found is an
+ # error. If found, delete it, and at the end, make sure
+ # have deleted everything.
+ for my $i (0 .. @code_point_in_names - 1) {
+ my $hash = $code_point_in_names[$i];
+ if ($hash->{'low'} == $start
+ && $hash->{'high'} == $end
+ && "$hash->{'name'}-" eq $name)
+ {
+ splice @code_point_in_names, $i, 1;
+ last;
+ }
+ else {
+ fail("prop_invmap('$mod_prop')");
+ diag("Unexpected code-point-in-name line '$invmap_ref->[$i]'");
+ next PROPERTY;
+ }
+ }
+ }
+
+ next;
+ }
+
+ # Have adjusted the map, as needed. Append to running string.
+ $end = ($start == $end) ? "" : sprintf("%05X", $end);
+ $tested_map .= sprintf "%05X\t%s\n", $start, $invmap_ref->[$i];
+ }
+
+ # Finished creating the string from the inversion map. Can compare
+ # with what the file is.
+ chomp $tested_map;
+ if ($tested_map ne $official) {
+ fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap");
+ next PROPERTY;
+ }
+ if (@code_point_in_names) {
+ fail("prop_invmap('$mod_prop')");
+ use Data::Dumper;
+ diag("Missing code-point-in-name line(s)" . Dumper \@code_point_in_names);
+ next PROPERTY;
+ }
+ }
+ elsif ($format eq 's' || $format eq 'r') {
+
+ # Here the map is not more or less directly from a file stored on
+ # disk. We try a different tack. These should all be properties that
+ # have just a few possible values (most of them are binary). We go
+ # through the map list, sorting each range into buckets, one for each
+ # map value. Thus for binary properties there will be a bucket for Y
+ # and one for N. The buckets are inversion lists. We compare each
+ # constructed inversion list with what we would get for it using
+ # prop_invlist(), which has already been tested. If they all match,
+ # the whole map must have matched.
+ my %maps;
+ my $previous_map;
+
+ # (The extra -1 is to not look at the final element in the loop, which
+ # we know is the one that starts just beyond Unicode and goes to
+ # infinity.)
+ for my $i (0 .. @$invlist_ref - 1 - 1) {
+ my $range_start = $invlist_ref->[$i];
+
+ # Because we are sorting into buckets, things could be
+ # out-of-order here, and still be in the correct order in the
+ # bucket, and hence wouldn't show up as an error; so have to
+ # check.
+ if (($i > 0 && $range_start <= $invlist_ref->[$i-1])
+ || $range_start >= $invlist_ref->[$i+1])
+ {
+ fail("prop_invmap('$mod_prop')");
+ diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+ next PROPERTY;
+ }
+
+ # This new range closes out the range started in the previous
+ # iteration.
+ push @{$maps{$previous_map}}, $range_start if defined $previous_map;
+
+ # And starts a range which will be closed in the next iteration.
+ $previous_map = $invmap_ref->[$i];
+ push @{$maps{$previous_map}}, $range_start;
+ }
+
+ # The range we just started hasn't been closed, and we didn't look at
+ # the final element of the loop. If that range is for the default
+ # value, it shouldn't be closed, as it is to extend to infinity. But
+ # otherwise, it should end at the final Unicode code point, and the
+ # list that maps to the default value should have another element that
+ # does go to infinity for every above Unicode code point.
+
+ if (@$invlist_ref > 1) {
+ my $penultimate_map = $invmap_ref->[-2];
+ if ($penultimate_map ne $missing) {
+
+ # The -1th element contains the first non-Unicode code point.
+ push @{$maps{$penultimate_map}}, $invlist_ref->[-1];
+ push @{$maps{$missing}}, $invlist_ref->[-1];
+ }
+ }
+
+ # Here, we have the buckets (inversion lists) all constructed. Go
+ # through each and verify that matches what prop_invlist() returns.
+ # We could use is_deeply() for the comparison, but would get multiple
+ # messages for each $prop.
+ foreach my $map (keys %maps) {
+ my @off_invlist = prop_invlist("$prop = $map");
+ my $min = (@off_invlist >= @{$maps{$map}})
+ ? @off_invlist
+ : @{$maps{$map}};
+ for my $i (0 .. $min- 1) {
+ if ($i > @off_invlist - 1) {
+ fail("prop_invmap('$mod_prop')");
+ diag("There is no element [$i] for $prop=$map from prop_invlist(), while [$i] in the implicit one constructed from prop_invmap() is '$maps{$map}[$i]'");
+ next PROPERTY;
+ }
+ elsif ($i > @{$maps{$map}} - 1) {
+ fail("prop_invmap('$mod_prop')");
+ diag("There is no element [$i] from the implicit $prop=$map constructed from prop_invmap(), while [$i] in the one from prop_invlist() is '$off_invlist[$i]'");
+ next PROPERTY;
+ }
+ elsif ($maps{$map}[$i] ne $off_invlist[$i]) {
+ fail("prop_invmap('$mod_prop')");
+ diag("Element [$i] of the implicit $prop=$map constructed from prop_invmap() is '$maps{$map}[$i]', and the one from prop_invlist() is '$off_invlist[$i]'");
+ next PROPERTY;
+ }
+ }
+ }
+ }
+ else { # Don't know this property nor format.
+
+ fail("prop_invmap('$mod_prop')");
+ diag("Unknown format '$format'");
+ }
+
+ pass("prop_invmap('$mod_prop')");
+}
+
done_testing();