summaryrefslogtreecommitdiff
path: root/lib/Unicode/UCD.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Unicode/UCD.pm')
-rw-r--r--lib/Unicode/UCD.pm976
1 files changed, 976 insertions, 0 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index ef46c29e21..09ea439919 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -26,6 +26,7 @@ our @EXPORT_OK = qw(charinfo
prop_aliases
prop_value_aliases
prop_invlist
+ prop_invmap
MAX_CP
);
@@ -75,6 +76,10 @@ Unicode::UCD - Unicode character database
use Unicode::UCD 'prop_invlist';
my @puncts = prop_invlist("gc=punctuation");
+ use Unicode::UCD 'prop_invmap';
+ my ($list_ref, $map_ref, $format, $missing)
+ = prop_invmap("General Category");
+
use Unicode::UCD 'compexcl';
my $compexcl = compexcl($codepoint);
@@ -657,6 +662,9 @@ as the keys, and the code point ranges (see L</charblock()>) as the values.
The names are in the old-style (see L</Old-style versus new-style block
names>).
+L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a
+different type of data structure.
+
See also L</Blocks versus Scripts>.
=cut
@@ -676,6 +684,9 @@ charscripts() returns a reference to a hash with the known script
names as the keys, and the code point ranges (see L</charscript()>) as
the values.
+L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
+different type of data structure.
+
See also L</Blocks versus Scripts>.
=cut
@@ -2033,6 +2044,971 @@ sub prop_invlist ($) {
return @invlist;
}
+sub _search_invlist {
+ # Find the range in the inversion list which contains a code point; that
+ # is, find i such that l[i] <= code_point < l[i+1]
+
+ # If this is ever made public, could use to speed up .t specials. Would
+ # need to use code point argument, as in other functions in this pm
+
+ my $list_ref = shift;
+ my $code_point = shift;
+ # Verify non-neg numeric XXX
+
+ my $max_element = @$list_ref - 1;
+ return if ! $max_element < 0; # Undef if list is empty.
+
+ # Short cut something at the far-end of the table. This also allows us to
+ # refer to element [$i+1] without fear of being out-of-bounds in the loop
+ # below.
+ return $max_element if $code_point >= $list_ref->[$max_element];
+
+ use integer; # want integer division
+
+ my $i = $max_element / 2;
+
+ my $lower = 0;
+ my $upper = $max_element;
+ while (1) {
+
+ if ($code_point >= $list_ref->[$i]) {
+
+ # Here we have met the lower constraint. We can quit if we
+ # also meet the upper one.
+ last if $code_point < $list_ref->[$i+1];
+
+ $lower = $i; # Still too low.
+
+ }
+ else {
+
+ # Here, $code_point < $list_ref[$i], so look lower down.
+ $upper = $i;
+ }
+
+ # Split search domain in half to try again.
+ my $temp = ($upper + $lower) / 2;
+
+ # No point in continuing unless $i changes for next time
+ # in the loop.
+ return $i if $temp == $i;
+ $i = $temp;
+ } # End of while loop
+
+ # Here we have found the offset
+ return $i;
+}
+
+=pod
+
+=head2 B<prop_invmap()>
+
+ use Unicode::UCD 'prop_invmap';
+ my ($list_ref, $map_ref, $format, $missing)
+ = prop_invmap("General Category");
+
+C<prop_invmap> is used to get the complete mapping definition for a property,
+in the form of an inversion map. An inversion map consists of two parallel
+arrays. One is an ordered list of code points that mark range beginnings, and
+the other gives the value (or mapping) that all code points in the
+corresponding range have.
+
+C<prop_invmap> is called with the name of the desired property. The name is
+loosely matched, meaning that differences in case, white-space, hyphens, and
+underscores are not meaningful (except for the trailing underscore in the
+old-form grandfathered-in property C<"L_">, which is better written as C<"LC">,
+or even better, C<"Gc=LC">).
+
+Many Unicode properties have more than one name (or alias). C<prop_invmap>
+understands all of these, including Perl extensions to them. Ambiguities are
+resolved as described above for L</prop_aliases()>. The Perl internal
+property "Perl_Decimal_Digit, described below, is also accepted. C<undef> is
+returned if the property name is unknown.
+
+It is a fatal error to call this function except in list context.
+
+In addition to the the two arrays that form the inversion map, C<prop_invmap>
+returns two other values; one is a scalar that gives some details as to the
+format of the entries of the map array; the other is used for specialized
+purposes, described at the end of this section.
+
+This means that C<prop_invmap> returns a 4 element list. For example,
+
+ my ($blocks_ranges_ref, $blocks_maps_ref, $format, $default)
+ = prop_invmap("Block");
+
+In this call, the two arrays will be populated as shown below (for Unicode
+6.0):
+
+ Index @blocks_ranges @blocks_maps
+ 0 0x0000 Basic Latin
+ 1 0x0080 Latin-1 Supplement
+ 2 0x0100 Latin Extended-A
+ 3 0x0180 Latin Extended-B
+ 4 0x0250 IPA Extensions
+ 5 0x02B0 Spacing Modifier Letters
+ 6 0x0300 Combining Diacritical Marks
+ 7 0x0370 Greek and Coptic
+ 8 0x0400 Cyrillic
+ ...
+ 233 0x2B820 No_Block
+ 234 0x2F800 CJK Compatibility Ideographs Supplement
+ 235 0x2FA20 No_Block
+ 236 0xE0000 Tags
+ 237 0xE0080 No_Block
+ 238 0xE0100 Variation Selectors Supplement
+ 239 0xE01F0 No_Block
+ 240 0xF0000 Supplementary Private Use Area-A
+ 241 0x100000 Supplementary Private Use Area-B
+ 242 0x110000 No_Block
+
+The first line (with Index [0]) means that the value for code point 0 is "Basic
+Latin". The entry "0x0080" in the @blocks_ranges column in the second line
+means that the value from the first line, "Basic Latin", extends to all code
+points in the range from 0 up to but not including 0x0080, that is, through
+255. In other words, the code points from 0 to 255 are all in the "Basic
+Latin" block. Similarly, all code points in the range from 0x0080 up to (but
+not including) 0x0100 are in the block named "Latin-1 Supplement", etc.
+(Notice that the return is the old-style block names; see L</Old-style versus
+new-style block names>).
+
+The final line (with Index [242]) means that the value for all code points above
+the legal Unicode maximum code point have the value "No_Block", which is the
+term Unicode uses for a non-existing block.
+
+The arrays completely specify the mappings for all possible code points.
+The final element in an inversion map returned by this function will always be
+for the range that consists of all the code points that aren't legal Unicode,
+but that are expressible on the platform. (That is, it starts with code point
+0x110000, the first code point above the legal Unicode maximum, and extends to
+infinity.) The value for that range will be the same that any typical
+unassigned code point has for the specified property. (Certain unassigned
+code points are not "typical"; for example the non-character code points, or
+those in blocks that are to be written right-to-left. The above-Unicode
+range's value is not based on these atypical code points.) It could be argued
+that, instead of treating these as unassigned Unicode code points, the value
+for this range should be C<undef>. If you wish, you can change the returned
+arrays accordingly.
+
+The maps are almost always simple scalars that should be interpreted as-is.
+These values are those given in the Unicode-supplied data files, which may be
+inconsistent as to capitalization and as to which synonym for a property-value
+is given. The results may be normalized by using the L</prop_value_aliases()>
+function.
+
+There are exceptions to the simple scalar maps. Some properties have some
+elements in their map list that are themselves lists of scalars; and some
+special strings are returned that are not to be interpreted as-is. Element
+[2] (placed into C<$format> in the example above) of the returned four element
+list tells you if the map has any of these special elements, as follows:
+
+=over
+
+=item C<s>
+
+means all the elements of the map array are simple scalars, with no special
+elements. Almost all properties are like this, like the C<block> example
+above.
+
+=item C<sl>
+
+means that some of the map array elements have the form given by C<s>, and
+the rest are lists of scalars. For example, here is a portion of the output
+of calling C<prop_invmap>() with the "Script Extensions" property:
+
+ @scripts_ranges @scripts_maps
+ ...
+ 0x0953 Deva
+ 0x0964 [ Beng Deva Guru Orya ]
+ 0x0966 Deva
+ 0x0970 Common
+
+Here, the code points 0x964 and 0x965 are used in the Bengali,
+Devanagari, Gurmukhi, and Oriya scripts.
+
+=item C<r>
+
+means that all the elements of the map array are either rational numbers or
+the string C<"NaN">, meaning "Not a Number". A rational number is either an
+integer, or two integers separated by a solidus (C<"/">). The second integer
+represents the denominator of the division implied by the solidus, and is
+guaranteed not to be 0. If you want to convert them to scalar numbers, you
+can use something like this:
+
+ my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property);
+ if ($format && $format eq "r") {
+ map { $_ = eval $_ } @$invmap_ref;
+ }
+
+Here's some entries from the output of the property "Nv", which has format
+C<"r">.
+
+ @numerics_ranges @numerics_maps Note
+ 0x00 "NaN"
+ 0x30 0 DIGIT 0
+ 0x31 1
+ 0x32 2
+ ...
+ 0x37 7
+ 0x38 8
+ 0x39 9 DIGIT 9
+ 0x3A "NaN"
+ 0xB2 2 SUPERSCRIPT 2
+ 0xB3 3 SUPERSCRIPT 2
+ 0xB4 "NaN"
+ 0xB9 1 SUPERSCRIPT 1
+ 0xBA "NaN"
+ 0xBC 1/4 VULGAR FRACTION 1/4
+ 0xBD 1/2 VULGAR FRACTION 1/2
+ 0xBE 3/4 VULGAR FRACTION 3/4
+ 0xBF "NaN"
+ 0x660 0 ARABIC-INDIC DIGIT ZERO
+
+=item C<c>
+
+is like C<s> in that all the map array elements are scalars, but some of them
+are the special string S<C<"E<lt>code pointE<gt>">>, meaning that the map of
+each code point in the corresponding range in the inversion list is the code
+point itself. For example, in:
+
+ my ($uppers_ranges_ref, $uppers_maps_ref, $format)
+ = prop_invmap("Simple_Uppercase_Mapping");
+
+the returned arrays look like this:
+
+ @$uppers_ranges_ref @$uppers_maps_ref Note
+ 0 "<code point>"
+ 97 65 'a' maps to 'A'
+ 98 66 'b' => 'B'
+ 99 67 'c' => 'C'
+ ...
+ 120 88 'x' => 'X'
+ 121 89 'y' => 'Y'
+ 122 90 'z' => 'Z'
+ 123 "<code point>"
+ 181 924 MICRO SIGN => Greek Cap MU
+ 182 "<code point>"
+ ...
+
+The first line means that the uppercase of code point 0 is 0;
+the uppercase of code point 1 is 1; ... of code point 96 is 96. Without the
+C<"E<lt>code_pointE<gt>"> notation, every code point would have to have an
+entry. This would mean that the arrays would each have more than a million
+entries to list just the legal Unicode code points!
+
+=item C<cl>
+
+means that some of the map array elements have the form given by C<c>, and
+the rest are ordered lists of code points.
+For example, in:
+
+ my ($uppers_ranges_ref, $uppers_maps_ref, $format)
+ = prop_invmap("Uppercase_Mapping");
+
+the returned arrays look like this:
+
+ @$uppers_ranges_ref @$uppers_maps_ref
+ 0 "<code point>"
+ 97 65
+ ...
+ 122 90
+ 123 "<code point>"
+ 181 924
+ 182 "<code point>"
+ ...
+ 0x0149 [ 0x02BC 0x004E ]
+ 0x014A "<code point>"
+ 0x014B 0x014A
+ ...
+
+This is the full Uppercase_Mapping property (as opposed to the
+Simple_Uppercase_Mapping given in the example for format C<"c">). The only
+difference between the two in the ranges shown is that the code point at
+0x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two
+characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
+CAPITAL LETTER N).
+
+=item C<cle>
+
+means that some of the map array elements have the forms given by C<cl>, and
+the rest are the empty string. The property C<NFKC_Casefold> has this form.
+An example slice is:
+
+ @$ranges_ref @$maps_ref Note
+ ...
+ 0x00AA 0x0061 FEMININE ORDINAL INDICATOR => 'a'
+ 0x00AB <code point>
+ 0x00AD SOFT HYPHEN => ""
+ 0x00AE <code point>
+ 0x00AF [ 0x0020, 0x0304 ] MACRON => SPACE . COMBINING MACRON
+ 0x00B0 <code point>
+ ...
+
+=item C<n>
+
+means the Name property. All the elements of the map array are simple
+scalars, but some of them contain special strings that require more work to
+get the actual name.
+
+Entries such as:
+
+ CJK UNIFIED IDEOGRAPH-<code point>
+
+mean that the name for the code point is "CJK UNIFIED IDEOGRAPH-"
+with the code point (expressed in hexadecimal) appended to it, like "CJK
+UNIFIED IDEOGRAPH-3403" (similarly for C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code
+pointE<gt>>).
+
+Also, entries like
+
+ <hangul syllable>
+
+means that the name is algorithmically calculated. This is easily done by
+the function L<charnames/charnames::viacode(code)>.
+
+Note that for control characters (C<Gc=cc>), Unicode's data files have the
+string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty
+string. This function returns that real name, the empty string.
+
+=item C<d>
+
+means the Decomposition_Mapping property. This property is like C<cl>
+properties, except it has an additional entry type:
+
+ <hangul syllable>
+
+for those code points whose decomposition is algorithmically calculated. (The
+C<n> format has this same entry.) These can be generated via the function
+L<Unicode::Normalize::NFD()|Unicode::Normalize>.
+
+
+Note that the mapping is the one that is specified in the Unicode data files,
+and to get the final decomposition, it may need to be applied recursively.
+
+=back
+
+A binary search can be used to quickly find a code point in the inversion
+list, and hence its corresponding mapping.
+
+The final element (index [3], assigned to C<$default> in the "block" example) in
+the four element list returned by this function may be useful for applications
+that wish to convert the returned inversion map data structure into some
+other, such as a hash. It gives the mapping that most code points map to
+under the property. If you establish the convention that any code point not
+explicitly listed in your data structure maps to this value, you can
+potentially make your data structure much smaller. As you construct your data
+structure from the one returned by this function, simply ignore those ranges
+that map to this value, generally called the "default" value. For example, to
+convert to the data structure searchable by L</charinrange()>, you can follow
+this recipe:
+
+ my ($list_ref, $map_ref, $format, $missing) = prop_invmap($property);
+ my @range_list;
+ for my $i (0 .. @$list_ref - 2) {
+ next if $map_ref->[$i] eq $missing;
+ push @range_list, [ $list_ref->[$i],
+ $list_ref->[$i+1],
+ $map_ref->[$i]
+ ];
+ }
+
+ print charinrange(\@range_list, $code_point), "\n";
+
+
+With this, C<charinrange()> will return C<undef> if its input code point maps
+to C<$missing>. You can avoid this by omitting the C<next> statement, and adding
+a line after the loop to handle the final element of the inversion map.
+
+One internal Perl property is accessible by this function.
+"Perl_Decimal_Digit" returns an inversion map in which all the Unicode decimal
+digits map to their numeric values, and everything else to the empty string,
+like so:
+
+ @digits @values
+ 0x0000 ""
+ 0x0030 0
+ 0x0031 1
+ 0x0032 2
+ 0x0033 3
+ 0x0034 4
+ 0x0035 5
+ 0x0036 6
+ 0x0037 7
+ 0x0038 8
+ 0x0039 9
+ 0x003A ""
+ 0x0660 0
+ 0x0661 1
+ ...
+
+Note that the inversion maps returned for the C<Case_Folding> and
+C<Simple_Case_Folding> properties do not include the Turkic-locale mappings.
+Use L</casefold()> for these.
+
+The C<Name_Alias> property is potentially undergoing signficant revision by
+Unicode at the time of this writing. The format of the values returned for it
+may change substantially in future Unicode versions.
+
+C<prop_invmap> does not know about any user-defined properties, and will
+return C<undef> if called with one of those.
+
+=cut
+
+# User-defined properties could be handled with some changes to utf8_heavy.pl;
+# if done, consideration should be given to the fact that the user subroutine
+# could return different results with each call, which could lead to some
+# security issues.
+
+# One could store things in memory so they don't have to be recalculated, but
+# it is unlikely this will be called often, and some properties would take up
+# significant memory.
+
+# These are created by mktables for this routine and stored in unicore/UCD.pl
+# where their structures are described.
+our @algorithmic_named_code_points;
+our $HANGUL_BEGIN;
+our $HANGUL_COUNT;
+
+sub prop_invmap ($) {
+
+ croak __PACKAGE__, "::prop_invmap: must be called in list context" unless wantarray;
+
+ my $prop = $_[0];
+ return unless defined $prop;
+
+ # Fail internal properties
+ return if $prop =~ /^_/;
+
+ # The values returned by this function.
+ my (@invlist, @invmap, $format, $missing);
+
+ # The swash has two components we look at, the base list, and a hash,
+ # named 'SPECIALS', containing any additional members whose mappings don't
+ # fit into the the base list scheme of things. These generally 'override'
+ # any value in the base list for the same code point.
+ my $overrides;
+
+ require "utf8_heavy.pl";
+ require "unicore/UCD.pl";
+
+RETRY:
+
+ # Try to get the map swash for the property. They have 'To' prepended to
+ # the property name, and 32 means we will accept 32 bit return values.
+ my $swash = utf8::SWASHNEW(__PACKAGE__, "To$prop", undef, 32, 0);
+
+ # If didn't find it, could be because needs a proxy. And if was the
+ # 'Block' or 'Name' property, use a proxy even if did find it. Finding it
+ # would be the result of the installation changing mktables to output the
+ # Block or Name tables. The Block table gives block names in the
+ # new-style, and this routine is supposed to return old-style block names.
+ # The Name table is valid, but we need to execute the special code below
+ # to add in the algorithmic-defined name entries.
+ if (ref $swash eq ""
+ || $swash->{'TYPE'} eq 'ToBlk'
+ || $swash->{'TYPE'} eq 'ToNa')
+ {
+
+ # Get the short name of the input property, in standard form
+ my ($second_try) = prop_aliases($prop);
+ return unless $second_try;
+ $second_try = utf8::_loose_name(lc $second_try);
+
+ if ($second_try eq "in") {
+
+ # This property is identical to age for inversion map purposes
+ $prop = "age";
+ goto RETRY;
+ }
+ elsif ($second_try eq 'scf') {
+
+ # This property uses just the LIST part of cf which includes the
+ # simple folds that are otherwise overridden by the SPECIALS. So
+ # all we need do is to not look at the SPECIALS; set $overrides to
+ # indicate that
+ $overrides = -1;
+ $prop = "cf";
+ goto RETRY;
+ }
+ elsif ($second_try =~ / ^ s[ltu]c $ /x) {
+
+ # Because some applications may be reading the full mapping
+ # equivalent files directly, they haven't been changed to include
+ # the simple mappings as well, as was done with the cf file (which
+ # doesn't have those backward compatibility issues) in 5.14.
+ # Instead, separate internal-only files were created that
+ # contain just the simple mappings that get overridden by the
+ # SPECIALS. Thus, these simple case mappings use the LIST part of
+ # their full mapping equivalents; plus the ones that are in those
+ # additional files. These special files are used by other
+ # functions in this module, so use the same hashes that those
+ # functions use.
+ my $file;
+ if ($second_try eq "suc") {
+ $file = '_suc.pl';
+ $overrides = \%SIMPLE_UPPER;
+ }
+ elsif ($second_try eq "slc") {
+ $file = '_slc.pl';
+ $overrides = \%SIMPLE_LOWER;
+ }
+ else {
+ $file = '_stc.pl';
+ $overrides = \%SIMPLE_TITLE;
+ }
+
+ # The files are already handled by the _read_table() function.
+ # Don't read them in if already done.
+ %$overrides =_read_table("unicore/To/$file", 'use_hash')
+ unless %$overrides;
+
+ # Convert to the full mapping name, and go handle that; e.g.,
+ # suc => uc.
+ $prop = $second_try =~ s/^s//r;
+ goto RETRY;
+ }
+ elsif ($second_try eq "blk") {
+
+ # We use the old block names. Just create a fake swash from its
+ # data.
+ _charblocks();
+ my %blocks;
+ $blocks{'LIST'} = "";
+ $blocks{'TYPE'} = "ToBlk";
+ $utf8::SwashInfo{ToBlk}{'missing'} = "No_Block";
+ $utf8::SwashInfo{ToBlk}{'format'} = "s";
+
+ foreach my $block (@BLOCKS) {
+ $blocks{'LIST'} .= sprintf "%x\t%x\t%s\n",
+ $block->[0],
+ $block->[1],
+ $block->[2];
+ }
+ $swash = \%blocks;
+ }
+ elsif ($second_try eq "na") {
+
+ # Use the combo file that has all the Name-type properties in it,
+ # extracting just the ones that are for the actual 'Name'
+ # property. And create a fake swash from it.
+ my %names;
+ $names{'LIST'} = "";
+ my $original = do "unicore/Name.pl";
+ my $previous_hex_code_point = "";
+ my $algorithm_names = \@algorithmic_named_code_points;
+
+ # We hold off on adding the next entry to the list until we know,
+ # that the next line isn't for the same code point. We only
+ # output the final line. That one is the original Name property
+ # value. The others are the Name_Alias corrections, which are
+ # listed first in the file.
+ my $staging = "";
+
+ my $i = 0;
+ foreach my $line (split "\n", $original) {
+ my ($hex_code_point, $name) = split "\t", $line;
+
+ # Weeds out all comments, blank lines, and named sequences
+ next if $hex_code_point =~ /\P{ASCII_HEX_DIGIT}/;
+
+ my $code_point = hex $hex_code_point;
+
+ # The name of all controls is the default: the empty string.
+ # The set of controls is immutable, so these hard-coded
+ # constants work.
+ next if $code_point <= 0x9F
+ && ($code_point <= 0x1F || $code_point >= 0x7F);
+
+ # Output the last iteration's result, but only output the
+ # final name if a code point has more than one.
+ $names{'LIST'} .= $staging
+ if $hex_code_point ne $previous_hex_code_point;
+
+ # If we are beyond where one of the special lines needs to
+ # be inserted ...
+ if ($i < @$algorithm_names
+ && $code_point > $algorithm_names->[$i]->{'low'})
+ {
+
+ # ... then insert it, ahead of what we were about to
+ # output
+ $staging = sprintf "%x\t%x\t%s\n",
+ $algorithm_names->[$i]->{'low'},
+ $algorithm_names->[$i]->{'high'},
+ $algorithm_names->[$i]->{'name'};
+
+ # And pretend that what we last saw was the final code
+ # point of the inserted range.
+ $previous_hex_code_point = sprintf "%04X",
+ $algorithm_names->[$i]->{'high'};
+
+ # Done with this range.
+ $i++;
+
+ # Except we actually need to output the inserted line.
+ redo;
+ }
+
+ # Normal name.
+ $staging = sprintf "%x\t\t%s\n", $code_point, $name;
+ $previous_hex_code_point = $hex_code_point;
+ }
+
+ # Add the name from the final iteration
+ $names{'LIST'} .= $staging;
+
+ $names{'TYPE'} = "ToNa";
+ $utf8::SwashInfo{ToNa}{'missing'} = "";
+ $utf8::SwashInfo{ToNa}{'format'} = "n";
+ $swash = \%names;
+ }
+ elsif ($second_try =~ / ^ ( d [mt] ) $ /x) {
+
+ # The file is a combination of dt and dm properties. Create a
+ # fake swash from the portion that we want.
+ my $original = do "unicore/Decomposition.pl";
+ my %decomps;
+
+ if ($second_try eq 'dt') {
+ $decomps{'TYPE'} = "ToDt";
+ $utf8::SwashInfo{'ToDt'}{'missing'} = "None";
+ $utf8::SwashInfo{'ToDt'}{'format'} = "s";
+ }
+ else {
+ $decomps{'TYPE'} = "ToDm";
+ $utf8::SwashInfo{'ToDm'}{'missing'} = "<code point>";
+
+ # Use a special internal-to-this_routine format, 'dm', to
+ # distinguish from 'd', meaning decimal.
+ $utf8::SwashInfo{'ToDm'}{'format'} = "dm";
+ }
+
+ $decomps{'LIST'} = "";
+
+ # This property has one special range not in the file: for the
+ # hangul syllables
+ my $done_hangul = 0; # Have we done the hangul range.
+ foreach my $line (split "\n", $original) {
+ my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line;
+ my $code_point = hex $hex_lower;
+ my $value;
+
+ # The type, enclosed in <...>, precedes the mapping separated
+ # by blanks
+ if ($type_and_map =~ / ^ < ( .* ) > \s+ (.*) $ /x) {
+ $value = ($second_try eq 'dt') ? $1 : $2
+ }
+ else { # If there is no type specified, it's canonical
+ $value = ($second_try eq 'dt')
+ ? "Canonical" :
+ $type_and_map;
+ }
+
+ # Insert the hangul range at the appropriate spot.
+ if (! $done_hangul && $code_point > $HANGUL_BEGIN) {
+ $done_hangul = 1;
+ $decomps{'LIST'} .=
+ sprintf "%x\t%x\t%s\n",
+ $HANGUL_BEGIN,
+ $HANGUL_BEGIN + $HANGUL_COUNT - 1,
+ ($second_try eq 'dt')
+ ? "Canonical"
+ : "<hangul syllable>";
+ }
+
+ # And append this to our constructed LIST.
+ $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n";
+ }
+ $swash = \%decomps;
+ }
+ else { # Don't know this property. Fail.
+ return;
+ }
+ }
+
+ if ($swash->{'EXTRAS'}) {
+ carp __PACKAGE__, "::prop_invmap: swash returned for $prop unexpectedly has EXTRAS magic";
+ return;
+ }
+
+ # Here, have a valid swash return. Examine it.
+ my $returned_prop = $swash->{TYPE};
+
+ # All properties but binary ones should have 'missing' and 'format'
+ # entries
+ $missing = $utf8::SwashInfo{$returned_prop}{'missing'};
+ $missing = 'N' unless defined $missing;
+
+ $format = $utf8::SwashInfo{$returned_prop}{'format'};
+ $format = 'b' unless defined $format;
+
+ # The LIST input lines look like:
+ # ...
+ # 0374\t\tCommon
+ # 0375\t0377\tGreek # [3]
+ # 037A\t037D\tGreek # [4]
+ # 037E\t\tCommon
+ # 0384\t\tGreek
+ # ...
+ #
+ # Convert them to like
+ # 0374 => Common
+ # 0375 => Greek
+ # 0378 => $missing
+ # 037A => Greek
+ # 037E => Common
+ # 037F => $missing
+ # 0384 => Greek
+ #
+ # For binary properties, the final non-comment column is absent, and
+ # assumed to be 'Y'.
+
+ foreach my $range (split "\n", $swash->{'LIST'}) {
+ $range =~ s/ \s* (?: \# .* )? $ //xg; # rmv trailing space, comments
+
+ # Find the beginning and end of the range on the line
+ my ($hex_begin, $hex_end, $map) = split "\t", $range;
+ my $begin = hex $hex_begin;
+ my $end = (defined $hex_end && $hex_end ne "")
+ ? hex $hex_end
+ : $begin;
+
+ # If the property doesn't have a range that begins at 0, add one that
+ # maps to the default value (for missing ranges).
+ if (! @invlist) {
+ if ($begin != 0) {
+ push @invlist, 0;
+ push @invmap, $missing;
+ }
+ }
+ elsif ($invlist[-1] == $begin) {
+
+ # If the input isn't in the most compact form, so that there are
+ # two adjacent ranges that map to the same thing, they should be
+ # combined. This happens in our constructed dt mapping, as
+ # Element [-2] is the map for the latest range so far processed.
+ # Just set the beginning point of the map to $missing (in
+ # invlist[-1]) to 1 beyond where this range ends. For example, in
+ # 12\t13\tXYZ
+ # 14\t17\tXYZ
+ # we have set it up so that it looks like
+ # 12 => XYZ
+ # 14 => $missing
+ #
+ # We now see that it should be
+ # 12 => XYZ
+ # 18 => $missing
+ if (@invlist > 1 && $invmap[-2] eq $map) {
+ $invlist[-1] = $end + 1;
+ next;
+ }
+
+ # Here, the range started in the previous iteration that maps to
+ # $missing starts at the same code point as this range. That
+ # means there is no gap to fill that that range was intended for,
+ # so we just pop it off the parallel arrays.
+ pop @invlist;
+ pop @invmap;
+ }
+
+ # Add the range beginning, and the range's map.
+ push @invlist, $begin;
+ if ($format eq 'dm') {
+
+ # The decomposition maps are either a line like <hangul syllable>
+ # which are to be taken as is; or a sequence of code points in hex
+ # and separated by blanks. Convert them to decimal, and if there
+ # is more than one, use an anonymous array as the map.
+ if ($map =~ /^ < /x) {
+ push @invmap, $map;
+ }
+ else {
+ my @map = map { hex } split " ", $map;
+ if (@map == 1) {
+ push @invmap, $map[0];
+ }
+ else {
+ push @invmap, \@map;
+ }
+ }
+ }
+ else {
+
+ # Otherwise, convert hex formatted list entries to decimal; add a
+ # 'Y' map for the missing value in binary properties, or
+ # otherwise, use the input map unchanged.
+ $map = ($format eq 'x')
+ ? hex $map
+ : $format eq 'b'
+ ? 'Y'
+ : $map;
+ push @invmap, $map;
+ }
+
+ # We just started a range. It ends with $end. The gap between it and
+ # the next element in the list must be filled with a range that maps
+ # to the default value. If there is no gap, the next iteration will
+ # pop this, unless there is no next iteration, and we have filled all
+ # of the Unicode code space, so check for that and skip.
+ if ($end < $MAX_UNICODE_CODEPOINT) {
+ push @invlist, $end + 1;
+ push @invmap, $missing;
+ }
+ }
+
+ # If the property is empty, make all code points use the value for missing
+ # ones.
+ if (! @invlist) {
+ push @invlist, 0;
+ push @invmap, $missing;
+ }
+
+ # And add in standard element that all non-Unicode code points map to
+ # $missing
+ push @invlist, $MAX_UNICODE_CODEPOINT + 1;
+ push @invmap, $missing;
+
+ # The second component of the map are those values that require
+ # non-standard specification, stored in SPECIALS. These override any
+ # duplicate code points in LIST. If we are using a proxy, we may have
+ # already set $overrides based on the proxy.
+ $overrides = $swash->{'SPECIALS'} unless defined $overrides;
+ if ($overrides) {
+
+ # A negative $overrides implies that the SPECIALS should be ignored,
+ # and a simple 'c' list is the value.
+ if ($overrides < 0) {
+ $format = 'c';
+ }
+ else {
+
+ # Currently, all overrides are for properties that normally map to
+ # single code points, but now some will map to lists of code
+ # points (but there is an exception case handled below).
+ $format = 'cl';
+
+ # Look through the overrides.
+ foreach my $cp_maybe_utf8 (keys %$overrides) {
+ my $cp;
+ my @map;
+
+ # If the overrides came from SPECIALS, the code point keys are
+ # packed UTF-8.
+ if ($overrides == $swash->{'SPECIALS'}) {
+ $cp = unpack("C0U", $cp_maybe_utf8);
+ @map = unpack "U0U*", $swash->{'SPECIALS'}{$cp_maybe_utf8};
+
+ # The empty string will show up unpacked as an empty
+ # array.
+ $format = 'cle' if @map == 0;
+ }
+ else {
+
+ # But if we generated the overrides, we didn't bother to
+ # pack them, and we, so far, do this only for properties
+ # that are 'c' ones.
+ $cp = $cp_maybe_utf8;
+ @map = hex $overrides->{$cp};
+ $format = 'c';
+ }
+
+ # Find the range that the override applies to.
+ my $i = _search_invlist(\@invlist, $cp);
+ if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
+ croak __PACKAGE__, "wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
+ }
+
+ # And what that range currently maps to
+ my $cur_map = $invmap[$i];
+
+ # If there is a gap between the next range and the code point
+ # we are overriding, we have to add elements to both arrays to
+ # fill that gap, using the map that applies to it, which is
+ # $cur_map, since it is part of the current range.
+ if ($invlist[$i + 1] > $cp + 1) {
+ #use feature 'say';
+ #say "Before splice:";
+ #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
+ #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
+ #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
+ #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
+ #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
+
+ splice @invlist, $i + 1, 0, $cp + 1;
+ splice @invmap, $i + 1, 0, $cur_map;
+
+ #say "After splice:";
+ #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
+ #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
+ #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
+ #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
+ #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
+ }
+
+ # If the remaining portion of the range is multiple code
+ # points (ending with the one we are replacing, guaranteed by
+ # the earlier splice). We must split it into two
+ if ($invlist[$i] < $cp) {
+ $i++; # Compensate for the new element
+
+ #use feature 'say';
+ #say "Before splice:";
+ #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
+ #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
+ #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
+ #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
+ #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
+
+ splice @invlist, $i, 0, $cp;
+ splice @invmap, $i, 0, 'dummy';
+
+ #say "After splice:";
+ #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
+ #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
+ #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
+ #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
+ #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
+ }
+
+ # Here, the range we are overriding contains a single code
+ # point. The result could be the empty string, a single
+ # value, or a list. If the last case, we use an anonymous
+ # array.
+ $invmap[$i] = (scalar @map == 0)
+ ? ""
+ : (scalar @map > 1)
+ ? \@map
+ : $map[0];
+ }
+ }
+ }
+ elsif ($format eq 'x') {
+
+ # All hex-valued properties are really to code points
+ $format = 'c';
+ }
+ elsif ($format eq 'dm') {
+ $format = 'd';
+ }
+ elsif ($format eq 'sw') { # blank-separated elements to form a list.
+ map { $_ = [ split " ", $_ ] if $_ =~ / / } @invmap;
+ $format = 'sl';
+ }
+ elsif ($returned_prop eq 'ToNameAlias') {
+
+ # This property currently doesn't have any lists, but theoretically
+ # could
+ $format = 'sl';
+ }
+ elsif ($format ne 'n' && $format ne 'r') {
+
+ # All others are simple scalars
+ $format = 's';
+ }
+
+ return (\@invlist, \@invmap, $format, $missing);
+}
+
=head2 Unicode::UCD::UnicodeVersion
This returns the version of the Unicode Character Database, in other words, the