diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-10-31 14:07:25 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-11-08 08:09:32 -0700 |
commit | 7ef25837157389e073e4677bb3ca144185b4b37c (patch) | |
tree | b96b0f03b348aab39c7a7b4ae391bea6ea46fbf6 /lib | |
parent | 55a402526b74c327b2b14c95efb46ed2675921c9 (diff) | |
download | perl-7ef25837157389e073e4677bb3ca144185b4b37c.tar.gz |
Unicode::UCD: add prop_aliases(), prop_value_aliases()
Diffstat (limited to 'lib')
-rw-r--r-- | lib/Unicode/UCD.pm | 401 | ||||
-rw-r--r-- | lib/Unicode/UCD.t | 341 |
2 files changed, 741 insertions, 1 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index 81ef5411fa..20873336de 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -6,7 +6,7 @@ no warnings 'surrogate'; # surrogates can be inputs to this use charnames (); use Unicode::Normalize qw(getCombinClass NFD); -our $VERSION = '0.36'; +our $VERSION = '0.37'; use Storable qw(dclone); @@ -23,6 +23,8 @@ our @EXPORT_OK = qw(charinfo casefold casespec namedseq num + prop_aliases + prop_value_aliases ); use Carp; @@ -62,6 +64,12 @@ Unicode::UCD - Unicode character database my $categories = general_categories(); my $types = bidi_types(); + use Unicode::UCD 'prop_aliases'; + my @space_names = prop_aliases("space"); + + use Unicode::UCD 'prop_value_aliases'; + my @gc_punct_names = prop_value_aliases("Gc", "Punct"); + use Unicode::UCD 'compexcl'; my $compexcl = compexcl($codepoint); @@ -154,6 +162,9 @@ C<E<lt>controlE<gt>>. The short name of the general category of I<code>. This will match one of the keys in the hash returned by L</general_categories()>. +The L</prop_value_aliases()> function can be used to get all the synonyms +of the category name. + =item B<combining> the combining class number for I<code> used in the Canonical Ordering Algorithm. @@ -161,11 +172,17 @@ For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior available at L<http://www.unicode.org/versions/Unicode5.1.0/> +The L</prop_value_aliases()> function can be used to get all the synonyms +of the combining class number. + =item B<bidi> bidirectional type of I<code>. This will match one of the keys in the hash returned by L</bidi_types()>. +The L</prop_value_aliases()> function can be used to get all the synonyms +of the bidi type name. + =item B<decomposition> is empty if I<code> has no decomposition; or is one or more codes @@ -732,6 +749,9 @@ from the long names to the short names. The general category is the one returned from L</charinfo()> under the C<category> key. +The L</prop_value_aliases()> function can be used to get all the synonyms of +the category name. + =cut my %BIDI_TYPES = @@ -774,6 +794,9 @@ the Unicode TR9 is recommended reading: L<http://www.unicode.org/reports/tr9/> (as of Unicode 5.0.0) +The L</prop_value_aliases()> function can be used to get all the synonyms of +the bidi type name. + =cut sub bidi_types { @@ -1382,6 +1405,382 @@ sub num { return $value; } +=pod + +=head2 B<prop_aliases()> + + use Unicode::UCD 'prop_aliases'; + + my ($short_name, $full_name, @other_names) = prop_aliases("space"); + my $same_full_name = prop_aliases("Space"); # Scalar context + my ($same_short_name) = prop_aliases("Space"); # gets 0th element + print "The full name is $full_name\n"; + print "The short name is $short_name\n"; + print "The other aliases are: ", join(", ", @other_names), "\n"; + + prints: + The full name is White_Space + The short name is WSpace + The other aliases are: Space + +Most Unicode properties have several synonymous names. Typically, there is at +least a short name, convenient to type, and a long name that more fully +describes the property, and hence is more easily understood. + +If you know one name for a Unicode property, you can use C<prop_aliases> to find +either the long name (when called in scalar context), or a list of all of the +names, somewhat ordered so that the short name is in the 0th element, the long +name in the next element, and any other synonyms are in the remaining +elements, in no particular order. + +The long name is returned in a form nicely capitalized, suitable for printing. + +The input parameter name is loosely matched, which means that white space, +hyphens, and underscores are ignored (except for the trailing underscore in +the old_form grandfathered-in C<"L_">, which is better written as C<"LC">, and +both of which mean C<General_Category=Cased Letter>). + +If the name is unknown, C<undef> is returned (or an empty list in list +context). Note that Perl typically recognizes property names in regular +expressions with an optional C<"Is_>" (with or without the underscore) +prefixed to them, such as C<\p{isgc=punct}>. This function does not recognize +those in the input, returning C<undef>. Nor are they included in the output +as possible synonyms. + +C<prop_aliases> does know about the Perl extensions to Unicode properties, +such as C<Any> and C<XPosixAlpha>, and the single form equivalents to Unicode +properties such as C<XDigit>, C<Greek>, C<In_Greek>, and C<Is_Greek>. The +final example demonstrates that the C<"Is_"> prefix is recognized for these +extensions; it is needed to resolve ambiguities. For example, +C<prop_aliases('lc')> returns the list C<(lc, Lowercase_Mapping)>, but +C<prop_aliases('islc')> returns C<(Is_LC, Cased_Letter)>. This is +because C<islc> is a Perl extension which is short for +C<General_Category=Cased Letter>. The lists returned for the Perl extensions +will not include the C<"Is_"> prefix (whether or not the input had it) unless +needed to resolve ambiguities, as shown in the C<"islc"> example, where the +returned list had one element containing C<"Is_">, and the other without. + +It is also possible for the reverse to happen: C<prop_aliases('isc')> returns +the list C<(isc, ISO_Comment)>; whereas C<prop_aliases('c')> returns +C<(C, Other)> (the latter being a Perl extension meaning +C<General_Category=Other>. L<perluniprops> lists the available forms, +including which ones are discouraged from use. + +Those discouraged forms are accepted as input to C<prop_aliases>, but are not +returned in the lists. C<prop_aliases('isL&')> and C<prop_aliases('isL_')>, +which are old synonyms for C<"Is_LC"> and should not be used in new code, are +examples of this. These both return C<(Is_LC, Cased_Letter)>. Thus this +function allows you to take a discourarged form, and find its acceptable +alternatives. The same goes with single-form Block property equivalences. +Only the forms that begin with C<"In_"> are not discouraged; if you pass +C<prop_aliases> a discouraged form, you will get back the equivalent ones that +begin with C<"In_">. It will otherwise look like a new-style block name (see. +L</Old-style versus new-style block names>). + +C<prop_aliases> does not know about any user-defined properties, and will +return C<undef> if called with one of those. Likewise for Perl internal +properties, with the exception of "Perl_Decimal_Digit" which it does know +about (and which is documented below in L</prop_invmap()>). + +=cut + +# It may be that there are use cases where the discouraged forms should be +# returned. If that comes up, an optional boolean second parameter to the +# function could be created, for example. + +# These are created by mktables for this routine and stored in unicore/UCD.pl +# where their structures are described. +our %string_property_loose_to_name; +our %ambiguous_names; +our %loose_perlprop_to_name; +our %prop_aliases; + +sub prop_aliases ($) { + my $prop = $_[0]; + return unless defined $prop; + + require "unicore/UCD.pl"; + require "unicore/Heavy.pl"; + require "utf8_heavy.pl"; + + # The property name may be loosely or strictly matched; we don't know yet. + # But both types use lower-case. + $prop = lc $prop; + + # It is loosely matched if its lower case isn't known to be strict. + my $list_ref; + if (! exists $utf8::stricter_to_file_of{$prop}) { + my $loose = utf8::_loose_name($prop); + + # There is a hash that converts from any loose name to its standard + # form, mapping all synonyms for a name to one name that can be used + # as a key into another hash. The whole concept is for memory + # savings, as the second hash doesn't have to have all the + # combinations. Actually, there are two hashes that do the + # converstion. One is used in utf8_heavy.pl (stored in Heavy.pl) for + # looking up properties matchable in regexes. This function needs to + # access string properties, which aren't available in regexes, so a + # second conversion hash is made for them (stored in UCD.pl). Look in + # the string one now, as the rest can have an optional 'is' prefix, + # which these don't. + if (exists $string_property_loose_to_name{$loose}) { + + # Convert to its standard loose name. + $prop = $string_property_loose_to_name{$loose}; + } + else { + my $retrying = 0; # bool. ? Has an initial 'is' been stripped + RETRY: + if (exists $utf8::loose_property_name_of{$loose} + && (! $retrying + || ! exists $ambiguous_names{$loose})) + { + # Found an entry giving the standard form. We don't get here + # (in the test above) when we've stripped off an + # 'is' and the result is an ambiguous name. That is because + # these are official Unicode properties (though Perl can have + # an optional 'is' prefix meaning the official property), and + # all ambiguous cases involve a Perl single-form extension + # for the gc, script, or block properties, and the stripped + # 'is' means that they mean one of those, and not one of + # these + $prop = $utf8::loose_property_name_of{$loose}; + } + elsif (exists $loose_perlprop_to_name{$loose}) { + + # This hash is specifically for this function to list Perl + # extensions that aren't in the earlier hashes. If there is + # only one element, the short and long names are identical. + # Otherwise the form is already in the same form as + # %prop_aliases, which is handled at the end of the function. + $list_ref = $loose_perlprop_to_name{$loose}; + if (@$list_ref == 1) { + my @list = ($list_ref->[0], $list_ref->[0]); + $list_ref = \@list; + } + } + elsif (! exists $utf8::loose_to_file_of{$loose}) { + + # loose_to_file_of is a complete list of loose names. If not + # there, the input is unknown. + return; + } + else { + + # Here we found the name but not its aliases, so it has to + # exist. This means it must be one of the Perl single-form + # extensions. First see if it is for a property-value + # combination in one of the following properties. + my @list; + foreach my $property ("gc", "script") { + @list = prop_value_aliases($property, $loose); + last if @list; + } + if (@list) { + + # Here, it is one of those property-value combination + # single-form synonyms. There are ambiguities with some + # of these. Check against the list for these, and adjust + # if necessary. + for my $i (0 .. @list -1) { + if (exists $ambiguous_names + {utf8::_loose_name(lc $list[$i])}) + { + # The ambiguity is resolved by toggling whether or + # not it has an 'is' prefix + $list[$i] =~ s/^Is_// or $list[$i] =~ s/^/Is_/; + } + } + return @list; + } + + # Here, it wasn't one of the gc or script single-form + # extensions. It could be a block property single-form + # extension. An 'in' prefix definitely means that, and should + # be looked up without the prefix. + my $began_with_in = $loose =~ s/^in//; + @list = prop_value_aliases("block", $loose); + if (@list) { + map { $_ =~ s/^/In_/ } @list; + return @list; + } + + # Here still haven't found it. The last opportunity for it + # being valid is only if it began with 'is'. We retry without + # the 'is', setting a flag to that effect so that we don't + # accept things that begin with 'isis...' + if (! $retrying && ! $began_with_in && $loose =~ s/^is//) { + $retrying = 1; + goto RETRY; + } + + # Here, didn't find it. Since it was in %loose_to_file_of, we + # should have been able to find it. + carp __PACKAGE__, "::prop_aliases: Unexpectedly could not find '$prop'. Send bug report to perlbug\@perl.org"; + return; + } + } + } + + if (! $list_ref) { + # Here, we have set $prop to a standard form name of the input. Look + # it up in the structure created by mktables for this purpose, which + # contains both strict and loosely matched properties. Avoid + # autovivifying. + $list_ref = $prop_aliases{$prop} if exists $prop_aliases{$prop}; + return unless $list_ref; + } + + # The full name is in element 1. + return $list_ref->[1] unless wantarray; + + return @{dclone $list_ref}; +} + +=pod + +=head2 B<prop_value_aliases()> + + use Unicode::UCD 'prop_value_aliases'; + + my ($short_name, $full_name, @other_names) + = prop_value_aliases("Gc", "Punct"); + my $same_full_name = prop_value_aliases("Gc", "P"); # Scalar cntxt + my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th + # element + print "The full name is $full_name\n"; + print "The short name is $short_name\n"; + print "The other aliases are: ", join(", ", @other_names), "\n"; + + prints: + The full name is Punctuation + The short name is P + The other aliases are: Punct + +Some Unicode properties have a restricted set of legal values. For example, +all binary properties are restricted to just C<true> or C<false>; and there +are only a few dozen possible General Categories. + +For such properties, there are usually several synonyms for each possible +value. For example, in binary properties, I<truth> can be represented by any of +the strings "Y", "Yes", "T", or "True"; and the General Category +"Punctuation" by that string, or "Punct", or simply "P". + +Like property names, there is typically at least a short name for each such +property-value, and a long name. If you know any name of the property-value, +you can use C<prop_value_aliases>() to get the long name (when called in +scalar context), or a list of all the names, with the short name in the 0th +element, the long name in the next element, and any other synonyms in the +remaining elements, in no particular order, except that any all-numeric +synonyms will be last. + +The long name is returned in a form nicely capitalized, suitable for printing. + +Case, white space, hyphens, and underscores are ignored in the input parameters +(except for the trailing underscore in the old-form grandfathered-in general +category property value C<"L_">, which is better written as C<"LC">). + +If either name is unknown, C<undef> is returned. Note that Perl typically +recognizes property names in regular expressions with an optional C<"Is_>" +(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>. +This function does not recognize those in the property parameter, returning +C<undef>. + +If called with a property that doesn't have synonyms for its values, it +returns the input value, possibly normalized with capitalization and +underscores. + +For the block property, new-style block names are returned (see +L</Old-style versus new-style block names>). + +To find the synonyms for single-forms, such as C<\p{Any}>, use +L</prop_aliases()> instead. + +C<prop_value_aliases> does not know about any user-defined properties, and +will return C<undef> if called with one of those. + +=cut + +# These are created by mktables for this routine and stored in unicore/UCD.pl +# where their structures are described. +our %loose_to_standard_value; +our %prop_value_aliases; + +sub prop_value_aliases ($$) { + my ($prop, $value) = @_; + return unless defined $prop && defined $value; + + require "unicore/UCD.pl"; + require "utf8_heavy.pl"; + + # Find the property name synonym that's used as the key in other hashes, + # which is element 0 in the returned list. + ($prop) = prop_aliases($prop); + return if ! $prop; + $prop = utf8::_loose_name(lc $prop); + + # Here is a legal property, but the hash below (created by mktables for + # this purpose) only knows about the properties that have a very finite + # number of potential values, that is not ones whose value could be + # anything, like most (if not all) string properties. These don't have + # synonyms anyway. Simply return the input. For example, there is no + # synonym for ('Uppercase_Mapping', A'). + return $value if ! exists $prop_value_aliases{$prop}; + + # The value name may be loosely or strictly matched; we don't know yet. + # But both types use lower-case. + $value = lc $value; + + # If the name isn't found under loose matching, it certainly won't be + # found under strict + my $loose_value = utf8::_loose_name($value); + return unless exists $loose_to_standard_value{"$prop=$loose_value"}; + + # Similarly if the combination under loose matching doesn't exist, it + # won't exist under strict. + my $standard_value = $loose_to_standard_value{"$prop=$loose_value"}; + return unless exists $prop_value_aliases{$prop}{$standard_value}; + + # Here we did find a combination under loose matching rules. But it could + # be that is a strict property match that shouldn't have matched. + # %prop_value_aliases is set up so that the strict matches will appear as + # if they were in loose form. Thus, if the non-loose version is legal, + # we're ok, can skip the further check. + if (! exists $utf8::stricter_to_file_of{"$prop=$value"} + + # We're also ok and skip the further check if value loosely matches. + # mktables has verified that no strict name under loose rules maps to + # an existing loose name. This code relies on the very limited + # circumstances that strict names can be here. Strict name matching + # happens under two conditions: + # 1) when the name begins with an underscore. But this function + # doesn't accept those, and %prop_value_aliases doesn't have + # them. + # 2) When the values are numeric, in which case we need to look + # further, but their squeezed-out loose values will be in + # %stricter_to_file_of + && exists $utf8::stricter_to_file_of{"$prop=$loose_value"}) + { + # The only thing that's legal loosely under strict is that can have an + # underscore between digit pairs XXX + while ($value =~ s/(\d)_(\d)/$1$2/g) {} + return unless exists $utf8::stricter_to_file_of{"$prop=$value"}; + } + + # Here, we know that the combination exists. Return it. + my $list_ref = $prop_value_aliases{$prop}{$standard_value}; + if (@$list_ref > 1) { + # The full name is in element 1. + return $list_ref->[1] unless wantarray; + + return @{dclone $list_ref}; + } + + return $list_ref->[0] unless wantarray; + + # Only 1 element means that it repeats + return ( $list_ref->[0], $list_ref->[0] ); +} =head2 Unicode::UCD::UnicodeVersion diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index 98bab65ff3..df601d1fa8 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -508,4 +508,345 @@ is(num("\N{ETHIOPIC NUMBER TEN THOUSAND}"), 10000, 'Verify num("\N{ETHIOPIC NUMB is(num("\N{NORTH INDIC FRACTION ONE HALF}"), .5, 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5'); is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9'); +# Create a user-defined property +sub InKana {<<'END'} +3040 309F +30A0 30FF +END + +use Unicode::UCD qw(prop_aliases); + +is(prop_aliases(undef), undef, "prop_aliases(undef) returns <undef>"); +is(prop_aliases("unknown property"), undef, + "prop_aliases(<unknown property>) returns <undef>"); +is(prop_aliases("InKana"), undef, + "prop_aliases(<user-defined property>) returns <undef>"); +is(prop_aliases("Perl_Decomposition_Mapping"), undef, "prop_aliases('Perl_Decomposition_Mapping') returns <undef> since internal-Perl-only"); +is(prop_aliases("Perl_Charnames"), undef, + "prop_aliases('Perl_Charnames') returns <undef> since internal-Perl-only"); +is(prop_aliases("isgc"), undef, + "prop_aliases('isgc') returns <undef> since is not covered Perl extension"); +is(prop_aliases("Is_Is_Any"), undef, + "prop_aliases('Is_Is_Any') returns <undef> since two is's"); + +require 'utf8_heavy.pl'; +require "unicore/Heavy.pl"; + +# Keys are lists of properties. Values are defined if have been tested. +my %props; + +# To test for loose matching, add in the characters that are ignored there. +my $extra_chars = "-_ "; + +# The one internal property we accept +$props{'Perl_Decimal_Digit'} = 1; +my @list = prop_aliases("perldecimaldigit"); +is_deeply(\@list, + [ "Perl_Decimal_Digit", + "Perl_Decimal_Digit" + ], "prop_aliases('perldecimaldigit') returns Perl_Decimal_Digit as both short and full names"); + +# Get the official Unicode property name synonyms and test them. +open my $props, "<", "../lib/unicore/PropertyAliases.txt" + or die "Can't open Unicode PropertyAliases.txt"; +while (<$props>) { + s/\s*#.*//; # Remove comments + next if /^\s* $/x; # Ignore empty and comment lines + + chomp; + my $count = 0; # 0th field in line is short name; 1th is long name + my $short_name; + my $full_name; + my @names_via_short; + foreach my $alias (split /\s*;\s*/) { # Fields are separated by + # semi-colons + # Add in the characters that are supposed to be ignored, to test loose + # matching, which the tested function does on all inputs. + my $mod_name = "$extra_chars$alias"; + + my $loose = utf8::_loose_name(lc $alias); + + # Indicate we have tested this. + $props{$loose} = 1; + + my @all_names = prop_aliases($mod_name); + if (grep { $_ eq $loose } @Unicode::UCD::suppressed_properties) { + is(@all_names, 0, "prop_aliases('$mod_name') returns undef since $alias is not installed"); + next; + } + elsif (! @all_names) { + fail("prop_aliases('$mod_name')"); + diag("'$alias' is unknown to prop_aliases()"); + next; + } + + if ($count == 0) { # Is short name + + @names_via_short = prop_aliases($mod_name); + + # If the 0th test fails, no sense in continuing with the others + last unless is($names_via_short[0], $alias, + "prop_aliases: '$alias' is the short name for '$mod_name'"); + $short_name = $alias; + } + elsif ($count == 1) { # Is full name + + # Some properties have the same short and full name; no sense + # repeating the test if the same. + if ($alias ne $short_name) { + my @names_via_full = prop_aliases($mod_name); + is_deeply(\@names_via_full, \@names_via_short, "prop_aliases() returns the same list for both '$short_name' and '$mod_name'"); + } + + # Tests scalar context + is(prop_aliases($short_name), $alias, + "prop_aliases: '$alias' is the long name for '$short_name'"); + } + else { # Is another alias + is_deeply(\@all_names, \@names_via_short, "prop_aliases() returns the same list for both '$short_name' and '$mod_name'"); + ok((grep { $_ =~ /^$alias$/i } @all_names), + "prop_aliases: '$alias' is listed as an alias for '$mod_name'"); + } + + $count++; + } +} + +# Now test anything we can find that wasn't covered by the tests of the +# official properties. We have no way of knowing if mktables omitted a Perl +# extension or not, but we do the best we can from its generated lists + +foreach my $alias (keys %utf8::loose_to_file_of) { + next if $alias =~ /=/; + my $lc_name = lc $alias; + my $loose = utf8::_loose_name($lc_name); + next if exists $props{$loose}; # Skip if already tested + $props{$loose} = 1; + my $mod_name = "$extra_chars$alias"; # Tests loose matching + my @aliases = prop_aliases($mod_name); + my $found_it = grep { utf8::_loose_name(lc $_) eq $lc_name } @aliases; + if ($found_it) { + pass("prop_aliases: '$lc_name' is listed as an alias for '$mod_name'"); + } + elsif ($lc_name =~ /l[_&]$/) { + + # These two names are special in that they don't appear in the + # returned list because they are discouraged from use. Verify + # that they return the same list as a non-discouraged version. + my @LC = prop_aliases('Is_LC'); + is_deeply(\@aliases, \@LC, "prop_aliases: '$lc_name' returns the same list as 'Is_LC'"); + } + else { + my $stripped = $lc_name =~ s/^is//; + + # Could be that the input includes a prefix 'is', which is rarely + # returned as an alias, so having successfully stripped it off above, + # try again. + if ($stripped) { + $found_it = grep { utf8::_loose_name(lc $_) eq $lc_name } @aliases; + } + + # If that didn't work, it could be that it's a block, which is always + # returned with a leading 'In_' to avoid ambiguity. Try comparing + # with that stripped off. + if (! $found_it) { + $found_it = grep { utf8::_loose_name(s/^In_(.*)/\L$1/r) eq $lc_name } + @aliases; + # Could check that is a real block, but tests for invmap will + # likely pickup any errors, since this will be tested there. + $lc_name = "in$lc_name" if $found_it; # Change for message below + } + my $message = "prop_aliases: '$lc_name' is listed as an alias for '$mod_name'"; + ($found_it) ? pass($message) : fail($message); + } +} + +my $done_equals = 0; +foreach my $alias (keys %utf8::stricter_to_file_of) { + if ($alias =~ /=/) { # Only test one case where there is an equals + next if $done_equals; + $done_equals = 1; + } + my $lc_name = lc $alias; + my @list = prop_aliases($alias); + if ($alias =~ /^_/) { + is(@list, 0, "prop_aliases: '$lc_name' returns an empty list since it is internal_only"); + } + elsif ($alias =~ /=/) { + is(@list, 0, "prop_aliases: '$lc_name' returns an empty list since is illegal property name"); + } + else { + ok((grep { lc $_ eq $lc_name } @list), + "prop_aliases: '$lc_name' is listed as an alias for '$alias'"); + } +} + +use Unicode::UCD qw(prop_value_aliases); + +is(prop_value_aliases("unknown property", "unknown value"), undef, + "prop_value_aliases(<unknown property>, <unknown value>) returns <undef>"); +is(prop_value_aliases(undef, undef), undef, + "prop_value_aliases(undef, undef) returns <undef>"); +is((prop_value_aliases("na", "A")), "A", "test that prop_value_aliases returns its input for properties that don't have synonyms"); +is(prop_value_aliases("isgc", "C"), undef, "prop_value_aliases('isgc', 'C') returns <undef> since is not covered Perl extension"); +is(prop_value_aliases("gc", "isC"), undef, "prop_value_aliases('gc', 'isC') returns <undef> since is not covered Perl extension"); + +# We have no way of knowing if mktables omitted a Perl extension that it +# shouldn't have, but we can check if it omitted an official Unicode property +# name synonym. And for those, we can check if the short and full names are +# correct. + +my %pva_tested; # List of things already tested. +open my $propvalues, "<", "../lib/unicore/PropValueAliases.txt" + or die "Can't open Unicode PropValueAliases.txt"; +while (<$propvalues>) { + s/\s*#.*//; # Remove comments + next if /^\s* $/x; # Ignore empty and comment lines + chomp; + + my @fields = split /\s*;\s*/; # Fields are separated by semi-colons + my $prop = shift @fields; # 0th field is the property, + my $count = 0; # 0th field in line (after shifting off the property) is + # short name; 1th is long name + my $short_name; + my @names_via_short; # Saves the values between iterations + + # The property on the lhs of the = is always loosely matched. Add in + # characters that are ignored under loose matching to test that + my $mod_prop = "$extra_chars$prop"; + + if ($fields[0] eq 'n/a') { # See comments in input file, essentially + # means full name and short name are identical + $fields[0] = $fields[1]; + } + elsif ($fields[0] ne $fields[1] + && utf8::_loose_name(lc $fields[0]) + eq utf8::_loose_name(lc $fields[1]) + && $fields[1] !~ /[[:upper:]]/) + { + # Also, there is a bug in the file in which "n/a" is omitted, and + # the two fields are identical except for case, and the full name + # is all lower case. Copy the "short" name unto the full one to + # give it some upper case. + + $fields[1] = $fields[0]; + } + + # The ccc property in the file is special; has an extra numeric field + # (0th), which should go at the end, since we use the next two fields as + # the short and full names, respectively. See comments in input file. + splice (@fields, 0, 0, splice(@fields, 1, 2)) if $prop eq 'ccc'; + + my $loose_prop = utf8::_loose_name(lc $prop); + my $suppressed = grep { $_ eq $loose_prop } + @Unicode::UCD::suppressed_properties; + foreach my $value (@fields) { + if ($suppressed) { + is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop"); + next; + } + elsif (grep { $_ eq ("$loose_prop=" . utf8::_loose_name(lc $value)) } @Unicode::UCD::suppressed_properties) { + is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop=$value"); + next; + } + + # Add in test for loose matching. + my $mod_value = "$extra_chars$value"; + + # If the value is a number, optionally negative, including a floating + # point or rational numer, it should be only strictly matched, so the + # loose matching should fail. + if ($value =~ / ^ -? \d+ (?: [\/.] \d+ )? $ /x) { + is(prop_value_aliases($mod_prop, $mod_value), undef, "prop_value_aliases('$mod_prop', '$mod_value') returns undef because '$mod_value' should be strictly matched"); + + # And reset so below tests just the strict matching. + $mod_value = $value; + } + + if ($count == 0) { + + @names_via_short = prop_value_aliases($mod_prop, $mod_value); + + # If the 0th test fails, no sense in continuing with the others + last unless is($names_via_short[0], $value, "prop_value_aliases: In '$prop', '$value' is the short name for '$mod_value'"); + $short_name = $value; + } + elsif ($count == 1) { + + # Some properties have the same short and full name; no sense + # repeating the test if the same. + if ($value ne $short_name) { + my @names_via_full = + prop_value_aliases($mod_prop, $mod_value); + is_deeply(\@names_via_full, \@names_via_short, "In '$prop', prop_value_aliases() returns the same list for both '$short_name' and '$mod_value'"); + } + + # Tests scalar context + is(prop_value_aliases($prop, $short_name), $value, "'$value' is the long name for prop_value_aliases('$prop', '$short_name')"); + } + else { + my @all_names = prop_value_aliases($mod_prop, $mod_value); + is_deeply(\@all_names, \@names_via_short, "In '$prop', prop_value_aliases() returns the same list for both '$short_name' and '$mod_value'"); + ok((grep { utf8::_loose_name(lc $_) eq utf8::_loose_name(lc $value) } prop_value_aliases($prop, $short_name)), "'$value' is listed as an alias for prop_value_aliases('$prop', '$short_name')"); + } + + $pva_tested{utf8::_loose_name(lc $prop) . "=" . utf8::_loose_name(lc $value)} = 1; + $count++; + } +} + +# And test as best we can, the non-official pva's that mktables generates. +foreach my $hash (\%utf8::loose_to_file_of, \%utf8::stricter_to_file_of) { + foreach my $test (keys %$hash) { + next if exists $pva_tested{$test}; # Skip if already tested + + my ($prop, $value) = split "=", $test; + next unless defined $value; # prop_value_aliases() requires an input + # 'value' + my $mod_value; + if ($hash == \%utf8::loose_to_file_of) { + + # Add extra characters to test loose-match rhs value + $mod_value = "$extra_chars$value"; + } + else { # Here value is strictly matched. + + # Extra elements are added by mktables to this hash so that + # something like "age=6.0" has a synonym of "age=6". It's not + # clear to me (khw) if we should be encouraging those synonyms, so + # don't test for them. + next if $value !~ /\D/ && exists $hash->{"$prop=$value.0"}; + + # Verify that loose matching fails when only strict is called for. + next unless is(prop_value_aliases($prop, "$extra_chars$value"), undef, + "prop_value_aliases('$prop', '$extra_chars$value') returns undef since '$value' should be strictly matched"), + + # Strict matching does allow for underscores between digits. Test + # for that. + $mod_value = $value; + while ($mod_value =~ s/(\d)(\d)/$1_$2/g) {} + } + + # The lhs property is always loosely matched, so add in extra + # characters to test that. + my $mod_prop = "$extra_chars$prop"; + + if ($prop eq 'gc' && $value =~ /l[_&]$/) { + # These two names are special in that they don't appear in the + # returned list because they are discouraged from use. Verify + # that they return the same list as a non-discouraged version. + my @LC = prop_value_aliases('gc', 'lc'); + my @l_ = prop_value_aliases($mod_prop, $mod_value); + is_deeply(\@l_, \@LC, "prop_value_aliases('$mod_prop', '$mod_value) returns the same list as prop_value_aliases('gc', 'lc')"); + } + else { + ok((grep { utf8::_loose_name(lc $_) eq utf8::_loose_name(lc $value) } + prop_value_aliases($mod_prop, $mod_value)), + "'$value' is listed as an alias for prop_value_aliases('$mod_prop', '$mod_value')"); + } + } +} + +undef %pva_tested; + done_testing(); |