diff options
author | Karl Williamson <khw@cpan.org> | 2015-01-27 15:08:08 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-02-10 15:53:48 -0700 |
commit | 9024667a44828cc925a3d939578415da8ffeec83 (patch) | |
tree | 3dd7ecaf964d8122836d12f12fe312e952233e4b /lib/Unicode | |
parent | 17673bf503a4d987b87f8af6ade614c4090b87cb (diff) | |
download | perl-9024667a44828cc925a3d939578415da8ffeec83.tar.gz |
Unicode::UCD: Add prop_values() function
This new function returns the input property's possible values.
Diffstat (limited to 'lib/Unicode')
-rw-r--r-- | lib/Unicode/UCD.pm | 125 | ||||
-rw-r--r-- | lib/Unicode/UCD.t | 30 |
2 files changed, 133 insertions, 22 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index f3d5541e92..252bebcd48 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -5,7 +5,7 @@ use warnings; no warnings 'surrogate'; # surrogates can be inputs to this use charnames (); -our $VERSION = '0.60'; +our $VERSION = '0.61'; require Exporter; @@ -22,6 +22,7 @@ our @EXPORT_OK = qw(charinfo num prop_aliases prop_value_aliases + prop_values prop_invlist prop_invmap search_invlist @@ -76,6 +77,9 @@ Unicode::UCD - Unicode character database use Unicode::UCD 'prop_value_aliases'; my @gc_punct_names = prop_value_aliases("Gc", "Punct"); + use Unicode::UCD 'prop_values'; + my @all_EA_short_names = prop_values("East_Asian_Width"); + use Unicode::UCD 'prop_invlist'; my @puncts = prop_invlist("gc=punctuation"); @@ -798,6 +802,9 @@ names>). L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a different type of data structure. +L<prop_values("Block")|/prop_values()> can be used to get all +the known new-style block names as a list, without the code point ranges. + See also L</Blocks versus Scripts>. =cut @@ -820,6 +827,9 @@ the values. L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a different type of data structure. +L<C<prop_values("Script")>|/prop_values()> can be used to get all +the known script names as a list, without the code point ranges. + See also L</Blocks versus Scripts>. =cut @@ -903,8 +913,9 @@ from the long names to the short names. The general category is the one returned from L</charinfo()> under the C<category> key. -The L</prop_value_aliases()> function can be used to get all the synonyms of -the category name. +The L</prop_values()> and L</prop_value_aliases()> functions can be used as an +alternative to this function; the first returning a simple list of the short +category names; and the second gets all the synonyms of a given category name. =cut @@ -948,8 +959,10 @@ the Unicode TR9 is recommended reading: L<http://www.unicode.org/reports/tr9/> (as of Unicode 5.0.0) -The L</prop_value_aliases()> function can be used to get all the synonyms of -the bidi type name. +The L</prop_values()> and L</prop_value_aliases()> functions can be used as an +alternative to this function; the first returning a simple list of the short +bidi type names; and the second gets all the synonyms of a given bidi type +name. =cut @@ -1960,6 +1973,79 @@ sub prop_aliases ($) { =pod +=head2 B<prop_values()> + + use Unicode::UCD 'prop_values'; + + print "AHex values are: ", join(", ", prop_values("AHex")), + "\n"; + prints: + AHex values are: N, Y + +Some Unicode properties have a restricted set of legal values. For example, +all binary properties are restricted to just C<true> or C<false>; and there +are only a few dozen possible General Categories. Use C<prop_values> +to find out if a given property is one such, and if so, to get a list of the +values: + + print join ", ", prop_values("NFC_Quick_Check"); + prints: + M, N, Y + +If the property doesn't have such a restricted set, C<undef> is returned. + +There are usually several synonyms for each possible value. Use +L</prop_value_aliases()> to access those. + +Case, white space, hyphens, and underscores are ignored in the input property +name (except for the trailing underscore in the old-form grandfathered-in +general category property value C<"L_">, which is better written as C<"LC">). + +If the property name is unknown, C<undef> is returned. Note that Perl typically +recognizes property names in regular expressions with an optional C<"Is_>" +(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>. +This function does not recognize those in the property parameter, returning +C<undef>. + +For the block property, new-style block names are returned (see +L</Old-style versus new-style block names>). + +C<prop_values> does not know about any user-defined properties, and +will return C<undef> if called with one of those. + +=cut + +# These are created by mktables for this module and stored in unicore/UCD.pl +# where their structures are described. +our %loose_to_standard_value; +our %prop_value_aliases; + +sub prop_values ($) { + my $prop = shift; + return undef unless defined $prop; + + require "unicore/UCD.pl"; + require "utf8_heavy.pl"; + + # Find the property name synonym that's used as the key in other hashes, + # which is element 0 in the returned list. + ($prop) = prop_aliases($prop); + return undef if ! $prop; + $prop = utf8::_loose_name(lc $prop); + + # Here is a legal property. + return undef unless exists $prop_value_aliases{$prop}; + my @return; + foreach my $value_key (sort { lc $a cmp lc $b } + keys %{$prop_value_aliases{$prop}}) + { + push @return, $prop_value_aliases{$prop}{$value_key}[0]; + } + return @return; +} + +=pod + =head2 B<prop_value_aliases()> use Unicode::UCD 'prop_value_aliases'; @@ -1973,7 +2059,7 @@ sub prop_aliases ($) { print "The short name is $short_name\n"; print "The other aliases are: ", join(", ", @other_names), "\n"; - prints: + prints: The full name is Punctuation The short name is P The other aliases are: Punct @@ -1982,18 +2068,20 @@ Some Unicode properties have a restricted set of legal values. For example, all binary properties are restricted to just C<true> or C<false>; and there are only a few dozen possible General Categories. -For such properties, there are usually several synonyms for each possible -value. For example, in binary properties, I<truth> can be represented by any of -the strings "Y", "Yes", "T", or "True"; and the General Category -"Punctuation" by that string, or "Punct", or simply "P". +You can use L</prop_values()> to find out if a given property is one which has +a restricted set of values, and if so, what those values are. But usually +each value actually has several synonyms. For example, in binary properties, +I<truth> can be represented by any of the strings "Y", "Yes", "T", or "True"; +and the General Category "Punctuation" by that string, or "Punct", or simply +"P". Like property names, there is typically at least a short name for each such -property-value, and a long name. If you know any name of the property-value, -you can use C<prop_value_aliases>() to get the long name (when called in -scalar context), or a list of all the names, with the short name in the 0th -element, the long name in the next element, and any other synonyms in the -remaining elements, in no particular order, except that any all-numeric -synonyms will be last. +property-value, and a long name. If you know any name of the property-value +(which you can get by L</prop_values()>, you can use C<prop_value_aliases>() +to get the long name (when called in scalar context), or a list of all the +names, with the short name in the 0th element, the long name in the next +element, and any other synonyms in the remaining elements, in no particular +order, except that any all-numeric synonyms will be last. The long name is returned in a form nicely capitalized, suitable for printing. @@ -2022,11 +2110,6 @@ will return C<undef> if called with one of those. =cut -# These are created by mktables for this routine and stored in unicore/UCD.pl -# where their structures are described. -our %loose_to_standard_value; -our %prop_value_aliases; - sub prop_value_aliases ($$) { my ($prop, $value) = @_; return unless defined $prop && defined $value; diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index 37c8bd2cb8..7a1084996f 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -698,7 +698,7 @@ foreach my $alias (keys %utf8::stricter_to_file_of) { } } -use Unicode::UCD qw(prop_value_aliases); +use Unicode::UCD qw(prop_values prop_value_aliases); is(prop_value_aliases("unknown property", "unknown value"), undef, "prop_value_aliases(<unknown property>, <unknown value>) returns <undef>"); @@ -720,6 +720,12 @@ skip "PropValueAliases.txt is not in this Unicode version", 1 if $v_unicode_vers open my $propvalues, "<", "../lib/unicore/PropValueAliases.txt" or die "Can't open Unicode PropValueAliases.txt"; local $/ = "\n"; + +# Each examined line in the file is for a single value for a property. We +# accumulate all the values for each property using these two variables. +my $prev_prop = ""; +my @this_prop_values; + while (<$propvalues>) { s/\s*#.*//; # Remove comments next if /^\s* $/x; # Ignore empty and comment lines @@ -731,6 +737,27 @@ while (<$propvalues>) { my @fields = split /\s*;\s*/; # Fields are separated by semi-colons my $prop = shift @fields; # 0th field is the property, + + # When changing properties, we examine the accumulated values for the old + # one to see if our function that returns them matches. + if ($prev_prop ne $prop) { + if ($prev_prop ne "") { # Skip for the first time through + my @ucd_function_values = prop_values($prev_prop); + @ucd_function_values = () unless @ucd_function_values; + + # This perl extension doesn't appear in the official file + push @this_prop_values, "Non_Canon" if $prev_prop eq 'dt'; + + my @file_values = undef; + @file_values = sort { lc($a =~ s/_//gr) cmp lc($b =~ s/_//gr) } + @this_prop_values if @this_prop_values; + is_deeply(\@ucd_function_values, \@file_values, + "prop_values('$prev_prop') returns correct list of values"); + } + $prev_prop = $prop; + undef @this_prop_values; + } + my $count = 0; # 0th field in line (after shifting off the property) is # short name; 1th is long name my $short_name; @@ -765,6 +792,7 @@ while (<$propvalues>) { my $loose_prop = &utf8::_loose_name(lc $prop); my $suppressed = grep { $_ eq $loose_prop } @Unicode::UCD::suppressed_properties; + push @this_prop_values, $fields[0] unless $suppressed; foreach my $value (@fields) { if ($suppressed) { is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop"); |