summaryrefslogtreecommitdiff
path: root/lib/Unicode
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-01-27 15:08:08 -0700
committerKarl Williamson <khw@cpan.org>2015-02-10 15:53:48 -0700
commit9024667a44828cc925a3d939578415da8ffeec83 (patch)
tree3dd7ecaf964d8122836d12f12fe312e952233e4b /lib/Unicode
parent17673bf503a4d987b87f8af6ade614c4090b87cb (diff)
downloadperl-9024667a44828cc925a3d939578415da8ffeec83.tar.gz
Unicode::UCD: Add prop_values() function
This new function returns the input property's possible values.
Diffstat (limited to 'lib/Unicode')
-rw-r--r--lib/Unicode/UCD.pm125
-rw-r--r--lib/Unicode/UCD.t30
2 files changed, 133 insertions, 22 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index f3d5541e92..252bebcd48 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -5,7 +5,7 @@ use warnings;
no warnings 'surrogate'; # surrogates can be inputs to this
use charnames ();
-our $VERSION = '0.60';
+our $VERSION = '0.61';
require Exporter;
@@ -22,6 +22,7 @@ our @EXPORT_OK = qw(charinfo
num
prop_aliases
prop_value_aliases
+ prop_values
prop_invlist
prop_invmap
search_invlist
@@ -76,6 +77,9 @@ Unicode::UCD - Unicode character database
use Unicode::UCD 'prop_value_aliases';
my @gc_punct_names = prop_value_aliases("Gc", "Punct");
+ use Unicode::UCD 'prop_values';
+ my @all_EA_short_names = prop_values("East_Asian_Width");
+
use Unicode::UCD 'prop_invlist';
my @puncts = prop_invlist("gc=punctuation");
@@ -798,6 +802,9 @@ names>).
L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a
different type of data structure.
+L<prop_values("Block")|/prop_values()> can be used to get all
+the known new-style block names as a list, without the code point ranges.
+
See also L</Blocks versus Scripts>.
=cut
@@ -820,6 +827,9 @@ the values.
L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
different type of data structure.
+L<C<prop_values("Script")>|/prop_values()> can be used to get all
+the known script names as a list, without the code point ranges.
+
See also L</Blocks versus Scripts>.
=cut
@@ -903,8 +913,9 @@ from the long names to the short names. The general category is the
one returned from
L</charinfo()> under the C<category> key.
-The L</prop_value_aliases()> function can be used to get all the synonyms of
-the category name.
+The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
+alternative to this function; the first returning a simple list of the short
+category names; and the second gets all the synonyms of a given category name.
=cut
@@ -948,8 +959,10 @@ the Unicode TR9 is recommended reading:
L<http://www.unicode.org/reports/tr9/>
(as of Unicode 5.0.0)
-The L</prop_value_aliases()> function can be used to get all the synonyms of
-the bidi type name.
+The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
+alternative to this function; the first returning a simple list of the short
+bidi type names; and the second gets all the synonyms of a given bidi type
+name.
=cut
@@ -1960,6 +1973,79 @@ sub prop_aliases ($) {
=pod
+=head2 B<prop_values()>
+
+ use Unicode::UCD 'prop_values';
+
+ print "AHex values are: ", join(", ", prop_values("AHex")),
+ "\n";
+ prints:
+ AHex values are: N, Y
+
+Some Unicode properties have a restricted set of legal values. For example,
+all binary properties are restricted to just C<true> or C<false>; and there
+are only a few dozen possible General Categories. Use C<prop_values>
+to find out if a given property is one such, and if so, to get a list of the
+values:
+
+ print join ", ", prop_values("NFC_Quick_Check");
+ prints:
+ M, N, Y
+
+If the property doesn't have such a restricted set, C<undef> is returned.
+
+There are usually several synonyms for each possible value. Use
+L</prop_value_aliases()> to access those.
+
+Case, white space, hyphens, and underscores are ignored in the input property
+name (except for the trailing underscore in the old-form grandfathered-in
+general category property value C<"L_">, which is better written as C<"LC">).
+
+If the property name is unknown, C<undef> is returned. Note that Perl typically
+recognizes property names in regular expressions with an optional C<"Is_>"
+(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
+This function does not recognize those in the property parameter, returning
+C<undef>.
+
+For the block property, new-style block names are returned (see
+L</Old-style versus new-style block names>).
+
+C<prop_values> does not know about any user-defined properties, and
+will return C<undef> if called with one of those.
+
+=cut
+
+# These are created by mktables for this module and stored in unicore/UCD.pl
+# where their structures are described.
+our %loose_to_standard_value;
+our %prop_value_aliases;
+
+sub prop_values ($) {
+ my $prop = shift;
+ return undef unless defined $prop;
+
+ require "unicore/UCD.pl";
+ require "utf8_heavy.pl";
+
+ # Find the property name synonym that's used as the key in other hashes,
+ # which is element 0 in the returned list.
+ ($prop) = prop_aliases($prop);
+ return undef if ! $prop;
+ $prop = utf8::_loose_name(lc $prop);
+
+ # Here is a legal property.
+ return undef unless exists $prop_value_aliases{$prop};
+ my @return;
+ foreach my $value_key (sort { lc $a cmp lc $b }
+ keys %{$prop_value_aliases{$prop}})
+ {
+ push @return, $prop_value_aliases{$prop}{$value_key}[0];
+ }
+ return @return;
+}
+
+=pod
+
=head2 B<prop_value_aliases()>
use Unicode::UCD 'prop_value_aliases';
@@ -1973,7 +2059,7 @@ sub prop_aliases ($) {
print "The short name is $short_name\n";
print "The other aliases are: ", join(", ", @other_names), "\n";
- prints:
+ prints:
The full name is Punctuation
The short name is P
The other aliases are: Punct
@@ -1982,18 +2068,20 @@ Some Unicode properties have a restricted set of legal values. For example,
all binary properties are restricted to just C<true> or C<false>; and there
are only a few dozen possible General Categories.
-For such properties, there are usually several synonyms for each possible
-value. For example, in binary properties, I<truth> can be represented by any of
-the strings "Y", "Yes", "T", or "True"; and the General Category
-"Punctuation" by that string, or "Punct", or simply "P".
+You can use L</prop_values()> to find out if a given property is one which has
+a restricted set of values, and if so, what those values are. But usually
+each value actually has several synonyms. For example, in binary properties,
+I<truth> can be represented by any of the strings "Y", "Yes", "T", or "True";
+and the General Category "Punctuation" by that string, or "Punct", or simply
+"P".
Like property names, there is typically at least a short name for each such
-property-value, and a long name. If you know any name of the property-value,
-you can use C<prop_value_aliases>() to get the long name (when called in
-scalar context), or a list of all the names, with the short name in the 0th
-element, the long name in the next element, and any other synonyms in the
-remaining elements, in no particular order, except that any all-numeric
-synonyms will be last.
+property-value, and a long name. If you know any name of the property-value
+(which you can get by L</prop_values()>, you can use C<prop_value_aliases>()
+to get the long name (when called in scalar context), or a list of all the
+names, with the short name in the 0th element, the long name in the next
+element, and any other synonyms in the remaining elements, in no particular
+order, except that any all-numeric synonyms will be last.
The long name is returned in a form nicely capitalized, suitable for printing.
@@ -2022,11 +2110,6 @@ will return C<undef> if called with one of those.
=cut
-# These are created by mktables for this routine and stored in unicore/UCD.pl
-# where their structures are described.
-our %loose_to_standard_value;
-our %prop_value_aliases;
-
sub prop_value_aliases ($$) {
my ($prop, $value) = @_;
return unless defined $prop && defined $value;
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index 37c8bd2cb8..7a1084996f 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -698,7 +698,7 @@ foreach my $alias (keys %utf8::stricter_to_file_of) {
}
}
-use Unicode::UCD qw(prop_value_aliases);
+use Unicode::UCD qw(prop_values prop_value_aliases);
is(prop_value_aliases("unknown property", "unknown value"), undef,
"prop_value_aliases(<unknown property>, <unknown value>) returns <undef>");
@@ -720,6 +720,12 @@ skip "PropValueAliases.txt is not in this Unicode version", 1 if $v_unicode_vers
open my $propvalues, "<", "../lib/unicore/PropValueAliases.txt"
or die "Can't open Unicode PropValueAliases.txt";
local $/ = "\n";
+
+# Each examined line in the file is for a single value for a property. We
+# accumulate all the values for each property using these two variables.
+my $prev_prop = "";
+my @this_prop_values;
+
while (<$propvalues>) {
s/\s*#.*//; # Remove comments
next if /^\s* $/x; # Ignore empty and comment lines
@@ -731,6 +737,27 @@ while (<$propvalues>) {
my @fields = split /\s*;\s*/; # Fields are separated by semi-colons
my $prop = shift @fields; # 0th field is the property,
+
+ # When changing properties, we examine the accumulated values for the old
+ # one to see if our function that returns them matches.
+ if ($prev_prop ne $prop) {
+ if ($prev_prop ne "") { # Skip for the first time through
+ my @ucd_function_values = prop_values($prev_prop);
+ @ucd_function_values = () unless @ucd_function_values;
+
+ # This perl extension doesn't appear in the official file
+ push @this_prop_values, "Non_Canon" if $prev_prop eq 'dt';
+
+ my @file_values = undef;
+ @file_values = sort { lc($a =~ s/_//gr) cmp lc($b =~ s/_//gr) }
+ @this_prop_values if @this_prop_values;
+ is_deeply(\@ucd_function_values, \@file_values,
+ "prop_values('$prev_prop') returns correct list of values");
+ }
+ $prev_prop = $prop;
+ undef @this_prop_values;
+ }
+
my $count = 0; # 0th field in line (after shifting off the property) is
# short name; 1th is long name
my $short_name;
@@ -765,6 +792,7 @@ while (<$propvalues>) {
my $loose_prop = &utf8::_loose_name(lc $prop);
my $suppressed = grep { $_ eq $loose_prop }
@Unicode::UCD::suppressed_properties;
+ push @this_prop_values, $fields[0] unless $suppressed;
foreach my $value (@fields) {
if ($suppressed) {
is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop");