diff options
author | Karl Williamson <public@khwilliamson.com> | 2013-05-21 20:34:47 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2013-05-22 10:11:52 -0600 |
commit | 1fdd5e539a93d9e9573e769f06c0f3d3c3d7e3ac (patch) | |
tree | e55e5acf01ac9c3c355bb9929f673f1b116ee2f1 /lib/Unicode | |
parent | ad5f730f426dcac62c9dc19bda79e1586ec5f135 (diff) | |
download | perl-1fdd5e539a93d9e9573e769f06c0f3d3c3d7e3ac.tar.gz |
Make Unicode::UCD::search_invlist() available
This commit documents this function, removing the initial underscore
from its name. (And it hardens input checking.)
Diffstat (limited to 'lib/Unicode')
-rw-r--r-- | lib/Unicode/UCD.pm | 65 | ||||
-rw-r--r-- | lib/Unicode/UCD.t | 9 |
2 files changed, 63 insertions, 11 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index f28548334b..eeaf2c3002 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -24,6 +24,7 @@ our @EXPORT_OK = qw(charinfo prop_value_aliases prop_invlist prop_invmap + search_invlist MAX_CP ); @@ -80,6 +81,9 @@ Unicode::UCD - Unicode character database my ($list_ref, $map_ref, $format, $missing) = prop_invmap("General Category"); + use Unicode::UCD 'search_invlist'; + my $index = search_invlist(\@invlist, $code_point); + use Unicode::UCD 'compexcl'; my $compexcl = compexcl($codepoint); @@ -2124,6 +2128,9 @@ code points that have the property-value: C<prop_invlist> does not know about any user-defined nor Perl internal-only properties, and will return C<undef> if called with one of those. +The L</search_invlist()> function is provided for finding a code point within +an inversion list. + =cut # User-defined properties could be handled with some changes to utf8_heavy.pl; @@ -2675,8 +2682,9 @@ which is an integer. That is, it must match the regular expression: Further, the first element in a range never needs adjustment, as the adjustment would be just adding 0. -A binary search can be used to quickly find a code point in the inversion -list, and hence its corresponding mapping. +A binary search such as that provided by L</search_invlist()>, can be used to +quickly find a code point in the inversion list, and hence its corresponding +mapping. The final, fourth element (index [3], assigned to C<$default> in the "block" example) in the four element list returned by this function is used with the @@ -3374,7 +3382,7 @@ RETRY: } # Find the range that the override applies to. - my $i = _search_invlist(\@invlist, $cp); + my $i = search_invlist(\@invlist, $cp); if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) { croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]" } @@ -3483,17 +3491,52 @@ RETRY: return (\@invlist, \@invmap, $format, $missing); } -sub _search_invlist { - # Find the range in the inversion list which contains a code point; that - # is, find i such that l[i] <= code_point < l[i+1]. Returns undef if no - # such i. +sub search_invlist { + +=pod + +=head2 B<search_invlist()> + + use Unicode::UCD qw(prop_invmap prop_invlist); + use Unicode::UCD 'search_invlist'; + + my @invlist = prop_invlist($property_name); + print $code_point, ((search_invlist(\@invlist, $code_point) // -1) % 2) + ? " isn't" + : " is", + " in $property_name\n"; + + my ($blocks_ranges_ref, $blocks_map_ref) = prop_invmap("Block"); + my $index = search_invlist($blocks_ranges_ref, $code_point); + print "$code_point is in block ", $blocks_map_ref->[$index], "\n"; + +C<search_invlist> is used to search an inversion list returned by +C<prop_invlist> or C<prop_invmap> for a particular L</code point argument>. +C<undef> is returned if the code point is not found in the inversion list +(this happens only when it is not a legal L<code point argument>, or is less +than the list's first element). A warning is raised in the first instance. + +Otherwise, it returns the index into the list of the range that contains the +code point.; that is, find C<i> such that + + list[i]<= code_point < list[i+1]. + +As explained in L</prop_invlist()>, whether a code point is in the list or not +depends on if the index is even (in) or odd (not in). And as explained in +L</prop_invmap()>, the index is used with the returned parallel array to find +the mapping. + +=cut - # If this is ever made public, could use to speed up .t specials. Would - # need to use code point argument, as in other functions in this pm my $list_ref = shift; - my $code_point = shift; - # Verify non-neg numeric XXX + my $input_code_point = shift; + my $code_point = _getcode($input_code_point); + + if (! defined $code_point) { + carp __PACKAGE__, "::search_invlist: unknown code '$input_code_point'"; + return; + } my $max_element = @$list_ref - 1; diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index e070defbea..4b0c2271ec 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -2026,5 +2026,14 @@ foreach my $prop (keys %props) { pass("prop_invmap('$mod_prop')"); } +# A few tests of search_invlist +use Unicode::UCD qw(search_invlist); + +my ($scripts_ranges_ref, $scripts_map_ref) = prop_invmap("Script"); +my $index = search_invlist($scripts_ranges_ref, 0x390); +is($scripts_map_ref->[$index], "Greek", "U+0390 is Greek"); +my @alpha_invlist = prop_invlist("Alpha"); +is(search_invlist(\@alpha_invlist, ord("\t")), undef, "search_invlist returns undef for code points before first one on the list"); + ok($/ eq $input_record_separator, "The record separator didn't get overridden"); done_testing(); |