diff options
author | Karl Williamson <khw@cpan.org> | 2020-02-05 13:32:26 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2020-02-12 16:25:53 -0700 |
commit | 673c254b34746289019db8836016c81eb38e5bf0 (patch) | |
tree | a137acc65278cc7a8f7c3c03024c21e10940c0b6 /lib/_charnames.pm | |
parent | ff5ebe043d728d8813248fe7b3a58935b1116e6a (diff) | |
download | perl-673c254b34746289019db8836016c81eb38e5bf0.tar.gz |
Add qr/\p{Name=...}/
This accomplishes the same thing as \N{...}, but only for regex
patterns, using loose matching and only the official Unicode names.
This commit includes a comparison of the two approaches, added to
perlunicode. But the real reason to do this is as a way station to
being able to specify wild card lookup on the name property, coming in a
later commit.
I chose to not include user-defined aliases nor :short character names
at this time. I thought that there might be unforeseen consequences of
using them. It's better to later relax a requirement than to try to
restrict it.
Diffstat (limited to 'lib/_charnames.pm')
-rw-r--r-- | lib/_charnames.pm | 38 |
1 files changed, 27 insertions, 11 deletions
diff --git a/lib/_charnames.pm b/lib/_charnames.pm index c6169d16f8..600317b623 100644 --- a/lib/_charnames.pm +++ b/lib/_charnames.pm @@ -6,7 +6,7 @@ package _charnames; use strict; use warnings; -our $VERSION = '1.45'; +our $VERSION = '1.46'; use unicore::Name; # mktables-generated algorithmically-defined names use bytes (); # for $bytes::hint_bits @@ -263,8 +263,9 @@ my %dummy_H = ( ); -sub lookup_name ($$$) { - my ($name, $wants_ord, $runtime) = @_; +sub lookup_name ($$$;$) { + my ($name, $wants_ord, $runtime, $regex_loose) = @_; + $regex_loose //= 0; # Lookup the name or sequence $name in the tables. If $wants_ord is false, # returns the string equivalent of $name; if true, returns the ordinal value @@ -281,7 +282,7 @@ sub lookup_name ($$$) { my $result; # The string result my $save_input; - if ($runtime) { + if ($runtime && ! $regex_loose) { my $hints_ref = (caller($runtime))[10]; @@ -307,16 +308,16 @@ sub lookup_name ($$$) { $^H{charnames_short} = $hints_ref->{charnames_short}; } - my $loose = $^H{charnames_loose}; + my $loose = $regex_loose || $^H{charnames_loose}; my $lookup_name; # Input name suitably modified for grepping for in the # table # User alias should be checked first or else can't override ours, and if we # were to add any, could conflict with theirs. - if (exists $^H{charnames_ord_aliases}{$name}) { + if (! $regex_loose && exists $^H{charnames_ord_aliases}{$name}) { $result = $^H{charnames_ord_aliases}{$name}; } - elsif (exists $^H{charnames_name_aliases}{$name}) { + elsif (! $regex_loose && exists $^H{charnames_name_aliases}{$name}) { $name = $^H{charnames_name_aliases}{$name}; $save_input = $lookup_name = $name; # Cache the result for any error # message @@ -422,7 +423,7 @@ sub lookup_name ($$$) { # the other way around slows down finding these immensely. # Algorithmically determinables are not placed in the cache because # that uses up memory, and finding these again is fast. - if (($loose || $^H{charnames_full}) + if ( ($loose || $^H{charnames_full}) && (defined (my $ord = charnames::name_to_code_point_special($lookup_name, $loose)))) { $result = chr $ord; @@ -464,6 +465,10 @@ sub lookup_name ($$$) { @off = ($-[0] + 1, $+[0]); # The 1 is for the tab $cache_ref = ($loose) ? \%loose_names_cache : \%full_names_cache; } + elsif ($regex_loose) { + # Currently don't allow :short when this is set + return; + } else { # Here, didn't look for, or didn't find the name. @@ -572,9 +577,11 @@ sub lookup_name ($$$) { # Here, wants string output. If utf8 is acceptable, just return what # we've got; otherwise attempt to convert it to non-utf8 and return that. - my $in_bytes = ($runtime) - ? (caller $runtime)[8] & $bytes::hint_bits - : $^H & $bytes::hint_bits; + my $in_bytes = ! $regex_loose # \p{name=} doesn't currently care if + # in bytes or not + && (($runtime) + ? (caller $runtime)[8] & $bytes::hint_bits + : $^H & $bytes::hint_bits); return $result if (! $in_bytes || utf8::downgrade($result, 1)) # The 1 arg # means don't die on failure } @@ -617,6 +624,15 @@ sub charnames { return lookup_name($_[0], 0, 0); } +sub _loose_regcomp_lookup { + # For use only by regcomp.c to compile \p{name=...} + # khw thinks it best to not do :short matching, and only official names. + # But that is only a guess, and if demand warrants, could be changed + return lookup_name($_[0], 0, 1, + 1 # Always use :loose matching + ); +} + sub import { shift; ## ignore class name |