Add qr/\p{Name=...}/

This accomplishes the same thing as \N{...}, but only for regex patterns, using loose matching and only the official Unicode names. This commit includes a comparison of the two approaches, added to perlunicode. But the real reason to do this is as a way station to being able to specify wild card lookup on the name property, coming in a later commit. I chose to not include user-defined aliases nor :short character names at this time. I thought that there might be unforeseen consequences of using them. It's better to later relax a requirement than to try to restrict it.
author: Karl Williamson <khw@cpan.org> 2020-02-05 13:32:26 -0700
committer: Karl Williamson <khw@cpan.org> 2020-02-12 16:25:53 -0700
commit: 673c254b34746289019db8836016c81eb38e5bf0 (patch)
tree: a137acc65278cc7a8f7c3c03024c21e10940c0b6 /lib/_charnames.pm
parent: ff5ebe043d728d8813248fe7b3a58935b1116e6a (diff)
download: perl-673c254b34746289019db8836016c81eb38e5bf0.tar.gz
1 files changed, 27 insertions, 11 deletions
diff --git a/lib/_charnames.pm b/lib/_charnames.pm
index c6169d16f8..600317b623 100644
--- a/lib/_charnames.pm
+++ b/lib/_charnames.pm
@@ -6,7 +6,7 @@
 package _charnames;
 use strict;
 use warnings;
-our $VERSION = '1.45';
+our $VERSION = '1.46';
 use unicore::Name;    # mktables-generated algorithmically-defined names
 
 use bytes ();          # for $bytes::hint_bits
@@ -263,8 +263,9 @@ my %dummy_H = (
               );
 
 
-sub lookup_name ($$$) {
-  my ($name, $wants_ord, $runtime) = @_;
+sub lookup_name ($$$;$) {
+  my ($name, $wants_ord, $runtime, $regex_loose) = @_;
+  $regex_loose //= 0;
 
   # Lookup the name or sequence $name in the tables.  If $wants_ord is false,
   # returns the string equivalent of $name; if true, returns the ordinal value
@@ -281,7 +282,7 @@ sub lookup_name ($$$) {
   my $result;       # The string result
   my $save_input;
 
-  if ($runtime) {
+  if ($runtime && ! $regex_loose) {
 
     my $hints_ref = (caller($runtime))[10];
 
@@ -307,16 +308,16 @@ sub lookup_name ($$$) {
     $^H{charnames_short} = $hints_ref->{charnames_short};
   }
 
-  my $loose = $^H{charnames_loose};
+  my $loose = $regex_loose || $^H{charnames_loose};
   my $lookup_name;  # Input name suitably modified for grepping for in the
                     # table
 
   # User alias should be checked first or else can't override ours, and if we
   # were to add any, could conflict with theirs.
-  if (exists $^H{charnames_ord_aliases}{$name}) {
+  if (! $regex_loose && exists $^H{charnames_ord_aliases}{$name}) {
     $result = $^H{charnames_ord_aliases}{$name};
   }
-  elsif (exists $^H{charnames_name_aliases}{$name}) {
+  elsif (! $regex_loose && exists $^H{charnames_name_aliases}{$name}) {
     $name = $^H{charnames_name_aliases}{$name};
     $save_input = $lookup_name = $name;  # Cache the result for any error
                                          # message
@@ -422,7 +423,7 @@ sub lookup_name ($$$) {
       # the other way around slows down finding these immensely.
       # Algorithmically determinables are not placed in the cache because
       # that uses up memory, and finding these again is fast.
-      if (($loose || $^H{charnames_full})
+      if (   ($loose || $^H{charnames_full})
           && (defined (my $ord = charnames::name_to_code_point_special($lookup_name, $loose))))
       {
         $result = chr $ord;
@@ -464,6 +465,10 @@ sub lookup_name ($$$) {
           @off = ($-[0] + 1, $+[0]);    # The 1 is for the tab
           $cache_ref = ($loose) ? \%loose_names_cache : \%full_names_cache;
         }
+        elsif ($regex_loose) {
+          # Currently don't allow :short when this is set
+          return;
+        }
         else {
 
           # Here, didn't look for, or didn't find the name.
@@ -572,9 +577,11 @@ sub lookup_name ($$$) {
 
     # Here, wants string output.  If utf8 is acceptable, just return what
     # we've got; otherwise attempt to convert it to non-utf8 and return that.
-    my $in_bytes = ($runtime)
-                   ? (caller $runtime)[8] & $bytes::hint_bits
-                   : $^H & $bytes::hint_bits;
+    my $in_bytes =     ! $regex_loose   # \p{name=} doesn't currently care if
+                                        # in bytes or not
+                   && (($runtime)
+                       ? (caller $runtime)[8] & $bytes::hint_bits
+                       : $^H & $bytes::hint_bits);
     return $result if (! $in_bytes || utf8::downgrade($result, 1)) # The 1 arg
                                                   # means don't die on failure
   }
@@ -617,6 +624,15 @@ sub charnames {
   return lookup_name($_[0], 0, 0);
 }
 
+sub _loose_regcomp_lookup {
+  # For use only by regcomp.c to compile \p{name=...}
+  # khw thinks it best to not do :short matching, and only official names.
+  # But that is only a guess, and if demand warrants, could be changed
+  return lookup_name($_[0], 0, 1,
+                     1  # Always use :loose matching
+                    );
+}
+
 sub import
 {
   shift; ## ignore class name
author	Karl Williamson <khw@cpan.org>	2020-02-05 13:32:26 -0700
committer	Karl Williamson <khw@cpan.org>	2020-02-12 16:25:53 -0700
commit	673c254b34746289019db8836016c81eb38e5bf0 (patch)
tree	a137acc65278cc7a8f7c3c03024c21e10940c0b6 /lib/_charnames.pm
parent	ff5ebe043d728d8813248fe7b3a58935b1116e6a (diff)
download	perl-673c254b34746289019db8836016c81eb38e5bf0.tar.gz