regcharclass.pl: Get code point folding to a seq

Previously regcharclass.pl could tell if an input string was a multi-character fold of some Unicode code point. This commit adds the ability to return what that code point is. This capability will be used in a later commit.
author: Karl Williamson <khw@cpan.org> 2020-10-18 10:20:38 -0600
committer: Karl Williamson <khw@cpan.org> 2020-12-19 20:45:42 -0700
commit: a50454ce0f79538d3cedde9015ef68d4d6dfb89f (patch)
tree: f6328fbadf94fe0c6e4c813f89d3be6d6cdddeff /regen/regcharclass.pl
parent: aceeeadc611887db471a0774dfaf0c05c75e5725 (diff)
download: perl-a50454ce0f79538d3cedde9015ef68d4d6dfb89f.tar.gz
1 files changed, 30 insertions, 17 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index 8d97a79f7a..5fd8e255e3 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -366,6 +366,7 @@ my %n2a;    # Inversion of a2n, for each character set
 sub new {
     my $class= shift;
     my %opt= @_;
+    my %hash_return;
     for ( qw(op txt) ) {
         die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
           if !exists $opt{$_};
@@ -437,19 +438,31 @@ sub new {
             die "eval '$1' failed: $@" if $@;
             push @{$opt{txt}}, @results;
             next;
+        } elsif ($str =~ / ^ % \s* ( .* ) /x) { # user-furnished sub() call
+            %hash_return = eval "$1";
+            die "eval '$1' failed: $@" if $@;
+            push @{$opt{txt}}, keys %hash_return;
+            die "Only one multi character expansion currently allowed per rule"
+                                                        if  $self->{multi_maps};
+            next;
         } else {
             die "Unparsable line: $txt\n";
         }
         my ( $cp, $cp_high, $low, $latin1, $utf8 )
                                         = __uni_latin1($charset, $a2n, $str );
+        my $from;
+        if (defined $hash_return{"\"$str\""}) {
+            $from = $hash_return{"\"$str\""};
+            $from = $a2n->[$from] if $from < 256;
+        }
         my $UTF8= $low   || $utf8;
         my $LATIN1= $low || $latin1;
         my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
         #die Dumper($txt,$cp,$low,$latin1,$utf8)
         #    if $txt=~/NEL/ or $utf8 and @$utf8>3;
 
-        @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 )}=
-          ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1 );
+        @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 from )}=
+          ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1, $from );
         my $rec= $self->{strs}{$str};
         foreach my $key ( qw(low utf8 latin1 high cp cp_high UTF8 LATIN1) ) {
             $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
@@ -522,9 +535,6 @@ sub pop_count ($) {
 sub _optree {
     my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
     return unless defined $trie;
-    if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
-        die "Can't do 'cp' optree from multi-codepoint strings";
-    }
     $ret_type ||= 'len';
     $else= 0  unless defined $else;
     $depth= 0 unless defined $depth;
@@ -535,13 +545,16 @@ sub _optree {
     if (exists $trie->{''} ) {
         # we can now update the "else" value, anything failing to match
         # after this point should return the value from this.
+        my $prefix = $self->{strs}{ $trie->{''} };
         if ( $ret_type eq 'cp' ) {
-            $else= $self->{strs}{ $trie->{''} }{cp}[0];
+            $else= $prefix->{from};
+            $else= $self->{strs}{ $trie->{''} }{cp}[0] unless defined $else;
             $else= $self->val_fmt($else) if $else > 9;
         } elsif ( $ret_type eq 'len' ) {
             $else= $depth;
         } elsif ( $ret_type eq 'both') {
-            $else= $self->{strs}{ $trie->{''} }{cp}[0];
+            $else= $prefix->{from};
+            $else= $self->{strs}{ $trie->{''} }{cp}[0] unless defined $else;
             $else= $self->val_fmt($else) if $else > 9;
             $else= "len=$depth, $else";
         }
@@ -1648,36 +1661,36 @@ QUOTEMETA: Meta-characters that \Q should quote
 \p{_Perl_Quotemeta}
 
 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
-=> UTF8 :safe
-&regcharclass_multi_char_folds::multi_char_folds('u', 'a')
+=> UTF8 UTF8-cp :safe
+%regcharclass_multi_char_folds::multi_char_folds('u', 'a')
 
 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
-=> LATIN1 : safe
-&regcharclass_multi_char_folds::multi_char_folds('l', 'a')
+=> LATIN1 LATIN1-cp : safe
+%regcharclass_multi_char_folds::multi_char_folds('l', 'a')
 
 THREE_CHAR_FOLD: A three-character multi-char fold
 => UTF8 :safe
-&regcharclass_multi_char_folds::multi_char_folds('u', '3')
+%regcharclass_multi_char_folds::multi_char_folds('u', '3')
 
 THREE_CHAR_FOLD: A three-character multi-char fold
 => LATIN1 :safe
-&regcharclass_multi_char_folds::multi_char_folds('l', '3')
+%regcharclass_multi_char_folds::multi_char_folds('l', '3')
 
 THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
 => UTF8 :safe
-&regcharclass_multi_char_folds::multi_char_folds('u', 'h')
+%regcharclass_multi_char_folds::multi_char_folds('u', 'h')
 
 THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
 => LATIN1 :safe
-&regcharclass_multi_char_folds::multi_char_folds('l', 'h')
+%regcharclass_multi_char_folds::multi_char_folds('l', 'h')
 #
 #THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds
 #=> UTF8 :safe
-#&regcharclass_multi_char_folds::multi_char_folds('u', 'fm')
+#%regcharclass_multi_char_folds::multi_char_folds('u', 'fm')
 #
 #THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds
 #=> LATIN1 :safe
-#&regcharclass_multi_char_folds::multi_char_folds('l', 'fm')
+#%regcharclass_multi_char_folds::multi_char_folds('l', 'fm')
 
 FOLDS_TO_MULTI: characters that fold to multi-char strings
 => UTF8 :fast
author	Karl Williamson <khw@cpan.org>	2020-10-18 10:20:38 -0600
committer	Karl Williamson <khw@cpan.org>	2020-12-19 20:45:42 -0700
commit	a50454ce0f79538d3cedde9015ef68d4d6dfb89f (patch)
tree	f6328fbadf94fe0c6e4c813f89d3be6d6cdddeff /regen/regcharclass.pl
parent	aceeeadc611887db471a0774dfaf0c05c75e5725 (diff)
download	perl-a50454ce0f79538d3cedde9015ef68d4d6dfb89f.tar.gz