summaryrefslogtreecommitdiff
path: root/regen/regcharclass.pl
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-10-18 10:20:38 -0600
committerKarl Williamson <khw@cpan.org>2020-12-19 20:45:42 -0700
commita50454ce0f79538d3cedde9015ef68d4d6dfb89f (patch)
treef6328fbadf94fe0c6e4c813f89d3be6d6cdddeff /regen/regcharclass.pl
parentaceeeadc611887db471a0774dfaf0c05c75e5725 (diff)
downloadperl-a50454ce0f79538d3cedde9015ef68d4d6dfb89f.tar.gz
regcharclass.pl: Get code point folding to a seq
Previously regcharclass.pl could tell if an input string was a multi-character fold of some Unicode code point. This commit adds the ability to return what that code point is. This capability will be used in a later commit.
Diffstat (limited to 'regen/regcharclass.pl')
-rwxr-xr-xregen/regcharclass.pl47
1 files changed, 30 insertions, 17 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index 8d97a79f7a..5fd8e255e3 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -366,6 +366,7 @@ my %n2a; # Inversion of a2n, for each character set
sub new {
my $class= shift;
my %opt= @_;
+ my %hash_return;
for ( qw(op txt) ) {
die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
if !exists $opt{$_};
@@ -437,19 +438,31 @@ sub new {
die "eval '$1' failed: $@" if $@;
push @{$opt{txt}}, @results;
next;
+ } elsif ($str =~ / ^ % \s* ( .* ) /x) { # user-furnished sub() call
+ %hash_return = eval "$1";
+ die "eval '$1' failed: $@" if $@;
+ push @{$opt{txt}}, keys %hash_return;
+ die "Only one multi character expansion currently allowed per rule"
+ if $self->{multi_maps};
+ next;
} else {
die "Unparsable line: $txt\n";
}
my ( $cp, $cp_high, $low, $latin1, $utf8 )
= __uni_latin1($charset, $a2n, $str );
+ my $from;
+ if (defined $hash_return{"\"$str\""}) {
+ $from = $hash_return{"\"$str\""};
+ $from = $a2n->[$from] if $from < 256;
+ }
my $UTF8= $low || $utf8;
my $LATIN1= $low || $latin1;
my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
#die Dumper($txt,$cp,$low,$latin1,$utf8)
# if $txt=~/NEL/ or $utf8 and @$utf8>3;
- @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 )}=
- ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1 );
+ @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 from )}=
+ ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1, $from );
my $rec= $self->{strs}{$str};
foreach my $key ( qw(low utf8 latin1 high cp cp_high UTF8 LATIN1) ) {
$self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
@@ -522,9 +535,6 @@ sub pop_count ($) {
sub _optree {
my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
return unless defined $trie;
- if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
- die "Can't do 'cp' optree from multi-codepoint strings";
- }
$ret_type ||= 'len';
$else= 0 unless defined $else;
$depth= 0 unless defined $depth;
@@ -535,13 +545,16 @@ sub _optree {
if (exists $trie->{''} ) {
# we can now update the "else" value, anything failing to match
# after this point should return the value from this.
+ my $prefix = $self->{strs}{ $trie->{''} };
if ( $ret_type eq 'cp' ) {
- $else= $self->{strs}{ $trie->{''} }{cp}[0];
+ $else= $prefix->{from};
+ $else= $self->{strs}{ $trie->{''} }{cp}[0] unless defined $else;
$else= $self->val_fmt($else) if $else > 9;
} elsif ( $ret_type eq 'len' ) {
$else= $depth;
} elsif ( $ret_type eq 'both') {
- $else= $self->{strs}{ $trie->{''} }{cp}[0];
+ $else= $prefix->{from};
+ $else= $self->{strs}{ $trie->{''} }{cp}[0] unless defined $else;
$else= $self->val_fmt($else) if $else > 9;
$else= "len=$depth, $else";
}
@@ -1648,36 +1661,36 @@ QUOTEMETA: Meta-characters that \Q should quote
\p{_Perl_Quotemeta}
MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
-=> UTF8 :safe
-&regcharclass_multi_char_folds::multi_char_folds('u', 'a')
+=> UTF8 UTF8-cp :safe
+%regcharclass_multi_char_folds::multi_char_folds('u', 'a')
MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
-=> LATIN1 : safe
-&regcharclass_multi_char_folds::multi_char_folds('l', 'a')
+=> LATIN1 LATIN1-cp : safe
+%regcharclass_multi_char_folds::multi_char_folds('l', 'a')
THREE_CHAR_FOLD: A three-character multi-char fold
=> UTF8 :safe
-&regcharclass_multi_char_folds::multi_char_folds('u', '3')
+%regcharclass_multi_char_folds::multi_char_folds('u', '3')
THREE_CHAR_FOLD: A three-character multi-char fold
=> LATIN1 :safe
-&regcharclass_multi_char_folds::multi_char_folds('l', '3')
+%regcharclass_multi_char_folds::multi_char_folds('l', '3')
THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
=> UTF8 :safe
-&regcharclass_multi_char_folds::multi_char_folds('u', 'h')
+%regcharclass_multi_char_folds::multi_char_folds('u', 'h')
THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
=> LATIN1 :safe
-&regcharclass_multi_char_folds::multi_char_folds('l', 'h')
+%regcharclass_multi_char_folds::multi_char_folds('l', 'h')
#
#THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds
#=> UTF8 :safe
-#&regcharclass_multi_char_folds::multi_char_folds('u', 'fm')
+#%regcharclass_multi_char_folds::multi_char_folds('u', 'fm')
#
#THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds
#=> LATIN1 :safe
-#&regcharclass_multi_char_folds::multi_char_folds('l', 'fm')
+#%regcharclass_multi_char_folds::multi_char_folds('l', 'fm')
FOLDS_TO_MULTI: characters that fold to multi-char strings
=> UTF8 :fast