diff options
author | Karl Williamson <khw@cpan.org> | 2020-10-18 10:20:38 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2020-12-19 20:45:42 -0700 |
commit | a50454ce0f79538d3cedde9015ef68d4d6dfb89f (patch) | |
tree | f6328fbadf94fe0c6e4c813f89d3be6d6cdddeff /regen/regcharclass.pl | |
parent | aceeeadc611887db471a0774dfaf0c05c75e5725 (diff) | |
download | perl-a50454ce0f79538d3cedde9015ef68d4d6dfb89f.tar.gz |
regcharclass.pl: Get code point folding to a seq
Previously regcharclass.pl could tell if an input string was a
multi-character fold of some Unicode code point. This commit adds the
ability to return what that code point is. This capability will be used
in a later commit.
Diffstat (limited to 'regen/regcharclass.pl')
-rwxr-xr-x | regen/regcharclass.pl | 47 |
1 files changed, 30 insertions, 17 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 8d97a79f7a..5fd8e255e3 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -366,6 +366,7 @@ my %n2a; # Inversion of a2n, for each character set sub new { my $class= shift; my %opt= @_; + my %hash_return; for ( qw(op txt) ) { die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field" if !exists $opt{$_}; @@ -437,19 +438,31 @@ sub new { die "eval '$1' failed: $@" if $@; push @{$opt{txt}}, @results; next; + } elsif ($str =~ / ^ % \s* ( .* ) /x) { # user-furnished sub() call + %hash_return = eval "$1"; + die "eval '$1' failed: $@" if $@; + push @{$opt{txt}}, keys %hash_return; + die "Only one multi character expansion currently allowed per rule" + if $self->{multi_maps}; + next; } else { die "Unparsable line: $txt\n"; } my ( $cp, $cp_high, $low, $latin1, $utf8 ) = __uni_latin1($charset, $a2n, $str ); + my $from; + if (defined $hash_return{"\"$str\""}) { + $from = $hash_return{"\"$str\""}; + $from = $a2n->[$from] if $from < 256; + } my $UTF8= $low || $utf8; my $LATIN1= $low || $latin1; my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8; #die Dumper($txt,$cp,$low,$latin1,$utf8) # if $txt=~/NEL/ or $utf8 and @$utf8>3; - @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 )}= - ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1 ); + @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 from )}= + ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1, $from ); my $rec= $self->{strs}{$str}; foreach my $key ( qw(low utf8 latin1 high cp cp_high UTF8 LATIN1) ) { $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++ @@ -522,9 +535,6 @@ sub pop_count ($) { sub _optree { my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_; return unless defined $trie; - if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) { - die "Can't do 'cp' optree from multi-codepoint strings"; - } $ret_type ||= 'len'; $else= 0 unless defined $else; $depth= 0 unless defined $depth; @@ -535,13 +545,16 @@ sub _optree { if (exists $trie->{''} ) { # we can now update the "else" value, anything failing to match # after this point should return the value from this. + my $prefix = $self->{strs}{ $trie->{''} }; if ( $ret_type eq 'cp' ) { - $else= $self->{strs}{ $trie->{''} }{cp}[0]; + $else= $prefix->{from}; + $else= $self->{strs}{ $trie->{''} }{cp}[0] unless defined $else; $else= $self->val_fmt($else) if $else > 9; } elsif ( $ret_type eq 'len' ) { $else= $depth; } elsif ( $ret_type eq 'both') { - $else= $self->{strs}{ $trie->{''} }{cp}[0]; + $else= $prefix->{from}; + $else= $self->{strs}{ $trie->{''} }{cp}[0] unless defined $else; $else= $self->val_fmt($else) if $else > 9; $else= "len=$depth, $else"; } @@ -1648,36 +1661,36 @@ QUOTEMETA: Meta-characters that \Q should quote \p{_Perl_Quotemeta} MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character -=> UTF8 :safe -®charclass_multi_char_folds::multi_char_folds('u', 'a') +=> UTF8 UTF8-cp :safe +%regcharclass_multi_char_folds::multi_char_folds('u', 'a') MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character -=> LATIN1 : safe -®charclass_multi_char_folds::multi_char_folds('l', 'a') +=> LATIN1 LATIN1-cp : safe +%regcharclass_multi_char_folds::multi_char_folds('l', 'a') THREE_CHAR_FOLD: A three-character multi-char fold => UTF8 :safe -®charclass_multi_char_folds::multi_char_folds('u', '3') +%regcharclass_multi_char_folds::multi_char_folds('u', '3') THREE_CHAR_FOLD: A three-character multi-char fold => LATIN1 :safe -®charclass_multi_char_folds::multi_char_folds('l', '3') +%regcharclass_multi_char_folds::multi_char_folds('l', '3') THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds => UTF8 :safe -®charclass_multi_char_folds::multi_char_folds('u', 'h') +%regcharclass_multi_char_folds::multi_char_folds('u', 'h') THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds => LATIN1 :safe -®charclass_multi_char_folds::multi_char_folds('l', 'h') +%regcharclass_multi_char_folds::multi_char_folds('l', 'h') # #THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds #=> UTF8 :safe -#®charclass_multi_char_folds::multi_char_folds('u', 'fm') +#%regcharclass_multi_char_folds::multi_char_folds('u', 'fm') # #THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds #=> LATIN1 :safe -#®charclass_multi_char_folds::multi_char_folds('l', 'fm') +#%regcharclass_multi_char_folds::multi_char_folds('l', 'fm') FOLDS_TO_MULTI: characters that fold to multi-char strings => UTF8 :fast |