diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-09-03 16:54:56 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-09-13 21:14:03 -0600 |
commit | b1af8fefbdf1c044271e0b9d8898e2d808ab7879 (patch) | |
tree | c4351ce0694ec4911516ebf3e2f7a420dcd81f57 /regen | |
parent | cc08b31c82fa64f8ce3a10d9c4b51581c07b2783 (diff) | |
download | perl-b1af8fefbdf1c044271e0b9d8898e2d808ab7879.tar.gz |
regen/regcharclass.pl: Add new output macro type
The new type 'high' is used on only above-Latin1 code points. It is
designed for code that already knows the tested code point is not
Latin1, and avoids unnecessary tests.
Diffstat (limited to 'regen')
-rwxr-xr-x | regen/regcharclass.pl | 15 |
1 files changed, 10 insertions, 5 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index c29af67cc3..1d4a9217ec 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -272,7 +272,7 @@ sub __cond_join { # # Each string is then stored in the 'strs' subhash as a hash record # made up of the results of __uni_latin1, using the keynames -# 'low','latin1','utf8', as well as the synthesized 'LATIN1' and +# 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and # 'UTF8' which hold a merge of 'low' and their lowercase equivelents. # # Size data is tracked per type in the 'size' subhash. @@ -343,13 +343,14 @@ sub new { my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str ); my $UTF8= $low || $utf8; my $LATIN1= $low || $latin1; + my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8; #die Dumper($txt,$cp,$low,$latin1,$utf8) # if $txt=~/NEL/ or $utf8 and @$utf8>3; - @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 cp UTF8 LATIN1 )}= - ( $str, $txt, $low, $utf8, $latin1, $cp, $UTF8, $LATIN1 ); + @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp UTF8 LATIN1 )}= + ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $UTF8, $LATIN1 ); my $rec= $self->{strs}{$str}; - foreach my $key ( qw(low utf8 latin1 cp UTF8 LATIN1) ) { + foreach my $key ( qw(low utf8 latin1 high cp UTF8 LATIN1) ) { $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++ if $self->{strs}{$str}{$key}; } @@ -653,7 +654,7 @@ sub render { # make a macro of a given type. # calls into make_trie and (generic_|length_)optree as needed # Opts are: -# type : 'cp','generic','low','latin1','utf8','LATIN1','UTF8' +# type : 'cp','generic','high','low','latin1','utf8','LATIN1','UTF8' # ret_type : 'cp' or 'len' # safe : add length guards to macro # @@ -810,6 +811,10 @@ if ( !caller ) { # latin1 generate a macro whose name is 'is_BASE_latin1' and defines a # class that includes only upper-Latin1-range chars. It is not # designed to take a UTF-8 input parameter. +# high generate a macro whose name is 'is_BASE_high' and defines a +# class that includes all relevant code points that are above +# the Latin1 range. This is for very specialized uses only. +# It is designed to take only an input UTF-8 parameter. # utf8 generate a macro whose name is 'is_BASE_utf8' and defines a # class that includes all relevant characters that aren't ASCII. # It is designed to take only an input UTF-8 parameter. |