summaryrefslogtreecommitdiff
path: root/regen
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-09-03 16:54:56 -0600
committerKarl Williamson <public@khwilliamson.com>2012-09-13 21:14:03 -0600
commitb1af8fefbdf1c044271e0b9d8898e2d808ab7879 (patch)
treec4351ce0694ec4911516ebf3e2f7a420dcd81f57 /regen
parentcc08b31c82fa64f8ce3a10d9c4b51581c07b2783 (diff)
downloadperl-b1af8fefbdf1c044271e0b9d8898e2d808ab7879.tar.gz
regen/regcharclass.pl: Add new output macro type
The new type 'high' is used on only above-Latin1 code points. It is designed for code that already knows the tested code point is not Latin1, and avoids unnecessary tests.
Diffstat (limited to 'regen')
-rwxr-xr-xregen/regcharclass.pl15
1 files changed, 10 insertions, 5 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index c29af67cc3..1d4a9217ec 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -272,7 +272,7 @@ sub __cond_join {
#
# Each string is then stored in the 'strs' subhash as a hash record
# made up of the results of __uni_latin1, using the keynames
-# 'low','latin1','utf8', as well as the synthesized 'LATIN1' and
+# 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
# 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
#
# Size data is tracked per type in the 'size' subhash.
@@ -343,13 +343,14 @@ sub new {
my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
my $UTF8= $low || $utf8;
my $LATIN1= $low || $latin1;
+ my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
#die Dumper($txt,$cp,$low,$latin1,$utf8)
# if $txt=~/NEL/ or $utf8 and @$utf8>3;
- @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 cp UTF8 LATIN1 )}=
- ( $str, $txt, $low, $utf8, $latin1, $cp, $UTF8, $LATIN1 );
+ @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp UTF8 LATIN1 )}=
+ ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $UTF8, $LATIN1 );
my $rec= $self->{strs}{$str};
- foreach my $key ( qw(low utf8 latin1 cp UTF8 LATIN1) ) {
+ foreach my $key ( qw(low utf8 latin1 high cp UTF8 LATIN1) ) {
$self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
if $self->{strs}{$str}{$key};
}
@@ -653,7 +654,7 @@ sub render {
# make a macro of a given type.
# calls into make_trie and (generic_|length_)optree as needed
# Opts are:
-# type : 'cp','generic','low','latin1','utf8','LATIN1','UTF8'
+# type : 'cp','generic','high','low','latin1','utf8','LATIN1','UTF8'
# ret_type : 'cp' or 'len'
# safe : add length guards to macro
#
@@ -810,6 +811,10 @@ if ( !caller ) {
# latin1 generate a macro whose name is 'is_BASE_latin1' and defines a
# class that includes only upper-Latin1-range chars. It is not
# designed to take a UTF-8 input parameter.
+# high generate a macro whose name is 'is_BASE_high' and defines a
+# class that includes all relevant code points that are above
+# the Latin1 range. This is for very specialized uses only.
+# It is designed to take only an input UTF-8 parameter.
# utf8 generate a macro whose name is 'is_BASE_utf8' and defines a
# class that includes all relevant characters that aren't ASCII.
# It is designed to take only an input UTF-8 parameter.