summaryrefslogtreecommitdiff
path: root/regen
diff options
context:
space:
mode:
Diffstat (limited to 'regen')
-rwxr-xr-xregen/regcharclass.pl653
-rw-r--r--regen/unicode_constants.pl146
-rw-r--r--regen/utf8_strings.pl108
3 files changed, 693 insertions, 214 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index faf1572b7c..7d126428ef 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -4,11 +4,12 @@ use 5.008;
use warnings;
use warnings FATAL => 'all';
use Text::Wrap qw(wrap);
-use Encode;
use Data::Dumper;
$Data::Dumper::Useqq= 1;
our $hex_fmt= "0x%02X";
+sub ASCII_PLATFORM { (ord('A') == 65) }
+
require 'regen/regen_lib.pl';
=head1 NAME
@@ -23,36 +24,47 @@ CharClass::Matcher -- Generate C macros that match character classes efficiently
Dynamically generates macros for detecting special charclasses
in latin-1, utf8, and codepoint forms. Macros can be set to return
-the length (in bytes) of the matched codepoint, or the codepoint itself.
+the length (in bytes) of the matched codepoint, and/or the codepoint itself.
-To regenerate regcharclass.h, run this script from perl-root. No arguments
+To regenerate F<regcharclass.h>, run this script from perl-root. No arguments
are necessary.
-Using WHATEVER as an example the following macros will be produced:
+Using WHATEVER as an example the following macros can be produced, depending
+on the input parameters (how to get each is described by internal comments at
+the C<__DATA__> line):
=over 4
-=item is_WHATEVER(s,is_utf8)
+=item C<is_WHATEVER(s,is_utf8)>
-=item is_WHATEVER_safe(s,e,is_utf8)
+=item C<is_WHATEVER_safe(s,e,is_utf8)>
-Do a lookup as appropriate based on the is_utf8 flag. When possible
-comparisons involving octect<128 are done before checking the is_utf8
+Do a lookup as appropriate based on the C<is_utf8> flag. When possible
+comparisons involving octect<128 are done before checking the C<is_utf8>
flag, hopefully saving time.
-=item is_WHATEVER_utf8(s)
+The version without the C<_safe> suffix should be used only when the input is
+known to be well-formed.
+
+=item C<is_WHATEVER_utf8(s)>
-=item is_WHATEVER_utf8_safe(s,e)
+=item C<is_WHATEVER_utf8_safe(s,e)>
Do a lookup assuming the string is encoded in (normalized) UTF8.
-=item is_WHATEVER_latin1(s)
+The version without the C<_safe> suffix should be used only when the input is
+known to be well-formed.
+
+=item C<is_WHATEVER_latin1(s)>
-=item is_WHATEVER_latin1_safe(s,e)
+=item C<is_WHATEVER_latin1_safe(s,e)>
Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
-=item is_WHATEVER_cp(cp)
+The version without the C<_safe> suffix should be used only when it is known
+that C<s> contains at least one character.
+
+=item C<is_WHATEVER_cp(cp)>
Check to see if the string matches a given codepoint (hypothetically a
U32). The condition is constructed as as to "break out" as early as
@@ -65,11 +77,34 @@ IOW:
Thus if the character is X+1 only two comparisons will be done. Making
matching lookups slower, but non-matching faster.
-=back
+=item C<what_len_WHATEVER_FOO(arg1, ..., len)>
+
+A variant form of each of the macro types described above can be generated, in
+which the code point is returned by the macro, and an extra parameter (in the
+final position) is added, which is a pointer for the macro to set the byte
+length of the returned code point.
-Additionally it is possible to generate C<what_> variants that return
-the codepoint read instead of the number of octets read, this can be
-done by suffixing '-cp' to the type description.
+These forms all have a C<what_len> prefix instead of the C<is_>, for example
+C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and
+C<what_len_WHATEVER_utf8(s,len)>.
+
+These forms should not be used I<except> on small sets of mostly widely
+separated code points; otherwise the code generated is inefficient. For these
+cases, it is best to use the C<is_> forms, and then find the code point with
+C<utf8_to_uvchr_buf>(). This program can fail with a "deep recursion"
+message on the worst of the inappropriate sets. Examine the generated macro
+to see if it is acceptable.
+
+=item C<what_WHATEVER_FOO(arg1, ...)>
+
+A variant form of each of the C<is_> macro types described above can be generated, in
+which the code point and not the length is returned by the macro. These have
+the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should
+not be used where the set contains a NULL, as 0 is returned for two different
+cases: a) the set doesn't include the input code point; b) the set does
+include it, and it is a NULL.
+
+=back
=head2 CODE FORMAT
@@ -78,7 +113,7 @@ perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
=head1 AUTHOR
-Author: Yves Orton (demerphq) 2007
+Author: Yves Orton (demerphq) 2007. Maintained by Perl5 Porters.
=head1 BUGS
@@ -107,14 +142,16 @@ License or the Artistic License, as specified in the README file.
# represent the string in some given encoding with specific conditions.
#
# $cp - list of codepoints that make up the string.
-# $n - list of octets that make up the string if all codepoints < 128
+# $n - list of octets that make up the string if all codepoints are invariant
+# regardless of if the string is in UTF-8 or not.
# $l - list of octets that make up the string in latin1 encoding if all
-# codepoints < 256, and at least one codepoint is >127.
-# $u - list of octets that make up the string in utf8 if any codepoint >127
+# codepoints < 256, and at least one codepoint is UTF-8 variant.
+# $u - list of octets that make up the string in utf8 if any codepoint is
+# UTF-8 variant
#
# High CP | Defined
#-----------+----------
-# 0 - 127 : $n
+# 0 - 127 : $n (127/128 are the values for ASCII platforms)
# 128 - 255 : $l, $u
# 256 - ... : $u
#
@@ -123,22 +160,33 @@ sub __uni_latin1 {
my $str= shift;
my $max= 0;
my @cp;
+ my $only_has_invariants = 1;
for my $ch ( split //, $str ) {
my $cp= ord $ch;
push @cp, $cp;
$max= $cp if $max < $cp;
+ if (! ASCII_PLATFORM && $only_has_invariants) {
+ if ($cp > 255) {
+ $only_has_invariants = 0;
+ }
+ else {
+ my $temp = chr($cp);
+ utf8::upgrade($temp);
+ my @utf8 = unpack "U0C*", $temp;
+ $only_has_invariants = (@utf8 == 1 && $utf8[0] == $cp);
+ }
+ }
}
my ( $n, $l, $u );
- if ( $max < 128 ) {
+ $only_has_invariants = $max < 128 if ASCII_PLATFORM;
+ if ($only_has_invariants) {
$n= [@cp];
} else {
$l= [@cp] if $max && $max < 256;
- my $copy= $str; # must copy string, FB_CROAK makes encode destructive
- $u= eval { Encode::encode( "utf8", $copy, Encode::FB_CROAK ) };
- # $u is utf8 but with the utf8 flag OFF
- # therefore "C*" gets us the values of the bytes involved.
- $u= [ unpack "C*", $u ] if defined $u;
+ $u= $str;
+ utf8::upgrade($u);
+ $u= [ unpack "U0C*", $u ] if defined $u;
}
return ( \@cp, $n, $l, $u );
}
@@ -224,7 +272,7 @@ sub __cond_join {
#
# Each string is then stored in the 'strs' subhash as a hash record
# made up of the results of __uni_latin1, using the keynames
-# 'low','latin1','utf8', as well as the synthesized 'LATIN1' and
+# 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
# 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
#
# Size data is tracked per type in the 'size' subhash.
@@ -247,23 +295,62 @@ sub new {
my $str= $txt;
if ( $str =~ /^[""]/ ) {
$str= eval $str;
- } elsif ( $str =~ /^0x/ ) {
+ } elsif ($str =~ / - /x ) { # A range: Replace this element on the
+ # list with its expansion
+ my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
+ die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper;
+ foreach my $cp (hex $lower .. hex $upper) {
+ push @{$opt{txt}}, sprintf "0x%X", $cp;
+ }
+ next;
+ } elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
+ # Otherwise undocumented, a leading N means is already in the
+ # native character set; don't convert.
$str= chr eval $str;
- } elsif ( /\S/ ) {
- die "Unparsable line: $txt\n";
- } else {
+ } elsif ( $str =~ /^0x/ ) {
+ $str= eval $str;
+
+ # Convert from Unicode/ASCII to native, if necessary
+ $str = utf8::unicode_to_native($str) if ! ASCII_PLATFORM
+ && $str <= 0xFF;
+ $str = chr $str;
+ } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
+ my $property = $1;
+ use Unicode::UCD qw(prop_invlist);
+
+ my @invlist = prop_invlist($property, '_perl_core_internal_ok');
+ if (! @invlist) {
+
+ # An empty return could mean an unknown property, or merely
+ # that it is empty. Call in scalar context to differentiate
+ my $count = prop_invlist($property, '_perl_core_internal_ok');
+ die "$property not found" unless defined $count;
+ }
+
+ # Replace this element on the list with the property's expansion
+ for (my $i = 0; $i < @invlist; $i += 2) {
+ foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
+
+ # prop_invlist() returns native values; add leading 'N'
+ # to indicate that.
+ push @{$opt{txt}}, sprintf "N0x%X", $cp;
+ }
+ }
next;
+ } else {
+ die "Unparsable line: $txt\n";
}
my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
my $UTF8= $low || $utf8;
my $LATIN1= $low || $latin1;
+ my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
#die Dumper($txt,$cp,$low,$latin1,$utf8)
# if $txt=~/NEL/ or $utf8 and @$utf8>3;
- @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 cp UTF8 LATIN1 )}=
- ( $str, $txt, $low, $utf8, $latin1, $cp, $UTF8, $LATIN1 );
+ @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp UTF8 LATIN1 )}=
+ ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $UTF8, $LATIN1 );
my $rec= $self->{strs}{$str};
- foreach my $key ( qw(low utf8 latin1 cp UTF8 LATIN1) ) {
+ foreach my $key ( qw(low utf8 latin1 high cp UTF8 LATIN1) ) {
$self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
if $self->{strs}{$str}{$key};
}
@@ -308,6 +395,22 @@ sub make_trie {
return 0 + keys( %trie ) ? \%trie : undef;
}
+sub pop_count ($) {
+ my $word = shift;
+
+ # This returns a list of the positions of the bits in the input word that
+ # are 1.
+
+ my @positions;
+ my $position = 0;
+ while ($word) {
+ push @positions, $position if $word & 1;
+ $position++;
+ $word >>= 1;
+ }
+ return @positions;
+}
+
# my $optree= _optree()
#
# recursively convert a trie to an optree where every node represents
@@ -326,7 +429,7 @@ sub _optree {
$depth= 0 unless defined $depth;
my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
- if ( $trie->{''} ) {
+ if (exists $trie->{''} ) {
if ( $ret_type eq 'cp' ) {
$else= $self->{strs}{ $trie->{''} }{cp}[0];
$else= sprintf "$self->{val_fmt}", $else if $else > 9;
@@ -454,19 +557,121 @@ sub length_optree {
return $else;
}
+sub calculate_mask(@) {
+ my @list = @_;
+ my $list_count = @list;
+
+ # Look at the input list of byte values. This routine sees if the set
+ # consisting of those bytes is exactly determinable by using a
+ # mask/compare operation. If not, it returns an empty list; if so, it
+ # returns a list consisting of (mask, compare). For example, consider a
+ # set consisting of the numbers 0xF0, 0xF1, 0xF2, and 0xF3. If we want to
+ # know if a number 'c' is in the set, we could write:
+ # 0xF0 <= c && c <= 0xF4
+ # But the following mask/compare also works, and has just one test:
+ # c & 0xFC == 0xF0
+ # The reason it works is that the set consists of exactly those numbers
+ # whose first 4 bits are 1, and the next two are 0. (The value of the
+ # other 2 bits is immaterial in determining if a number is in the set or
+ # not.) The mask masks out those 2 irrelevant bits, and the comparison
+ # makes sure that the result matches all bytes that which match those 6
+ # material bits exactly. In other words, the set of numbers contains
+ # exactly those whose bottom two bit positions are either 0 or 1. The
+ # same principle applies to bit positions that are not necessarily
+ # adjacent. And it can be applied to bytes that differ in 1 through all 8
+ # bit positions. In order to be a candidate for this optimization, the
+ # number of numbers in the test must be a power of 2. Based on this
+ # count, we know the number of bit positions that must differ.
+ my $bit_diff_count = 0;
+ my $compare = $list[0];
+ if ($list_count == 2) {
+ $bit_diff_count = 1;
+ }
+ elsif ($list_count == 4) {
+ $bit_diff_count = 2;
+ }
+ elsif ($list_count == 8) {
+ $bit_diff_count = 3;
+ }
+ elsif ($list_count == 16) {
+ $bit_diff_count = 4;
+ }
+ elsif ($list_count == 32) {
+ $bit_diff_count = 5;
+ }
+ elsif ($list_count == 64) {
+ $bit_diff_count = 6;
+ }
+ elsif ($list_count == 128) {
+ $bit_diff_count = 7;
+ }
+ elsif ($list_count == 256) {
+ return (0, 0);
+ }
+
+ # If the count wasn't a power of 2, we can't apply this optimization
+ return if ! $bit_diff_count;
+
+ my %bit_map;
+
+ # For each byte in the list, find the bit positions in it whose value
+ # differs from the first byte in the set.
+ for (my $i = 1; $i < @list; $i++) {
+ my @positions = pop_count($list[0] ^ $list[$i]);
+
+ # If the number of differing bits is greater than those permitted by
+ # the set size, this optimization doesn't apply.
+ return if @positions > $bit_diff_count;
+
+ # Save the bit positions that differ.
+ foreach my $bit (@positions) {
+ $bit_map{$bit} = 1;
+ }
+
+ # If the total so far is greater than those permitted by the set size,
+ # this optimization doesn't apply.
+ return if keys %bit_map > $bit_diff_count;
+
+
+ # The value to compare against is the AND of all the members of the
+ # set. The bit positions that are the same in all will be correct in
+ # the AND, and the bit positions that differ will be 0.
+ $compare &= $list[$i];
+ }
+
+ # To get to here, we have gone through all bytes in the set,
+ # and determined that they all differ from each other in at most
+ # the number of bits allowed for the set's quantity. And since we have
+ # tested all 2**N possibilities, we know that the set includes no fewer
+ # elements than we need,, so the optimization applies.
+ die "panic: internal logic error" if keys %bit_map != $bit_diff_count;
+
+ # The mask is the bit positions where things differ, complemented.
+ my $mask = 0;
+ foreach my $position (keys %bit_map) {
+ $mask |= 1 << $position;
+ }
+ $mask = ~$mask & 0xFF;
+
+ return ($mask, $compare);
+}
+
# _cond_as_str
# turn a list of conditions into a text expression
# - merges ranges of conditions, and joins the result with ||
sub _cond_as_str {
- my ( $self, $op, $combine )= @_;
+ my ( $self, $op, $combine, $opts_ref )= @_;
my $cond= $op->{vals};
my $test= $op->{test};
+ my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
return "( $test )" if !defined $cond;
- # rangify the list
+ # rangify the list.
my @ranges;
my $Update= sub {
- if ( @ranges ) {
+ # We skip this if there are optimizations that
+ # we can apply (below) to the individual ranges
+ if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) {
if ( $ranges[-1][0] == $ranges[-1][1] ) {
$ranges[-1]= $ranges[-1][0];
} elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
@@ -475,25 +680,129 @@ sub _cond_as_str {
}
}
};
- for my $cond ( @$cond ) {
- if ( !@ranges || $cond != $ranges[-1][1] + 1 ) {
+ for my $condition ( @$cond ) {
+ if ( !@ranges || $condition != $ranges[-1][1] + 1 ) {
$Update->();
- push @ranges, [ $cond, $cond ];
+ push @ranges, [ $condition, $condition ];
} else {
$ranges[-1][1]++;
}
}
$Update->();
+
return $self->_combine( $test, @ranges )
if $combine;
- @ranges= map {
- ref $_
- ? sprintf(
- "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
- @$_ )
- : sprintf( "$self->{val_fmt} == $test", $_ );
- } @ranges;
+
+ if ($is_cp_ret) {
+ @ranges= map {
+ ref $_
+ ? sprintf(
+ "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
+ @$_ )
+ : sprintf( "$self->{val_fmt} == $test", $_ );
+ } @ranges;
+ }
+ else {
+ # If the input set has certain characteristics, we can optimize tests
+ # for it. This doesn't apply if returning the code point, as we want
+ # each element of the set individually. The code above is for this
+ # simpler case.
+
+ return 1 if @$cond == 256; # If all bytes match, is trivially true
+
+ if (@ranges > 1) {
+ # See if the entire set shares optimizable characterstics, and if
+ # so, return the optimization. We delay checking for this on sets
+ # with just a single range, as there may be better optimizations
+ # available in that case.
+ my ($mask, $base) = calculate_mask(@$cond);
+ if (defined $mask && defined $base) {
+ return sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask, $base;
+ }
+ }
+
+ # Here, there was no entire-class optimization. Look at each range.
+ for (my $i = 0; $i < @ranges; $i++) {
+ if (! ref $ranges[$i]) { # Trivial case: no range
+ $ranges[$i] = sprintf "$self->{val_fmt} == $test", $ranges[$i];
+ }
+ elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
+ $ranges[$i] = # Trivial case: single element range
+ sprintf "$self->{val_fmt} == $test", $ranges[$i]->[0];
+ }
+ else {
+ my $output = "";
+
+ # Well-formed UTF-8 continuation bytes on ascii platforms must
+ # be in the range 0x80 .. 0xBF. If we know that the input is
+ # well-formed (indicated by not trying to be 'safe'), we can
+ # omit tests that verify that the input is within either of
+ # these bounds. (No legal UTF-8 character can begin with
+ # anything in this range, so we don't have to worry about this
+ # being a continuation byte or not.)
+ if (ASCII_PLATFORM
+ && ! $opts_ref->{safe}
+ && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
+ {
+ my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
+ my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
+
+ # If the range is the entire legal range, it matches any
+ # legal byte, so we can omit both tests. (This should
+ # happen only if the number of ranges is 1.)
+ if ($lower_limit_is_80 && $upper_limit_is_BF) {
+ return 1;
+ }
+ elsif ($lower_limit_is_80) { # Just use the upper limit test
+ $output = sprintf("( $test <= $self->{val_fmt} )",
+ $ranges[$i]->[1]);
+ }
+ elsif ($upper_limit_is_BF) { # Just use the lower limit test
+ $output = sprintf("( $test >= $self->{val_fmt} )",
+ $ranges[$i]->[0]);
+ }
+ }
+
+ # If we didn't change to omit a test above, see if the number
+ # of elements is a power of 2 (only a single bit in the
+ # representation of its count will be set) and if so, it may
+ # be that a mask/compare optimization is possible.
+ if ($output eq ""
+ && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
+ {
+ my @list;
+ push @list, $_ for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
+ my ($mask, $base) = calculate_mask(@list);
+ if (defined $mask && defined $base) {
+ $output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $mask, $base;
+ }
+ }
+
+ if ($output ne "") { # Prefer any optimization
+ $ranges[$i] = $output;
+ }
+ elsif ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) {
+ # No optimization happened. We need a test that the code
+ # point is within both bounds. But, if the bounds are
+ # adjacent code points, it is cleaner to say
+ # 'first == test || second == test'
+ # than it is to say
+ # 'first <= test && test <= second'
+ $ranges[$i] = "( "
+ . join( " || ", ( map
+ { sprintf "$self->{val_fmt} == $test", $_ }
+ @{$ranges[$i]} ) )
+ . " )";
+ }
+ else { # Full bounds checking
+ $ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]);
+ }
+ }
+ }
+ }
+
return "( " . join( " || ", @ranges ) . " )";
+
}
# _combine
@@ -524,13 +833,18 @@ sub _combine {
# _render()
# recursively convert an optree to text with reasonably neat formatting
sub _render {
- my ( $self, $op, $combine, $brace )= @_;
+ my ( $self, $op, $combine, $brace, $opts_ref )= @_;
+ return 0 if ! defined $op; # The set is empty
if ( !ref $op ) {
return $op;
}
- my $cond= $self->_cond_as_str( $op, $combine );
- my $yes= $self->_render( $op->{yes}, $combine, 1 );
- my $no= $self->_render( $op->{no}, $combine, 0 );
+ my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
+ #no warnings 'recursion'; # This would allow really really inefficient
+ # code to be generated. See pod
+ my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref );
+ return $yes if $cond eq '1';
+
+ my $no= $self->_render( $op->{no}, $combine, 0, $opts_ref );
return "( $cond )" if $yes eq '1' and $no eq '0';
my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
return "$lb$cond ? $yes : $no$rb"
@@ -555,8 +869,8 @@ sub _render {
# longer lists such as that resulting from type 'cp' output.
# Currently only used for type 'cp' macros.
sub render {
- my ( $self, $op, $combine )= @_;
- my $str= "( " . $self->_render( $op, $combine ) . " )";
+ my ( $self, $op, $combine, $opts_ref )= @_;
+ my $str= "( " . $self->_render( $op, $combine, 0, $opts_ref ) . " )";
return __clean( $str );
}
@@ -564,7 +878,7 @@ sub render {
# make a macro of a given type.
# calls into make_trie and (generic_|length_)optree as needed
# Opts are:
-# type : 'cp','generic','low','latin1','utf8','LATIN1','UTF8'
+# type : 'cp','generic','high','low','latin1','utf8','LATIN1','UTF8'
# ret_type : 'cp' or 'len'
# safe : add length guards to macro
#
@@ -595,7 +909,7 @@ sub make_macro {
$method= 'optree';
}
my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
- my $text= $self->render( $optree, $type eq 'cp' );
+ my $text= $self->render( $optree, $type eq 'cp', \%opts );
my @args= $type eq 'cp' ? 'cp' : 's';
push @args, "e" if $opts{safe};
push @args, "is_utf8" if $type eq 'generic';
@@ -626,21 +940,38 @@ if ( !caller ) {
print $out_fh read_only_top( lang => 'C', by => $0,
file => 'regcharclass.h', style => '*',
copyright => [2007, 2011] );
+ print $out_fh "\n#ifndef H_REGCHARCLASS /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n";
- my ( $op, $title, @txt, @types, @mods );
+ my ( $op, $title, @txt, @types, %mods );
my $doit= sub {
return unless $op;
+
+ # Skip if to compile on a different platform.
+ return if delete $mods{only_ascii_platform} && ! ASCII_PLATFORM;
+ return if delete $mods{only_ebcdic_platform} && ord 'A' != 193;
+
print $out_fh "/*\n\t$op: $title\n\n";
print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
- #die Dumper(\@types,\@mods);
+ #die Dumper(\@types,\%mods);
+
+ my @mods;
+ push @mods, 'safe' if delete $mods{safe};
+ unshift @mods, 'fast' if delete $mods{fast} || ! @mods; # Default to 'fast'
+ # do this one
+ # first, as
+ # traditional
+ if (%mods) {
+ die "Unknown modifiers: ", join ", ", map { "'$_'" } keys %mods;
+ }
foreach my $type_spec ( @types ) {
my ( $type, $ret )= split /-/, $type_spec;
$ret ||= 'len';
foreach my $mod ( @mods ) {
next if $mod eq 'safe' and $type eq 'cp';
+ delete $mods{$mod};
my $macro= $obj->make_macro(
type => $type,
ret_type => $ret,
@@ -652,22 +983,26 @@ if ( !caller ) {
};
while ( <DATA> ) {
- s/^\s*#//;
+ s/^ \s* (?: \# .* ) ? $ //x; # squeeze out comment and blanks
next unless /\S/;
chomp;
if ( /^([A-Z]+)/ ) {
- $doit->();
+ $doit->(); # This starts a new definition; do the previous one
( $op, $title )= split /\s*:\s*/, $_, 2;
@txt= ();
} elsif ( s/^=>// ) {
my ( $type, $modifier )= split /:/, $_;
@types= split ' ', $type;
- @mods= split ' ', $modifier;
+ undef %mods;
+ map { $mods{$_} = 1 } split ' ', $modifier;
} else {
push @txt, "$_";
}
}
$doit->();
+
+ print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
+
if($path eq '-') {
print $out_fh "/* ex: set ro: */\n";
} else {
@@ -675,16 +1010,95 @@ if ( !caller ) {
}
}
+# The form of the input is a series of definitions to make macros for.
+# The first line gives the base name of the macro, followed by a colon, and
+# then text to be used in comments associated with the macro that are its
+# title or description. In all cases the first (perhaps only) parameter to
+# the macro is a pointer to the first byte of the code point it is to test to
+# see if it is in the class determined by the macro. In the case of non-UTF8,
+# the code point consists only of a single byte.
#
-# Valid types: generic, LATIN1, UTF8, low, latin1, utf8
-# default return value is octects read.
-# append -cp to make it codepoint matched.
-# modifiers come after the colon, valid possibilities
-# being 'fast' and 'safe'.
+# The second line must begin with a '=>' and be followed by the types of
+# macro(s) to be generated; these are specified below. A colon follows the
+# types, followed by the modifiers, also specified below. At least one
+# modifier is required.
#
+# The subsequent lines give what code points go into the class defined by the
+# macro. Multiple characters may be specified via a string like "\x0D\x0A",
+# enclosed in quotes. Otherwise the lines consist of single Unicode code
+# point, prefaced by 0x; or a single range of Unicode code points separated by
+# a minus (and optional space); or a single Unicode property specified in the
+# standard Perl form "\p{...}".
#
-# This is no longer used, but retained in case it is needed some day. Put the
-# lines below under __DATA__
+# A blank line or one whose first non-blank character is '#' is a comment.
+# The definition of the macro is terminated by a line unlike those described.
+#
+# Valid types:
+# low generate a macro whose name is 'is_BASE_low' and defines a
+# class that includes only ASCII-range chars. (BASE is the
+# input macro base name.)
+# latin1 generate a macro whose name is 'is_BASE_latin1' and defines a
+# class that includes only upper-Latin1-range chars. It is not
+# designed to take a UTF-8 input parameter.
+# high generate a macro whose name is 'is_BASE_high' and defines a
+# class that includes all relevant code points that are above
+# the Latin1 range. This is for very specialized uses only.
+# It is designed to take only an input UTF-8 parameter.
+# utf8 generate a macro whose name is 'is_BASE_utf8' and defines a
+# class that includes all relevant characters that aren't ASCII.
+# It is designed to take only an input UTF-8 parameter.
+# LATIN1 generate a macro whose name is 'is_BASE_latin1' and defines a
+# class that includes both ASCII and upper-Latin1-range chars.
+# It is not designed to take a UTF-8 input parameter.
+# UTF8 generate a macro whose name is 'is_BASE_utf8' and defines a
+# class that can include any code point, adding the 'low' ones
+# to what 'utf8' works on. It is designed to take only an input
+# UTF-8 parameter.
+# generic generate a macro whose name is 'is_BASE". It has a 2nd,
+# boolean, parameter which indicates if the first one points to
+# a UTF-8 string or not. Thus it works in all circumstances.
+# cp generate a macro whose name is 'is_BASE_cp' and defines a
+# class that returns true if the UV parameter is a member of the
+# class; false if not.
+# A macro of the given type is generated for each type listed in the input.
+# The default return value is the number of octets read to generate the match.
+# Append "-cp" to the type to have it instead return the matched codepoint.
+# The macro name is changed to 'what_BASE...'. See pod for
+# caveats
+# Appending '-both" instead adds an extra parameter to the end of the argument
+# list, which is a pointer as to where to store the number of
+# bytes matched, while also returning the code point. The macro
+# name is changed to 'what_len_BASE...'. See pod for caveats
+#
+# Valid modifiers:
+# safe The input string is not necessarily valid UTF-8. In
+# particular an extra parameter (always the 2nd) to the macro is
+# required, which points to one beyond the end of the string.
+# The macro will make sure not to read off the end of the
+# string. In the case of non-UTF8, it makes sure that the
+# string has at least one byte in it. The macro name has
+# '_safe' appended to it.
+# fast The input string is valid UTF-8. No bounds checking is done,
+# and the macro can make assumptions that lead to faster
+# execution.
+# only_ascii_platform Skip this definition if this program is being run on
+# a non-ASCII platform.
+# only_ebcdic_platform Skip this definition if this program is being run on
+# a non-EBCDIC platform.
+# No modifier need be specified; fast is assumed for this case. If both
+# 'fast', and 'safe' are specified, two macros will be created for each
+# 'type'.
+#
+# If run on a non-ASCII platform will automatically convert the Unicode input
+# to native. The documentation above is slightly wrong in this case. 'low'
+# actually refers to code points whose UTF-8 representation is the same as the
+# non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the
+# code points less than 256.
+
+1; # in the unlikely case we are being used as a module
+
+__DATA__
+# This is no longer used, but retained in case it is needed some day.
# TRICKYFOLD: Problematic fold case letters. When adding to this list, also should add them to regcomp.c and fold_grind.t
# => generic cp generic-cp generic-both :fast safe
# 0x00DF # LATIN SMALL LETTER SHARP S
@@ -694,48 +1108,75 @@ if ( !caller ) {
# 0x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
# 0x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
-1; # in the unlikely case we are being used as a module
-
-__DATA__
LNBREAK: Line Break: \R
=> generic UTF8 LATIN1 :fast safe
"\x0D\x0A" # CRLF - Network (Windows) line ending
-0x0A # LF | LINE FEED
-0x0B # VT | VERTICAL TAB
-0x0C # FF | FORM FEED
-0x0D # CR | CARRIAGE RETURN
-0x85 # NEL | NEXT LINE
-0x2028 # LINE SEPARATOR
-0x2029 # PARAGRAPH SEPARATOR
+\p{VertSpace}
HORIZWS: Horizontal Whitespace: \h \H
=> generic UTF8 LATIN1 cp :fast safe
-0x09 # HT
-0x20 # SPACE
-0xa0 # NBSP
-0x1680 # OGHAM SPACE MARK
-0x180e # MONGOLIAN VOWEL SEPARATOR
-0x2000 # EN QUAD
-0x2001 # EM QUAD
-0x2002 # EN SPACE
-0x2003 # EM SPACE
-0x2004 # THREE-PER-EM SPACE
-0x2005 # FOUR-PER-EM SPACE
-0x2006 # SIX-PER-EM SPACE
-0x2007 # FIGURE SPACE
-0x2008 # PUNCTUATION SPACE
-0x2009 # THIN SPACE
-0x200A # HAIR SPACE
-0x202f # NARROW NO-BREAK SPACE
-0x205f # MEDIUM MATHEMATICAL SPACE
-0x3000 # IDEOGRAPHIC SPACE
+\p{HorizSpace}
VERTWS: Vertical Whitespace: \v \V
=> generic UTF8 LATIN1 cp :fast safe
-0x0A # LF
-0x0B # VT
-0x0C # FF
-0x0D # CR
-0x85 # NEL
-0x2028 # LINE SEPARATOR
-0x2029 # PARAGRAPH SEPARATOR
+\p{VertSpace}
+
+REPLACEMENT: Unicode REPLACEMENT CHARACTER
+=> UTF8 :safe
+0xFFFD
+
+NONCHAR: Non character code points
+=> UTF8 :fast
+\p{Nchar}
+
+SURROGATE: Surrogate characters
+=> UTF8 :fast
+\p{Gc=Cs}
+
+GCB_L: Grapheme_Cluster_Break=L
+=> UTF8 :fast
+\p{_X_GCB_L}
+
+GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V)
+=> UTF8 :fast
+\p{_X_LV_LVT_V}
+
+GCB_Prepend: Grapheme_Cluster_Break=Prepend
+=> UTF8 :fast
+\p{_X_GCB_Prepend}
+
+GCB_RI: Grapheme_Cluster_Break=RI
+=> UTF8 :fast
+\p{_X_RI}
+
+GCB_SPECIAL_BEGIN: Grapheme_Cluster_Break=special_begins
+=> UTF8 :fast
+\p{_X_Special_Begin}
+
+GCB_T: Grapheme_Cluster_Break=T
+=> UTF8 :fast
+\p{_X_GCB_T}
+
+GCB_V: Grapheme_Cluster_Break=V
+=> UTF8 :fast
+\p{_X_GCB_V}
+
+# This program was run with this enabled, and the results copied to utf8.h;
+# then this was commented out because it takes so long to figure out these 2
+# million code points. The results would not change unless utf8.h decides it
+# wants a maximum other than 4 bytes, or this program creates better
+# optimizations
+#UTF8_CHAR: Matches utf8 from 1 to 4 bytes
+#=> UTF8 :safe only_ascii_platform
+#0x0 - 0x1FFFFF
+
+# This hasn't been commented out, because we haven't an EBCDIC platform to run
+# it on, and the 3 types of EBCDIC allegedly supported by Perl would have
+# different results
+UTF8_CHAR: Matches utf8 from 1 to 5 bytes
+=> UTF8 :safe only_ebcdic_platform
+0x0 - 0x3FFFFF:
+
+QUOTEMETA: Meta-characters that \Q should quote
+=> high :fast
+\p{_Perl_Quotemeta}
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl
new file mode 100644
index 0000000000..e3d588a599
--- /dev/null
+++ b/regen/unicode_constants.pl
@@ -0,0 +1,146 @@
+use v5.16.0;
+use strict;
+use warnings;
+require 'regen/regen_lib.pl';
+use charnames qw(:loose);
+
+my $out_fh = open_new('unicode_constants.h', '>',
+ {style => '*', by => $0,
+ from => "Unicode data"});
+
+print $out_fh <<END;
+
+#ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */
+#define H_UNICODE_CONSTANTS 1
+
+/* This file contains #defines for various Unicode code points. The values
+ * the macros expand to are the native Unicode code point, or all or portions
+ * of the UTF-8 encoding for the code point. In the former case, the macro
+ * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8".
+ *
+ * The macros that have the suffix "_UTF8" may have further suffixes, as
+ * follows:
+ * "_FIRST_BYTE" if the value is just the first byte of the UTF-8
+ * representation; the value will be a numeric constant.
+ * "_TAIL" if instead it represents all but the first byte. This, and
+ * with no additional suffix are both string constants */
+
+END
+
+# The data are at the end of this file. A blank line is output as-is.
+# Otherwise, each line represents one #define, and begins with either a
+# Unicode character name with the blanks in it squeezed out or replaced by
+# underscores; or it may be a hexadecimal Unicode code point. In the latter
+# case, the name will be looked-up to use as the name of the macro. In either
+# case, the macro name will have suffixes as listed above, and all blanks will
+# be replaced by underscores.
+#
+# Each line may optionally have one of the following flags on it, separated by
+# white space from the initial token.
+# string indicates that the output is to be of the string form
+# described in the comments above that are placed in the file.
+# first indicates that the output is to be of the FIRST_BYTE form.
+# tail indicates that the output is of the _TAIL form.
+# native indicates that the output is the code point, converted to the
+# platform's native character set if applicable
+#
+# If the code point has no official name, the desired name may be appended
+# after the flag, which will be ignored if there is an official name.
+#
+# This program is used to make it convenient to create compile time constants
+# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
+# having to figure things out.
+
+while ( <DATA> ) {
+ if ($_ !~ /\S/) {
+ print $out_fh "\n";
+ next;
+ }
+
+ chomp;
+ unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
+ (?: [\ ]+ ( [^ ]* ) )? # optional flag
+ (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required
+ /x)
+ {
+ die "Unexpected syntax at line $.: $_\n";
+ }
+
+ my $name_or_cp = $1;
+ my $flag = $2;
+ my $desired_name = $3;
+
+ my $name;
+ my $cp;
+
+ if ($name_or_cp =~ /[^[:xdigit:]]/) {
+
+ # Anything that isn't a hex value must be a name.
+ $name = $name_or_cp;
+ $cp = charnames::vianame($name =~ s/_/ /gr);
+ die "Unknown name '$name' at line $.: $_\n" unless defined $name;
+ }
+ else {
+ $cp = $name_or_cp;
+ $name = charnames::viacode("0$cp") // ""; # viacode requires a leading
+ # zero to be sure that the
+ # argument is hex
+ die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp;
+ }
+
+ $name = $desired_name if $name eq "";
+ $name =~ s/ /_/g; # The macro name can have no blanks in it
+
+ my $str = join "", map { sprintf "\\x%02X", $_ }
+ unpack("U0C*", pack("U", hex $cp));
+
+ my $suffix = '_UTF8';
+ if (! defined $flag || $flag eq 'string') {
+ $str = "\"$str\""; # Will be a string constant
+ } elsif ($flag eq 'tail') {
+ $str =~ s/\\x..//; # Remove the first byte
+ $suffix .= '_TAIL';
+ $str = "\"$str\""; # Will be a string constant
+ }
+ elsif ($flag eq 'first') {
+ $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
+ $suffix .= '_FIRST_BYTE';
+ $str = "0x$str"; # Is a numeric constant
+ }
+ elsif ($flag eq 'native') {
+ die "Are you sure you want to run this on an above-Latin1 code point?" if hex $cp > 0xff;
+ $suffix = '_NATIVE';
+ $str = utf8::unicode_to_native(hex $cp);
+ $str = "0x$cp"; # Is a numeric constant
+ }
+ else {
+ die "Unknown flag at line $.: $_\n";
+ }
+ print $out_fh "#define ${name}$suffix $str /* U+$cp */\n";
+}
+
+print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
+
+read_only_bottom_close_and_rename($out_fh);
+
+__DATA__
+0300 string
+0301 string
+0308 string
+
+03B9 first
+03B9 tail
+
+03C5 first
+03C5 tail
+
+2010 string
+D800 first FIRST_SURROGATE
+
+007F native
+00DF native
+00E5 native
+00C5 native
+00FF native
+00B5 native
+0085 native
diff --git a/regen/utf8_strings.pl b/regen/utf8_strings.pl
deleted file mode 100644
index d6d4c76208..0000000000
--- a/regen/utf8_strings.pl
+++ /dev/null
@@ -1,108 +0,0 @@
-use v5.16.0;
-use strict;
-use warnings;
-require 'regen/regen_lib.pl';
-use charnames qw(:loose);
-
-my $out_fh = open_new('utf8_strings.h', '>',
- {style => '*', by => $0,
- from => "Unicode data"});
-
-print $out_fh <<END;
-/* This file contains #defines for various Unicode code points. The values
- * for the macros are all or portions of the UTF-8 encoding for the code
- * point. Note that the names all have the suffix "_UTF8".
- *
- * The suffix "_FIRST_BYTE" may be appended to the name if the value is just
- * the first byte of the UTF-8 representation; the value will be a numeric
- * constant.
- *
- * The suffix "_TAIL" is appened if instead it represents all but the first
- * byte. This, and with no suffix are both string constants */
-
-END
-
-# The data are at the end of this file. Each line represents one #define.
-# Each line begins with either a Unicode character name with the blanks in it
-# squeezed out or replaced by underscores; or it may be a hexadecimal code
-# point. In the latter case, the name will be looked-up to use as the name
-# of the macro. In either case, the macro name will have suffixes as
-# listed above, and all blanks will be replaced by underscores.
-#
-# Each line may optionally have one of the following flags on it, separated by
-# white space from the initial token.
-# first indicates that the output is to be of the FIRST_BYTE form
-# described in the comments above that are placed in the file.
-# tail indicates that the output is of the _TAIL form.
-#
-# This program is used to make it convenient to create compile time constants
-# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
-# having to figure things out.
-
-while ( <DATA> ) {
- chomp;
- unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
- (?: [\ ]+ ( .* ) )? # optional flag
- /x)
- {
- die "Unexpected syntax at line $.: $_\n";
- }
-
- my $name_or_cp = $1;
- my $flag = $2;
-
- my $name;
- my $cp;
-
- if ($name_or_cp =~ /[^[:xdigit:]]/) {
-
- # Anything that isn't a hex value must be a name.
- $name = $name_or_cp;
- $cp = charnames::vianame($name =~ s/_/ /gr);
- die "Unknown name '$name' at line $.: $_\n" unless defined $name;
- }
- else {
- $cp = $name_or_cp;
- $name = charnames::viacode("0$cp"); # viacode requires a leading zero
- # to be sure that the argument is hex
- die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp;
- }
-
- $name =~ s/ /_/g; # The macro name can have no blanks in it
-
- my $str = join "", map { sprintf "\\x%02X", $_ }
- unpack("U0C*", pack("U", hex $cp));
-
- my $suffix = '_UTF8';
- if (! defined $flag) {
- $str = "\"$str\""; # Will be a string constant
- } elsif ($flag eq 'tail') {
- $str =~ s/\\x..//; # Remove the first byte
- $suffix .= '_TAIL';
- $str = "\"$str\""; # Will be a string constant
- }
- elsif ($flag eq 'first') {
- $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
- $suffix .= '_FIRST_BYTE';
- $str = "0x$str"; # Is a numeric constant
- }
- else {
- die "Unknown flag at line $.: $_\n";
- }
- print $out_fh "#define ${name}$suffix $str /* U+$cp */\n";
-}
-
-read_only_bottom_close_and_rename($out_fh);
-
-__DATA__
-0300
-0301
-0308
-03B9 tail
-03C5 tail
-03B9 first
-03C5 first
-1100
-1160
-11A8
-2010