diff options
Diffstat (limited to 'regen')
-rwxr-xr-x | regen/regcharclass.pl | 653 | ||||
-rw-r--r-- | regen/unicode_constants.pl | 146 | ||||
-rw-r--r-- | regen/utf8_strings.pl | 108 |
3 files changed, 693 insertions, 214 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index faf1572b7c..7d126428ef 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -4,11 +4,12 @@ use 5.008; use warnings; use warnings FATAL => 'all'; use Text::Wrap qw(wrap); -use Encode; use Data::Dumper; $Data::Dumper::Useqq= 1; our $hex_fmt= "0x%02X"; +sub ASCII_PLATFORM { (ord('A') == 65) } + require 'regen/regen_lib.pl'; =head1 NAME @@ -23,36 +24,47 @@ CharClass::Matcher -- Generate C macros that match character classes efficiently Dynamically generates macros for detecting special charclasses in latin-1, utf8, and codepoint forms. Macros can be set to return -the length (in bytes) of the matched codepoint, or the codepoint itself. +the length (in bytes) of the matched codepoint, and/or the codepoint itself. -To regenerate regcharclass.h, run this script from perl-root. No arguments +To regenerate F<regcharclass.h>, run this script from perl-root. No arguments are necessary. -Using WHATEVER as an example the following macros will be produced: +Using WHATEVER as an example the following macros can be produced, depending +on the input parameters (how to get each is described by internal comments at +the C<__DATA__> line): =over 4 -=item is_WHATEVER(s,is_utf8) +=item C<is_WHATEVER(s,is_utf8)> -=item is_WHATEVER_safe(s,e,is_utf8) +=item C<is_WHATEVER_safe(s,e,is_utf8)> -Do a lookup as appropriate based on the is_utf8 flag. When possible -comparisons involving octect<128 are done before checking the is_utf8 +Do a lookup as appropriate based on the C<is_utf8> flag. When possible +comparisons involving octect<128 are done before checking the C<is_utf8> flag, hopefully saving time. -=item is_WHATEVER_utf8(s) +The version without the C<_safe> suffix should be used only when the input is +known to be well-formed. + +=item C<is_WHATEVER_utf8(s)> -=item is_WHATEVER_utf8_safe(s,e) +=item C<is_WHATEVER_utf8_safe(s,e)> Do a lookup assuming the string is encoded in (normalized) UTF8. -=item is_WHATEVER_latin1(s) +The version without the C<_safe> suffix should be used only when the input is +known to be well-formed. + +=item C<is_WHATEVER_latin1(s)> -=item is_WHATEVER_latin1_safe(s,e) +=item C<is_WHATEVER_latin1_safe(s,e)> Do a lookup assuming the string is encoded in latin-1 (aka plan octets). -=item is_WHATEVER_cp(cp) +The version without the C<_safe> suffix should be used only when it is known +that C<s> contains at least one character. + +=item C<is_WHATEVER_cp(cp)> Check to see if the string matches a given codepoint (hypothetically a U32). The condition is constructed as as to "break out" as early as @@ -65,11 +77,34 @@ IOW: Thus if the character is X+1 only two comparisons will be done. Making matching lookups slower, but non-matching faster. -=back +=item C<what_len_WHATEVER_FOO(arg1, ..., len)> + +A variant form of each of the macro types described above can be generated, in +which the code point is returned by the macro, and an extra parameter (in the +final position) is added, which is a pointer for the macro to set the byte +length of the returned code point. -Additionally it is possible to generate C<what_> variants that return -the codepoint read instead of the number of octets read, this can be -done by suffixing '-cp' to the type description. +These forms all have a C<what_len> prefix instead of the C<is_>, for example +C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and +C<what_len_WHATEVER_utf8(s,len)>. + +These forms should not be used I<except> on small sets of mostly widely +separated code points; otherwise the code generated is inefficient. For these +cases, it is best to use the C<is_> forms, and then find the code point with +C<utf8_to_uvchr_buf>(). This program can fail with a "deep recursion" +message on the worst of the inappropriate sets. Examine the generated macro +to see if it is acceptable. + +=item C<what_WHATEVER_FOO(arg1, ...)> + +A variant form of each of the C<is_> macro types described above can be generated, in +which the code point and not the length is returned by the macro. These have +the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should +not be used where the set contains a NULL, as 0 is returned for two different +cases: a) the set doesn't include the input code point; b) the set does +include it, and it is a NULL. + +=back =head2 CODE FORMAT @@ -78,7 +113,7 @@ perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f" =head1 AUTHOR -Author: Yves Orton (demerphq) 2007 +Author: Yves Orton (demerphq) 2007. Maintained by Perl5 Porters. =head1 BUGS @@ -107,14 +142,16 @@ License or the Artistic License, as specified in the README file. # represent the string in some given encoding with specific conditions. # # $cp - list of codepoints that make up the string. -# $n - list of octets that make up the string if all codepoints < 128 +# $n - list of octets that make up the string if all codepoints are invariant +# regardless of if the string is in UTF-8 or not. # $l - list of octets that make up the string in latin1 encoding if all -# codepoints < 256, and at least one codepoint is >127. -# $u - list of octets that make up the string in utf8 if any codepoint >127 +# codepoints < 256, and at least one codepoint is UTF-8 variant. +# $u - list of octets that make up the string in utf8 if any codepoint is +# UTF-8 variant # # High CP | Defined #-----------+---------- -# 0 - 127 : $n +# 0 - 127 : $n (127/128 are the values for ASCII platforms) # 128 - 255 : $l, $u # 256 - ... : $u # @@ -123,22 +160,33 @@ sub __uni_latin1 { my $str= shift; my $max= 0; my @cp; + my $only_has_invariants = 1; for my $ch ( split //, $str ) { my $cp= ord $ch; push @cp, $cp; $max= $cp if $max < $cp; + if (! ASCII_PLATFORM && $only_has_invariants) { + if ($cp > 255) { + $only_has_invariants = 0; + } + else { + my $temp = chr($cp); + utf8::upgrade($temp); + my @utf8 = unpack "U0C*", $temp; + $only_has_invariants = (@utf8 == 1 && $utf8[0] == $cp); + } + } } my ( $n, $l, $u ); - if ( $max < 128 ) { + $only_has_invariants = $max < 128 if ASCII_PLATFORM; + if ($only_has_invariants) { $n= [@cp]; } else { $l= [@cp] if $max && $max < 256; - my $copy= $str; # must copy string, FB_CROAK makes encode destructive - $u= eval { Encode::encode( "utf8", $copy, Encode::FB_CROAK ) }; - # $u is utf8 but with the utf8 flag OFF - # therefore "C*" gets us the values of the bytes involved. - $u= [ unpack "C*", $u ] if defined $u; + $u= $str; + utf8::upgrade($u); + $u= [ unpack "U0C*", $u ] if defined $u; } return ( \@cp, $n, $l, $u ); } @@ -224,7 +272,7 @@ sub __cond_join { # # Each string is then stored in the 'strs' subhash as a hash record # made up of the results of __uni_latin1, using the keynames -# 'low','latin1','utf8', as well as the synthesized 'LATIN1' and +# 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and # 'UTF8' which hold a merge of 'low' and their lowercase equivelents. # # Size data is tracked per type in the 'size' subhash. @@ -247,23 +295,62 @@ sub new { my $str= $txt; if ( $str =~ /^[""]/ ) { $str= eval $str; - } elsif ( $str =~ /^0x/ ) { + } elsif ($str =~ / - /x ) { # A range: Replace this element on the + # list with its expansion + my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x; + die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper; + foreach my $cp (hex $lower .. hex $upper) { + push @{$opt{txt}}, sprintf "0x%X", $cp; + } + next; + } elsif ($str =~ s/ ^ N (?= 0x ) //x ) { + # Otherwise undocumented, a leading N means is already in the + # native character set; don't convert. $str= chr eval $str; - } elsif ( /\S/ ) { - die "Unparsable line: $txt\n"; - } else { + } elsif ( $str =~ /^0x/ ) { + $str= eval $str; + + # Convert from Unicode/ASCII to native, if necessary + $str = utf8::unicode_to_native($str) if ! ASCII_PLATFORM + && $str <= 0xFF; + $str = chr $str; + } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) { + my $property = $1; + use Unicode::UCD qw(prop_invlist); + + my @invlist = prop_invlist($property, '_perl_core_internal_ok'); + if (! @invlist) { + + # An empty return could mean an unknown property, or merely + # that it is empty. Call in scalar context to differentiate + my $count = prop_invlist($property, '_perl_core_internal_ok'); + die "$property not found" unless defined $count; + } + + # Replace this element on the list with the property's expansion + for (my $i = 0; $i < @invlist; $i += 2) { + foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) { + + # prop_invlist() returns native values; add leading 'N' + # to indicate that. + push @{$opt{txt}}, sprintf "N0x%X", $cp; + } + } next; + } else { + die "Unparsable line: $txt\n"; } my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str ); my $UTF8= $low || $utf8; my $LATIN1= $low || $latin1; + my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8; #die Dumper($txt,$cp,$low,$latin1,$utf8) # if $txt=~/NEL/ or $utf8 and @$utf8>3; - @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 cp UTF8 LATIN1 )}= - ( $str, $txt, $low, $utf8, $latin1, $cp, $UTF8, $LATIN1 ); + @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp UTF8 LATIN1 )}= + ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $UTF8, $LATIN1 ); my $rec= $self->{strs}{$str}; - foreach my $key ( qw(low utf8 latin1 cp UTF8 LATIN1) ) { + foreach my $key ( qw(low utf8 latin1 high cp UTF8 LATIN1) ) { $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++ if $self->{strs}{$str}{$key}; } @@ -308,6 +395,22 @@ sub make_trie { return 0 + keys( %trie ) ? \%trie : undef; } +sub pop_count ($) { + my $word = shift; + + # This returns a list of the positions of the bits in the input word that + # are 1. + + my @positions; + my $position = 0; + while ($word) { + push @positions, $position if $word & 1; + $position++; + $word >>= 1; + } + return @positions; +} + # my $optree= _optree() # # recursively convert a trie to an optree where every node represents @@ -326,7 +429,7 @@ sub _optree { $depth= 0 unless defined $depth; my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie; - if ( $trie->{''} ) { + if (exists $trie->{''} ) { if ( $ret_type eq 'cp' ) { $else= $self->{strs}{ $trie->{''} }{cp}[0]; $else= sprintf "$self->{val_fmt}", $else if $else > 9; @@ -454,19 +557,121 @@ sub length_optree { return $else; } +sub calculate_mask(@) { + my @list = @_; + my $list_count = @list; + + # Look at the input list of byte values. This routine sees if the set + # consisting of those bytes is exactly determinable by using a + # mask/compare operation. If not, it returns an empty list; if so, it + # returns a list consisting of (mask, compare). For example, consider a + # set consisting of the numbers 0xF0, 0xF1, 0xF2, and 0xF3. If we want to + # know if a number 'c' is in the set, we could write: + # 0xF0 <= c && c <= 0xF4 + # But the following mask/compare also works, and has just one test: + # c & 0xFC == 0xF0 + # The reason it works is that the set consists of exactly those numbers + # whose first 4 bits are 1, and the next two are 0. (The value of the + # other 2 bits is immaterial in determining if a number is in the set or + # not.) The mask masks out those 2 irrelevant bits, and the comparison + # makes sure that the result matches all bytes that which match those 6 + # material bits exactly. In other words, the set of numbers contains + # exactly those whose bottom two bit positions are either 0 or 1. The + # same principle applies to bit positions that are not necessarily + # adjacent. And it can be applied to bytes that differ in 1 through all 8 + # bit positions. In order to be a candidate for this optimization, the + # number of numbers in the test must be a power of 2. Based on this + # count, we know the number of bit positions that must differ. + my $bit_diff_count = 0; + my $compare = $list[0]; + if ($list_count == 2) { + $bit_diff_count = 1; + } + elsif ($list_count == 4) { + $bit_diff_count = 2; + } + elsif ($list_count == 8) { + $bit_diff_count = 3; + } + elsif ($list_count == 16) { + $bit_diff_count = 4; + } + elsif ($list_count == 32) { + $bit_diff_count = 5; + } + elsif ($list_count == 64) { + $bit_diff_count = 6; + } + elsif ($list_count == 128) { + $bit_diff_count = 7; + } + elsif ($list_count == 256) { + return (0, 0); + } + + # If the count wasn't a power of 2, we can't apply this optimization + return if ! $bit_diff_count; + + my %bit_map; + + # For each byte in the list, find the bit positions in it whose value + # differs from the first byte in the set. + for (my $i = 1; $i < @list; $i++) { + my @positions = pop_count($list[0] ^ $list[$i]); + + # If the number of differing bits is greater than those permitted by + # the set size, this optimization doesn't apply. + return if @positions > $bit_diff_count; + + # Save the bit positions that differ. + foreach my $bit (@positions) { + $bit_map{$bit} = 1; + } + + # If the total so far is greater than those permitted by the set size, + # this optimization doesn't apply. + return if keys %bit_map > $bit_diff_count; + + + # The value to compare against is the AND of all the members of the + # set. The bit positions that are the same in all will be correct in + # the AND, and the bit positions that differ will be 0. + $compare &= $list[$i]; + } + + # To get to here, we have gone through all bytes in the set, + # and determined that they all differ from each other in at most + # the number of bits allowed for the set's quantity. And since we have + # tested all 2**N possibilities, we know that the set includes no fewer + # elements than we need,, so the optimization applies. + die "panic: internal logic error" if keys %bit_map != $bit_diff_count; + + # The mask is the bit positions where things differ, complemented. + my $mask = 0; + foreach my $position (keys %bit_map) { + $mask |= 1 << $position; + } + $mask = ~$mask & 0xFF; + + return ($mask, $compare); +} + # _cond_as_str # turn a list of conditions into a text expression # - merges ranges of conditions, and joins the result with || sub _cond_as_str { - my ( $self, $op, $combine )= @_; + my ( $self, $op, $combine, $opts_ref )= @_; my $cond= $op->{vals}; my $test= $op->{test}; + my $is_cp_ret = $opts_ref->{ret_type} eq "cp"; return "( $test )" if !defined $cond; - # rangify the list + # rangify the list. my @ranges; my $Update= sub { - if ( @ranges ) { + # We skip this if there are optimizations that + # we can apply (below) to the individual ranges + if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) { if ( $ranges[-1][0] == $ranges[-1][1] ) { $ranges[-1]= $ranges[-1][0]; } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) { @@ -475,25 +680,129 @@ sub _cond_as_str { } } }; - for my $cond ( @$cond ) { - if ( !@ranges || $cond != $ranges[-1][1] + 1 ) { + for my $condition ( @$cond ) { + if ( !@ranges || $condition != $ranges[-1][1] + 1 ) { $Update->(); - push @ranges, [ $cond, $cond ]; + push @ranges, [ $condition, $condition ]; } else { $ranges[-1][1]++; } } $Update->(); + return $self->_combine( $test, @ranges ) if $combine; - @ranges= map { - ref $_ - ? sprintf( - "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", - @$_ ) - : sprintf( "$self->{val_fmt} == $test", $_ ); - } @ranges; + + if ($is_cp_ret) { + @ranges= map { + ref $_ + ? sprintf( + "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", + @$_ ) + : sprintf( "$self->{val_fmt} == $test", $_ ); + } @ranges; + } + else { + # If the input set has certain characteristics, we can optimize tests + # for it. This doesn't apply if returning the code point, as we want + # each element of the set individually. The code above is for this + # simpler case. + + return 1 if @$cond == 256; # If all bytes match, is trivially true + + if (@ranges > 1) { + # See if the entire set shares optimizable characterstics, and if + # so, return the optimization. We delay checking for this on sets + # with just a single range, as there may be better optimizations + # available in that case. + my ($mask, $base) = calculate_mask(@$cond); + if (defined $mask && defined $base) { + return sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask, $base; + } + } + + # Here, there was no entire-class optimization. Look at each range. + for (my $i = 0; $i < @ranges; $i++) { + if (! ref $ranges[$i]) { # Trivial case: no range + $ranges[$i] = sprintf "$self->{val_fmt} == $test", $ranges[$i]; + } + elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) { + $ranges[$i] = # Trivial case: single element range + sprintf "$self->{val_fmt} == $test", $ranges[$i]->[0]; + } + else { + my $output = ""; + + # Well-formed UTF-8 continuation bytes on ascii platforms must + # be in the range 0x80 .. 0xBF. If we know that the input is + # well-formed (indicated by not trying to be 'safe'), we can + # omit tests that verify that the input is within either of + # these bounds. (No legal UTF-8 character can begin with + # anything in this range, so we don't have to worry about this + # being a continuation byte or not.) + if (ASCII_PLATFORM + && ! $opts_ref->{safe} + && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi) + { + my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80); + my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF); + + # If the range is the entire legal range, it matches any + # legal byte, so we can omit both tests. (This should + # happen only if the number of ranges is 1.) + if ($lower_limit_is_80 && $upper_limit_is_BF) { + return 1; + } + elsif ($lower_limit_is_80) { # Just use the upper limit test + $output = sprintf("( $test <= $self->{val_fmt} )", + $ranges[$i]->[1]); + } + elsif ($upper_limit_is_BF) { # Just use the lower limit test + $output = sprintf("( $test >= $self->{val_fmt} )", + $ranges[$i]->[0]); + } + } + + # If we didn't change to omit a test above, see if the number + # of elements is a power of 2 (only a single bit in the + # representation of its count will be set) and if so, it may + # be that a mask/compare optimization is possible. + if ($output eq "" + && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1) + { + my @list; + push @list, $_ for ($ranges[$i]->[0] .. $ranges[$i]->[1]); + my ($mask, $base) = calculate_mask(@list); + if (defined $mask && defined $base) { + $output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $mask, $base; + } + } + + if ($output ne "") { # Prefer any optimization + $ranges[$i] = $output; + } + elsif ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) { + # No optimization happened. We need a test that the code + # point is within both bounds. But, if the bounds are + # adjacent code points, it is cleaner to say + # 'first == test || second == test' + # than it is to say + # 'first <= test && test <= second' + $ranges[$i] = "( " + . join( " || ", ( map + { sprintf "$self->{val_fmt} == $test", $_ } + @{$ranges[$i]} ) ) + . " )"; + } + else { # Full bounds checking + $ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]); + } + } + } + } + return "( " . join( " || ", @ranges ) . " )"; + } # _combine @@ -524,13 +833,18 @@ sub _combine { # _render() # recursively convert an optree to text with reasonably neat formatting sub _render { - my ( $self, $op, $combine, $brace )= @_; + my ( $self, $op, $combine, $brace, $opts_ref )= @_; + return 0 if ! defined $op; # The set is empty if ( !ref $op ) { return $op; } - my $cond= $self->_cond_as_str( $op, $combine ); - my $yes= $self->_render( $op->{yes}, $combine, 1 ); - my $no= $self->_render( $op->{no}, $combine, 0 ); + my $cond= $self->_cond_as_str( $op, $combine, $opts_ref ); + #no warnings 'recursion'; # This would allow really really inefficient + # code to be generated. See pod + my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref ); + return $yes if $cond eq '1'; + + my $no= $self->_render( $op->{no}, $combine, 0, $opts_ref ); return "( $cond )" if $yes eq '1' and $no eq '0'; my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" ); return "$lb$cond ? $yes : $no$rb" @@ -555,8 +869,8 @@ sub _render { # longer lists such as that resulting from type 'cp' output. # Currently only used for type 'cp' macros. sub render { - my ( $self, $op, $combine )= @_; - my $str= "( " . $self->_render( $op, $combine ) . " )"; + my ( $self, $op, $combine, $opts_ref )= @_; + my $str= "( " . $self->_render( $op, $combine, 0, $opts_ref ) . " )"; return __clean( $str ); } @@ -564,7 +878,7 @@ sub render { # make a macro of a given type. # calls into make_trie and (generic_|length_)optree as needed # Opts are: -# type : 'cp','generic','low','latin1','utf8','LATIN1','UTF8' +# type : 'cp','generic','high','low','latin1','utf8','LATIN1','UTF8' # ret_type : 'cp' or 'len' # safe : add length guards to macro # @@ -595,7 +909,7 @@ sub make_macro { $method= 'optree'; } my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type ); - my $text= $self->render( $optree, $type eq 'cp' ); + my $text= $self->render( $optree, $type eq 'cp', \%opts ); my @args= $type eq 'cp' ? 'cp' : 's'; push @args, "e" if $opts{safe}; push @args, "is_utf8" if $type eq 'generic'; @@ -626,21 +940,38 @@ if ( !caller ) { print $out_fh read_only_top( lang => 'C', by => $0, file => 'regcharclass.h', style => '*', copyright => [2007, 2011] ); + print $out_fh "\n#ifndef H_REGCHARCLASS /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n"; - my ( $op, $title, @txt, @types, @mods ); + my ( $op, $title, @txt, @types, %mods ); my $doit= sub { return unless $op; + + # Skip if to compile on a different platform. + return if delete $mods{only_ascii_platform} && ! ASCII_PLATFORM; + return if delete $mods{only_ebcdic_platform} && ord 'A' != 193; + print $out_fh "/*\n\t$op: $title\n\n"; print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", ""; my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt ); - #die Dumper(\@types,\@mods); + #die Dumper(\@types,\%mods); + + my @mods; + push @mods, 'safe' if delete $mods{safe}; + unshift @mods, 'fast' if delete $mods{fast} || ! @mods; # Default to 'fast' + # do this one + # first, as + # traditional + if (%mods) { + die "Unknown modifiers: ", join ", ", map { "'$_'" } keys %mods; + } foreach my $type_spec ( @types ) { my ( $type, $ret )= split /-/, $type_spec; $ret ||= 'len'; foreach my $mod ( @mods ) { next if $mod eq 'safe' and $type eq 'cp'; + delete $mods{$mod}; my $macro= $obj->make_macro( type => $type, ret_type => $ret, @@ -652,22 +983,26 @@ if ( !caller ) { }; while ( <DATA> ) { - s/^\s*#//; + s/^ \s* (?: \# .* ) ? $ //x; # squeeze out comment and blanks next unless /\S/; chomp; if ( /^([A-Z]+)/ ) { - $doit->(); + $doit->(); # This starts a new definition; do the previous one ( $op, $title )= split /\s*:\s*/, $_, 2; @txt= (); } elsif ( s/^=>// ) { my ( $type, $modifier )= split /:/, $_; @types= split ' ', $type; - @mods= split ' ', $modifier; + undef %mods; + map { $mods{$_} = 1 } split ' ', $modifier; } else { push @txt, "$_"; } } $doit->(); + + print $out_fh "\n#endif /* H_REGCHARCLASS */\n"; + if($path eq '-') { print $out_fh "/* ex: set ro: */\n"; } else { @@ -675,16 +1010,95 @@ if ( !caller ) { } } +# The form of the input is a series of definitions to make macros for. +# The first line gives the base name of the macro, followed by a colon, and +# then text to be used in comments associated with the macro that are its +# title or description. In all cases the first (perhaps only) parameter to +# the macro is a pointer to the first byte of the code point it is to test to +# see if it is in the class determined by the macro. In the case of non-UTF8, +# the code point consists only of a single byte. # -# Valid types: generic, LATIN1, UTF8, low, latin1, utf8 -# default return value is octects read. -# append -cp to make it codepoint matched. -# modifiers come after the colon, valid possibilities -# being 'fast' and 'safe'. +# The second line must begin with a '=>' and be followed by the types of +# macro(s) to be generated; these are specified below. A colon follows the +# types, followed by the modifiers, also specified below. At least one +# modifier is required. # +# The subsequent lines give what code points go into the class defined by the +# macro. Multiple characters may be specified via a string like "\x0D\x0A", +# enclosed in quotes. Otherwise the lines consist of single Unicode code +# point, prefaced by 0x; or a single range of Unicode code points separated by +# a minus (and optional space); or a single Unicode property specified in the +# standard Perl form "\p{...}". # -# This is no longer used, but retained in case it is needed some day. Put the -# lines below under __DATA__ +# A blank line or one whose first non-blank character is '#' is a comment. +# The definition of the macro is terminated by a line unlike those described. +# +# Valid types: +# low generate a macro whose name is 'is_BASE_low' and defines a +# class that includes only ASCII-range chars. (BASE is the +# input macro base name.) +# latin1 generate a macro whose name is 'is_BASE_latin1' and defines a +# class that includes only upper-Latin1-range chars. It is not +# designed to take a UTF-8 input parameter. +# high generate a macro whose name is 'is_BASE_high' and defines a +# class that includes all relevant code points that are above +# the Latin1 range. This is for very specialized uses only. +# It is designed to take only an input UTF-8 parameter. +# utf8 generate a macro whose name is 'is_BASE_utf8' and defines a +# class that includes all relevant characters that aren't ASCII. +# It is designed to take only an input UTF-8 parameter. +# LATIN1 generate a macro whose name is 'is_BASE_latin1' and defines a +# class that includes both ASCII and upper-Latin1-range chars. +# It is not designed to take a UTF-8 input parameter. +# UTF8 generate a macro whose name is 'is_BASE_utf8' and defines a +# class that can include any code point, adding the 'low' ones +# to what 'utf8' works on. It is designed to take only an input +# UTF-8 parameter. +# generic generate a macro whose name is 'is_BASE". It has a 2nd, +# boolean, parameter which indicates if the first one points to +# a UTF-8 string or not. Thus it works in all circumstances. +# cp generate a macro whose name is 'is_BASE_cp' and defines a +# class that returns true if the UV parameter is a member of the +# class; false if not. +# A macro of the given type is generated for each type listed in the input. +# The default return value is the number of octets read to generate the match. +# Append "-cp" to the type to have it instead return the matched codepoint. +# The macro name is changed to 'what_BASE...'. See pod for +# caveats +# Appending '-both" instead adds an extra parameter to the end of the argument +# list, which is a pointer as to where to store the number of +# bytes matched, while also returning the code point. The macro +# name is changed to 'what_len_BASE...'. See pod for caveats +# +# Valid modifiers: +# safe The input string is not necessarily valid UTF-8. In +# particular an extra parameter (always the 2nd) to the macro is +# required, which points to one beyond the end of the string. +# The macro will make sure not to read off the end of the +# string. In the case of non-UTF8, it makes sure that the +# string has at least one byte in it. The macro name has +# '_safe' appended to it. +# fast The input string is valid UTF-8. No bounds checking is done, +# and the macro can make assumptions that lead to faster +# execution. +# only_ascii_platform Skip this definition if this program is being run on +# a non-ASCII platform. +# only_ebcdic_platform Skip this definition if this program is being run on +# a non-EBCDIC platform. +# No modifier need be specified; fast is assumed for this case. If both +# 'fast', and 'safe' are specified, two macros will be created for each +# 'type'. +# +# If run on a non-ASCII platform will automatically convert the Unicode input +# to native. The documentation above is slightly wrong in this case. 'low' +# actually refers to code points whose UTF-8 representation is the same as the +# non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the +# code points less than 256. + +1; # in the unlikely case we are being used as a module + +__DATA__ +# This is no longer used, but retained in case it is needed some day. # TRICKYFOLD: Problematic fold case letters. When adding to this list, also should add them to regcomp.c and fold_grind.t # => generic cp generic-cp generic-both :fast safe # 0x00DF # LATIN SMALL LETTER SHARP S @@ -694,48 +1108,75 @@ if ( !caller ) { # 0x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390 # 0x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0 -1; # in the unlikely case we are being used as a module - -__DATA__ LNBREAK: Line Break: \R => generic UTF8 LATIN1 :fast safe "\x0D\x0A" # CRLF - Network (Windows) line ending -0x0A # LF | LINE FEED -0x0B # VT | VERTICAL TAB -0x0C # FF | FORM FEED -0x0D # CR | CARRIAGE RETURN -0x85 # NEL | NEXT LINE -0x2028 # LINE SEPARATOR -0x2029 # PARAGRAPH SEPARATOR +\p{VertSpace} HORIZWS: Horizontal Whitespace: \h \H => generic UTF8 LATIN1 cp :fast safe -0x09 # HT -0x20 # SPACE -0xa0 # NBSP -0x1680 # OGHAM SPACE MARK -0x180e # MONGOLIAN VOWEL SEPARATOR -0x2000 # EN QUAD -0x2001 # EM QUAD -0x2002 # EN SPACE -0x2003 # EM SPACE -0x2004 # THREE-PER-EM SPACE -0x2005 # FOUR-PER-EM SPACE -0x2006 # SIX-PER-EM SPACE -0x2007 # FIGURE SPACE -0x2008 # PUNCTUATION SPACE -0x2009 # THIN SPACE -0x200A # HAIR SPACE -0x202f # NARROW NO-BREAK SPACE -0x205f # MEDIUM MATHEMATICAL SPACE -0x3000 # IDEOGRAPHIC SPACE +\p{HorizSpace} VERTWS: Vertical Whitespace: \v \V => generic UTF8 LATIN1 cp :fast safe -0x0A # LF -0x0B # VT -0x0C # FF -0x0D # CR -0x85 # NEL -0x2028 # LINE SEPARATOR -0x2029 # PARAGRAPH SEPARATOR +\p{VertSpace} + +REPLACEMENT: Unicode REPLACEMENT CHARACTER +=> UTF8 :safe +0xFFFD + +NONCHAR: Non character code points +=> UTF8 :fast +\p{Nchar} + +SURROGATE: Surrogate characters +=> UTF8 :fast +\p{Gc=Cs} + +GCB_L: Grapheme_Cluster_Break=L +=> UTF8 :fast +\p{_X_GCB_L} + +GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V) +=> UTF8 :fast +\p{_X_LV_LVT_V} + +GCB_Prepend: Grapheme_Cluster_Break=Prepend +=> UTF8 :fast +\p{_X_GCB_Prepend} + +GCB_RI: Grapheme_Cluster_Break=RI +=> UTF8 :fast +\p{_X_RI} + +GCB_SPECIAL_BEGIN: Grapheme_Cluster_Break=special_begins +=> UTF8 :fast +\p{_X_Special_Begin} + +GCB_T: Grapheme_Cluster_Break=T +=> UTF8 :fast +\p{_X_GCB_T} + +GCB_V: Grapheme_Cluster_Break=V +=> UTF8 :fast +\p{_X_GCB_V} + +# This program was run with this enabled, and the results copied to utf8.h; +# then this was commented out because it takes so long to figure out these 2 +# million code points. The results would not change unless utf8.h decides it +# wants a maximum other than 4 bytes, or this program creates better +# optimizations +#UTF8_CHAR: Matches utf8 from 1 to 4 bytes +#=> UTF8 :safe only_ascii_platform +#0x0 - 0x1FFFFF + +# This hasn't been commented out, because we haven't an EBCDIC platform to run +# it on, and the 3 types of EBCDIC allegedly supported by Perl would have +# different results +UTF8_CHAR: Matches utf8 from 1 to 5 bytes +=> UTF8 :safe only_ebcdic_platform +0x0 - 0x3FFFFF: + +QUOTEMETA: Meta-characters that \Q should quote +=> high :fast +\p{_Perl_Quotemeta} diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl new file mode 100644 index 0000000000..e3d588a599 --- /dev/null +++ b/regen/unicode_constants.pl @@ -0,0 +1,146 @@ +use v5.16.0; +use strict; +use warnings; +require 'regen/regen_lib.pl'; +use charnames qw(:loose); + +my $out_fh = open_new('unicode_constants.h', '>', + {style => '*', by => $0, + from => "Unicode data"}); + +print $out_fh <<END; + +#ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */ +#define H_UNICODE_CONSTANTS 1 + +/* This file contains #defines for various Unicode code points. The values + * the macros expand to are the native Unicode code point, or all or portions + * of the UTF-8 encoding for the code point. In the former case, the macro + * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8". + * + * The macros that have the suffix "_UTF8" may have further suffixes, as + * follows: + * "_FIRST_BYTE" if the value is just the first byte of the UTF-8 + * representation; the value will be a numeric constant. + * "_TAIL" if instead it represents all but the first byte. This, and + * with no additional suffix are both string constants */ + +END + +# The data are at the end of this file. A blank line is output as-is. +# Otherwise, each line represents one #define, and begins with either a +# Unicode character name with the blanks in it squeezed out or replaced by +# underscores; or it may be a hexadecimal Unicode code point. In the latter +# case, the name will be looked-up to use as the name of the macro. In either +# case, the macro name will have suffixes as listed above, and all blanks will +# be replaced by underscores. +# +# Each line may optionally have one of the following flags on it, separated by +# white space from the initial token. +# string indicates that the output is to be of the string form +# described in the comments above that are placed in the file. +# first indicates that the output is to be of the FIRST_BYTE form. +# tail indicates that the output is of the _TAIL form. +# native indicates that the output is the code point, converted to the +# platform's native character set if applicable +# +# If the code point has no official name, the desired name may be appended +# after the flag, which will be ignored if there is an official name. +# +# This program is used to make it convenient to create compile time constants +# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually +# having to figure things out. + +while ( <DATA> ) { + if ($_ !~ /\S/) { + print $out_fh "\n"; + next; + } + + chomp; + unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token + (?: [\ ]+ ( [^ ]* ) )? # optional flag + (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required + /x) + { + die "Unexpected syntax at line $.: $_\n"; + } + + my $name_or_cp = $1; + my $flag = $2; + my $desired_name = $3; + + my $name; + my $cp; + + if ($name_or_cp =~ /[^[:xdigit:]]/) { + + # Anything that isn't a hex value must be a name. + $name = $name_or_cp; + $cp = charnames::vianame($name =~ s/_/ /gr); + die "Unknown name '$name' at line $.: $_\n" unless defined $name; + } + else { + $cp = $name_or_cp; + $name = charnames::viacode("0$cp") // ""; # viacode requires a leading + # zero to be sure that the + # argument is hex + die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp; + } + + $name = $desired_name if $name eq ""; + $name =~ s/ /_/g; # The macro name can have no blanks in it + + my $str = join "", map { sprintf "\\x%02X", $_ } + unpack("U0C*", pack("U", hex $cp)); + + my $suffix = '_UTF8'; + if (! defined $flag || $flag eq 'string') { + $str = "\"$str\""; # Will be a string constant + } elsif ($flag eq 'tail') { + $str =~ s/\\x..//; # Remove the first byte + $suffix .= '_TAIL'; + $str = "\"$str\""; # Will be a string constant + } + elsif ($flag eq 'first') { + $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte + $suffix .= '_FIRST_BYTE'; + $str = "0x$str"; # Is a numeric constant + } + elsif ($flag eq 'native') { + die "Are you sure you want to run this on an above-Latin1 code point?" if hex $cp > 0xff; + $suffix = '_NATIVE'; + $str = utf8::unicode_to_native(hex $cp); + $str = "0x$cp"; # Is a numeric constant + } + else { + die "Unknown flag at line $.: $_\n"; + } + print $out_fh "#define ${name}$suffix $str /* U+$cp */\n"; +} + +print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n"; + +read_only_bottom_close_and_rename($out_fh); + +__DATA__ +0300 string +0301 string +0308 string + +03B9 first +03B9 tail + +03C5 first +03C5 tail + +2010 string +D800 first FIRST_SURROGATE + +007F native +00DF native +00E5 native +00C5 native +00FF native +00B5 native +0085 native diff --git a/regen/utf8_strings.pl b/regen/utf8_strings.pl deleted file mode 100644 index d6d4c76208..0000000000 --- a/regen/utf8_strings.pl +++ /dev/null @@ -1,108 +0,0 @@ -use v5.16.0; -use strict; -use warnings; -require 'regen/regen_lib.pl'; -use charnames qw(:loose); - -my $out_fh = open_new('utf8_strings.h', '>', - {style => '*', by => $0, - from => "Unicode data"}); - -print $out_fh <<END; -/* This file contains #defines for various Unicode code points. The values - * for the macros are all or portions of the UTF-8 encoding for the code - * point. Note that the names all have the suffix "_UTF8". - * - * The suffix "_FIRST_BYTE" may be appended to the name if the value is just - * the first byte of the UTF-8 representation; the value will be a numeric - * constant. - * - * The suffix "_TAIL" is appened if instead it represents all but the first - * byte. This, and with no suffix are both string constants */ - -END - -# The data are at the end of this file. Each line represents one #define. -# Each line begins with either a Unicode character name with the blanks in it -# squeezed out or replaced by underscores; or it may be a hexadecimal code -# point. In the latter case, the name will be looked-up to use as the name -# of the macro. In either case, the macro name will have suffixes as -# listed above, and all blanks will be replaced by underscores. -# -# Each line may optionally have one of the following flags on it, separated by -# white space from the initial token. -# first indicates that the output is to be of the FIRST_BYTE form -# described in the comments above that are placed in the file. -# tail indicates that the output is of the _TAIL form. -# -# This program is used to make it convenient to create compile time constants -# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually -# having to figure things out. - -while ( <DATA> ) { - chomp; - unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token - (?: [\ ]+ ( .* ) )? # optional flag - /x) - { - die "Unexpected syntax at line $.: $_\n"; - } - - my $name_or_cp = $1; - my $flag = $2; - - my $name; - my $cp; - - if ($name_or_cp =~ /[^[:xdigit:]]/) { - - # Anything that isn't a hex value must be a name. - $name = $name_or_cp; - $cp = charnames::vianame($name =~ s/_/ /gr); - die "Unknown name '$name' at line $.: $_\n" unless defined $name; - } - else { - $cp = $name_or_cp; - $name = charnames::viacode("0$cp"); # viacode requires a leading zero - # to be sure that the argument is hex - die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp; - } - - $name =~ s/ /_/g; # The macro name can have no blanks in it - - my $str = join "", map { sprintf "\\x%02X", $_ } - unpack("U0C*", pack("U", hex $cp)); - - my $suffix = '_UTF8'; - if (! defined $flag) { - $str = "\"$str\""; # Will be a string constant - } elsif ($flag eq 'tail') { - $str =~ s/\\x..//; # Remove the first byte - $suffix .= '_TAIL'; - $str = "\"$str\""; # Will be a string constant - } - elsif ($flag eq 'first') { - $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte - $suffix .= '_FIRST_BYTE'; - $str = "0x$str"; # Is a numeric constant - } - else { - die "Unknown flag at line $.: $_\n"; - } - print $out_fh "#define ${name}$suffix $str /* U+$cp */\n"; -} - -read_only_bottom_close_and_rename($out_fh); - -__DATA__ -0300 -0301 -0308 -03B9 tail -03C5 tail -03B9 first -03C5 first -1100 -1160 -11A8 -2010 |