diff options
author | Rafael Garcia-Suarez <rgarciasuarez@gmail.com> | 2007-04-23 12:25:10 +0000 |
---|---|---|
committer | Rafael Garcia-Suarez <rgarciasuarez@gmail.com> | 2007-04-23 12:25:10 +0000 |
commit | 12b72891435c5403bcc37a2760e6b00923c804ba (patch) | |
tree | f0ba2b87143762dfc0df5f65bd9be99fc7834e37 | |
parent | ef773544769fa28b527d925f5e04db46de7530bf (diff) | |
download | perl-12b72891435c5403bcc37a2760e6b00923c804ba.tar.gz |
Add Yves Orton's script to regenerate regcharclass.h
p4raw-id: //depot/perl@31030
-rw-r--r-- | MANIFEST | 11 | ||||
-rw-r--r-- | Porting/regcharclass.pl | 526 | ||||
-rw-r--r-- | regcharclass.h | 170 |
3 files changed, 649 insertions, 58 deletions
@@ -1565,7 +1565,7 @@ lib/CPAN/bin/cpan easily interact with CPAN from the command line lib/CPAN/Debug.pm helper package for CPAN.pm lib/CPAN/FirstTime.pm Utility for creating CPAN config files lib/CPAN/HandleConfig.pm helper package for CPAN.pm -lib/CPAN/Kwalify/distroprefs.dd helper file for validating config files +lib/CPAN/Kwalify/distroprefs.dd helper file for validating config files lib/CPAN/Kwalify/distroprefs.yml helper file for validating config files lib/CPAN/Kwalify.pm helper package for CPAN.pm lib/CPAN/Nox.pm Runs CPAN while avoiding compiled extensions @@ -1674,7 +1674,7 @@ lib/DBM_Filter/t/int32.t test DBM_Filter::int32 lib/DBM_Filter/t/null.t test DBM_Filter::null lib/DBM_Filter/t/utf8.t test DBM_Filter::utf8 lib/DBM_Filter/utf8.pm DBM Filter for UTF-8 Encoding -lib/dbm_filter_util.pl Utility functions used by DBM Filter tests +lib/dbm_filter_util.pl Utility functions used by DBM Filter tests lib/DB.pm Debugger API (draft) lib/DB.t See if DB works lib/Devel/InnerPackage.pm Find inner packages @@ -2333,14 +2333,14 @@ lib/Pod/Simple.pm Pod made simple lib/Pod/Simple.pod Pod for Pod::Simple lib/Pod/Simple/Progress.pm Pod::Simple::Progress lib/Pod/Simple/PullParserEndToken.pm end-tokens from Pod::Simple::PullParser -lib/Pod/Simple/PullParser.pm a pull-parser interface to parsing Pod +lib/Pod/Simple/PullParser.pm a pull-parser interface to parsing Pod lib/Pod/Simple/PullParserStartToken.pm start-tokens from Pod::Simple::PullParser lib/Pod/Simple/PullParserTextToken.pm text-tokens from Pod::Simple::PullParser lib/Pod/Simple/PullParserToken.pm tokens from Pod::Simple::PullParser lib/Pod/Simple/README Pod::Simple README file lib/Pod/Simple/RTF.pm format Pod as RTF lib/Pod/Simple/Search.pm find POD documents in directory trees -lib/Pod/Simple/SimpleTree.pm parse Pod into a simple parse tree +lib/Pod/Simple/SimpleTree.pm parse Pod into a simple parse tree lib/Pod/Simple/Subclassing.pod write a formatter as a Pod::Simple subclass lib/Pod/Simple/t/00about.t Pod::Simple test file lib/Pod/Simple/t/20_skip.t Pod::Simple test file @@ -3219,6 +3219,7 @@ Porting/patching.pod How to report changes made to Perl Porting/patchls Flexible patch file listing utility Porting/pumpkin.pod Guidelines and hints for Perl maintainers Porting/repository.pod How to use the Perl repository +Porting/regcharclass.pl Generate regcharclass.h from inline data Porting/sort_perldiag.pl Keep our diagnostics orderly Porting/testall.atom Cumulative profile with Third Degree Porting/thirdclean Cleanup Third Degree reports @@ -3284,7 +3285,7 @@ regcomp.c Regular expression compiler regcomp.h Private declarations for above regcomp.pl Builder of regnodes.h regcomp.sym Data for regnodes.h -regcharclass.h Match various character classes efficiently +regcharclass.h Generated by Porting/regcharclass.pl regen_lib.pl Common file routines for generator scripts regen_perly.pl generate perly.{act,h,tab} from perly.y regen.pl Run all scripts that (re)generate files diff --git a/Porting/regcharclass.pl b/Porting/regcharclass.pl new file mode 100644 index 0000000000..5202fc3f9e --- /dev/null +++ b/Porting/regcharclass.pl @@ -0,0 +1,526 @@ +package UTF8::Matcher; +use strict; +use warnings; +use Text::Wrap qw(wrap); +use Encode; +use Data::Dumper; + +# Author: Yves Orton (demerphq) 2007. + +=pod + +Dynamically generates macros for detecting special charclasses +in both latin-1, utf8, and codepoint forms. + +To regenerate regcharclass.h, run this script from perl-root. No arguments +are necessary. + +Each charclass handler is constructed as follows: +Each string the charclass must match is rendered as unicode (codepoints>255), +and if possible as latin1 (codepoints>127), and if possible as "neutral" +(all codepoints<128). + +The rendered strings are then inserted into digit-tries by type and length. +With shorter strings being added to tries that are allowed to contain longer +strings, but not vice versa. Thus the "longest" trie contains all strings +for that charclass. + +The following types of trie are generated: + + n - Neutral only. All strings in this type have codepoints<128 + l - Latin1 only. All strings in this type have a codepoint>127 in them + u - UTF8 only. All strings in this type have a codepoint>255 in them + L - Latin1. All strings in 'n' and 'l' + U - UTF8. All string in 'n' and 'u' + c - Codepoint. All strings in U but in codepoint and not utf8 form. + +The ternary() routine is responsible for converting the trie data into a +ternary conditional that matches the required set of strings. The generated +macro normally takes at least the argument 's' which is expected to be a +pointer of type C<char *> or C<U8 *>. The condition generated will be +optimised to match the string as efficiently as possible, with range lookups +being used where possible, and in some situations relying on "true" to be 1. + +ternary() takes two optional arguments, $type which is one of the above +characters and $ext which is used to add an extra extension to the macro name. + +If $type is omitted or false then the generated macro will take an additional +argument, 'is_utf8'. + +If $ext has the string 'safe' in it then the generated macro will take an extra +argument 'e' for the end of the string, and all lookups will be length checked +to prevent lookups past e. If 'safe' is not used then the lookup is assumed to +be guaranteed safe, and no 'e' argument is provided and no length checks are +made during execution. + +The 'c' type is different as compared to the rest. Instead of producing +a condition that does octet comparisons of a string array, the 'c' type +produces a macro that takes a single codepoint as an argument (instead of a +char* or U8*) and does the lookup based on only that char, thus it cannot be +used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass. +This is primarily used for populating charclass bitmaps for codepoints 0..255 +but will also match codepoints in the unicode range if necessary. + +Using LNBREAK as an example the following macros will be produced: + +=over 4 + +=item is_LNBREAK(s,is_utf8) + +=item is_LNBREAK_safe(s,e,is_utf8) + +Do a lookup as apporpriate based on the is_utf8 flag. When possible +comparisons involving octect<128 are done before checking the is_utf8 +flag, hopefully saving time. + +=item is_LNBREAK_utf8(s) + +=item is_LNBREAK_utf8_safe(s,e) + +Do a lookup assuming the string is encoded in (normalized) UTF8. + +=item is_LNBREAK_latin1(s) + +=item is_LNBREAK_latin1_safe(s,e) + +Do a lookup assuming the string is encoded in latin-1 (aka plan octets). + +=item is_LNBREAK_cp(cp) + +Check to see if the string matches a given codepoint (hypotethically a +U32). The condition is constructed as as to "break out" as early as +possible if the codepoint is out of range of the condition. + +IOW: + + (cp==X || (cp>X && (cp==Y || (cp>Y && ...)))) + +Thus if the character is X+1 only two comparisons will be done. Making +matching lookups slower, but non-matching faster. + +=back + +=cut + +# store a list of numbers into a hash based trie. +sub _trie_store { + my $root= shift; + foreach my $b ( @_ ) { + $root->{$b} ||= {}; + $root= $root->{$b}; + } + $root->{''}++; +} + +# Convert a string into its neutral, latin1, utf8 forms, where +# the form is undefined unless the string can be completely represented +# in that form. The string is then decomposed into the octects representing +# it. A list is returned for each. Additional a list of codepoints making +# up the string. +# returns (\@n,\@u,\@l,\@cp) +# +sub _uni_latin1 { + my $str= shift; + my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) }; + my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) }; + my $n= $l; + undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/; + return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )), + [map { ord $_ } split //,$str]); +} + +# store an array ref of char data into the appropriate +# type bins, tracking sizes as we go. +sub _store { + my ( $self, $r, @k )= @_; + for my $z ( @k ) { + $self->{size}{$z}{ 0 + @$r }++; + push @{ $self->{data}{$z} }, $r; + } +} + +# construct a new charclass constructor object. +# $title ends up in the code a as a comment. +# $opcode is the name of the operation the charclass implements. +# the rest of the arguments are strings that the charclass +# can match. +sub new { + my $class= shift; + my $title= shift; + my $opcode= shift; + my $self= bless { op => $opcode, title => $title }, $class; + my %seen; + # convert the strings to the numeric equivelents and store + # them for later insertion while tracking their sizes. + foreach my $seq ( @_ ) { + next if $seen{$seq}++; + push @{$self->{seq}},$seq; + my ( $n, $u, $l,$cp )= _uni_latin1( $seq ); + if ( $n ) { + _store( $self, $n, qw(n U L) ); + } else { + if ( $l ) { + _store( $self, $l, qw(l L) ); + } + _store( $self, $u, qw(u U) ); + } + _store($self,$cp,'c'); + } + # + # now construct the tries. For each type of data we insert + # the data into all the tries of length $size and smaller. + # + + my %allsize; + foreach my $k ( keys %{ $self->{data} } ) { + my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} }; + $self->{size}{$k}=\@size; + undef @allsize{@size}; + foreach my $d ( @{ $self->{data}{$k} } ) { + foreach my $sz ( @size ) { + last if $sz < @$d; + $self->{trie}{$k}{$sz} ||= {}; + _trie_store( $self->{trie}{$k}{$sz}, @$d ); + } + } + #delete $self->{data}{$k}; + } + my @size= sort { $b <=> $a } keys %allsize; + $self->{size}{''}= \@size; + return $self; +} + +# +# _cond([$v1,$v2,$v2...],$ofs) +# +# converts an array of codepoints into a conditional expression +# consequtive codepoints are merged into a range test +# returns a string containing the conditional expression in the form +# '( li[x]==v || li[x]==y )' When possible we also use range lookups. + +sub _cond { + my ( $c, $ofs,$fmt )= @_; + $fmt||='((U8*)s)[%d]'; + # cheapo rangification routine. + # Convert the first element into a singleton represented + # as [$x,$x] and then merge the rest in as we go. + my @v= sort { $a <=> $b } @$c; + my @r= ( [ ( shift @v ) x 2 ] ); + for my $n ( @v ) { + if ( $n == $r[-1][1] + 1 ) { + $r[-1][1]++; + } else { + push @r, [ $n, $n ]; + } + } + @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r; + # sort the ranges by size and order. + @r= sort { $a->[0] <=> $b->[0] } @r; + my $alu= sprintf $fmt,$ofs; # C array look up + + if ($fmt=~/%d/) { + # map the ranges into conditions + @r= map { + # singleton + $_->[0] == $_->[1] ? "$alu==$_->[0]" : + # range + "($_->[0]<=$alu && $alu<=$_->[1])" + } @r; + # return the joined results. + return '( ' . join( " || ", @r ) . ' )'; + } else { + return combine($alu,@r); + } +} + +# +# Do the condition in such a way that we break out early if the value +# we are looking at is in between two elements in the list. +# Currently used only for codepoint macros (depth 1) +# +sub combine { + my $alu=shift; + local $_ = shift; + my $txt= $_->[0] == $_->[1] + ? "$alu==$_->[0]" + : "($_->[0]<=$alu && $alu<=$_->[1])"; + return $txt unless @_; + return "( $txt || ( $alu > $_->[1] && \n".combine($alu,@_)." ) )"; +} + +# recursively convert a trie to an optree represented by +# [condition,yes,no] where yes and no can be a ref to another optree +# or a scalar representing code. +# called by make_optree + +sub _trie_to_optree { + my ( $node, $ofs, $else, $fmt )= @_; + return $else unless $node; + $ofs ||= 0; + if ( $node->{''} ) { + $else= $ofs; + } else { + $else ||= 0; + } + my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] } + map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] } + grep length, keys %$node; + + return $ofs if !@k; + + my ( $root, $expr ); + while ( @k ) { + my @cond= ( $k[0][0] ); + my $d= $k[0][1]; + my $r= $k[0][2]; + shift @k; + while ( @k && $k[0][1] eq $d ) { + push @cond, $k[0][0]; + shift @k; + } + my $op= + [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ]; + if ( !$root ) { + $root= $expr= $op; + } else { + push @$expr, $op; + $expr= $op; + } + } + push @$expr, $else; + return $root; +} + +# construct the optree for a type. +# handles the special logic of type ''. +sub make_optree { + my ( $self, $type, $size, $fmt )= @_; + my $else= 0; + $size||=$self->{size}{$type}[0]; + $size=1 if $type eq 'c'; + if ( !$type ) { + my ( $u, $l ); + for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) { + $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt ); + } + for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) { + $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt ); + } + if ( $u ) { + $else= [ '(is_utf8)', $u, $l || 0 ]; + } elsif ( $l ) { + $else= [ '(!is_utf8)', $l, 0 ]; + } + $type= 'n'; + $size-- while !$self->{trie}{n}{$size}; + } + return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt ); +} + +# construct the optree for a type with length checks to prevent buffer +# overruns. Only one length check is performed per lookup trading code +# size for speed. +sub length_optree { + my ( $self, $type,$fmt )= @_; + $type ||= ''; + return $self->{len_op}{$type} if $self->{len_op}{$type}; + my @size = @{$self->{size}{$type}}; + + my ( $root, $expr ); + foreach my $size ( @size ) { + my $op= [ + "( (e) - (s) > " . ( $size - 1 ) . " )", + $self->make_optree( $type, $size ), + ]; + if ( !$root ) { + $root= $expr= $op; + } else { + push @$expr, $op; + $expr= $op; + } + } + push @$expr, 0; + return $self->{len_op}{$type}= $root ? $root : $expr->[0]; +} + +# +# recursively walk an optree and covert it to a huge nested ternary expression. +# +sub _optree_to_ternary { + my ( $node )= @_; + return $node + if !ref $node; + my $depth = 0; + if ( $node->[0] =~ /\[(\d+)\]/ ) { + $depth= $1 + 1; + } + return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0], + _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] ); +} + +# add \\ to the end of strings in a reasonable neat way. +sub _macro($) { + my $str= shift; + my @lines= split /[^\S\n]*\n/, $str; + return join( "\\\n", map { sprintf "%-76s", $_ } @lines ) . "\n\n"; +} + +# default type extensions. 'uln' dont have one because normally +# they are used only as part of type '' which doesnt get an extension +my %ext= ( + U => '_utf8', + L => '_latin1', + c => '_cp', + +); + +# produce the ternary, handling arguments and putting on the macro headers +# and boiler plate +sub ternary { + my ( $self, $type, $ext )= @_; + $type ||= ''; + $ext = ($ext{$type} || '') . ($ext||""); + my ($root,$fmt,$arg); + if ($type eq 'c') { + $arg= $fmt= 'cp'; + } else { + $arg= 's'; + } + if ( $type eq 'c' || $ext !~ /safe/) { + $root= $self->make_optree( $type, 0, $fmt ); + } else { + $root= $self->length_optree( $type, $fmt ); + } + + our $parens; + $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x; + my $expr= qr/ + \( \s* + ($parens) + \s* \? \s* + \( \s* + ($parens) + \s* \? \s* + (\d+|$parens) + \s* : \s* + (\d+|$parens) + \s* \) + \s* : \s* + \4 + \s* \) + /x; + my $code= _optree_to_ternary( $root ); + for ( $code ) { + s/^\s*//; + 1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g + || s<$expr><(($1 && $2) ? $3 : $4)>g + || s<\(\s*($parens)\s*\)><$1>g; + } + my @args=($arg); + push @args,'e' if $ext=~/safe/; + push @args,'is_utf8' if !$type; + my $args=join ",",@args; + return "/*** GENERATED CODE ***/\n" + . _macro "#define is_$self->{op}$ext($args)\n$code"; +} + +my $path=shift @ARGV; +if (!$path) { + $path= "regcharclass.h"; + if (!-e $path) { $path="../$path" } + if (!-e $path) { die "Can't find regcharclass.h to update!\n" }; +} + +rename $path,"$path.bak"; +open my $out_fh,">",$path + or die "Can't write to '$path':$!"; +binmode $out_fh; # want unix line endings even when run on win32. +my ($zero)=$0=~/([^\\\/]+)$/; +print $out_fh <<"HEADER"; +/*********************** WARNING WARNING WARNING ************************ + +Do not modify this code directly: This file was autogenerated by + + Porting/$zero + +from data contained within the script. Change the script instead. + +Generated at: @{[ scalar gmtime ]} GMT + +************************ WARNING WARNING WARNING ************************/ + +HEADER + +my ($op,$title,@strs,@txt); +my $doit= sub { + return unless $op; + my $o= __PACKAGE__->new($title,$op,@strs); + print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n"; + print $out_fh join "\n",@txt,"*/",""; + for ('', 'U', 'L') { + print $out_fh $o->ternary( $_ ); + print $out_fh $o->ternary( $_,'_safe' ); + } + print $out_fh $o->ternary( 'c' ); +}; +while (<DATA>) { + next unless /\S/; + chomp; + if (/^([A-Z]+)/) { + $doit->(); + ($op,$title)=split /\s*:\s*/,$_,2; + @txt=@strs=(); + } else { + push @txt, "\t$_"; + s/#.*$//; + if (/^0x/) { + push @strs,map { chr $_ } eval $_; + } elsif (/^[""'']/) { + push @strs,eval $_; + } + } +} +$doit->(); +print "$path has been updated\n"; + +__DATA__ +LNBREAK: Line Break: \R +"\x0D\x0A" # CRLF - Network (Windows) line ending +0x0A # LF | LINE FEED +0x0B # VT | VERTICAL TAB +0x0C # FF | FORM FEED +0x0D # CR | CARRIAGE RETURN +0x85 # NEL | NEXT LINE +0x2028 # LINE SEPARATOR +0x2029 # PARAGRAPH SEPARATOR + +HORIZWS: Horizontal Whitespace: \h \H +0x09 # HT +0x20 # SPACE +0xa0 # NBSP +0x1680 # OGHAM SPACE MARK +0x180e # MONGOLIAN VOWEL SEPARATOR +0x2000 # EN QUAD +0x2001 # EM QUAD +0x2002 # EN SPACE +0x2003 # EM SPACE +0x2004 # THREE-PER-EM SPACE +0x2005 # FOUR-PER-EM SPACE +0x2006 # SIX-PER-EM SPACE +0x2007 # FIGURE SPACE +0x2008 # PUNCTUATION SPACE +0x2009 # THIN SPACE +0x200A # HAIR SPACE +0x202f # NARROW NO-BREAK SPACE +0x205f # MEDIUM MATHEMATICAL SPACE +0x3000 # IDEOGRAPHIC SPACE + +VERTWS: Vertical Whitespace: \v \V +0x0A # LF +0x0B # VT +0x0C # FF +0x0D # CR +0x85 # NEL +0x2028 # LINE SEPARATOR +0x2029 # PARAGRAPH SEPARATOR + diff --git a/regcharclass.h b/regcharclass.h index 3fc92d7bce..08b9eaeca3 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1,13 +1,29 @@ -/****** WARNING WARNING WARNING ********/ -/* */ -/* Autogenerated code, do not modify! */ -/* */ -/****** WARNING WARNING WARNING ********/ - -/* LNBREAK Line Break: \j \J - Codepoints: 0x0A, 0x0B, 0x0C, 0x0D, 0x0D.0x0A, 0x85, 0x2028, 0x2029 - */ -#define is_LNBREAK(s,is_utf8) /*** Line Break: \j \J ***/ \ +/*********************** WARNING WARNING WARNING ************************ + +Do not modify this code directly: This file was autogenerated by + + Porting/regcharclass.pl + +from data contained within the script. Change the script instead. + +Generated at: Mon Apr 23 11:33:52 2007 GMT + +************************ WARNING WARNING WARNING ************************/ + +/* + LNBREAK: Line Break: \R + + "\x0D\x0A" # CRLF - Network (Windows) line ending + 0x0A # LF | LINE FEED + 0x0B # VT | VERTICAL TAB + 0x0C # FF | FORM FEED + 0x0D # CR | CARRIAGE RETURN + 0x85 # NEL | NEXT LINE + 0x2028 # LINE SEPARATOR + 0x2029 # PARAGRAPH SEPARATOR +*/ +/*** GENERATED CODE ***/ +#define is_LNBREAK(s,is_utf8) \ ( ( ((U8*)s)[0]==13 ) ? \ ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ? 1 : \ @@ -17,7 +33,8 @@ (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\ ( ((U8*)s)[0]==133 ) ) ) ) -#define is_LNBREAK_safe(s,e,is_utf8) /*** Line Break: \j \J ***/ \ +/*** GENERATED CODE ***/ +#define is_LNBREAK_safe(s,e,is_utf8) \ ( ( (e) - (s) > 2 ) ? \ ( ( ((U8*)s)[0]==13 ) ? \ ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \ @@ -39,7 +56,8 @@ ( (!is_utf8) ? \ ( ((U8*)s)[0]==133 ) : 0 ) ) : 0 ) ) ) -#define is_LNBREAK_utf8(s) /*** Line Break: \j \J ***/ \ +/*** GENERATED CODE ***/ +#define is_LNBREAK_utf8(s) \ ( ( ((U8*)s)[0]==194 ) ? \ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ ( ( ((U8*)s)[0]==226 ) ? \ @@ -48,7 +66,8 @@ ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \ (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ) ) ) -#define is_LNBREAK_utf8_safe(s,e) /*** Line Break: \j \J ***/ \ +/*** GENERATED CODE ***/ +#define is_LNBREAK_utf8_safe(s,e) \ ( ( (e) - (s) > 2 ) ? \ ( ( ((U8*)s)[0]==194 ) ? \ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ @@ -66,12 +85,14 @@ ( ( (e) - (s) > 0 ) ? \ (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) : 0 ) ) ) -#define is_LNBREAK_latin1(s) /*** Line Break: \j \J ***/ \ +/*** GENERATED CODE ***/ +#define is_LNBREAK_latin1(s) \ ( ( ((U8*)s)[0]==13 ) ? \ ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) || ((U8*)s)[0]==133 ) ) -#define is_LNBREAK_latin1_safe(s,e) /*** Line Break: \j \J ***/ \ +/*** GENERATED CODE ***/ +#define is_LNBREAK_latin1_safe(s,e) \ ( ( (e) - (s) > 1 ) ? \ ( ( ((U8*)s)[0]==13 ) ? \ ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \ @@ -79,21 +100,38 @@ ( ( (e) - (s) > 0 ) ? \ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) || ((U8*)s)[0]==133 ) : 0 ) ) -#define is_LNBREAK_cp(cp) /*** Line Break: \j \J ***/ \ -( (10<=cp && cp<=13) || ( cp > 13 && ( cp==133 || ( cp > 133 && ( cp==8232 || ( cp > 8232 && cp==8233 ) ) ) ) ) ) - -/****** WARNING WARNING WARNING ********/ -/* */ -/* Autogenerated code, do not modify! */ -/* */ -/****** WARNING WARNING WARNING ********/ - -/* HORIZWS Horizontal Whitespace: \h \H - Codepoints: 0x09, 0x20, 0xA0, 0x1680, 0x180E, 0x2000, 0x2001, 0x2002, - 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F, - 0x205F, 0x3000 - */ -#define is_HORIZWS(s,is_utf8) /*** Horizontal Whitespace: \h \H ***/ \ +/*** GENERATED CODE ***/ +#define is_LNBREAK_cp(cp) \ +( (10<=cp && cp<=13) || ( cp > 13 && \ +( cp==133 || ( cp > 133 && \ +( cp==8232 || ( cp > 8232 && \ +cp==8233 ) ) ) ) ) ) + +/* + HORIZWS: Horizontal Whitespace: \h \H + + 0x09 # HT + 0x20 # SPACE + 0xa0 # NBSP + 0x1680 # OGHAM SPACE MARK + 0x180e # MONGOLIAN VOWEL SEPARATOR + 0x2000 # EN QUAD + 0x2001 # EM QUAD + 0x2002 # EN SPACE + 0x2003 # EM SPACE + 0x2004 # THREE-PER-EM SPACE + 0x2005 # FOUR-PER-EM SPACE + 0x2006 # SIX-PER-EM SPACE + 0x2007 # FIGURE SPACE + 0x2008 # PUNCTUATION SPACE + 0x2009 # THIN SPACE + 0x200A # HAIR SPACE + 0x202f # NARROW NO-BREAK SPACE + 0x205f # MEDIUM MATHEMATICAL SPACE + 0x3000 # IDEOGRAPHIC SPACE +*/ +/*** GENERATED CODE ***/ +#define is_HORIZWS(s,is_utf8) \ ( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 : \ ( (is_utf8) ? \ ( ( ((U8*)s)[0]==194 ) ? \ @@ -109,7 +147,8 @@ (((( ((U8*)s)[0]==227 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) ) ) :\ ( ((U8*)s)[0]==160 ) ) ) -#define is_HORIZWS_safe(s,e,is_utf8) /*** Horizontal Whitespace: \h \H ***/ \ +/*** GENERATED CODE ***/ +#define is_HORIZWS_safe(s,e,is_utf8) \ ( ( (e) - (s) > 2 ) ? \ ( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 : \ ( (is_utf8) ? \ @@ -135,7 +174,8 @@ ( (!is_utf8) ? \ ( ((U8*)s)[0]==160 ) : 0 ) ) : 0 ) ) ) -#define is_HORIZWS_utf8(s) /*** Horizontal Whitespace: \h \H ***/ \ +/*** GENERATED CODE ***/ +#define is_HORIZWS_utf8(s) \ ( ( ((U8*)s)[0]==194 ) ? \ ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) : \ ( ( ((U8*)s)[0]==225 ) ? \ @@ -150,7 +190,8 @@ ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) : \ ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ) ) ) ) -#define is_HORIZWS_utf8_safe(s,e) /*** Horizontal Whitespace: \h \H ***/ \ +/*** GENERATED CODE ***/ +#define is_HORIZWS_utf8_safe(s,e) \ ( ( (e) - (s) > 2 ) ? \ ( ( ((U8*)s)[0]==194 ) ? \ ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) : \ @@ -172,26 +213,40 @@ ( ( (e) - (s) > 0 ) ? \ ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) : 0 ) ) ) -#define is_HORIZWS_latin1(s) /*** Horizontal Whitespace: \h \H ***/ \ +/*** GENERATED CODE ***/ +#define is_HORIZWS_latin1(s) \ ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 || ((U8*)s)[0]==160 ) -#define is_HORIZWS_latin1_safe(s,e) /*** Horizontal Whitespace: \h \H ***/ \ +/*** GENERATED CODE ***/ +#define is_HORIZWS_latin1_safe(s,e) \ ( ( (e) - (s) > 0 ) ? \ ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 || ((U8*)s)[0]==160 ) : 0 ) -#define is_HORIZWS_cp(cp) /*** Horizontal Whitespace: \h \H ***/ \ -( cp==9 || ( cp > 9 && ( cp==32 || ( cp > 32 && ( cp==160 || ( cp > 160 && ( cp==5760 || ( cp > 5760 && ( cp==6158 || ( cp > 6158 && ( (8192<=cp && cp<=8202) || ( cp > 8202 && ( cp==8239 || ( cp > 8239 && ( cp==8287 || ( cp > 8287 && cp==12288 ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) +/*** GENERATED CODE ***/ +#define is_HORIZWS_cp(cp) \ +( cp==9 || ( cp > 9 && \ +( cp==32 || ( cp > 32 && \ +( cp==160 || ( cp > 160 && \ +( cp==5760 || ( cp > 5760 && \ +( cp==6158 || ( cp > 6158 && \ +( (8192<=cp && cp<=8202) || ( cp > 8202 && \ +( cp==8239 || ( cp > 8239 && \ +( cp==8287 || ( cp > 8287 && \ +cp==12288 ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) -/****** WARNING WARNING WARNING ********/ -/* */ -/* Autogenerated code, do not modify! */ -/* */ -/****** WARNING WARNING WARNING ********/ +/* + VERTWS: Vertical Whitespace: \v \V -/* VERTWS Vertical Whitespace: \v \V - Codepoints: 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0x2028, 0x2029 - */ -#define is_VERTWS(s,is_utf8) /*** Vertical Whitespace: \v \V ***/ \ + 0x0A # LF + 0x0B # VT + 0x0C # FF + 0x0D # CR + 0x85 # NEL + 0x2028 # LINE SEPARATOR + 0x2029 # PARAGRAPH SEPARATOR +*/ +/*** GENERATED CODE ***/ +#define is_VERTWS(s,is_utf8) \ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 : \ ( (is_utf8) ? \ ( ( ((U8*)s)[0]==194 ) ? \ @@ -199,7 +254,8 @@ (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\ ( ((U8*)s)[0]==133 ) ) ) -#define is_VERTWS_safe(s,e,is_utf8) /*** Vertical Whitespace: \v \V ***/ \ +/*** GENERATED CODE ***/ +#define is_VERTWS_safe(s,e,is_utf8) \ ( ( (e) - (s) > 2 ) ? \ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 : \ ( (is_utf8) ? \ @@ -217,14 +273,16 @@ ( (!is_utf8) ? \ ( ((U8*)s)[0]==133 ) : 0 ) ) : 0 ) ) ) -#define is_VERTWS_utf8(s) /*** Vertical Whitespace: \v \V ***/ \ +/*** GENERATED CODE ***/ +#define is_VERTWS_utf8(s) \ ( ( ((U8*)s)[0]==194 ) ? \ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ ( ( ((U8*)s)[0]==226 ) ? \ ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) :\ (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ) ) -#define is_VERTWS_utf8_safe(s,e) /*** Vertical Whitespace: \v \V ***/ \ +/*** GENERATED CODE ***/ +#define is_VERTWS_utf8_safe(s,e) \ ( ( (e) - (s) > 2 ) ? \ ( ( ((U8*)s)[0]==194 ) ? \ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ @@ -238,13 +296,19 @@ ( ( (e) - (s) > 0 ) ? \ (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) : 0 ) ) ) -#define is_VERTWS_latin1(s) /*** Vertical Whitespace: \v \V ***/ \ +/*** GENERATED CODE ***/ +#define is_VERTWS_latin1(s) \ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) || ((U8*)s)[0]==133 ) -#define is_VERTWS_latin1_safe(s,e) /*** Vertical Whitespace: \v \V ***/ \ +/*** GENERATED CODE ***/ +#define is_VERTWS_latin1_safe(s,e) \ ( ( (e) - (s) > 0 ) ? \ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) || ((U8*)s)[0]==133 ) : 0 ) -#define is_VERTWS_cp(cp) /*** Vertical Whitespace: \v \V ***/ \ -( (10<=cp && cp<=13) || ( cp > 13 && ( cp==133 || ( cp > 133 && ( cp==8232 || ( cp > 8232 && cp==8233 ) ) ) ) ) ) +/*** GENERATED CODE ***/ +#define is_VERTWS_cp(cp) \ +( (10<=cp && cp<=13) || ( cp > 13 && \ +( cp==133 || ( cp > 133 && \ +( cp==8232 || ( cp > 8232 && \ +cp==8233 ) ) ) ) ) ) |