#!/usr/bin/perl -w # # # Regenerate (overwriting only if changed): # # pod/perldebguts.pod # regnodes.h # # from information stored in # # regcomp.sym # op_reg_common.h # regexp.h # # pod/perldebguts.pod is not completely regenerated. Only the table of # regexp nodes is replaced; other parts remain unchanged. # # Accepts the standard regen_lib -q and -v args. # # This script is normally invoked from regen.pl. # # F defines the opcodes and states used in the regex # engine, it also includes documentation on the opcodes. This script # parses those definitions out and turns them into typedefs, defines, # and data structures, and maybe even code which the regex engine can # use to operate. # # F and op_reg_common.h contain defines C and # C that are used in flags in our code. These defines are # parsed out and data structures are created to allow the debug mode of # the regex engine to show things such as which flags were set during # compilation. In some cases we transform the C code in the header files # into perl code which we execute to C the contents. For instance # in a situation like this: # # #define RXf_X 0x1 /* the X mode */ # #define RXf_Y 0x2 /* the Y mode */ # #define RXf_Z (X|Y) /* the Z mode */ # # this script might end up eval()ing something like C<0x1> and then # C<0x2> and then C<(0x1|0x2)> the results of which it then might use in # constructing a data structure, or pod in perldebguts, or a comment in # C. It also would separate out the "X", "Y", and "Z" and # use them, and would also use the data in the line comment if present. # # If you compile a regex under perl -Mre=Debug,ALL you can see much # of the content that this file generates and parses out of its input # files. BEGIN { # Get function prototypes require './regen/regen_lib.pl'; require './regen/HeaderParser.pm'; } use strict; # NOTE I don't think anyone actually knows what all of these properties mean, # and I suspect some of them are outright unused. This is a first attempt to # clean up the generation so maybe one day we can move to something more self # documenting. (One might argue that an array of hashes of properties would # be easier to use.) # # Why we use the term regnode and nodes, and not say, opcodes, I am not sure. # General thoughts: # 1. We use a single continuum to represent both opcodes and states, # and in regexec.c we switch on the combined set. # 2. Opcodes have more information associated to them, states are simpler, # basically just an identifier/number that can be used to switch within # the state machine. # 3. Some opcode are order dependent. # 4. Output files often use "tricks" to reduce diff effects. Some of what # we do below is more clumsy looking than it could be because of this. # Op/state properties: # # Property In Descr # ---------------------------------------------------------------------------- # name Both Name of op/state # id Both integer value for this opcode/state # optype Both Either 'op' or 'state' # line_num Both line_num number of the input file for this item. # type Op Type of node (aka regnode_kind) # code Op Meta about the node, used to detect variable length nodes # suffix Op which regnode struct this uses, so if this is '1', it # uses 'struct regnode_1' # flags Op S for simple; V for varies # longj Op Boolean as to if this node is a longjump # comment Both Comment about node, if any. Placed in perlredebguts # as its description # pod_comment Both Special comments for pod output (preceding lines in def) # Such lines begin with '#*' # Global State my @all; # all opcodes/state my %all; # hash of all opcode/state names my @ops; # array of just opcodes my @states; # array of just states my $longest_name_length= 0; # track lengths of names for nicer reports my (%type_alias); # map the type (??) # register a newly constructed node into our state tables. # ensures that we have no name collisions (on name anyway), # and issues the "id" for the node. sub register_node { my ($node)= @_; if ( $all{ $node->{name} } ) { die "Duplicate item '$node->{name}' in regcomp.sym line $node->{line_num} " . "previously defined on line $all{ $node->{name} }{line_num}\n"; } elsif (!$node->{optype}) { die "must have an optype in node ", Dumper($node); } elsif ($node->{optype} eq "op") { push @ops, $node; } elsif ($node->{optype} eq "state") { push @states, $node; } else { die "Uknown optype '$node->{optype}' in ", Dumper($node); } $node->{id}= 0 + @all; push @all, $node; $all{ $node->{name} }= $node; if ($node->{longj} && $node->{longj} != 1) { die "longj field must be in [01] if present in ", Dumper($node); } } # Parse and add an opcode definition to the global state. # What an opcode definition looks like is given in regcomp.sym. # # Not every opcode definition has all of the components. We should maybe make # this nicer/easier to read in the future. Also note that the above is tab # sensitive. # Special comments for an entry precede it, and begin with '#*' and are placed # in the generated pod file just before the entry. sub parse_opcode_def { my ( $text, $line_num, $pod_comment )= @_; my $node= { line_num => $line_num, pod_comment => $pod_comment, optype => "op", }; # first split the line into three, the initial NAME, a middle part # that we call "desc" which contains various (not well documented) things, # and a comment section. @{$node}{qw(name desc comment)}= /^(\S+)\s+([^\t]+?)\s*;\s*(.*)/ or die "Failed to match $_"; # the content of the "desc" field from the first step is extracted here: @{$node}{qw(type code suffix flags longj)}= split /[,\s]\s*/, $node->{desc}; defined $node->{$_} or $node->{$_} = "" for qw(type code suffix flags longj); register_node($node); # has to be before the type_alias code below if ( !$all{ $node->{type} } and !$type_alias{ $node->{type} } ) { #warn "Regop type '$node->{type}' from regcomp.sym line $line_num" # ." is not an existing regop, and will be aliased to $node->{name}\n" # if -t STDERR; $type_alias{ $node->{type} }= $node->{name}; } $longest_name_length= length $node->{name} if length $node->{name} > $longest_name_length; } # parse out a state definition and add the resulting data # into the global state. may create multiple new states from # a single definition (this is part of the point). # Format for states: # REGOP \t typelist [ \t typelist] # typelist= namelist # = namelist:FAIL # = name:count # Eg: # WHILEM A_pre,A_min,A_max,B_min,B_max:FAIL # BRANCH next:FAIL # CURLYM A,B:FAIL # # The CURLYM definition would create the states: # CURLYM_A, CURLYM_A_fail, CURLYM_B, CURLYM_B_fail sub parse_state_def { my ( $text, $line_num, $pod_comment )= @_; my ( $type, @lists )= split /\s+/, $text; die "No list? $type" if !@lists; foreach my $list (@lists) { my ( $names, $special )= split /:/, $list, 2; $special ||= ""; foreach my $name ( split /,/, $names ) { my $real= $name eq 'resume' ? "resume_$type" : "${type}_$name"; my @suffix; if ( !$special ) { @suffix= (""); } elsif ( $special =~ /\d/ ) { @suffix= ( 1 .. $special ); } elsif ( $special eq 'FAIL' ) { @suffix= ( "", "_fail" ); } else { die "unknown :type ':$special'"; } foreach my $suffix (@suffix) { my $node= { name => "$real$suffix", optype => "state", type => $type || "", comment => "state for $type", line_num => $line_num, }; register_node($node); } } } } sub process_flags { my ( $flag, $varname, $comment )= @_; $comment= '' unless defined $comment; my @selected; my $bitmap= ''; for my $node (@ops) { my $set= $node->{flags} && $node->{flags} eq $flag ? 1 : 0; # Whilst I could do this with vec, I'd prefer to do longhand the arithmetic # ops in the C code. my $current= do { no warnings; ord substr $bitmap, ( $node->{id} >> 3 ); }; substr( $bitmap, ( $node->{id} >> 3 ), 1 )= chr( $current | ( $set << ( $node->{id} & 7 ) ) ); push @selected, $node->{name} if $set; } my $out_string= join ', ', @selected, 0; $out_string =~ s/(.{1,70},) /$1\n /g; my $out_mask= join ', ', map { sprintf "0x%02X", ord $_ } split '', $bitmap; return $comment . <<"EOP"; #define REGNODE_\U$varname\E(node) (PL_${varname}_bitmask[(node) >> 3] & (1 << ((node) & 7))) #ifndef DOINIT EXTCONST U8 PL_${varname}\[] __attribute__deprecated__; #else EXTCONST U8 PL_${varname}\[] __attribute__deprecated__ = { $out_string }; #endif /* DOINIT */ #ifndef DOINIT EXTCONST U8 PL_${varname}_bitmask[]; #else EXTCONST U8 PL_${varname}_bitmask[] = { $out_mask }; #endif /* DOINIT */ EOP } sub print_process_EXACTish { my ($out)= @_; # Creates some bitmaps for EXACTish nodes. my @folded; my @req8; my $base; for my $node (@ops) { next unless $node->{type} eq 'EXACT'; my $name = $node->{name}; $base = $node->{id} if $name eq 'EXACT'; my $index = $node->{id} - $base; # This depends entirely on naming conventions in regcomp.sym $folded[$index] = $name =~ /^EXACTF/ || 0; $req8[$index] = $name =~ /8/ || 0; } die "Can't cope with > 32 EXACTish nodes" if @folded > 32; my $exactf = sprintf "%X", oct("0b" . join "", reverse @folded); my $req8 = sprintf "%X", oct("0b" . join "", reverse @req8); print $out <) { # Special pod comments if (/^#\* ?/) { $pod_comment .= "# $'"; } # Truly blank lines possibly surrounding pod comments elsif (/^\s*$/) { $pod_comment .= "\n" } next if /\A\s*#/ || /\A\s*\z/; s/\s*\z//; if (/^-+\s*$/) { $seen_sep= 1; next; } if ($seen_sep) { parse_state_def( $_, $., $pod_comment ); } else { parse_opcode_def( $_, $., $pod_comment ); } $pod_comment= ""; } close $in_fh; die "Too many regexp/state opcodes! Maximum is 256, but there are ", 0 + @all, " in file!" if @all > 256; } # use fixed width to keep the diffs between regcomp.pl recompiles # as small as possible. my ( $base_name_width, $rwidth, $twidth )= ( 22, 12, 9 ); sub print_state_defs { my ($out)= @_; printf $out < $#ops, -$base_name_width, REGMATCH_STATE_MAX => $#all; my %rev_type_alias= reverse %type_alias; my $base_format = "#define %*s\t%d\t/* %#04x %s */\n"; my @withs; my $in_states = 0; my $max_name_width = 0; for my $ref (\@ops, \@states) { for my $node ($ref->@*) { my $len = length $node->{name}; $max_name_width = $len if $max_name_width < $len; } } die "Do a white-space only commit to increase \$base_name_width to" . " $max_name_width; then re-run" if $base_name_width < $max_name_width; print $out <{name}, $node->{id}, $node->{comment}); if ( defined( my $alias= $rev_type_alias{ $node->{name} } ) ) { print_state_def_line($out, $alias, $node->{id}, $node->{comment}); } } print $out "\t/* ------------ States ------------- */\n"; for my $node (@states) { print_state_def_line($out, $node->{name}, $node->{id}, $node->{comment}); } } sub print_state_def_line { my ($fh, $name, $id, $comment) = @_; # The sub-names are like '_tb' or '_tb_p8' = max 6 chars wide my $name_col_width = $base_name_width + 6; my $base_id_width = 3; # Max is '255' or 3 cols my $mid_id_width = 3; # Max is '511' or 3 cols my $full_id_width = 3; # Max is '1023' but not close to using the 4th my $line = "#define " . $name; $line .= " " x ($name_col_width - length($name)); $line .= sprintf "%*s", $base_id_width, $id; $line .= " " x $mid_id_width; $line .= " " x ($full_id_width + 2); $line .= "/* "; my $hanging = length $line; # Indent any subsequent line to this pos $line .= sprintf "0x%02x", $id; my $columns = 78; # From the documentation: 'In fact, every resulting line will have length # of no more than "$columns - 1"' $line = wrap($columns + 1, "", " " x $hanging, "$line $comment"); chomp $line; # wrap always adds a trailing \n $line =~ s/ \s+ $ //x; # trim, just in case. # The comment may have wrapped. Find the final \n and measure the length # to the end. If it is short enough, just append the ' */' to the line. # If it is too close to the end of the space available, add an extra line # that consists solely of blanks and the ' */' my $len = length($line); my $rindex = rindex($line, "\n"); if (length($line) - rindex($line, "\n") - 1 <= $columns - 3) { $line .= " */\n"; } else { $line .= "\n" . " " x ($hanging - 3) . "*/\n"; } print $fh $line; # And add the 2 subsidiary #defines used when switching on # with_t_UTF8nes() my $with_id_t = $id * 2; for my $with (qw(tb t8)) { my $with_name = "${name}_$with"; print $fh "#define ", $with_name; print $fh " " x ($name_col_width - length($with_name) + $base_id_width); printf $fh "%*s", $mid_id_width, $with_id_t; print $fh " " x $full_id_width; printf $fh " /*"; print $fh " " x (4 + 2); # 4 is width of 0xHH that the base entry uses printf $fh "0x%03x */\n", $with_id_t; $with_id_t++; } # Finally add the 4 subsidiary #defines used when switching on # with_tp_UTF8nes() my $with_id_tp = $id * 4; for my $with (qw(tb_pb tb_p8 t8_pb t8_p8)) { my $with_name = "${name}_$with"; print $fh "#define ", $with_name; print $fh " " x ($name_col_width - length($with_name) + $base_id_width + $mid_id_width); printf $fh "%*s", $full_id_width, $with_id_tp; printf $fh " /*"; print $fh " " x (4 + 2); # 4 is width of 0xHH that the base entry uses printf $fh "0x%03x */\n", $with_id_tp; $with_id_tp++; } print $fh "\n"; # Blank line separates groups for clarity } sub print_typedefs { my ($out)= @_; print $out <{suffix} and $len < length($node->{suffix})) { $len= length $node->{suffix}; } } $len += length "struct regnode_"; $len = (int($len/5)+2)*5; my $prefix= "tregnode"; foreach my $node (sort { $a->{name} cmp $b->{name} } @ops) { my $struct_name= "struct regnode"; if (my $suffix= $node->{suffix}) { $struct_name .= "_$suffix"; } $node->{typedef}= $prefix . "_" . $node->{name}; printf $out "typedef %*s %s;\n", -$len, $struct_name, $node->{typedef}; } print $out <{suffix}; $node->{arg_len}= $size; } { my $varies= 0; $varies= 1 if $node->{code} and $node->{code}=~"str"; $node->{arg_len_varies}= $varies; } $node->{off_by_arg}= $node->{longj} || 0; print $out " {\n"; print $out " /* #$node_idx $node->{optype} $node->{name} */\n"; foreach my $f_idx (0..$#fields) { my $field= $fields[$f_idx]; printf $out " .%s = %s", $field, $node->{$field} // 0; printf $out $f_idx == $#fields ? "\n" : ",\n"; } print $out " }"; print $out $node_idx==$#all ? "\n" : ",\n"; } print $out <{name}",), $node->{id} - $ofs; if ( $node->{id} == $#ops and @ops != @all ) { print $out "\t/* ------------ States ------------- */\n"; $ofs= $#ops; $sym= 'REGNODE_MAX +'; } } print $out <new(); foreach my $file ( "op_reg_common.h", "regexp.h" ) { $hp->read_file($file); foreach my $line_info (@{$hp->lines}) { next unless $line_info->{type} eq "content" and $line_info->{sub_type} eq "#define"; my $line= $line_info->{line}; $line=~s/\s*\\\n\s*/ /g; # optional leading '_'. Return symbol in $1, and strip it from # comment of line. Currently doesn't handle comments running onto # next line if ($line=~s/^ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) { chomp($line); my $define= $1; my $orig= $_; $line=~s{ /\* .*? \*/ }{ }x; # Replace comments by a blank # Replace any prior defined symbols by their values foreach my $key ( keys %definitions ) { $line=~s/\b$key\b/$definitions{$key}/g; } # Remove the U suffix from unsigned int literals $line=~s/\b([0-9]+)U\b/$1/g; my $newval= eval $line; # Get numeric definition $definitions{$define}= $newval; next unless $line =~ /<new(); my $last_val = 0; foreach my $file ("regcomp.h") { $hp->read_file($file); my @bit_tuples; foreach my $line_info (@{$hp->lines}) { next unless $line_info->{type} eq "content" and $line_info->{sub_type} eq "#define"; my $line= $line_info->{line}; $line=~s/\s*\\\n\s*/ /g; # optional leading '_'. Return symbol in $1, and strip it from # comment of line if ( $line =~ m/^ \# \s* define \s+ ( PREGf_ ( \w+ ) ) \s+ 0x([0-9a-f]+)(?:\s*\/\*(.*)\*\/)?/xi ){ chomp $line; my $define= $1; my $abbr= $2; my $hex= $3; my $comment= $4; my $val= hex($hex); my $bin= sprintf "%b", $val; if ($bin=~/1.*?1/) { die "Not expecting multiple bits in PREGf" } my $bit= length($bin) - 1 ; $comment= $comment ? " - $comment" : ""; if ($bit_tuples[$bit]) { die "Duplicate PREGf bit '$bit': $define $val ($hex)"; } $bit_tuples[$bit]= [ $bit, $val, $abbr, $define, $comment ]; } } foreach my $i (0..$#bit_tuples) { my $bit_tuple= $bit_tuples[$i]; if (!$bit_tuple) { $bit_tuple= [ $i, 1<<$i, "", "", "*UNUSED*" ]; } my ($bit, $val, $abbr, $define, $comment)= @$bit_tuple; printf $out qq(\t%-30s/* (1<<%2d) - 0x%08x - %s%s */\n), qq("$abbr",), $bit, $val, $define, $comment; } $REG_INTFLAGS_NAME_SIZE=0+@bit_tuples; } print $out <' ); my $node; my $code; my $name_fmt= '<' x ( $longest_name_length - 1 ); my $descr_fmt= '<' x ( 58 - $longest_name_length ); eval <{pod_comment} ^$name_fmt ^<<<<<<<<< ^$descr_fmt~~ \$node->{name}, \$code, defined \$node->{comment} ? \$node->{comment} : '' . 1; EOD my $old_fh= select($guts); $~= "GuTS"; open my $oldguts, '<', 'pod/perldebguts.pod' or die "$0 cannot open pod/perldebguts.pod for reading: $!"; while (<$oldguts>) { print; last if /=for regcomp.pl begin/; } print <<'END_OF_DESCR'; # TYPE arg-description [regnode-struct-suffix] [longjump-len] DESCRIPTION END_OF_DESCR for my $n (@ops) { $node= $n; $code= "$node->{code} " . ( $node->{suffix} || "" ); $code .= " $node->{longj}" if $node->{longj}; if ( $node->{pod_comment} ||= "" ) { # Trim multiple blanks $node->{pod_comment} =~ s/^\n\n+/\n/; $node->{pod_comment} =~ s/\n\n+$/\n\n/; } write; } print "\n"; while (<$oldguts>) { last if /=for regcomp.pl end/; } do { print } while <$oldguts>; #win32 can't unlink an open FH close $oldguts or die "Error closing pod/perldebguts.pod: $!"; select $old_fh; close_and_rename($guts); } my $confine_to_core = 'defined(PERL_CORE) || defined(PERL_EXT_RE_BUILD)'; read_definition("regcomp.sym"); if ($ENV{DUMP}) { require Data::Dumper; print Data::Dumper::Dumper(\@all); exit(1); } my $out= open_new( 'regnodes.h', '>', { by => 'regen/regcomp.pl', from => [ 'regcomp.sym', 'op_reg_common.h', 'regexp.h' ], }, ); print $out "#if $confine_to_core\n\n"; print_typedefs($out); print_state_defs($out); print_regnode_name($out); print_regnode_info($out); print_reg_extflags_name($out); print_reg_intflags_name($out); print_process_flags($out); print_process_EXACTish($out); print $out "\n#endif /* $confine_to_core */\n"; read_only_bottom_close_and_rename($out); do_perldebguts();