summaryrefslogtreecommitdiff
path: root/regen
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2022-12-03 00:12:07 +0100
committerYves Orton <demerphq@gmail.com>2022-12-24 01:17:22 +0100
commitd655b7ce0d9f884dc0965633e816e31c75a953a1 (patch)
treeeb9c110e92bf8b3949451cd147603fcb0bf3273d /regen
parent915f2e86e7f36f03ca41bf068e9679a75bc301ae (diff)
downloadperl-d655b7ce0d9f884dc0965633e816e31c75a953a1.tar.gz
regen/HeaderParser.pm - A module to parse our header files
Consistent proper header file parsing with an OO interface. There are various traps and zaps parsing header files, such as line continuations, and multiline comments acting as a line continuation. There are also issues that the naive may overlook such as indented preprocessor directives, and such things. There are also some specialized tasks which we perform to construct header files from other header files, such as grouping content under similar guard clauses together, normalizing guard clauses, and the like. HeaderParser provides an API to handle these issues. The code which needs to read header files, or write header files, can use the HeaderParser to group and normalize the content they are reading or writing. This also frees the code generators from needing to worry about indentation, or such artifacts, HeaderParser normalizes it all away. This patch includes migrating everything to use the new infra, but it does not include some of the changes that would come with that new infra, so it would not pass test on THIS commit. Running make regen should "fix" this, although it is deliberate to make rebasing the branch easier. One of the notable changes in this commit is that embed.fnc is now under control of `make regen` (even though it is an input to `make regen`) and any changes will be automatically tied by running it, even if those changes also trigger other changes. HeaderParser sits underneath it all, so there is no chicken and egg problem that would require running `make regen` twice, the output of processing the modified embed.fnc would be identical to the output of processing the original. fixup
Diffstat (limited to 'regen')
-rw-r--r--regen/HeaderParser.pm1553
-rw-r--r--regen/tidy_embed.pl61
2 files changed, 1614 insertions, 0 deletions
diff --git a/regen/HeaderParser.pm b/regen/HeaderParser.pm
new file mode 100644
index 0000000000..8bc2c72548
--- /dev/null
+++ b/regen/HeaderParser.pm
@@ -0,0 +1,1553 @@
+package HeaderParser;
+use strict;
+use warnings;
+
+# these are required below in BEGIN statements, we cant have a
+# hard dependency on them as they might not be available when
+# we run as part of autodoc.pl
+#
+# use Data::Dumper;
+# use Storable qw(dclone);
+#
+use Carp qw(confess);
+use Text::Tabs qw(expand unexpand);
+use Text::Wrap qw(wrap);
+
+# The style of this file is determined by:
+#
+# perltidy -w -ple -bbb -bbc -bbs -nolq -l=80 -noll -nola -nwls='=' \
+# -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs \
+# -fsb='#start-no-tidy' -fse='#end-no-tidy' -cpb -bfvt=2
+
+my (
+ %unop, # unary operators and their precedence
+ %binop, # binary operators and their precedence
+ %is_right_assoc, # operators which are right associative
+ %precedence, # precedence of all operators.
+ %associative, # associative operators
+ %commutative, # commutative operators
+ %cmpop, # comparison operators
+ $unop_pat, # pattern to match unary operators
+ $binop_pat, # pattern to match binary operators
+ %op_names, # map of op to description, used in error messages
+ $tokenize_pat # a pattern which can tokenize an expression
+);
+
+BEGIN {
+ # this is initialization for the operator precedence expression parser
+ # we use for handling preprocessor conditions.
+ %op_names= (
+ '==' => 'equality',
+ '!=' => 'inequality',
+ '<<' => 'bit-shift-left',
+ '>>' => 'bit-shift-right',
+ '+' => 'addition',
+ '-' => 'subtraction',
+ '*' => 'multiplication',
+ '/' => 'division',
+ '%' => 'modulo',
+ '||' => 'logical-or', # Lowest precedence
+ '&&' => 'logical-and',
+ '|' => 'binary-or',
+ '^' => 'binary-xor',
+ '&' => 'binary-and',
+ '<' => 'less-than', # split on spaces, all with equal precedence
+ '>' => 'greater-than',
+ '<=' => 'less-than-or-equal',
+ '>=' => 'greater-than-or-equal',
+ );
+ my @cmpop= (
+ '== !=', # listed in lowest to highest precedence
+ '< > <= >=', # split on spaces, all with equal precedence
+ );
+ my @binop= (
+ '||', # Lowest precedence
+ '&&',
+ '|',
+ '^',
+ '&',
+ @cmpop, # include the numerical comparison operators.
+ '<< >>',
+ '+ -',
+ '* / %', # highest prcedence operators.
+ );
+
+ my @unop= qw( ! ~ + - );
+ %unop= map { $_ => 1 } @unop;
+ %cmpop= map { $_ => 1 } map { split /\s+/, $_ } @cmpop;
+ %binop= map { $_ => 1 } map { split /\s+/, $_ } @binop;
+
+ my $make_pat= sub {
+ my $pat= join "|", sort { length($b) <=> length($a) || $a cmp $b }
+ map quotemeta($_), @_;
+ return qr/$pat/;
+ };
+ $unop_pat= $make_pat->(@unop);
+ foreach my $ix (0 .. $#binop) {
+ my $sym= $binop[$ix];
+ $precedence{$_}= (1 + $ix) * 10 for split /\s+/, $sym;
+ }
+ $is_right_assoc{"?"}= 1;
+ $is_right_assoc{":"}= 1;
+ $precedence{"?"}= 1;
+ $precedence{":"}= 0;
+
+ $associative{$_}++
+ for qw( || && + *); # we leave '==' out so we don't reorder terms
+ $commutative{$_}++ for qw( || && + *);
+
+ $binop_pat= $make_pat->(keys %precedence);
+ $tokenize_pat= qr/
+ ^(?:
+ (?<comment> \/\*.*?\*\/ )
+ | (?<ws> \s+ )
+ | (?<term>
+ (?<literal>
+ (?<define> defined\(\w+\) )
+ | (?<func> \w+\s*\(\s*\w+(?:\s*,\s*\w+)*\s*\) )
+ | (?<const> (?:0x[a-fA-F0-9]+|\d+[LU]*|'.') )
+ | (?<sym> \w+ )
+ )
+ | (?<op> $binop_pat | $unop_pat )
+ | (?<paren> [\(\)] )
+ )
+ )
+ /xs;
+}
+
+# dump the arguments with dump. wraps loading Dumper
+# as we are executed by miniperl where Dumper isnt available
+sub dd {
+ my $self= shift;
+ local $self->{orig_content};
+ my $ret= "(dump not available)";
+ eval {
+ require Data::Dumper;
+ $ret= Data::Dumper->new(\@_)->Indent(1)->Sortkeys(1)->Useqq(1)->Dump();
+ };
+ return $ret;
+}
+
+my $has_storable;
+
+# same story here, in miniperl we use slow perl code,
+# in real perl we can use Storable and speed things up.
+BEGIN { eval "use Storable; \$has_storable=1;" }
+
+# recursively copy an AoAoA...
+sub copy_aoa {
+ my ($aoa)= @_;
+ if ($has_storable) {
+ return Storable::dclone($aoa);
+ }
+ else {
+ return _copy_aoa($aoa);
+ }
+}
+
+sub _copy_aoa {
+ my ($thing)= @_;
+ if (ref $thing) {
+ return [ map { ref($_) ? _copy_aoa($_) : $_ } @$thing ];
+ }
+ else {
+ return $thing;
+ }
+}
+
+# return the number characters that should go in between a '#' and
+# the name of a c preprocessor directive. Returns 0 spaces for level
+# 0, and 2 * ($level - 1) + 1 spaces for the rest. (1,3,5, etc)
+# This might sound weird, but consider these are tab *stops* and the
+# '#' is included in the total. which means indents of 2, 4, 6 etc.
+sub indent_chars {
+ my ($self, $level)= @_;
+ my $ind= "";
+ $ind .= " " if $level;
+ $ind .= " " x ($level - 1) if $level > 1;
+ return $ind;
+}
+
+# we use OO to store state, etc.
+sub new {
+ my ($class, %args)= @_;
+ return bless \%args,;
+}
+
+# this parses the expression into an array of tokens
+# this is somewhat crude, we could do this incrementally
+# if we wanted and avoid the overhead. but it makes it
+# easier to debug the tokenizer.
+sub _tokenize_expr {
+ my ($self, $expr)= @_;
+ delete $self->{tokens};
+ delete $self->{parse_tree};
+ $self->{original_expr}= $expr;
+
+ my @tokens;
+ while ($expr =~ s/$tokenize_pat//xs) {
+ push @tokens, {%+} if defined $+{'term'};
+ }
+ $self->{tokens}= \@tokens;
+ warn $self->dd($self) if $self->{debug};
+ if (length $expr) {
+ confess "Failed to tokenize_expr: $expr\n";
+ }
+ return \@tokens;
+}
+
+# sort terms in an expression in a way that puts things
+# in a sensible order. Anything starting with PERL_IN_
+# should be on the left in alphabetical order. Digits
+# should be on the right (eg 0), and ties are resolved
+# by stripping non-alpha-numerc, thus removing underbar
+# parens, spaces, logical operators, etc, and then by
+# lc comparison of the result.
+sub _sort_terms {
+ my $self= shift;
+ my (@terms)= map {
+ [
+ $_, # 0: raw
+ lc($_) =~ s/[^a-zA-Z0-9]//gr, # 1: "_" stripped and caseless
+ $_ =~ m/PERL_IN_/ ? 1 : 0, # 2: PERL_IN_ labeled define
+ $_ =~ m/^\d/ ? 1 : 0, # 3: digit
+ $_ =~ m/DEBUGGING/ ? 1 : 0 # 4: DEBUGGING?
+ ]
+ } @_;
+ my %seen;
+ #start-no-tidy
+ @terms= map { $seen{ $_->[0] }++ ? () : $_->[0] }
+ sort {
+ $b->[2] <=> $a->[2] || # PERL_IN before others
+ $a->[3] <=> $b->[3] || # digits after others
+ $a->[4] <=> $b->[4] || # DEBUGGING after all else
+ $a->[1] cmp $b->[1] || # stripped caseless cmp
+ lc($a->[0]) cmp lc($b->[0]) || # caseless cmp
+ $a->[0] cmp $b->[0] || # exact cmp
+ 0
+ } @terms;
+ #end-no-tidy
+ return @terms;
+}
+
+# normalize a condition expression by parsing it and then stringifying
+# the parse tree.
+sub tidy_cond {
+ my ($self, $expr)= @_;
+ my $ret= $self->{_tidy_cond_cache}{$expr} //= do {
+ $self->parse_expr($expr) if defined $expr;
+ my $text= $self->_pt_as_str();
+ $text;
+ };
+ $self->{last_expr}= $ret;
+ return $ret;
+}
+
+# convert a parse tree structure to a string recursively.
+#
+# Parse trees are currently made up of arrays, with the count
+# of items in the object determining the type of op it represents.
+# 1 argument: literal value of some sort.
+# 2 arguments: unary operator: 0 slot is the operator, 1 is a parse tree
+# : ternary: 0 slot holds '?', 1 is an array holding three
+# parse trees: cond, true, false
+# 3 arguments or more: binary operator. 0 slot is the op. 1..n are parse trees
+# : note, this is multigate for commutative operators like
+# : "+", "*", "&&" and "||", so an expr
+# : like "A && B && !C" would be represented as:
+# : [ "&&", ["A"], ["B"], [ "!",["C"] ] ]
+#
+sub _pt_as_str {
+ my ($self, $node, $parent_op, $depth)= @_;
+
+ $node ||= $self->{parse_tree}
+ or confess "No parse tree?";
+ $depth ||= 0;
+ if (@$node == 1) {
+
+ # its a literal
+ return $node->[0];
+ }
+ elsif (@$node == 2) {
+
+ # is this a ternary or an unop?
+ if ($node->[0] eq '?') {
+
+ # ternary, the three "parts" are tucked away in
+ # an array in the payload slot
+ my $expr=
+ $self->_pt_as_str($node->[1][0], "?", $depth + 1) . " ? "
+ . $self->_pt_as_str($node->[1][1], "?", $depth + 1) . " : "
+ . $self->_pt_as_str($node->[1][2], "?", $depth + 1);
+
+ # stick parens on if this is a subexpression
+ $expr= "( " . $expr . " )" if $depth;
+ return $expr;
+ }
+ else {
+ if ( $node->[0] eq "!"
+ and @{ $node->[1] } == 2
+ and $node->[1][0] eq "!")
+ {
+ # normalize away !! in expressions.
+ return $self->_pt_as_str($node->[1][1], $parent_op, $depth);
+ }
+
+ # unop - the payload is a optree
+ return $node->[0]
+ . $self->_pt_as_str($node->[1], $node->[0], $depth + 1);
+ }
+ }
+
+ # if we get here we are dealing with a binary operator
+ # the nodes are not necessarily binary, as we "collect"
+ # the terms into a list, thus: A && B && C && D -> ['&&',A,B,C,D]
+ my ($op, @terms)= @$node;
+
+ # convert the terms to strings
+ @terms= map { $self->_pt_as_str($_, $op, $depth + 1) } @terms;
+
+ # sort them to normalize the subexpression
+ my $expr=
+ join " $op ", $associative{$op}
+ ? $self->_sort_terms(@terms)
+ : @terms;
+
+ # stick parens on if this is a subexpression
+ $expr= "( " . $expr . " )" if $depth and !$cmpop{$op};
+
+ # and we are done.
+ return $expr;
+}
+
+# Returns the precedence of an operator, returns 0 if there is no token
+# or the next token is not an op, or confesss if it encounters an op it does not
+# know.
+sub _precedence {
+ my $self= shift;
+ my $token= shift // return 0;
+
+ my $op= (ref $token ? $token->{op} : $token) // return 0;
+
+ return $precedence{$op} // confess "Unknown op '$op'";
+}
+
+# entry point into parsing the tokens, checks that we actually parsed everything
+# and didnt leave anything in the token stream (possible from a malformed expression)
+# Performs some minor textual cleanups using regexes, but then does a proper parse
+# of the expression.
+sub parse_expr {
+ my ($self, $expr)= @_;
+ if (defined $expr) {
+ $expr =~ s/\s*\\\n\s*/ /g;
+ $expr =~ s/defined\s+(\w+)/defined($1)/g;
+ $self->_tokenize_expr($expr);
+ }
+ my $ret= $self->_parse_expr();
+ if (@{ $self->{tokens} }) {
+
+ # if all was well with parsing we should not get here.
+ confess "Unparsed tokens: ", $self->dd($self->{tokens});
+ }
+ $self->{parse_tree}= $ret;
+ return $ret;
+}
+
+# this is just a wrapper around _parse_expr_assoc() which handles
+# parsing an arbitrary expression.
+sub _parse_expr {
+ my ($self)= @_;
+ return $self->_parse_expr_assoc($self->_parse_expr_primary(), 1);
+}
+
+# This handles extracting from the token stream
+# - simple literals
+# - unops (assumed to be right associative)
+# - parens (which reset the precedence acceptable to the parser)
+#
+sub _parse_expr_primary {
+ my ($self)= @_;
+ my $tokens= $self->{tokens}
+ or confess "No tokens in _parse_expr_primary?";
+ my $first= $tokens->[0]
+ or confess "No primary?";
+ if ($first->{paren} and $first->{paren} eq "(") {
+ shift @$tokens;
+ my $expr= $self->_parse_expr();
+ $first= $tokens->[0];
+ if (!$first->{paren} or $first->{paren} ne ")") {
+ confess "Expecting close paren", $self->dd($tokens);
+ }
+ shift @$tokens;
+ return $expr;
+ }
+ elsif ($first->{op} and $unop{ $first->{op} }) {
+ my $op_token= shift @$tokens;
+ return [ $op_token->{op}, $self->_parse_expr_primary() ];
+ }
+ elsif (defined $first->{literal}) {
+ shift @$tokens;
+ return [ $first->{literal} ];
+ }
+ else {
+ die sprintf
+ "Unexpected token '%s', expecting literal, unary, or expression.\n",
+ $first->{term};
+ }
+}
+
+# This is the heart of the expression parser. It uses
+# a pair of nested loops to avoid excessive recursion during parsing,
+# which should be a bit faster than other strategies. It only should
+# recurse when the precedence level changes.
+sub _parse_expr_assoc {
+ my ($self, $lhs, $min_precedence)= @_;
+ my $tokens= $self->{tokens}
+ or confess "No tokens in _parse_expr_assoc";
+ my $la= $tokens->[0]; # lookahead
+ my $la_pr= $self->_precedence($la); # lookahead precedence
+ while ($la && $la_pr >= $min_precedence) {
+ my $op_token= shift @$tokens;
+ my $op_pr= $la_pr; # op precedence
+ if ($op_token->{op} eq "?") {
+ my $mid= $self->_parse_expr();
+ if (@$tokens and $tokens->[0]{op} and $tokens->[0]{op} eq ":") {
+ shift @$tokens;
+ my $tail= $self->_parse_expr();
+ return [ '?', [ $lhs, $mid, $tail ] ];
+ }
+ confess "Panic: expecting ':'", $self->dd($tokens);
+ }
+ my $rhs;
+ eval { $rhs= $self->_parse_expr_primary(); }
+ or die "Error in $op_names{$op_token->{op}} expression: $@";
+ $la= $tokens->[0];
+ $la_pr= $self->_precedence($la);
+ while (
+ $la_pr > $op_pr || # any and larger
+ ( $is_right_assoc{ $op_token->{op} }
+ and $la_pr == $op_pr) # right and equal
+ ) {
+ my $new_precedence= $op_pr + ($la_pr > $op_pr ? 1 : 0);
+ $rhs= $self->_parse_expr_assoc($rhs, $new_precedence);
+ $la= $tokens->[0];
+ $la_pr= $self->_precedence($la);
+ }
+ if ( @$lhs >= 3
+ && $lhs->[0] eq $op_token->{op}
+ && $commutative{ $op_token->{op} })
+ {
+ push @$lhs, $rhs;
+ }
+ else {
+ $lhs= [ $op_token->{op}, $lhs, $rhs ];
+ }
+ }
+ return $lhs;
+}
+
+#entry point for normalizing and if/elif statements
+#returns the line and condition in normalized form.
+sub normalize_if_elif {
+ my ($self, $line, $line_info)= @_;
+ if (my $dat= $self->{cache_normalize_if_elif}{$line}) {
+ return $dat->{line}, $dat->{cond};
+ }
+ my ($cond);
+ eval {
+ ($line, $cond)= $self->_normalize_if_elif($line);
+ 1;
+ } or die sprintf "Error at line %d\nLine %d: %s\n%s",
+ ($line_info->start_line_num()) x 2, $line, $@;
+ $self->{cache_normalize_if_elif}{$line}= { line => $line, cond => $cond };
+ return ($line, $cond);
+}
+
+#guts of the normalize_if_elif() - cleans up the line, extracts
+#the condition, and then tidies it with tidy_cond().
+sub _normalize_if_elif {
+ my ($self, $line)= @_;
+ my $nl= "";
+ $nl= $1 if $line =~ s/(\n+)\z//;
+ $line =~ s/\s+\z//;
+ my @comment;
+ push @comment, $1 while $line =~ s!\s*(/\*.*?\*/)\z!!;
+ $line =~ s/defined\s*\(\s*(\w+)\s*\)/defined($1)/g;
+ $line =~ s/!\s+defined/!defined/g;
+
+ if ($line =~ /^#((?:el)?if)(n?)def\s+(\w+)/) {
+ my $if= $1;
+ my $not= $2 ? "!" : "";
+ $line= "#$if ${not}defined($3)";
+ }
+ $line =~ s/#((?:el)?if)\s+//
+ or confess "Bad cond: $line";
+ my $if= $1;
+ $line =~ s/!\s+/!/g;
+
+ my $old_cond= $line;
+ my $cond= $self->tidy_cond($old_cond);
+
+ warn "cond - $old_cond\ncond + $cond\n"
+ if $old_cond ne $cond and $self->{debug};
+
+ $line= "#$if $cond";
+ $line .= " " . join " ", reverse @comment if @comment;
+
+ $line .= $nl;
+ return ($line, $cond);
+}
+
+# parses a text buffer as though it was a file on disk
+# calls parse_fh()
+sub parse_text {
+ my ($self, $text)= @_;
+ open my $fh, "<", \$text
+ or die "Failed to open buffer for read: $!";
+ return $self->parse_fh($fh);
+}
+
+# takes a readable filehandle and parses whatever contents is
+# returned by reading it. Returns an array of HeaderLine objects.
+# this is the main routing for parsing a header file.
+sub parse_fh {
+ my ($self, $fh)= @_;
+ my @lines;
+ my @cond;
+ my @cond_line;
+ my $last_cond;
+ my $cb= $self->{pre_process_content};
+ $self->{orig_content}= "";
+ my $line_num= 1;
+
+ while (defined(my $line= readline($fh))) {
+ my $start_line_num= $line_num++;
+ $self->{orig_content} .= $line;
+ while ($line =~ /\\\n\z/ or $line =~ m</\*(?:(?!\*/).)*\s*\z>s) {
+ defined(my $read_line= readline($fh))
+ or last;
+ $self->{orig_content} .= $read_line;
+ $line_num++;
+ $line .= $read_line;
+ }
+ while ($line =~ m!/\*(.*?)(\*/|\z)!gs) {
+ my ($inner, $tail)= ($1, $2);
+ if ($tail ne "*/") {
+ confess
+ "Unterminated comment starting at line $start_line_num\n";
+ }
+ elsif ($inner =~ m!/\*!) {
+ confess
+ "Nested/broken comment starting at line $start_line_num\n";
+ }
+ }
+
+ my $raw= $line;
+ my $type= "content";
+ my $sub_type= "text";
+ my $level= @cond;
+ my $do_pop= 0;
+ my $flat= $line;
+ $flat =~ s/\s*\\\n\s*/ /g;
+ $flat =~ s!/\*.*?\*/! !gs;
+ $flat =~ s/\s+/ /g;
+ $flat =~ s/\s+\z//;
+ $flat =~ s/^\s*#\s*/#/g;
+
+ my $line_info=
+ HeaderLine->new(start_line_num => $start_line_num, raw => $raw);
+ my $do_cond_line;
+ if ($flat =~ /^#/) {
+ if ($flat =~ m/^(#(?:el)?if)(n?)def\s+(\w+)/) {
+ my $if= $1;
+ my $not= $2 ? "!" : "";
+ my $sym= $3;
+ $flat =~
+ s/^(#(?:el)?if)(n?)def\s+(\w+)/$if ${not}defined($sym)/;
+ }
+ my $cond; # used in various expressions below
+ if ($flat =~ /^#endif/) {
+ if (!@cond) {
+ confess "Not expecting $flat";
+ }
+ $do_pop= 1;
+ $level--;
+ $type= "cond";
+ $sub_type= "#endif";
+ }
+ elsif ($flat =~ /^#if\b/) {
+ ($flat, $cond)= $self->normalize_if_elif($flat, $line_info);
+ push @cond, [$cond];
+ push @cond_line, $line_info;
+ $type= "cond";
+ $sub_type= "#if";
+ }
+ elsif ($flat =~ /^#elif\b/) {
+ if (!@cond) {
+ confess "No if for $flat";
+ }
+ $level--;
+ ($flat, $cond)= $self->normalize_if_elif($flat, $line_info);
+ $cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])");
+ $cond_line[-1]= $line_info;
+ push @{ $cond[-1] }, $cond;
+ $type= "cond";
+ $sub_type= "#elif";
+ }
+ elsif ($flat =~ /^#else\b/) {
+ if (!@cond) {
+ confess "No if for $flat";
+ }
+ $level--;
+ $cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])");
+ $cond_line[-1]= $line_info;
+ $type= "cond";
+ $sub_type= "#else";
+ }
+ elsif ($flat =~ /#undef/) {
+ $type= "content";
+ $sub_type= "#undef";
+ }
+ elsif ($flat =~ /#pragma\b/) {
+ $type= "content";
+ $sub_type= "#pragma";
+ }
+ elsif ($flat =~ /#include\b/) {
+ $type= "content";
+ $sub_type= "#include";
+ }
+ elsif ($flat =~ /#define\b/) {
+ $type= "content";
+ $sub_type= "#define";
+ }
+ elsif ($flat =~ /#error\b/) {
+ $type= "content";
+ $sub_type= "#error";
+ }
+ else {
+ confess "Do not know what to do with $line";
+ }
+ if ($type eq "cond") {
+
+ # normalize conditional lines
+ $line= $flat;
+ $last_cond= $line_info;
+ }
+ }
+ $line =~ s/\n?\z/\n/;
+
+ %$line_info= (
+ cond => copy_aoa(\@cond),
+ type => $type,
+ sub_type => $sub_type,
+ raw => $raw,
+ flat => $flat,
+ line => $line,
+ level => $level,
+ start_line_num => $start_line_num,
+ n_lines => $line_num - $start_line_num,
+ );
+
+ push @lines, $line_info;
+ if ($do_pop) {
+ pop @cond;
+ pop @cond_line;
+ }
+ if ($type eq "content" and $cb) {
+ $cb->($self, $lines[-1]);
+ }
+ }
+ if (@cond_line) {
+ my $msg= "Unterminated conditional block starting line "
+ . $cond_line[-1]->start_line_num();
+ $msg .=
+ " with last conditional operation at line "
+ . $last_cond->start_line_num()
+ if $cond_line[-1] != $last_cond;
+ confess $msg;
+ }
+ $self->{lines}= \@lines;
+ return \@lines;
+}
+
+# returns the last lines we parsed.
+sub lines { $_[0]->{lines} }
+
+# assuming a line looks like an embed.fnc entry parse it
+# and normalize it, and create and EmbedLine object from it.
+sub tidy_embed_fnc_entry {
+ my ($self, $line_data)= @_;
+ my $line= $line_data->{line};
+ return $line if $line =~ /^\s*:/;
+ return $line unless $line_data->{type} eq "content";
+ return $line unless $line =~ /\|/;
+
+ $line =~ s/\s*\\\n/ /g;
+ $line =~ s/\s+\z//;
+ ($line)= expand($line);
+ my ($flags, $ret, $name, @args)= split /\s*\|\s*/, $line;
+ for ($ret, @args) {
+ s/(\w)\*/$1 */g;
+ s/\*\s+(\w)/*$1/g;
+ s/\*const/* const/g;
+ }
+ my $head= sprintf "%-8s|%-7s", $flags, $ret;
+ $head .= sprintf "|%*s", -(31 - length($head)), $name;
+ if (@args and length($head) > 32) {
+ $head .= "\\\n";
+ $head .= " " x 32;
+ }
+ foreach my $ix (0 .. $#args) {
+ my $arg= $args[$ix];
+ $head .= "|$arg";
+ $head .= "\\\n" . (" " x 32) if $ix < $#args;
+ }
+ $line= $head . "\n";
+
+ if ($line =~ /\\\n/) {
+ my @lines= split /\s*\\\n/, $line;
+ my $len= length($lines[0]);
+ $len < length($_) and $len= length($_) for @lines;
+ $len= int(($len + 7) / 8) * 8;
+ $len= 72 if $len < 72;
+ $line= join("\\\n",
+ (map { sprintf "%*s", -$len, $_ } @lines[ 0 .. $#lines - 1 ]),
+ $lines[-1]);
+ }
+ ($line)= unexpand($line);
+
+ $line_data->{embed}= EmbedLine->new(
+ flags => $flags,
+ return_type => $ret,
+ name => $name,
+ args => \@args,
+ );
+ $line =~ s/\s+\z/\n/;
+ $line_data->{line}= $line;
+ return $line;
+}
+
+# line up the text in a multiline string by a given $fragment
+# of text, inserting whitespace in front or behind the $fragment
+# to get the text to line up. Returns the text. This is wrapped
+# by line_up() and is used to wrap long conditions and comments
+# in the generated code.
+sub _line_up_frag {
+ my ($self, $str, $fragment)= @_;
+ die "has tabs?!" if $str =~ /\t/;
+ my @lines= split /\n/, $str;
+ my $changed= 1;
+ while ($changed) {
+ $changed= 0;
+ foreach my $ix (0 .. $#lines - 1) {
+ my $f_index= 0;
+ my $n_index= 0;
+ while (1) {
+ $f_index= index($lines[$ix], $fragment, $f_index);
+ $n_index= index($lines[ $ix + 1 ], $fragment, $n_index);
+ if ($f_index == -1 or $n_index == -1) {
+ last;
+ }
+ if ($f_index < $n_index) {
+ my $f_idx= $f_index;
+ $f_idx-- while substr($lines[$ix], $f_idx, 1) ne " ";
+ substr($lines[$ix], $f_idx, 0, " " x ($n_index - $f_index));
+ $changed++;
+ last;
+ }
+ elsif ($n_index < $f_index) {
+ my $n_idx= $n_index;
+ $n_idx-- while substr($lines[ $ix + 1 ], $n_idx, 1) ne " ";
+ substr($lines[ $ix + 1 ],
+ $n_idx, 0, " " x ($f_index - $n_index));
+ $changed++;
+ last;
+ }
+ $f_index++;
+ $n_index++;
+ }
+ }
+ }
+ my $ret= join "", map { "$_\n" } @lines;
+ return $ret;
+}
+
+sub line_up {
+ my ($self, $line, @fragments)= @_;
+
+ foreach my $fragment (@fragments) {
+ $line= $self->_line_up_frag($line, $fragment);
+ last if $line ne $_[1];
+ }
+ return $line;
+}
+
+# Takes an array of HeaderLines objects produced by parse_fh()
+# or by group_content(), and turn it into a string.
+sub lines_as_str {
+ my ($self, $lines, $post_process_content)= @_;
+ $lines ||= $self->{lines};
+ my $ret;
+ $post_process_content ||= $self->{post_process_content};
+ my $filter= $self->{filter_content};
+ my $last_line= "";
+ foreach my $line_data (@$lines) {
+ my $line= $line_data->{line};
+ if ($line_data->{type} ne "content" or $line_data->{sub_type} ne "text")
+ {
+ my $level= $line_data->{level};
+ my $ind= $self->indent_chars($level);
+ $line =~ s/^#(\s*)/#$ind/;
+ }
+ if ($line_data->{type} eq "cond") {
+ if ($line_data->{sub_type} =~ /#(?:else|endif)/) {
+ my $joined= join " && ",
+ map { "($_)" } @{ $line_data->{cond}[-1] };
+ my $cond_txt= $self->tidy_cond($joined);
+ $cond_txt= "if $cond_txt" if $line_data->{sub_type} eq "#else";
+ $line =~ s!\s*\z! /* $cond_txt */\n!;
+ }
+ elsif ($line_data->{sub_type} eq "#elif") {
+ my $last_frame= $line_data->{cond}[-1];
+ my $joined= join " && ",
+ map { "($_)" } @$last_frame[ 0 .. ($#$last_frame - 1) ];
+ my $cond_txt= $self->tidy_cond($joined);
+ $line =~ s!\s*\z! /* && $cond_txt */\n!;
+ }
+ }
+ $line =~ s/\s+\z/\n/;
+ if ($last_line eq "\n" and $line eq "\n") {
+ next;
+ }
+ $last_line= $line;
+ if ($line_data->{type} eq "cond") {
+ $line =~ m!(^\s*#\s*\w+\s+)([^/].*?\s*)?(/\*.*)?\n\z!
+ or die "Failed to split cond line: $line";
+ my ($type, $cond, $comment)= ($1, $2, $3);
+ $comment //= "";
+ $cond //= "";
+ my $new_line;
+ if (!length($cond) and $comment) {
+ $comment =~ s!^(/\*\s+)!!
+ and $type .= $1;
+ }
+ my $l= length($type);
+ $line= _my_wrap($type, " " x $l, $cond . $comment);
+ $line =~ s/\n(?!\z)/ \\\n/g;
+ $line =~ s!\\\n\s+((?:\)\s*)*)\*/!$1*/!;
+ $line= $self->line_up($line, "||", "define");
+ }
+ $line_data->{line}= $line;
+ if ($post_process_content and $line_data->{type} eq "content") {
+ $post_process_content->($self, $line_data);
+ }
+ if ($filter and $line_data->{type} eq "content") {
+ $filter->($self, $line_data) or next;
+ }
+ $ret .= $line_data->{line};
+ }
+ return $ret;
+}
+
+# Text::Wrap::wrap has an odd api, so hide it behind a wrapper
+# sub which sets things up properly.
+sub _my_wrap {
+ my ($head, $rest, $line)= @_;
+ local $Text::Wrap::unexpand= 0;
+ local $Text::Wrap::huge= "overflow";
+ local $Text::Wrap::columns= 78;
+ $line= wrap $head, $rest, $line;
+ return $line;
+}
+
+# recursively extract the && expressions from a parse tree,
+# returning the result as strings.
+# if $node is not a '&&' op then it returns $node as a string,
+# otherwise it returns the string form of the arguments to the
+# '&&' op, recursively flattening any '&&' nodes that it might
+# contain.
+sub _and_clauses {
+ my ($self, $node)= @_;
+
+ my @ret;
+ if (@$node < 3 or $node->[0] ne "&&") {
+ return $self->_pt_as_str($node);
+ }
+ foreach my $idx (1 .. $#$node) {
+ push @ret, $self->_and_clauses($node->[$idx]);
+ }
+ return @ret;
+}
+
+# recursively walk the a parse tree, and return the literal
+# terms it contains, ignoring any operators in the optree.
+sub _terms {
+ my ($self, $node)= @_;
+ if (@$node == 1) {
+ return $self->_pt_as_str($node);
+ }
+ my @ret;
+ if (@$node == 2) {
+ if ($node->[0] eq "?") {
+ push @ret, map { $self->_terms($_) } @{ $node->[1] };
+ }
+ else {
+ push @ret, $self->_terms($node->[1]);
+ }
+ }
+ else {
+ foreach my $i (1 .. $#$node) {
+ push @ret, $self->_terms($node->[$i]);
+ }
+ }
+ return @ret;
+}
+
+# takes a HeaderLine "cond" AoA and flattens it into
+# a single expression, and then extracts all the and clauses
+# it contains. Thus [['defined(A)'],['defined(B)']] and
+# [['defined(A) && defined(B)']], end up as ['defined(A)','defined(B)']
+sub _flatten_cond {
+ my ($self, $cond_ary)= @_;
+
+ my $expr= join " && ", map {
+ map { "($_)" }
+ @$_
+ } @$cond_ary;
+ return [] unless $expr;
+ my $tree= $self->parse_expr($expr);
+ my %seen;
+ my @and_clause= grep { !$seen{$_}++ } $self->_and_clauses($tree);
+ return \@and_clause;
+}
+
+# Find the best path into a tree of conditions, such that
+# we reuse the maximum number of existing branches. Returning
+# two arrays, the first contain the parts of $cond_array that
+# make up the best path, in the best path order, and a second array
+# with the remaining items in the initial order they were provided.
+# Thus if we have previously stored only the path "A", "B", "C"
+# into the tree, and want to find the best path for
+# ["E","D","C","B","A"] we should return: ["A","B","C"],["E","D"],
+#
+# This used to reduce the number of conditions in the grouped content,
+# and is especially helpful with dealing with DEBUGGING related
+# functionality. It is coupled with careful control over the order
+# that we add paths and conditions to the tree.
+sub _best_path {
+ my ($self, $tree_node, $cond_array, @path)= @_;
+ my $best= \@path;
+ my $rest= $cond_array;
+ foreach my $cond (@$cond_array) {
+ if ($tree_node->{$cond}) {
+ my ($new_best, $new_rest)=
+ $self->_best_path($tree_node->{$cond},
+ [ grep $_ ne $cond, @$cond_array ],
+ @path, $cond);
+ if (@$new_best > @$best) {
+ ($best, $rest)= ($new_best, $new_rest);
+ }
+ }
+ }
+ if (@$best == @path) {
+ foreach my $cond (@$cond_array) {
+ my $not_cond= $self->tidy_cond("!($cond)");
+ if ($tree_node->{$not_cond}) {
+ $best= [ @path, $cond ];
+ $rest= [ grep $_ ne $cond, @$cond_array ];
+ last;
+ }
+ }
+ }
+ return ($best, $rest);
+}
+
+# This builds a group content tree from a set of lines. each content line in
+# the original file is added to the file based on the conditions that apply to
+# the content.
+#
+# The tree is made up of nested HoH's with keys in the HoH being normalized
+# clauses from the {cond} data in the HeaderLine objects.
+#
+# Care is taken to minimize the number of pathways and to reorder clauses to
+# reuse existing pathways and minimize the total number of conditions in the
+# file.
+#
+# The '' key of a hash contains an array of the lines that are part of the
+# condition that lead to that key. Thus lines with no conditions are in
+# @{$tree{''}}, lines with the condition "defined(A) && defined(B)" would be
+# in $tree{"defined(A)"}{"defined(B)"}{""}.
+#
+# The result of this sub is normally passed into _recurse_group_content_tree()
+# which converts it back into a set of HeaderLine objects.
+#
+sub _build_group_content_tree {
+ my ($self, $lines)= @_;
+ $lines ||= $self->{lines};
+ my $filter= $self->{filter_content};
+ my %seen_normal;
+ foreach my $line_data (@$lines) {
+ next if $line_data->{type} ne "content";
+ next if $filter and !$filter->($self, $line_data);
+ my $cond_frames= $line_data->{cond};
+ my $cond_frame= $self->_flatten_cond($cond_frames);
+ my $flat_merged= join " && ", map "($_)", @$cond_frame;
+ my $normalized;
+ if (@$cond_frame) {
+ $normalized= $self->tidy_cond($flat_merged);
+ }
+ else {
+ $normalized= $flat_merged; # empty string
+ }
+ push @{ $seen_normal{$normalized} }, $line_data;
+ }
+ my @debugging;
+ my @non_debugging;
+ foreach my $key (keys %seen_normal) {
+ if ($key =~ /DEBUGGING/) {
+ push @debugging, $key;
+ }
+ else {
+ push @non_debugging, $key;
+ }
+ }
+ @non_debugging=
+ sort { length($a) <=> length($b) || $a cmp $b } @non_debugging;
+ @debugging= sort { length($b) <=> length($a) || $a cmp $b } @debugging;
+ my %tree;
+ foreach my $normal_expr (@non_debugging, @debugging) {
+ my $all_line_data= $seen_normal{$normal_expr};
+
+ my $cond_frame=
+ (length $normal_expr)
+ ? $self->_flatten_cond([ [$normal_expr] ])
+ : [];
+ @$cond_frame= $self->_sort_terms(@$cond_frame);
+ my $node= \%tree;
+ my ($best, $rest)= $self->_best_path($node, $cond_frame);
+ die sprintf "Woah: %d %d %d", 0 + @$best, 0 + @$rest, 0 + @$cond_frame
+ unless @$best + @$rest == @$cond_frame;
+
+ foreach my $cond (@$best, @$rest) {
+ $node= $node->{$cond} ||= {};
+ }
+ push @{ $node->{''} }, @$all_line_data;
+ }
+
+ warn $self->dd(\%tree) if $self->{debug};
+ $self->{tree}= \%tree;
+ return \%tree;
+}
+
+# convert a tree of conditions constructed by _build_group_content_tree()
+# and turn it into a set of HeaderLines that represents it. Performs the
+# appropriate sets required to reconstitute an if/elif/elif/else sequence
+# by calling _handle_else().
+sub _recurse_group_content_tree {
+ my ($self, $node, @path)= @_;
+ my $depth= 0 + @path;
+ my $ind= $self->indent_chars($depth);
+ my @ret;
+ if ($node->{''}) {
+ if (my $cb= $self->{post_process_grouped_content}) {
+ $cb->($self, $node->{''}, \@path);
+ }
+ if (my $cb= $self->{post_process_content}) {
+ $cb->($self, $_, \@path) for @{ $node->{''} };
+ }
+ push @ret,
+ map { HeaderLine->new(%$_, cond => [@path], level => $depth) }
+ @{ $node->{''} };
+ }
+
+ my %skip;
+ foreach my $expr (
+ map { $_->[0] }
+ sort { $a->[1] cmp $b->[1] || $a->[0] cmp $b->[0] }
+ map { [ $_, lc($_) =~ s/[^A-Za-z0-9]+//gr ] } keys %$node
+ ) {
+ next unless length $expr; # ignore payload
+ my $not= $self->tidy_cond("!($expr)");
+ if ($skip{$expr} or ($not !~ /^!/ and $node->{$not})) {
+ next;
+ }
+ my $kid= $node->{$expr};
+ while (!$node->{$not} and keys(%$kid) == 1 and !$kid->{''}) {
+ my ($kid_key)= keys(%$kid);
+ $expr= $self->tidy_cond("($expr) && ($kid_key)");
+ $kid= $kid->{$kid_key};
+ my $new_not= $self->tidy_cond("!($expr)");
+ if ($node->{$new_not}) {
+ $not= $new_not;
+ $skip{$not}++;
+ }
+ }
+ my $raw= "#${ind}if $expr\n";
+ my $hl= HeaderLine->new(
+ type => "cond",
+ sub_type => "#if",
+ raw => $raw,
+ line => $raw,
+ level => $depth,
+ cond => [ @path, [$expr] ]);
+ my $ar= $self->_recurse_group_content_tree($kid, @path, [$expr]);
+ push @ret, $hl, @$ar;
+ if ($node->{$not}) {
+ $skip{$not}++;
+ my $ar=
+ $self->_handle_else($not, $node->{$not}, $ind, $depth, @path,
+ [$not]);
+ push @ret, @$ar;
+ }
+
+ # and finally the #endif
+
+ $raw= "#${ind}endif\n";
+
+ # we need to extract the condition information from the last line in @ret,
+ # as we don't know which condition we are ending here. It could be an elsif
+ # from deep in the parse tree for instance.
+ # So we need to extract the last frame from the cond structure in the last
+ # line-info in @ret.
+ # BUT if this last line is itself an #endif, then we need to take the second
+ # to last line instead, as the endif would have "popped" that frame off the
+ # condition stack.
+ my $last_ret= $ret[-1];
+ my $idx=
+ ($last_ret->{type} eq "cond" && $last_ret->{sub_type} eq "#endif")
+ ? -2
+ : -1;
+ my $end_line= HeaderLine->new(
+ type => "cond",
+ sub_type => "#endif",
+ raw => $raw,
+ line => $raw,
+ level => $depth,
+ cond => [ @path, $last_ret->{cond}[$idx] ]);
+ push @ret, $end_line;
+ }
+ return \@ret;
+}
+
+# this handles the specific case of an else clause, detecting
+# when an elif can be constructed, may recursively call itself
+# to deal with if/elif/elif/else chains. Calls back into
+# _recurse_group_content_tree().
+sub _handle_else {
+ my ($self, $not, $kid, $ind, $depth, @path)= @_;
+
+ # extract the first 3 keys - from this we can detect
+ # which of the three scenarios we have to handle.
+ my ($k1, $k2, $k3)=
+ sort { length($a) <=> length($b) || $a cmp $b } keys %$kid;
+ my $not_k1;
+ if (length($k1) and defined($k2) and !defined($k3)) {
+
+ # if we do not have a payload (length($k1)) and we have exactly
+ # two keys (defined($k2) and !defined($k3)) we need to compute
+ # the inverse of $k1, which we will use later.
+ $not_k1= $self->tidy_cond("!($k1)");
+ }
+ my @ret;
+ if (length($k1) and !defined($k2)) {
+
+ # only one child, no payload -> elsif $k1
+ my $sub_expr;
+ do {
+ $sub_expr=
+ !$sub_expr
+ ? $k1
+ : $self->tidy_cond("($sub_expr) && ($k1)");
+ $kid= $kid->{$k1};
+ ($k1, $k2)=
+ sort { length($a) <=> length($b) || $a cmp $b } keys %$kid;
+ } while length($k1) and !defined $k2;
+
+ my $raw= "#${ind}elif $sub_expr\n";
+ push @{ $path[-1] }, $sub_expr;
+ my $hl= HeaderLine->new(
+ type => "cond",
+ sub_type => "#elif",
+ raw => $raw,
+ line => $raw,
+ level => $depth,
+ cond => [ map { [@$_] } @path ]);
+ my $ar= $self->_recurse_group_content_tree($kid, @path);
+ push @ret, $hl, @$ar;
+ }
+ elsif (defined($not_k1) and $not_k1 eq $k2) {
+
+ # two children which are complementary, no payload -> elif $k1 else..
+ my $raw= "#${ind}elif $k1\n";
+
+ push @{ $path[-1] }, $k1;
+ my $hl= HeaderLine->new(
+ type => "cond",
+ sub_type => "#elif",
+ raw => $raw,
+ line => $raw,
+ level => $depth,
+ cond => [ map { [@$_] } @path ]);
+ my $ar= $self->_recurse_group_content_tree($kid->{$k1}, @path);
+ $path[-1][-1]= $k2;
+ my $rest= $self->_handle_else($k2, $kid->{$k2}, $ind, $depth, @path);
+ push @ret, $hl, @$ar, @$rest;
+ }
+ else {
+ # payload, 3+ children, or 2 which are not complementary -> else
+ my $raw= "#${ind}else\n";
+ my $hl= HeaderLine->new(
+ type => "cond",
+ sub_type => "#else",
+ raw => $raw,
+ line => $raw,
+ level => $depth,
+ cond => [ map { [@$_] } @path ]);
+ my $ar= $self->_recurse_group_content_tree($kid, @path);
+ push @ret, $hl, @$ar;
+ }
+ return \@ret;
+}
+
+# group the content in lines by the condition that apply to them
+# returns a set of lines representing the new structure
+sub group_content {
+ my ($self, $lines, $filter)= @_;
+ $lines ||= $self->{lines};
+ local $self->{filter_content}= $filter || $self->{filter_content};
+ my $tree= $self->_build_group_content_tree($lines);
+ return $self->_recurse_group_content_tree($tree);
+}
+
+#read a file by name - opens the file and passes the fh into parse_fh().
+sub read_file {
+ my ($self, $file_name, $callback)= @_;
+ $self= $self->new() unless ref $self;
+ $self->{last_file_name}= $file_name;
+ open my $fh, "<", $file_name
+ or confess "Failed to open '$file_name' for read: $!";
+ my $lines= $self->parse_fh($fh);
+ if ($callback) {
+ foreach my $line (@$lines) {
+ $callback->($self, $line);
+ }
+ }
+ return $self;
+}
+
+# These are utility methods for the HeaderLine objects.
+sub HeaderLine::new {
+ my ($class, %self)= @_;
+ return bless \%self, $class;
+}
+sub HeaderLine::cond { $_[0]->{cond} } # AoA
+sub HeaderLine::type { $_[0]->{type} }
+sub HeaderLine::type_is { return $_[0]->type eq $_[1] ? 1 : 0 }
+sub HeaderLine::sub_type { $_[0]->{sub_type} }
+sub HeaderLine::sub_type_is { return $_[0]->sub_type eq $_[1] ? 1 : 0 }
+sub HeaderLine::raw { $_[0]->{raw} }
+sub HeaderLine::flat { $_[0]->{flat} }
+sub HeaderLine::line { $_[0]->{line} }
+sub HeaderLine::level { $_[0]->{level} }
+sub HeaderLine::is_content { return $_[0]->type_is("content") }
+sub HeaderLine::is_cond { return $_[0]->type_is("cond") }
+sub HeaderLine::is_define { return $_[0]->sub_type_is("#define") }
+sub HeaderLine::line_num { $_[0]->{start_line_num} }
+sub HeaderLine::n_lines { $_[0]->{n_lines} }
+sub HeaderLine::embed { $_[0]->{embed} }
+*HeaderLine::start_line_num= *HeaderLine::line_num;
+
+# these are methods for EmbedLine objects
+*EmbedLine::new= *HeaderLine::new;
+sub EmbedLine::flags { $_[0]->{flags} }
+sub EmbedLine::return_type { $_[0]->{return_type} }
+sub EmbedLine::name { $_[0]->{name} }
+sub EmbedLine::args { $_[0]->{args} } # array ref
+
+1;
+
+__END__
+
+=head1 NAME
+
+HeaderParser - A minimal header file parser that can be hooked by other porting
+scripts.
+
+=head1 SYNOPSIS
+
+ my $o= HeaderParser->new();
+ my $lines= $o->parse_fh($fh);
+
+=head1 DESCRIPTION
+
+HeaderParser is a tool to parse C preprocessor header files. The tool
+understands the syntax of preprocessor conditions, and is capable of creating
+a parse tree of the expressions involved, and normalizing them as well.
+
+C preprocessor files are a bit tricky to parse properly, especially with a
+"line by line" model. There are two issues that must be dealt with:
+
+=over 4
+
+=item Line Continuations
+
+Any line ending in "\\\n" (that is backslash newline) is considered to be part
+of a longer string which continues on the next line. Processors should replace
+the "\\\n" typically with a space when converting to a "real" line.
+
+=item Comments Acting As A Line Continuation
+
+The rules for header files stipulates that C style comments are stripped
+before processing other content, this means that comments can serve as a form
+of line continuation:
+
+ #if defined(foo) /*
+ */ && defined(bar)
+
+is the same as
+
+ #if defined(foo) && defined(bar)
+
+This type of comment usage is often overlooked by people writing header file
+parsers for the first time.
+
+=item Indented pre processor directives.
+
+It is easy to forget that there may be multiple spaces between the "#"
+character and the directive. It also easy to forget that there may be spaces
+in *front* of the "#" character. Both of these cases are often overlooked.
+
+=back
+
+The main idea of this module is to provide a single framework for correctly
+parsing the content of our header files in a consistent manner. A secondary
+purpose it to make various tasks we want to do easier, such as normalizing
+content or preprocessor expressions, or just extracting the real "content" of
+the file properly.
+
+=head2 parse_fh
+
+This function parses a filehandle into a set of lines. Each line is represented by a hash
+based object which contains the following fields:
+
+ bless {
+ cond => [['defined(a)'],['defined(b)']],
+ type => "content",
+ sub_type => undef,
+ raw => $raw_content_of_line,
+ line => $normalized_content_of_line,
+ level => $level,
+ }, "HeaderLine"
+
+A "line" in this context is a logical line, and because of line continuations
+and comments may contain more than one physical line, and thus more than
+one newline, but will always include at least one and will always end with one
+(unless there is no newline at the end of the file). Thus
+
+ before /*
+ this is a comment
+ */ after \
+ and continues
+
+will be treated as a single logical line even though the content is
+spread over four lines.
+
+=over 4
+
+=item cond
+
+An array of arrays containing the normalized expressions of any C preprocessor
+conditional blocks which include the line. Each line has its own copy of the
+conditions it was operated on currently, but that may change so dont alter
+this data. The inner arrays may contain more than one element. If so then the
+line is part of an "#else" or "#elsif" and the clauses should be considered to
+be a conjuction when considering "when is this line included", however when
+considered as part of an if/elsif/else, each added clause represents the most
+recent condition. In the following you can see how:
+
+ before /* cond => [ ] */
+ #if A /* cond => [ ['A'] ] */
+ do-a /* cond => [ ['A'] ] */
+ #elif B /* cond => [ ['!A', 'B'] ] */
+ do-b /* cond => [ ['!A', 'B'] ] */
+ #else /* cond => [ ['!A', '!B'] ] */
+ do-c /* cond => [ ['!A', '!B'] ] */
+ # if D /* cond => [ ['!A', '!B'], ['D'] ] */
+ do-d /* cond => [ ['!A', '!B'], ['D'] ] */
+ # endif /* cond => [ ['!A', '!B'], ['D'] ] */
+ #endif /* cond => [ ['!A', '!B'] ] */
+ after /* cond => [ ] */
+
+So in the above we can see how the three clauses of the if produce
+a single "frame" in the cond array, but that frame "grows" and changes
+as additional else clauses are added. When an entirely new if block
+is started (D) it gets its own block. Each endif includes the clause
+it terminates.
+
+=item type
+
+This value indicates the type of the line. This may be one of the following:
+'content', 'cond', 'define', 'include' and 'error'. Several of the types
+have a sub_type.
+
+=item sub_type
+
+This value gives more detail on the type of the line where necessary.
+Not all types have a subtype.
+
+ Type | Sub Type
+ --------+----------
+ content | text
+ | include
+ | define
+ | error
+ cond | #if
+ | #elif
+ | #else
+ | #endif
+
+Note that there are no '#ifdef' or '#elifndef' or similar expressions. All
+expressions of that form are normalized into the '#if defined' form to
+simplify processing.
+
+=item raw
+
+This was the raw original text before HeaderParser performed any modifications
+to it.
+
+=item line
+
+This is the normalized and modified text after HeaderParser or any callbacks
+have processed it.
+
+=item level
+
+This is the "indent level" of a line and corresponds to the number of blocks
+that the line is within, not including any blocks that might be created by
+the line itself.
+
+ before /* level => 0 */
+ #if A /* level => 0 */
+ do-a /* level => 1 */
+ #elif B /* level => 0 */
+ do-b /* level => 1 */
+ #else /* level => 0 */
+ do-c /* level => 1 */
+ # if D /* level => 1 */
+ do-d /* level => 2 */
+ # endif /* level => 1 */
+ #endif /* level => 0 */
+ after /* level => 0 */
+
+=back
+
+parse_fh() will throw an exception if it encounters a malformed expression
+or input it cannot handle.
+
+=head2 lines_as_str
+
+This function will return a string representation of the lines it is provided.
+
+=head2 group_content
+
+This function will group the text in the file by the conditions which contain
+it. This is only useful for files where the content is essentially a list and
+where changing the order that lines are output in will not break the resulting
+file.
+
+Each content line will be grouped into a structure of nested if/else blocks
+(elif will produce a new nested block) such that the content under the control
+of a given set of normalized condition clauses are grouped together in the order
+the occurred in the file, such that each combined conditional clause is output
+only once.
+
+This means a file like this:
+
+ #if A
+ A
+ #elif K
+ AK
+ #else
+ ZA
+ #endif
+ #if B && Q
+ B
+ #endif
+ #if Q && B
+ BC
+ #endif
+ #if A
+ AD
+ #endif
+ #if !A
+ ZZ
+ #endif
+
+Will end up looking roughly like this:
+
+ #if A
+ A
+ AD
+ #else
+ ZZ
+ # if K
+ AK
+ # else
+ ZA
+ # endif
+ #endif
+ #if B && Q
+ B
+ BC
+ #endif
+
+Content at a given block level always goes before conditional clauses
+at the same nesting level.
+
+=head2 HOOKS
+
+There are severals hooks that are available, C<pre_process_content> and
+C<post_process_content>, and C<post_process_grouped_content>. All of these
+hooks will be called with the HeaderParser object as the first argument.
+The "process_content" callbacks will be called with a line hash as the second
+argument, and C<post_process_grouped_content> will be called with an
+array of line hashes for the content in that group, so that the array may be
+modified or sorted. Callbacks called from inside of C<group_content()>
+(that is C<post_process_content> and C<post_process_grouped_content> will be
+called with an additional argument containing and array specifying the actual
+conditional "path" to the content (which may differ somewhat from the data in
+a lines "cond" property).
+
+These hooks may do what they like, but generally they will modify the
+"line" property of the line hash to change the final output returned
+by C<lines_as_str()> or C<group_content()>.
+
+=head2 FORMATTING AND INDENTING
+
+Header parser tries hard to produce neat and readable output with a consistent
+style and form. For example:
+
+ #if defined(FOO)
+ # define HAS_FOO
+ # if defined(BAR)
+ # define HAS_FOO_AND_BAR
+ # else /* !defined(BAR) */
+ # define HAS_FOO_NO_BAR
+ # endif /* !defined(BAR) */
+ #endif /* defined(FOO) */
+
+HeaderParser uses two space tab stops for indenting C pre-processor
+directives. It puts the spaces between the "#" and the directive. The "#" is
+considered "part" of the indent, even though the space comes after it. This
+means the first indent level "looks" like one space, and following indents
+look like 2. This should match what a sensible editor would do with two space
+tab stops. The C<indent_chars()> method can be used to convert an indent level
+into a string that contains the appropriate number of spaces to go in between
+the "#" and the directive.
+
+When emitting "#endif", "#elif" and "#else" directives comments will be
+emitted also to show the conditions that apply. These comments may be wrapped
+to cover multiple lines. Some effort is made to get these comments to line up
+visually, but it uses heuristics which may not always produce the best result.
+
+=cut
diff --git a/regen/tidy_embed.pl b/regen/tidy_embed.pl
new file mode 100644
index 0000000000..6991b1db04
--- /dev/null
+++ b/regen/tidy_embed.pl
@@ -0,0 +1,61 @@
+use lib "regen";
+use HeaderParser;
+use strict;
+use warnings;
+
+my $parser= HeaderParser->new(
+ pre_process_content => sub {
+ my ($self,$line_data)= @_;
+ $self->tidy_embed_fnc_entry($line_data);
+ my $embed= $line_data->{embed}
+ or return;
+ },
+ _post_process_grouped_content => sub {
+ my ($self,$group_ary)= @_;
+ @{$group_ary}=
+ sort {
+ $a->{embed}{name} cmp $b->{embed}{name}
+ } @{$group_ary};
+ },
+ );
+my $tap;
+if (@ARGV and $ARGV[0] eq "--tap") {
+ $tap = shift @ARGV;
+}
+my $file= "embed.fnc";
+if (@ARGV) {
+ $file= shift @ARGV;
+}
+my $new= "$file.new";
+my $bak= "$file.bak";
+$parser->read_file($file);
+my $lines= $parser->lines;
+my @tail;
+while ($lines->[-1]{type} eq "content" and
+ ($lines->[-1]{line} eq "\n" or $lines->[-1]{line}=~/^\s*:/)
+) {
+ unshift @tail, pop @$lines;
+}
+
+my $grouped_content_ary= $parser->group_content();
+push @$grouped_content_ary, @tail;
+my $grouped_content_txt= $parser->normalized_content($grouped_content_ary);
+if ($grouped_content_txt ne $parser->{orig_content}) {
+ if ($tap) {
+ print "not ok - $0 $file\n";
+ } elsif (-t) {
+ print "Updating $file\n";
+ }
+ open my $fh,">",$new
+ or die "Failed to open '$new' for write: $!";
+ print $fh $grouped_content_txt
+ or die "Failed to print to '$new': $!";
+ close $fh
+ or die "Failed to close '$new': $!";
+ rename $file, $bak
+ or die "Couldn't move '$file' to '$bak': $!";
+ rename $new, $file
+ or die "Couldn't move embed.fnc.new to embed.fnc: $!";
+} elsif ($tap) {
+ print "ok - $0 $file\n";
+}