summaryrefslogtreecommitdiff
path: root/util/util_precompile.py
diff options
context:
space:
mode:
Diffstat (limited to 'util/util_precompile.py')
-rwxr-xr-xutil/util_precompile.py564
1 files changed, 564 insertions, 0 deletions
diff --git a/util/util_precompile.py b/util/util_precompile.py
new file mode 100755
index 0000000000..c58b811913
--- /dev/null
+++ b/util/util_precompile.py
@@ -0,0 +1,564 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-"
+# Copyright 2020 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+"""Process preprocessor output to move strings out of the image.
+
+This script is customized to be used in Chrome OS EC codebase, where functions
+cprintf(),cprints(), and cputs() are used to generate console output.
+
+Using this script requires a change in build process: instead of producing .o
+files from .c files, the process includes two extra steps: generate .E files
+instead of .o files, then process all .E files by this script, modifying some
+of them, producing a set of .Ep files, and then generate .o files from the .Ep
+files.
+
+A typical console output invocation in the EC codebase looks as follows:
+
+ cprintf(<chan>, <format string>, var args);
+
+This script is capable of processing invocations of up to 8 arguments (in fact
+in the Cr50 codebase at the time of introduction maximum number is 5). All
+generated .E files are given to this script as inputs. For each input file the
+script scans input lines and passes as is into the output lines which are not
+console output generating code.
+
+The format strings are extracted from console output generating lines and put
+into a dictionary, each line being the key, and the value an integer, the ever
+growing line number.
+
+After format strings are extracted and placed in the dictionary input lines
+are replaced with
+
+cmsg<X>(<chan>, <string index>, <param type map>, (uintptr_t)(param1), \
+ (uintptr_t)(param2)...)
+
+- X in cmsgX matches the number of parameters,
+- <chan> is passed as is,
+- <string index> is an integer to which the line number maps in the dictionary
+- <param type map> is a 32 bit integer, consisting of up to 8 fields, each
+ field describing the type of the parameter as it was specified in
+ the format string.
+
+Format specifications are interpreted as follows:
+
+%[Xcdux] - integer, 4 bytes
+%ll. - long, 8 bytes
+%s - string
+%pT - pointer, 4 bytes
+%pH - hex buf, 4 byte pointer to hex buf structure
+%pT - pointer to an 8 bytes value
+
+each of the above result in a different param type 4 bit nibble included in
+the <param type map>. A case when __func__ is used as a string parameter
+results in a unique parameter type, the function name is included in the
+dictionary, and just the index is passed as a cmsgX() parameter.
+
+When the original format string includes a 64 bit argument, the generated code
+creates a local 64 bit variable, assigns it the local variable to the argument
+and passes the address of the local value to the cmsgX invocation. This allows
+all cmsgX parameters to be of uintptr_t size.
+
+In cases when a string argument is __func__ this script saves the function
+name in the dictionary and coverts the parameter to the index, which is
+interpreted by the terminal to display the function name. A heuristic is used
+to determine the current function name: function starts with a line starting
+in the first column, containing an opening paren and not containing a
+semicolon. The word preceding the opening bracket is extracted and considered
+the current function name. Function scope ends with a } in the first column on
+its own line. This is not a very robust mechanism, but it seems to be adequate
+for this purpose.
+
+After all input files are processed, the format strings dictionary is
+converted into a list of strings such that each string is in the list at the
+location determined by the <string index>. This provides a means of mapping
+string indices into original format strings.
+
+The list is serialized, compressed and saved in a file as a flat blob. This
+file is used by the terminal program which interprets messages generated by
+the cmsgX() invocations and recreates the original string output on the
+terminal.
+
+There is a problem this scripts presents to a GNU make file: it requires
+multiple inputs to generate multiple outputs, but inputs are processed in a
+batch, only one script invocation is required.
+
+GNU Make does not seem to provide for such a concept, so this script is
+solving this problem as follows:
+
+When starting processing, obtain a file lock, and then verify that at least
+one of input files is newer that the previously generated strings blob, if it
+exists. If this is the case - process all input files, generate the blob and
+release the lock. If the blob exists and is newer - just release the lock and
+exit.
+
+As a result, when make is running this script is invoked as many times as
+there are .E files, but only one invocation results in processing, the rest
+just check that the newer blob is already there and exit.
+"""
+
+import argparse
+import fcntl
+import os
+import pickle
+import re
+import sys
+import zlib
+
+PRINTF_RX = r'[ \t]+cp(rint[fs]|uts)\('
+PTINTLINE = re.compile(PRINTF_RX)
+HASHLINE = re.compile(r'^# [0-9]+ ')
+TRAILING_SPACE = re.compile(r'\)[ \t]+;[ \t]*$')
+HEADER = re.compile(r'(%s[a-zA-Z_0-9]+),[ ]*' % PRINTF_RX)
+EOS = re.compile(r'[^\\\\]"[,\)]')
+FUNCTION = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_ \*]+\([^;]+$')
+INT_PARAM = re.compile(r'^[0-9.\-]*([l]{0,2}|z)[Xcdux]')
+STR_PARAM = re.compile(r'^[0-9.\-]*s')
+PTR_PARAM = re.compile(r'^p[hPT]')
+HEX_ESCAPE = re.compile(r'\\x[0-9a-fA-F]{2}')
+# Indexed by string, value - the string enum.
+FMT_DICT = {}
+
+# Parameter types
+PARAM_INT = 1
+PARAM_LONG = 2
+PARAM_STRING = 3
+PARAM_PTR = 4
+PARAM_HEX_BUF = 5
+PARAM_FUNC_NAME = 6
+PARAM_TIMESTAMP = 7
+
+def tokenize(params):
+ """Split C argument string into arguments (tokens).
+
+ Arguments within C string are comma separated, potentially include quoted
+ strings, which in turn could include escaped quotes and commas. There
+ could also be parens, possibly nested.
+
+ Only commas found outside quoted strings and parens should be considered
+ token separators.
+
+ Args:
+ params: A string, arguments given to say a C printf() invocation.
+
+ Returns:
+ A list of stings, parameters retrieved from 'params' with white space
+ stripped.
+ """
+
+ if ',' not in params:
+ return [params.strip(),]
+
+ tokens = []
+ parens_level = 0
+ in_q = False
+ in_double_q = False
+ token = ''
+ # need to skip all cases of commas in quotations or parens.
+ for i, char in enumerate(params):
+ char = params[i]
+ if char == ',' and parens_level == 0 and not in_q and not in_double_q:
+ tokens.append(token.strip())
+ token = ''
+ continue
+ token += char
+ if char in '{([':
+ parens_level += 1
+ continue
+ if char in '}])':
+ parens_level -= 1
+ continue
+ if char not in '\'"':
+ continue
+ if i > 0 and params[i - 1] == '\\':
+ continue
+ if char == '"':
+ in_double_q = not in_double_q
+ continue
+ in_q = not in_q
+ tokens.append(token.strip())
+ return tokens
+
+def drop_escapes(fmt):
+ 'Return passed in string with escaped characters replaced by literals'
+ # First replace common single characters sequences.
+ unescapes = (("\\'", "'"),
+ ('\\"', '"'),
+ ('\\\\', '\\'),
+ ('\\b', '\b'),
+ ('\\a', '\a'),
+ ('\\f', '\f'),
+ ('\\n', '\n'),
+ ('\\r', '\r'),
+ ('\\t', '\t'),
+ ('\\v', '\v'))
+ for (p, r) in unescapes:
+ fmt = fmt.replace(p, r)
+
+ # Then replace all hex values expressed as \xCC.
+ m = HEX_ESCAPE.search(fmt)
+ while m:
+ match = m.group(0)
+ c = chr(int('0x' + match[2:], 16))
+ fmt = HEX_ESCAPE.sub(c, fmt, count=1)
+ m = HEX_ESCAPE.search(fmt)
+ return fmt
+
+def generate_cmsg_line(fmt, params, fmt_blocks,
+ channel, current_function):
+ """Given artifacts of a C line vararg line generate a cmsgX invocation line.
+
+ See details in the file docstring.
+
+ Args:
+ fmt: A string, the format string to be removed from the C source code.
+ params: A list of strings, the arguments of the fmt string from the
+ C source code.
+ fmt_blocks: A list of strings, the result of splitting the fmt
+ string at '%' characters. Could be obtained locally, but is
+ available from the caller.
+ channel: A string, name of the console channel the message is sent on.
+ current_function: A string, name of the C function this line is in.
+
+ Returns:
+ The cmsgX() string to replace the current source code string.
+ Also adds the format string into the dictionary, if it is not yet
+ there.
+ """
+ global FMT_DICT
+
+ # Since all parameters passed to cmsgX() are cast to (uintptr_t), uint64_t
+ # values could not be passed directly. If an int 64 parameter is
+ # encountered, a local variable is created to store the value, and the
+ # address of the variable is passed as the parameter to cmsgX().
+ #
+ # The doubles list below keeps track of these occurrences.
+ doubles = []
+ param_map = 0
+
+ fmt = drop_escapes(fmt)
+
+ # Either find the string among the previously encountered ones, or add it
+ # to the dictionary.
+ str_index = FMT_DICT.get(fmt, None)
+ if str_index is None:
+ str_index = len(FMT_DICT)
+ FMT_DICT[fmt] = str_index
+
+ # Now build a cmsgX invocation.
+ if params == ['']:
+ cmsg_index = 0
+ else:
+ cmsg_index = len(params)
+ generated_code = 'cmsg%d(%s, %d' % (cmsg_index, channel, str_index)
+ if cmsg_index == 0:
+ return generated_code + ');\n'
+ generated_code += ', PARAM^MAP' # Placeholder for parameters map.
+
+ # Preset the position of the first parameter descriptor in the parameter
+ # map, it will be incremented by 4 in the beginning of the loop.
+ position = -4
+ for block, param in zip(fmt_blocks, params):
+ position += 4
+ match = INT_PARAM.match(block)
+ if match:
+ double = match.group(1) == 'll'
+ if double:
+ param_map = param_map + (PARAM_LONG << position)
+ var_name = 'll%d' % len(doubles)
+ doubles.append('%s = %s' % (var_name, param))
+ generated_code += ', (uintptr_t)&%s' % var_name
+ else:
+ param_map = param_map + (PARAM_INT << position)
+ generated_code += ', (uintptr_t)(%s)' % param
+ continue
+ if STR_PARAM.match(block):
+ if param == '__func__':
+ param_map = param_map + (PARAM_FUNC_NAME << position)
+ str_index = FMT_DICT.get(current_function, None)
+ if str_index is None:
+ str_index = len(FMT_DICT)
+ FMT_DICT[current_function] = str_index
+ generated_code += ', (uintptr_t)%d' % str_index
+ continue
+ param_map = param_map + (PARAM_STRING << position)
+ generated_code += ', (uintptr_t)(%s)' % param
+ continue
+ if PTR_PARAM.match(block):
+ types = {
+ 'P': PARAM_PTR,
+ 'T': PARAM_TIMESTAMP,
+ 'h': PARAM_HEX_BUF
+ }
+ param_map = param_map + (types[block[1]] << position)
+ generated_code += ', (uintptr_t)(%s)' % param
+ continue
+
+ generated_code = generated_code.replace('PARAM^MAP', '%d' % param_map)
+ generated_code += ');'
+ if doubles:
+ extra = ['{', ]
+ for double in doubles:
+ extra.append('\tlong long %s;\n' % double)
+ generated_code = '\n'.join(extra) + generated_code + '\n}'
+
+ return generated_code + '\n'
+
+def process_ccprintf(line, current_function):
+ """Generate cmsgX line based on source cprint[fs] line.
+
+ Split the input line into elements, and invoke the function to convert the
+ elements into the matching cmstX() invocation.
+
+ Args:
+ line: A string, the source line to convert.
+ current_function: A string, the name of the C function 'l' comes from.
+
+ Returns:
+ The generated cmsgX() line.
+ """
+ # Let's split the string into components.
+ header = HEADER.search(line)
+ if not header:
+ return line # Must be not a valid ccprintf() invocation.
+
+ # If the line has some text before the function name, say 'return
+ # ccprintf...' save the text in preamble. If not - just set preamble to a
+ # single space.
+ start_ofs = header.span()[0]
+ if start_ofs != 0:
+ preamble = line[:start_ofs] + ' '
+ else:
+ preamble = ' '
+
+ # Not strictly necessary, but makes the output look neater, remove spaces
+ # after closing paren til newline.
+ line = TRAILING_SPACE.sub(');', line)
+
+ # Retrieve the channel name, guaranteed first argument.
+ channel = header.group(1).split('(')[1]
+
+ # Drop the 'cprintf(<channel>, ' header.
+ trailer = HEADER.sub('', line[start_ofs:])
+
+ # Find the end of the quoted format string.
+ quoted = EOS.search(trailer)
+ if not quoted:
+ if header.group(2) != 'uts':
+ sys.stderr.write('Unprocessed string: "%s"\n' % line)
+ return line
+ # This is a cputs() invocation with indirect string. Let's fix it by
+ # converting (param) into ("%s", param).
+ fmt = '%s'
+ params = [trailer.rstrip(';')[:-1],] # Strip ')[;]'.
+ else:
+ # Extract the fmt string, eliminate possible concatenations and drop
+ # starting and trailing double quotes.
+ fmt = trailer[:quoted.span()[0]] + quoted.group(0)
+ fmt = fmt.replace('" "', '')
+ fmt = fmt[1:-2]
+ if header.group(2) == 'rints':
+ fmt = '[^T' + fmt # terminal will add "]\n"
+
+ # Now get the list of parameters coming after the fmt string.
+ params = tokenize(trailer[quoted.span()[1]:].rstrip(';')[:-1])
+
+ # Elimitnate the %% cases and do not include the first element, it is
+ # irrelevant.
+ fmt_blocks = fmt.replace('%%', '').split('%')[1:]
+
+ if len(fmt_blocks) != len(params):
+ if fmt_blocks or params[0] != '':
+ sys.stderr.write('Unprocessed string: "%s"\n' % line)
+ print(trailer[quoted.span()[1]:].rstrip(';')[:-1])
+ return line
+
+ if len(params) > 8:
+ sys.stderr.write('Too many parameters: "%s"\n' % line)
+ return line
+
+ return preamble + generate_cmsg_line(fmt, params, fmt_blocks,
+ channel, current_function)
+
+class LineProcessor(object):
+ """Process multiline source code strings.
+
+ The preprocessor output often generates C source code lines split in
+ multiple preprocessor output lines, in case there are macros in the
+ command line arguments, etc.
+
+ Before the line could be examined to be a printf() like invocation, and if
+ so converted into a cmsgX() line, the multiline preprocessor output needs
+ to be converted into a single code line.
+
+ This class allows to keep track of multiple preprocessor output lines.
+
+ Attributes:
+ partial_line: A string, concatenated preprocessor output lines
+ representing a single printf() like invocation statement.
+ current_function: A string, name of the function current lines belong
+ to.
+ """
+
+ def __init__(self):
+ self.partial_line = ''
+ self.current_function = None
+
+ def process_preprocessor_line(self, line):
+ """Process a preporcessor output line.
+
+ Examine the preprocessor output line to see if it falls into one of
+ three categories:
+
+ - A first function declaration line. Save the function name for future
+ reference.
+
+ - A closing '}' of a function - just drop the previously saved
+ function name.
+
+ - A line containing source code print statement
+ (cprintf/cprints/cputs). If the complete line is present - call the
+ parser to convert it into a cmsgX() invocation. If the line
+ preprocessor line is an incomplete C source line - keep processing
+ preprocessor lines until full source line is received and then call
+ the parser.
+
+ Args:
+ line: A string, a preprocessor output line.
+
+ Returns:
+ The input line if further processing is not required, or None, if
+ input line is not yet a full source code line, or the cmsgX()
+ invocation line, the result of converting a print statement line.
+ """
+ if FUNCTION.match(line):
+ # If this line looks like a first function definition line -
+ # retrieve the function name.
+ candidates = line.split()
+ for candidate in candidates:
+ if '(' in candidate:
+ self.current_function = candidate.split('(')[0]
+ elif line.startswith('}'):
+ # If this is a '}' in the first column - we are not in a function
+ # scope any more.
+ self.current_function = None
+ if self.partial_line:
+ if HASHLINE.search(line):
+ # Drop preprocessor directives from within multiline C strings.
+ return None
+ self.partial_line += line.rstrip()
+ if ';' not in line:
+ # Still not a complete line, nothing to process yet.
+ return None
+ # Got a complete C source line, process it.
+ line = self.partial_line.rstrip()
+ self.partial_line = ''
+ return process_ccprintf(line, self.current_function)
+ if not PTINTLINE.search(line):
+ # If not a print statement - no need to worry, just pass it to the
+ # output as is.
+ return line
+ if line.startswith('int'):
+ # This is either the function prototype or the first line of the
+ # function definition, no need to convert either.
+ return line
+ if ');' not in line:
+ # This is a line with a print statement, but it is incomplete,
+ # start accumulating preprocessor lines.
+ self.partial_line = line.rstrip()
+ return None
+ # Process a full print statement line.
+ return process_ccprintf(line.rstrip(), self.current_function)
+
+def preobj_process(name, ext):
+ """Process a C preprocessor output file.
+
+ Given a preprocessor output file, generate a new file, with all print
+ statements replaced with cmsgX() invocations.
+
+ Args:
+ name: A string, name of the preprocessor output file to process.
+ ext: A string, the extension to use for the generated file.
+ """
+ line_processor = LineProcessor()
+ output = os.path.splitext(name)[0] + '.' + ext
+ with open(name, 'r') as in_file:
+ with open(output, 'w') as outf:
+ for lile in in_file.readlines():
+ processed = line_processor.process_preprocessor_line(lile)
+ if processed:
+ outf.write(processed)
+
+def parse_args(argv):
+ """Prepare parser and parse command line arguments."""
+
+ prog = os.path.basename(argv[0])
+ parser = argparse.ArgumentParser(prog=prog)
+ parser.add_argument('-o', '--output')
+ parser.add_argument('-e', '--ext', default='Ep')
+ parser.add_argument('-l', '--lockfile', default='/tmp/%s.lockfile' % prog)
+ return parser.parse_known_args(argv)
+
+def generate_blob():
+ """Convert format strings dictionary into a blob.
+
+ First convert the format string dictionary into a list of strings, placed
+ in the list in locations matching their string index.
+
+ Then serialize and compress the list.
+
+ Returns:
+ A byte array, the compressed list of format lines.
+ """
+
+ strings = [''] * len(FMT_DICT)
+ for key, value in FMT_DICT.items():
+ strings[value] = key
+
+ dump = pickle.dumps('\0'.join(strings))
+ zipped = zlib.compress(dump, 9)
+ print('dump size %d, compressed size %d' % (len(dump), len(zipped)))
+ return zipped
+
+def main(argv):
+ """Main function.
+
+ To facilitate the use of this script in GNU Makefile, support multiple
+ concurrent invocations such that only one invocation does the processing.
+
+ First lock the lock file, then check if the blob to be generated is older
+ than any of the input files. If not - this means a different instance of
+ this script has already completed processing, just exit.
+ """
+ flags, files = parse_args(argv)
+
+ lfd = open(flags.lockfile, 'w')
+
+ fcntl.flock(lfd, fcntl.LOCK_EX)
+ try:
+ if os.path.exists(flags.output):
+ outp_stamp = os.stat(flags.output).st_mtime_ns
+
+ for e_file in files[1:]:
+ if os.stat(e_file).st_mtime_ns >= outp_stamp:
+ break
+ else:
+ # Output file is newer than all inputs.
+ return
+
+ for e_file in files[1:]:
+ preobj_process(e_file, flags.ext)
+
+ zipped = generate_blob()
+ if flags.output:
+ outf = open(flags.output, 'wb')
+ outf.write(zipped)
+ outf.close()
+
+ finally:
+ fcntl.flock(lfd, fcntl.LOCK_UN)
+ lfd.close()
+
+if __name__ == '__main__':
+ main(sys.argv)