1 files changed, 564 insertions, 0 deletions
diff --git a/util/util_precompile.py b/util/util_precompile.py
new file mode 100755
index 0000000000..c58b811913
--- /dev/null
+++ b/util/util_precompile.py
@@ -0,0 +1,564 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-"
+# Copyright 2020 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+"""Process preprocessor output to move strings out of the image.
+
+This script is customized to be used in Chrome OS EC codebase, where functions
+cprintf(),cprints(), and cputs() are used to generate console output.
+
+Using this script requires a change in build process: instead of producing .o
+files from .c files, the process includes two extra steps: generate .E files
+instead of .o files, then process all .E files by this script, modifying some
+of them, producing a set of .Ep files, and then generate .o files from the .Ep
+files.
+
+A typical console output invocation in the EC codebase looks as follows:
+
+ cprintf(<chan>, <format string>, var args);
+
+This script is capable of processing invocations of up to 8 arguments (in fact
+in the Cr50 codebase at the time of introduction maximum number is 5). All
+generated .E files are given to this script as inputs. For each input file the
+script scans input lines and passes as is into the output lines which are not
+console output generating code.
+
+The format strings are extracted from console output generating lines and put
+into a dictionary, each line being the key, and the value an integer, the ever
+growing line number.
+
+After format strings are extracted and placed in the dictionary input lines
+are replaced with
+
+cmsg<X>(<chan>, <string index>, <param type map>, (uintptr_t)(param1), \
+      (uintptr_t)(param2)...)
+
+- X in cmsgX matches the number of parameters,
+- <chan> is passed as is,
+- <string index> is an integer to which the line number maps in the dictionary
+- <param type map> is a 32 bit integer, consisting of up to 8 fields, each
+           field describing the type of the parameter as it was specified in
+           the format string.
+
+Format specifications are interpreted as follows:
+
+%[Xcdux] - integer, 4 bytes
+%ll. - long, 8 bytes
+%s - string
+%pT - pointer, 4 bytes
+%pH - hex buf, 4 byte pointer to hex buf structure
+%pT - pointer to an 8 bytes value
+
+each of the above result in a different param type 4 bit nibble included in
+the <param type map>. A case when __func__ is used as a string parameter
+results in a unique parameter type, the function name is included in the
+dictionary, and just the index is passed as a cmsgX() parameter.
+
+When the original format string includes a 64 bit argument, the generated code
+creates a local 64 bit variable, assigns it the local variable to the argument
+and passes the address of the local value to the cmsgX invocation. This allows
+all cmsgX parameters to be of uintptr_t size.
+
+In cases when a string argument is __func__ this script saves the function
+name in the dictionary and coverts the parameter to the index, which is
+interpreted by the terminal to display the function name. A heuristic is used
+to determine the current function name: function starts with a line starting
+in the first column, containing an opening paren and not containing a
+semicolon. The word preceding the opening bracket is extracted and considered
+the current function name. Function scope ends with a } in the first column on
+its own line. This is not a very robust mechanism, but it seems to be adequate
+for this purpose.
+
+After all input files are processed, the format strings dictionary is
+converted into a list of strings such that each string is in the list at the
+location determined by the <string index>. This provides a means of mapping
+string indices into original format strings.
+
+The list is serialized, compressed and saved in a file as a flat blob. This
+file is used by the terminal program which interprets messages generated by
+the cmsgX() invocations and recreates the original string output on the
+terminal.
+
+There is a problem this scripts presents to a GNU make file: it requires
+multiple inputs to generate multiple outputs, but inputs are processed in a
+batch, only one script invocation is required.
+
+GNU Make does not seem to provide for such a concept, so this script is
+solving this problem as follows:
+
+When starting processing, obtain a file lock, and then verify that at least
+one of input files is newer that the previously generated strings blob, if it
+exists. If this is the case - process all input files, generate the blob and
+release the lock. If the blob exists and is newer - just release the lock and
+exit.
+
+As a result, when make is running this script is invoked as many times as
+there are .E files, but only one invocation results in processing, the rest
+just check that the newer blob is already there and exit.
+"""
+
+import argparse
+import fcntl
+import os
+import pickle
+import re
+import sys
+import zlib
+
+PRINTF_RX = r'[ \t]+cp(rint[fs]|uts)\('
+PTINTLINE = re.compile(PRINTF_RX)
+HASHLINE = re.compile(r'^# [0-9]+ ')
+TRAILING_SPACE = re.compile(r'\)[ \t]+;[ \t]*$')
+HEADER = re.compile(r'(%s[a-zA-Z_0-9]+),[ ]*' % PRINTF_RX)
+EOS = re.compile(r'[^\\\\]"[,\)]')
+FUNCTION = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_ \*]+\([^;]+$')
+INT_PARAM = re.compile(r'^[0-9.\-]*([l]{0,2}|z)[Xcdux]')
+STR_PARAM = re.compile(r'^[0-9.\-]*s')
+PTR_PARAM = re.compile(r'^p[hPT]')
+HEX_ESCAPE = re.compile(r'\\x[0-9a-fA-F]{2}')
+# Indexed by string, value - the string enum.
+FMT_DICT = {}
+
+# Parameter types
+PARAM_INT = 1
+PARAM_LONG = 2
+PARAM_STRING = 3
+PARAM_PTR = 4
+PARAM_HEX_BUF = 5
+PARAM_FUNC_NAME = 6
+PARAM_TIMESTAMP = 7
+
+def tokenize(params):
+    """Split C argument string into arguments (tokens).
+
+    Arguments within C string are comma separated, potentially include quoted
+    strings, which in turn could include escaped quotes and commas. There
+    could also be parens, possibly nested.
+
+    Only commas found outside quoted strings and parens should be considered
+    token separators.
+
+    Args:
+        params: A string, arguments given to say a C printf() invocation.
+
+    Returns:
+        A list of stings, parameters retrieved from 'params' with white space
+        stripped.
+    """
+
+    if ',' not in params:
+        return [params.strip(),]
+
+    tokens = []
+    parens_level = 0
+    in_q = False
+    in_double_q = False
+    token = ''
+    # need to skip all cases of commas in quotations or parens.
+    for i, char in enumerate(params):
+        char = params[i]
+        if char == ',' and parens_level == 0 and not in_q and not in_double_q:
+            tokens.append(token.strip())
+            token = ''
+            continue
+        token += char
+        if char in '{([':
+            parens_level += 1
+            continue
+        if char in '}])':
+            parens_level -= 1
+            continue
+        if char not in '\'"':
+            continue
+        if i > 0 and params[i - 1] == '\\':
+            continue
+        if char == '"':
+            in_double_q = not in_double_q
+            continue
+        in_q = not in_q
+    tokens.append(token.strip())
+    return tokens
+
+def drop_escapes(fmt):
+    'Return passed in string with escaped characters replaced by literals'
+    # First replace common single characters sequences.
+    unescapes = (("\\'", "'"),
+                 ('\\"', '"'),
+                 ('\\\\', '\\'),
+                 ('\\b', '\b'),
+                 ('\\a', '\a'),
+                 ('\\f', '\f'),
+                 ('\\n', '\n'),
+                 ('\\r', '\r'),
+                 ('\\t', '\t'),
+                 ('\\v', '\v'))
+    for (p, r) in unescapes:
+        fmt = fmt.replace(p, r)
+
+    # Then replace all hex values expressed as \xCC.
+    m = HEX_ESCAPE.search(fmt)
+    while m:
+        match = m.group(0)
+        c = chr(int('0x' + match[2:], 16))
+        fmt = HEX_ESCAPE.sub(c, fmt, count=1)
+        m = HEX_ESCAPE.search(fmt)
+    return fmt
+
+def generate_cmsg_line(fmt, params, fmt_blocks,
+                       channel, current_function):
+    """Given artifacts of a C line vararg line generate a cmsgX invocation line.
+
+    See details in the file docstring.
+
+    Args:
+        fmt: A string, the format string to be removed from the C source code.
+        params: A list of strings, the arguments of the fmt string from the
+               C source code.
+        fmt_blocks: A list of strings, the result of splitting the fmt
+              string at '%' characters. Could be obtained locally, but is
+              available from the caller.
+        channel: A string, name of the console channel the message is sent on.
+        current_function: A string, name of the C function this line is in.
+
+    Returns:
+        The cmsgX() string to replace the current source code string.
+        Also adds the format string into the dictionary, if it is not yet
+        there.
+    """
+    global FMT_DICT
+
+    # Since all parameters passed to cmsgX() are cast to (uintptr_t), uint64_t
+    # values could not be passed directly. If an int 64 parameter is
+    # encountered, a local variable is created to store the value, and the
+    # address of the variable is passed as the parameter to cmsgX().
+    #
+    # The doubles list below keeps track of these occurrences.
+    doubles = []
+    param_map = 0
+
+    fmt = drop_escapes(fmt)
+
+    # Either find the string among the previously encountered ones, or add it
+    # to the dictionary.
+    str_index = FMT_DICT.get(fmt, None)
+    if str_index is None:
+        str_index = len(FMT_DICT)
+        FMT_DICT[fmt] = str_index
+
+    # Now build a cmsgX invocation.
+    if params == ['']:
+        cmsg_index = 0
+    else:
+        cmsg_index = len(params)
+    generated_code = 'cmsg%d(%s, %d' % (cmsg_index, channel, str_index)
+    if cmsg_index == 0:
+        return generated_code + ');\n'
+    generated_code += ', PARAM^MAP' # Placeholder for parameters map.
+
+    # Preset the position of the first parameter descriptor in the parameter
+    # map, it will be incremented by 4 in the beginning of the loop.
+    position = -4
+    for block, param in zip(fmt_blocks, params):
+        position += 4
+        match = INT_PARAM.match(block)
+        if match:
+            double = match.group(1) == 'll'
+            if double:
+                param_map = param_map  + (PARAM_LONG << position)
+                var_name = 'll%d' % len(doubles)
+                doubles.append('%s = %s' % (var_name, param))
+                generated_code += ', (uintptr_t)&%s' % var_name
+            else:
+                param_map = param_map + (PARAM_INT << position)
+                generated_code += ', (uintptr_t)(%s)' % param
+            continue
+        if STR_PARAM.match(block):
+            if param == '__func__':
+                param_map = param_map + (PARAM_FUNC_NAME << position)
+                str_index = FMT_DICT.get(current_function, None)
+                if str_index is None:
+                    str_index = len(FMT_DICT)
+                    FMT_DICT[current_function] = str_index
+                generated_code += ', (uintptr_t)%d' % str_index
+                continue
+            param_map = param_map + (PARAM_STRING << position)
+            generated_code += ', (uintptr_t)(%s)' % param
+            continue
+        if PTR_PARAM.match(block):
+            types = {
+                'P': PARAM_PTR,
+                'T': PARAM_TIMESTAMP,
+                'h': PARAM_HEX_BUF
+            }
+            param_map = param_map + (types[block[1]] << position)
+            generated_code += ', (uintptr_t)(%s)' % param
+            continue
+
+    generated_code = generated_code.replace('PARAM^MAP', '%d' % param_map)
+    generated_code += ');'
+    if doubles:
+        extra = ['{', ]
+        for double in doubles:
+            extra.append('\tlong long %s;\n' % double)
+        generated_code = '\n'.join(extra) + generated_code + '\n}'
+
+    return generated_code + '\n'
+
+def process_ccprintf(line, current_function):
+    """Generate cmsgX line based on source cprint[fs] line.
+
+    Split the input line into elements, and invoke the function to convert the
+    elements into the matching cmstX() invocation.
+
+    Args:
+        line: A string, the source line to convert.
+        current_function: A string, the name of the C function 'l' comes from.
+
+    Returns:
+        The generated cmsgX() line.
+    """
+    # Let's split the string into components.
+    header = HEADER.search(line)
+    if not header:
+        return line # Must be not a valid ccprintf() invocation.
+
+    # If the line has some text before the function name, say 'return
+    # ccprintf...' save the text in preamble. If not - just set preamble to a
+    # single space.
+    start_ofs = header.span()[0]
+    if start_ofs != 0:
+        preamble = line[:start_ofs] + ' '
+    else:
+        preamble = ' '
+
+    # Not strictly necessary, but makes the output look neater, remove spaces
+    # after closing paren til newline.
+    line = TRAILING_SPACE.sub(');', line)
+
+    # Retrieve the channel name, guaranteed first argument.
+    channel = header.group(1).split('(')[1]
+
+    # Drop the 'cprintf(<channel>, ' header.
+    trailer = HEADER.sub('', line[start_ofs:])
+
+    # Find the end of the quoted format string.
+    quoted = EOS.search(trailer)
+    if not quoted:
+        if header.group(2) != 'uts':
+            sys.stderr.write('Unprocessed string: "%s"\n' % line)
+            return line
+        # This is a cputs() invocation with indirect string. Let's fix it by
+        # converting (param) into ("%s", param).
+        fmt = '%s'
+        params = [trailer.rstrip(';')[:-1],] # Strip ')[;]'.
+    else:
+        # Extract the fmt string, eliminate possible concatenations and drop
+        # starting and trailing double quotes.
+        fmt = trailer[:quoted.span()[0]] + quoted.group(0)
+        fmt = fmt.replace('" "', '')
+        fmt = fmt[1:-2]
+        if header.group(2) == 'rints':
+            fmt = '[^T' + fmt # terminal will add "]\n"
+
+        # Now get the list of parameters coming after the fmt string.
+        params = tokenize(trailer[quoted.span()[1]:].rstrip(';')[:-1])
+
+    # Elimitnate the %% cases and do not include the first element, it is
+    # irrelevant.
+    fmt_blocks = fmt.replace('%%', '').split('%')[1:]
+
+    if len(fmt_blocks) != len(params):
+        if fmt_blocks or params[0] != '':
+            sys.stderr.write('Unprocessed string: "%s"\n' % line)
+            print(trailer[quoted.span()[1]:].rstrip(';')[:-1])
+            return line
+
+    if len(params) > 8:
+        sys.stderr.write('Too many parameters: "%s"\n' % line)
+        return line
+
+    return preamble + generate_cmsg_line(fmt, params, fmt_blocks,
+                              channel, current_function)
+
+class LineProcessor(object):
+    """Process multiline source code strings.
+
+    The preprocessor output often generates C source code lines split in
+    multiple preprocessor output lines, in case there are macros in the
+    command line arguments, etc.
+
+    Before the line could be examined to be a printf() like invocation, and if
+    so converted into a cmsgX() line, the multiline preprocessor output needs
+    to be converted into a single code line.
+
+    This class allows to keep track of multiple preprocessor output lines.
+
+    Attributes:
+        partial_line: A string, concatenated preprocessor output lines
+           representing a single printf() like invocation statement.
+        current_function: A string, name of the function current lines belong
+           to.
+    """
+
+    def __init__(self):
+        self.partial_line = ''
+        self.current_function = None
+
+    def process_preprocessor_line(self, line):
+        """Process a preporcessor output line.
+
+        Examine the preprocessor output line to see if it falls into one of
+        three categories:
+
+        - A first function declaration line. Save the function name for future
+          reference.
+
+        - A closing '}' of a function - just drop the previously saved
+          function name.
+
+        - A line containing source code print statement
+          (cprintf/cprints/cputs). If the complete line is present - call the
+          parser to convert it into a cmsgX() invocation. If the line
+          preprocessor line is an incomplete C source line - keep processing
+          preprocessor lines until full source line is received and then call
+          the parser.
+
+        Args:
+            line: A string, a preprocessor output line.
+
+        Returns:
+            The input line if further processing is not required, or None, if
+            input line is not yet a full source code line, or the cmsgX()
+            invocation line, the result of converting a print statement line.
+        """
+        if FUNCTION.match(line):
+            # If this line looks like a first function definition line -
+            # retrieve the function name.
+            candidates = line.split()
+            for candidate in candidates:
+                if '(' in candidate:
+                    self.current_function = candidate.split('(')[0]
+        elif line.startswith('}'):
+            # If this is a '}' in the first column - we are not in a function
+            # scope any more.
+            self.current_function = None
+        if self.partial_line:
+            if HASHLINE.search(line):
+                # Drop preprocessor directives from within multiline C strings.
+                return None
+            self.partial_line += line.rstrip()
+            if ';' not in line:
+                # Still not a complete line, nothing to process yet.
+                return None
+            # Got a complete C source line, process it.
+            line = self.partial_line.rstrip()
+            self.partial_line = ''
+            return process_ccprintf(line, self.current_function)
+        if not PTINTLINE.search(line):
+            # If not a print statement - no need to worry, just pass it to the
+            # output as is.
+            return line
+        if line.startswith('int'):
+            # This is either the function prototype or the first line of the
+            # function definition, no need to convert either.
+            return line
+        if ');' not in line:
+            # This is a line with a print statement, but it is incomplete,
+            # start accumulating preprocessor lines.
+            self.partial_line = line.rstrip()
+            return None
+        # Process a full print statement line.
+        return process_ccprintf(line.rstrip(), self.current_function)
+
+def preobj_process(name, ext):
+    """Process a C preprocessor output file.
+
+    Given a preprocessor output file, generate a new file, with all print
+    statements replaced with cmsgX() invocations.
+
+    Args:
+        name: A string, name of the preprocessor output file to process.
+        ext: A string, the extension to use for the generated file.
+    """
+    line_processor = LineProcessor()
+    output = os.path.splitext(name)[0] + '.' + ext
+    with open(name, 'r') as in_file:
+        with open(output, 'w') as outf:
+            for lile in in_file.readlines():
+                processed = line_processor.process_preprocessor_line(lile)
+                if processed:
+                    outf.write(processed)
+
+def parse_args(argv):
+    """Prepare parser and parse command line arguments."""
+
+    prog = os.path.basename(argv[0])
+    parser = argparse.ArgumentParser(prog=prog)
+    parser.add_argument('-o', '--output')
+    parser.add_argument('-e', '--ext', default='Ep')
+    parser.add_argument('-l', '--lockfile', default='/tmp/%s.lockfile' % prog)
+    return parser.parse_known_args(argv)
+
+def generate_blob():
+    """Convert format strings dictionary into a blob.
+
+    First convert the format string dictionary into a list of strings, placed
+    in the list in locations matching their string index.
+
+    Then serialize and compress the list.
+
+    Returns:
+        A byte array, the compressed list of format lines.
+    """
+
+    strings = [''] * len(FMT_DICT)
+    for key, value in FMT_DICT.items():
+        strings[value] = key
+
+    dump = pickle.dumps('\0'.join(strings))
+    zipped = zlib.compress(dump, 9)
+    print('dump size %d, compressed size %d' % (len(dump), len(zipped)))
+    return zipped
+
+def main(argv):
+    """Main function.
+
+    To facilitate the use of this script in GNU Makefile, support multiple
+    concurrent invocations such that only one invocation does the processing.
+
+    First lock the lock file, then check if the blob to be generated is older
+    than any of the input files. If not - this means a different instance of
+    this script has already completed processing, just exit.
+    """
+    flags, files = parse_args(argv)
+
+    lfd = open(flags.lockfile, 'w')
+
+    fcntl.flock(lfd, fcntl.LOCK_EX)
+    try:
+        if os.path.exists(flags.output):
+            outp_stamp = os.stat(flags.output).st_mtime_ns
+
+            for e_file in files[1:]:
+                if os.stat(e_file).st_mtime_ns >= outp_stamp:
+                    break
+            else:
+                # Output file is newer than all inputs.
+                return
+
+        for e_file in files[1:]:
+            preobj_process(e_file, flags.ext)
+
+        zipped = generate_blob()
+        if flags.output:
+            outf = open(flags.output, 'wb')
+            outf.write(zipped)
+            outf.close()
+
+    finally:
+        fcntl.flock(lfd, fcntl.LOCK_UN)
+        lfd.close()
+
+if __name__ == '__main__':
+    main(sys.argv)