diff options
Diffstat (limited to 'util/util_precompile.py')
-rwxr-xr-x | util/util_precompile.py | 564 |
1 files changed, 564 insertions, 0 deletions
diff --git a/util/util_precompile.py b/util/util_precompile.py new file mode 100755 index 0000000000..c58b811913 --- /dev/null +++ b/util/util_precompile.py @@ -0,0 +1,564 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*-" +# Copyright 2020 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. +"""Process preprocessor output to move strings out of the image. + +This script is customized to be used in Chrome OS EC codebase, where functions +cprintf(),cprints(), and cputs() are used to generate console output. + +Using this script requires a change in build process: instead of producing .o +files from .c files, the process includes two extra steps: generate .E files +instead of .o files, then process all .E files by this script, modifying some +of them, producing a set of .Ep files, and then generate .o files from the .Ep +files. + +A typical console output invocation in the EC codebase looks as follows: + + cprintf(<chan>, <format string>, var args); + +This script is capable of processing invocations of up to 8 arguments (in fact +in the Cr50 codebase at the time of introduction maximum number is 5). All +generated .E files are given to this script as inputs. For each input file the +script scans input lines and passes as is into the output lines which are not +console output generating code. + +The format strings are extracted from console output generating lines and put +into a dictionary, each line being the key, and the value an integer, the ever +growing line number. + +After format strings are extracted and placed in the dictionary input lines +are replaced with + +cmsg<X>(<chan>, <string index>, <param type map>, (uintptr_t)(param1), \ + (uintptr_t)(param2)...) + +- X in cmsgX matches the number of parameters, +- <chan> is passed as is, +- <string index> is an integer to which the line number maps in the dictionary +- <param type map> is a 32 bit integer, consisting of up to 8 fields, each + field describing the type of the parameter as it was specified in + the format string. + +Format specifications are interpreted as follows: + +%[Xcdux] - integer, 4 bytes +%ll. - long, 8 bytes +%s - string +%pT - pointer, 4 bytes +%pH - hex buf, 4 byte pointer to hex buf structure +%pT - pointer to an 8 bytes value + +each of the above result in a different param type 4 bit nibble included in +the <param type map>. A case when __func__ is used as a string parameter +results in a unique parameter type, the function name is included in the +dictionary, and just the index is passed as a cmsgX() parameter. + +When the original format string includes a 64 bit argument, the generated code +creates a local 64 bit variable, assigns it the local variable to the argument +and passes the address of the local value to the cmsgX invocation. This allows +all cmsgX parameters to be of uintptr_t size. + +In cases when a string argument is __func__ this script saves the function +name in the dictionary and coverts the parameter to the index, which is +interpreted by the terminal to display the function name. A heuristic is used +to determine the current function name: function starts with a line starting +in the first column, containing an opening paren and not containing a +semicolon. The word preceding the opening bracket is extracted and considered +the current function name. Function scope ends with a } in the first column on +its own line. This is not a very robust mechanism, but it seems to be adequate +for this purpose. + +After all input files are processed, the format strings dictionary is +converted into a list of strings such that each string is in the list at the +location determined by the <string index>. This provides a means of mapping +string indices into original format strings. + +The list is serialized, compressed and saved in a file as a flat blob. This +file is used by the terminal program which interprets messages generated by +the cmsgX() invocations and recreates the original string output on the +terminal. + +There is a problem this scripts presents to a GNU make file: it requires +multiple inputs to generate multiple outputs, but inputs are processed in a +batch, only one script invocation is required. + +GNU Make does not seem to provide for such a concept, so this script is +solving this problem as follows: + +When starting processing, obtain a file lock, and then verify that at least +one of input files is newer that the previously generated strings blob, if it +exists. If this is the case - process all input files, generate the blob and +release the lock. If the blob exists and is newer - just release the lock and +exit. + +As a result, when make is running this script is invoked as many times as +there are .E files, but only one invocation results in processing, the rest +just check that the newer blob is already there and exit. +""" + +import argparse +import fcntl +import os +import pickle +import re +import sys +import zlib + +PRINTF_RX = r'[ \t]+cp(rint[fs]|uts)\(' +PTINTLINE = re.compile(PRINTF_RX) +HASHLINE = re.compile(r'^# [0-9]+ ') +TRAILING_SPACE = re.compile(r'\)[ \t]+;[ \t]*$') +HEADER = re.compile(r'(%s[a-zA-Z_0-9]+),[ ]*' % PRINTF_RX) +EOS = re.compile(r'[^\\\\]"[,\)]') +FUNCTION = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_ \*]+\([^;]+$') +INT_PARAM = re.compile(r'^[0-9.\-]*([l]{0,2}|z)[Xcdux]') +STR_PARAM = re.compile(r'^[0-9.\-]*s') +PTR_PARAM = re.compile(r'^p[hPT]') +HEX_ESCAPE = re.compile(r'\\x[0-9a-fA-F]{2}') +# Indexed by string, value - the string enum. +FMT_DICT = {} + +# Parameter types +PARAM_INT = 1 +PARAM_LONG = 2 +PARAM_STRING = 3 +PARAM_PTR = 4 +PARAM_HEX_BUF = 5 +PARAM_FUNC_NAME = 6 +PARAM_TIMESTAMP = 7 + +def tokenize(params): + """Split C argument string into arguments (tokens). + + Arguments within C string are comma separated, potentially include quoted + strings, which in turn could include escaped quotes and commas. There + could also be parens, possibly nested. + + Only commas found outside quoted strings and parens should be considered + token separators. + + Args: + params: A string, arguments given to say a C printf() invocation. + + Returns: + A list of stings, parameters retrieved from 'params' with white space + stripped. + """ + + if ',' not in params: + return [params.strip(),] + + tokens = [] + parens_level = 0 + in_q = False + in_double_q = False + token = '' + # need to skip all cases of commas in quotations or parens. + for i, char in enumerate(params): + char = params[i] + if char == ',' and parens_level == 0 and not in_q and not in_double_q: + tokens.append(token.strip()) + token = '' + continue + token += char + if char in '{([': + parens_level += 1 + continue + if char in '}])': + parens_level -= 1 + continue + if char not in '\'"': + continue + if i > 0 and params[i - 1] == '\\': + continue + if char == '"': + in_double_q = not in_double_q + continue + in_q = not in_q + tokens.append(token.strip()) + return tokens + +def drop_escapes(fmt): + 'Return passed in string with escaped characters replaced by literals' + # First replace common single characters sequences. + unescapes = (("\\'", "'"), + ('\\"', '"'), + ('\\\\', '\\'), + ('\\b', '\b'), + ('\\a', '\a'), + ('\\f', '\f'), + ('\\n', '\n'), + ('\\r', '\r'), + ('\\t', '\t'), + ('\\v', '\v')) + for (p, r) in unescapes: + fmt = fmt.replace(p, r) + + # Then replace all hex values expressed as \xCC. + m = HEX_ESCAPE.search(fmt) + while m: + match = m.group(0) + c = chr(int('0x' + match[2:], 16)) + fmt = HEX_ESCAPE.sub(c, fmt, count=1) + m = HEX_ESCAPE.search(fmt) + return fmt + +def generate_cmsg_line(fmt, params, fmt_blocks, + channel, current_function): + """Given artifacts of a C line vararg line generate a cmsgX invocation line. + + See details in the file docstring. + + Args: + fmt: A string, the format string to be removed from the C source code. + params: A list of strings, the arguments of the fmt string from the + C source code. + fmt_blocks: A list of strings, the result of splitting the fmt + string at '%' characters. Could be obtained locally, but is + available from the caller. + channel: A string, name of the console channel the message is sent on. + current_function: A string, name of the C function this line is in. + + Returns: + The cmsgX() string to replace the current source code string. + Also adds the format string into the dictionary, if it is not yet + there. + """ + global FMT_DICT + + # Since all parameters passed to cmsgX() are cast to (uintptr_t), uint64_t + # values could not be passed directly. If an int 64 parameter is + # encountered, a local variable is created to store the value, and the + # address of the variable is passed as the parameter to cmsgX(). + # + # The doubles list below keeps track of these occurrences. + doubles = [] + param_map = 0 + + fmt = drop_escapes(fmt) + + # Either find the string among the previously encountered ones, or add it + # to the dictionary. + str_index = FMT_DICT.get(fmt, None) + if str_index is None: + str_index = len(FMT_DICT) + FMT_DICT[fmt] = str_index + + # Now build a cmsgX invocation. + if params == ['']: + cmsg_index = 0 + else: + cmsg_index = len(params) + generated_code = 'cmsg%d(%s, %d' % (cmsg_index, channel, str_index) + if cmsg_index == 0: + return generated_code + ');\n' + generated_code += ', PARAM^MAP' # Placeholder for parameters map. + + # Preset the position of the first parameter descriptor in the parameter + # map, it will be incremented by 4 in the beginning of the loop. + position = -4 + for block, param in zip(fmt_blocks, params): + position += 4 + match = INT_PARAM.match(block) + if match: + double = match.group(1) == 'll' + if double: + param_map = param_map + (PARAM_LONG << position) + var_name = 'll%d' % len(doubles) + doubles.append('%s = %s' % (var_name, param)) + generated_code += ', (uintptr_t)&%s' % var_name + else: + param_map = param_map + (PARAM_INT << position) + generated_code += ', (uintptr_t)(%s)' % param + continue + if STR_PARAM.match(block): + if param == '__func__': + param_map = param_map + (PARAM_FUNC_NAME << position) + str_index = FMT_DICT.get(current_function, None) + if str_index is None: + str_index = len(FMT_DICT) + FMT_DICT[current_function] = str_index + generated_code += ', (uintptr_t)%d' % str_index + continue + param_map = param_map + (PARAM_STRING << position) + generated_code += ', (uintptr_t)(%s)' % param + continue + if PTR_PARAM.match(block): + types = { + 'P': PARAM_PTR, + 'T': PARAM_TIMESTAMP, + 'h': PARAM_HEX_BUF + } + param_map = param_map + (types[block[1]] << position) + generated_code += ', (uintptr_t)(%s)' % param + continue + + generated_code = generated_code.replace('PARAM^MAP', '%d' % param_map) + generated_code += ');' + if doubles: + extra = ['{', ] + for double in doubles: + extra.append('\tlong long %s;\n' % double) + generated_code = '\n'.join(extra) + generated_code + '\n}' + + return generated_code + '\n' + +def process_ccprintf(line, current_function): + """Generate cmsgX line based on source cprint[fs] line. + + Split the input line into elements, and invoke the function to convert the + elements into the matching cmstX() invocation. + + Args: + line: A string, the source line to convert. + current_function: A string, the name of the C function 'l' comes from. + + Returns: + The generated cmsgX() line. + """ + # Let's split the string into components. + header = HEADER.search(line) + if not header: + return line # Must be not a valid ccprintf() invocation. + + # If the line has some text before the function name, say 'return + # ccprintf...' save the text in preamble. If not - just set preamble to a + # single space. + start_ofs = header.span()[0] + if start_ofs != 0: + preamble = line[:start_ofs] + ' ' + else: + preamble = ' ' + + # Not strictly necessary, but makes the output look neater, remove spaces + # after closing paren til newline. + line = TRAILING_SPACE.sub(');', line) + + # Retrieve the channel name, guaranteed first argument. + channel = header.group(1).split('(')[1] + + # Drop the 'cprintf(<channel>, ' header. + trailer = HEADER.sub('', line[start_ofs:]) + + # Find the end of the quoted format string. + quoted = EOS.search(trailer) + if not quoted: + if header.group(2) != 'uts': + sys.stderr.write('Unprocessed string: "%s"\n' % line) + return line + # This is a cputs() invocation with indirect string. Let's fix it by + # converting (param) into ("%s", param). + fmt = '%s' + params = [trailer.rstrip(';')[:-1],] # Strip ')[;]'. + else: + # Extract the fmt string, eliminate possible concatenations and drop + # starting and trailing double quotes. + fmt = trailer[:quoted.span()[0]] + quoted.group(0) + fmt = fmt.replace('" "', '') + fmt = fmt[1:-2] + if header.group(2) == 'rints': + fmt = '[^T' + fmt # terminal will add "]\n" + + # Now get the list of parameters coming after the fmt string. + params = tokenize(trailer[quoted.span()[1]:].rstrip(';')[:-1]) + + # Elimitnate the %% cases and do not include the first element, it is + # irrelevant. + fmt_blocks = fmt.replace('%%', '').split('%')[1:] + + if len(fmt_blocks) != len(params): + if fmt_blocks or params[0] != '': + sys.stderr.write('Unprocessed string: "%s"\n' % line) + print(trailer[quoted.span()[1]:].rstrip(';')[:-1]) + return line + + if len(params) > 8: + sys.stderr.write('Too many parameters: "%s"\n' % line) + return line + + return preamble + generate_cmsg_line(fmt, params, fmt_blocks, + channel, current_function) + +class LineProcessor(object): + """Process multiline source code strings. + + The preprocessor output often generates C source code lines split in + multiple preprocessor output lines, in case there are macros in the + command line arguments, etc. + + Before the line could be examined to be a printf() like invocation, and if + so converted into a cmsgX() line, the multiline preprocessor output needs + to be converted into a single code line. + + This class allows to keep track of multiple preprocessor output lines. + + Attributes: + partial_line: A string, concatenated preprocessor output lines + representing a single printf() like invocation statement. + current_function: A string, name of the function current lines belong + to. + """ + + def __init__(self): + self.partial_line = '' + self.current_function = None + + def process_preprocessor_line(self, line): + """Process a preporcessor output line. + + Examine the preprocessor output line to see if it falls into one of + three categories: + + - A first function declaration line. Save the function name for future + reference. + + - A closing '}' of a function - just drop the previously saved + function name. + + - A line containing source code print statement + (cprintf/cprints/cputs). If the complete line is present - call the + parser to convert it into a cmsgX() invocation. If the line + preprocessor line is an incomplete C source line - keep processing + preprocessor lines until full source line is received and then call + the parser. + + Args: + line: A string, a preprocessor output line. + + Returns: + The input line if further processing is not required, or None, if + input line is not yet a full source code line, or the cmsgX() + invocation line, the result of converting a print statement line. + """ + if FUNCTION.match(line): + # If this line looks like a first function definition line - + # retrieve the function name. + candidates = line.split() + for candidate in candidates: + if '(' in candidate: + self.current_function = candidate.split('(')[0] + elif line.startswith('}'): + # If this is a '}' in the first column - we are not in a function + # scope any more. + self.current_function = None + if self.partial_line: + if HASHLINE.search(line): + # Drop preprocessor directives from within multiline C strings. + return None + self.partial_line += line.rstrip() + if ';' not in line: + # Still not a complete line, nothing to process yet. + return None + # Got a complete C source line, process it. + line = self.partial_line.rstrip() + self.partial_line = '' + return process_ccprintf(line, self.current_function) + if not PTINTLINE.search(line): + # If not a print statement - no need to worry, just pass it to the + # output as is. + return line + if line.startswith('int'): + # This is either the function prototype or the first line of the + # function definition, no need to convert either. + return line + if ');' not in line: + # This is a line with a print statement, but it is incomplete, + # start accumulating preprocessor lines. + self.partial_line = line.rstrip() + return None + # Process a full print statement line. + return process_ccprintf(line.rstrip(), self.current_function) + +def preobj_process(name, ext): + """Process a C preprocessor output file. + + Given a preprocessor output file, generate a new file, with all print + statements replaced with cmsgX() invocations. + + Args: + name: A string, name of the preprocessor output file to process. + ext: A string, the extension to use for the generated file. + """ + line_processor = LineProcessor() + output = os.path.splitext(name)[0] + '.' + ext + with open(name, 'r') as in_file: + with open(output, 'w') as outf: + for lile in in_file.readlines(): + processed = line_processor.process_preprocessor_line(lile) + if processed: + outf.write(processed) + +def parse_args(argv): + """Prepare parser and parse command line arguments.""" + + prog = os.path.basename(argv[0]) + parser = argparse.ArgumentParser(prog=prog) + parser.add_argument('-o', '--output') + parser.add_argument('-e', '--ext', default='Ep') + parser.add_argument('-l', '--lockfile', default='/tmp/%s.lockfile' % prog) + return parser.parse_known_args(argv) + +def generate_blob(): + """Convert format strings dictionary into a blob. + + First convert the format string dictionary into a list of strings, placed + in the list in locations matching their string index. + + Then serialize and compress the list. + + Returns: + A byte array, the compressed list of format lines. + """ + + strings = [''] * len(FMT_DICT) + for key, value in FMT_DICT.items(): + strings[value] = key + + dump = pickle.dumps('\0'.join(strings)) + zipped = zlib.compress(dump, 9) + print('dump size %d, compressed size %d' % (len(dump), len(zipped))) + return zipped + +def main(argv): + """Main function. + + To facilitate the use of this script in GNU Makefile, support multiple + concurrent invocations such that only one invocation does the processing. + + First lock the lock file, then check if the blob to be generated is older + than any of the input files. If not - this means a different instance of + this script has already completed processing, just exit. + """ + flags, files = parse_args(argv) + + lfd = open(flags.lockfile, 'w') + + fcntl.flock(lfd, fcntl.LOCK_EX) + try: + if os.path.exists(flags.output): + outp_stamp = os.stat(flags.output).st_mtime_ns + + for e_file in files[1:]: + if os.stat(e_file).st_mtime_ns >= outp_stamp: + break + else: + # Output file is newer than all inputs. + return + + for e_file in files[1:]: + preobj_process(e_file, flags.ext) + + zipped = generate_blob() + if flags.output: + outf = open(flags.output, 'wb') + outf.write(zipped) + outf.close() + + finally: + fcntl.flock(lfd, fcntl.LOCK_UN) + lfd.close() + +if __name__ == '__main__': + main(sys.argv) |