#!/usr/bin/env python3 # -*- coding: utf-8 -*-" # Copyright 2020 The Chromium OS Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Process preprocessor output to move strings out of the image. This script is customized to be used in Chrome OS EC codebase, where functions cprintf(),cprints(), and cputs() are used to generate console output. Using this script requires a change in build process: instead of producing .o files from .c files, the process includes two extra steps: generate .E files instead of .o files, then process all .E files by this script, modifying some of them, producing a set of .Ep files, and then generate .o files from the .Ep files. A typical console output invocation in the EC codebase looks as follows: cprintf(, , var args); This script is capable of processing invocations of up to 8 arguments (in fact in the Cr50 codebase at the time of introduction maximum number is 5). All generated .E files are given to this script as inputs. For each input file the script scans input lines and passes as is into the output lines which are not console output generating code. The format strings are extracted from console output generating lines and put into a dictionary, each line being the key, and the value an integer, the ever growing line number. After format strings are extracted and placed in the dictionary input lines are replaced with cmsg(, , , (uintptr_t)(param1), \ (uintptr_t)(param2)...) - X in cmsgX matches the number of parameters, - is passed as is, - is an integer to which the line number maps in the dictionary - is a 32 bit integer, consisting of up to 8 fields, each field describing the type of the parameter as it was specified in the format string. Format specifications are interpreted as follows: %[Xcdux] - integer, 4 bytes %ll. - long, 8 bytes %s - string %pT - pointer, 4 bytes %pH - hex buf, 4 byte pointer to hex buf structure %pT - pointer to an 8 bytes value each of the above result in a different param type 4 bit nibble included in the . A case when __func__ is used as a string parameter results in a unique parameter type, the function name is included in the dictionary, and just the index is passed as a cmsgX() parameter. When the original format string includes a 64 bit argument, the generated code creates a local 64 bit variable, assigns it the local variable to the argument and passes the address of the local value to the cmsgX invocation. This allows all cmsgX parameters to be of uintptr_t size. In cases when a string argument is __func__ this script saves the function name in the dictionary and coverts the parameter to the index, which is interpreted by the terminal to display the function name. A heuristic is used to determine the current function name: function starts with a line starting in the first column, containing an opening paren and not containing a semicolon. The word preceding the opening bracket is extracted and considered the current function name. Function scope ends with a } in the first column on its own line. This is not a very robust mechanism, but it seems to be adequate for this purpose. After all input files are processed, the format strings dictionary is converted into a list of strings such that each string is in the list at the location determined by the . This provides a means of mapping string indices into original format strings. The list is serialized, compressed and saved in a file as a flat blob. This file is used by the terminal program which interprets messages generated by the cmsgX() invocations and recreates the original string output on the terminal. There is a problem this scripts presents to a GNU make file: it requires multiple inputs to generate multiple outputs, but inputs are processed in a batch, only one script invocation is required. GNU Make does not seem to provide for such a concept, so this script is solving this problem as follows: When starting processing, obtain a file lock, and then verify that at least one of input files is newer that the previously generated strings blob, if it exists. If this is the case - process all input files, generate the blob and release the lock. If the blob exists and is newer - just release the lock and exit. As a result, when make is running this script is invoked as many times as there are .E files, but only one invocation results in processing, the rest just check that the newer blob is already there and exit. """ import argparse import fcntl import os import pickle import re import sys import zlib PRINTF_RX = r'[ \t]+cp(rint[fs]|uts)\(' PTINTLINE = re.compile(PRINTF_RX) HASHLINE = re.compile(r'^# [0-9]+ ') TRAILING_SPACE = re.compile(r'\)[ \t]+;[ \t]*$') HEADER = re.compile(r'(%s[a-zA-Z_0-9]+),[ ]*' % PRINTF_RX) EOS = re.compile(r'[^\\\\]"[,\)]') FUNCTION = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_ \*]+\([^;]+$') INT_PARAM = re.compile(r'^[0-9.\-]*([l]{0,2}|z)[Xcdux]') STR_PARAM = re.compile(r'^[0-9.\-]*s') PTR_PARAM = re.compile(r'^p[hPT]') HEX_ESCAPE = re.compile(r'\\x[0-9a-fA-F]{2}') # Indexed by string, value - the string enum. FMT_DICT = {} # Parameter types PARAM_INT = 1 PARAM_LONG = 2 PARAM_STRING = 3 PARAM_PTR = 4 PARAM_HEX_BUF = 5 PARAM_FUNC_NAME = 6 PARAM_TIMESTAMP = 7 def tokenize(params): """Split C argument string into arguments (tokens). Arguments within C string are comma separated, potentially include quoted strings, which in turn could include escaped quotes and commas. There could also be parens, possibly nested. Only commas found outside quoted strings and parens should be considered token separators. Args: params: A string, arguments given to say a C printf() invocation. Returns: A list of stings, parameters retrieved from 'params' with white space stripped. """ if ',' not in params: return [params.strip(),] tokens = [] parens_level = 0 in_q = False in_double_q = False token = '' # need to skip all cases of commas in quotations or parens. for i, char in enumerate(params): char = params[i] if char == ',' and parens_level == 0 and not in_q and not in_double_q: tokens.append(token.strip()) token = '' continue token += char if char in '{([': parens_level += 1 continue if char in '}])': parens_level -= 1 continue if char not in '\'"': continue if i > 0 and params[i - 1] == '\\': continue if char == '"': in_double_q = not in_double_q continue in_q = not in_q tokens.append(token.strip()) return tokens def drop_escapes(fmt): 'Return passed in string with escaped characters replaced by literals' # First replace common single characters sequences. unescapes = (("\\'", "'"), ('\\"', '"'), ('\\\\', '\\'), ('\\b', '\b'), ('\\a', '\a'), ('\\f', '\f'), ('\\n', '\n'), ('\\r', '\r'), ('\\t', '\t'), ('\\v', '\v')) for (p, r) in unescapes: fmt = fmt.replace(p, r) # Then replace all hex values expressed as \xCC. m = HEX_ESCAPE.search(fmt) while m: match = m.group(0) c = chr(int('0x' + match[2:], 16)) fmt = HEX_ESCAPE.sub(c, fmt, count=1) m = HEX_ESCAPE.search(fmt) return fmt def generate_cmsg_line(fmt, params, fmt_blocks, channel, current_function): """Given artifacts of a C line vararg line generate a cmsgX invocation line. See details in the file docstring. Args: fmt: A string, the format string to be removed from the C source code. params: A list of strings, the arguments of the fmt string from the C source code. fmt_blocks: A list of strings, the result of splitting the fmt string at '%' characters. Could be obtained locally, but is available from the caller. channel: A string, name of the console channel the message is sent on. current_function: A string, name of the C function this line is in. Returns: The cmsgX() string to replace the current source code string. Also adds the format string into the dictionary, if it is not yet there. """ global FMT_DICT # Since all parameters passed to cmsgX() are cast to (uintptr_t), uint64_t # values could not be passed directly. If an int 64 parameter is # encountered, a local variable is created to store the value, and the # address of the variable is passed as the parameter to cmsgX(). # # The doubles list below keeps track of these occurrences. doubles = [] param_map = 0 fmt = drop_escapes(fmt) # Either find the string among the previously encountered ones, or add it # to the dictionary. str_index = FMT_DICT.get(fmt, None) if str_index is None: str_index = len(FMT_DICT) FMT_DICT[fmt] = str_index # Now build a cmsgX invocation. if params == ['']: cmsg_index = 0 else: cmsg_index = len(params) generated_code = 'cmsg%d(%s, %d' % (cmsg_index, channel, str_index) if cmsg_index == 0: return generated_code + ');\n' generated_code += ', PARAM^MAP' # Placeholder for parameters map. # Preset the position of the first parameter descriptor in the parameter # map, it will be incremented by 4 in the beginning of the loop. position = -4 for block, param in zip(fmt_blocks, params): position += 4 match = INT_PARAM.match(block) if match: double = match.group(1) == 'll' if double: param_map = param_map + (PARAM_LONG << position) var_name = 'll%d' % len(doubles) doubles.append('%s = %s' % (var_name, param)) generated_code += ', (uintptr_t)&%s' % var_name else: param_map = param_map + (PARAM_INT << position) generated_code += ', (uintptr_t)(%s)' % param continue if STR_PARAM.match(block): if param == '__func__': param_map = param_map + (PARAM_FUNC_NAME << position) str_index = FMT_DICT.get(current_function, None) if str_index is None: str_index = len(FMT_DICT) FMT_DICT[current_function] = str_index generated_code += ', (uintptr_t)%d' % str_index continue param_map = param_map + (PARAM_STRING << position) generated_code += ', (uintptr_t)(%s)' % param continue if PTR_PARAM.match(block): types = { 'P': PARAM_PTR, 'T': PARAM_TIMESTAMP, 'h': PARAM_HEX_BUF } param_map = param_map + (types[block[1]] << position) generated_code += ', (uintptr_t)(%s)' % param continue generated_code = generated_code.replace('PARAM^MAP', '%d' % param_map) generated_code += ');' if doubles: extra = ['{', ] for double in doubles: extra.append('\tlong long %s;\n' % double) generated_code = '\n'.join(extra) + generated_code + '\n}' return generated_code + '\n' def process_ccprintf(line, current_function): """Generate cmsgX line based on source cprint[fs] line. Split the input line into elements, and invoke the function to convert the elements into the matching cmstX() invocation. Args: line: A string, the source line to convert. current_function: A string, the name of the C function 'l' comes from. Returns: The generated cmsgX() line. """ # Let's split the string into components. header = HEADER.search(line) if not header: return line # Must be not a valid ccprintf() invocation. # If the line has some text before the function name, say 'return # ccprintf...' save the text in preamble. If not - just set preamble to a # single space. start_ofs = header.span()[0] if start_ofs != 0: preamble = line[:start_ofs] + ' ' else: preamble = ' ' # Not strictly necessary, but makes the output look neater, remove spaces # after closing paren til newline. line = TRAILING_SPACE.sub(');', line) # Retrieve the channel name, guaranteed first argument. channel = header.group(1).split('(')[1] # Drop the 'cprintf(, ' header. trailer = HEADER.sub('', line[start_ofs:]) # Find the end of the quoted format string. quoted = EOS.search(trailer) if not quoted: if header.group(2) != 'uts': sys.stderr.write('Unprocessed string: "%s"\n' % line) return line # This is a cputs() invocation with indirect string. Let's fix it by # converting (param) into ("%s", param). fmt = '%s' params = [trailer.rstrip(';')[:-1],] # Strip ')[;]'. else: # Extract the fmt string, eliminate possible concatenations and drop # starting and trailing double quotes. fmt = trailer[:quoted.span()[0]] + quoted.group(0) fmt = fmt.replace('" "', '') fmt = fmt[1:-2] if header.group(2) == 'rints': fmt = '[^T' + fmt # terminal will add "]\n" # Now get the list of parameters coming after the fmt string. params = tokenize(trailer[quoted.span()[1]:].rstrip(';')[:-1]) # Elimitnate the %% cases and do not include the first element, it is # irrelevant. fmt_blocks = fmt.replace('%%', '').split('%')[1:] if len(fmt_blocks) != len(params): if fmt_blocks or params[0] != '': sys.stderr.write('Unprocessed string: "%s"\n' % line) print(trailer[quoted.span()[1]:].rstrip(';')[:-1]) return line if len(params) > 8: sys.stderr.write('Too many parameters: "%s"\n' % line) return line return preamble + generate_cmsg_line(fmt, params, fmt_blocks, channel, current_function) class LineProcessor(object): """Process multiline source code strings. The preprocessor output often generates C source code lines split in multiple preprocessor output lines, in case there are macros in the command line arguments, etc. Before the line could be examined to be a printf() like invocation, and if so converted into a cmsgX() line, the multiline preprocessor output needs to be converted into a single code line. This class allows to keep track of multiple preprocessor output lines. Attributes: partial_line: A string, concatenated preprocessor output lines representing a single printf() like invocation statement. current_function: A string, name of the function current lines belong to. """ def __init__(self): self.partial_line = '' self.current_function = None def process_preprocessor_line(self, line): """Process a preporcessor output line. Examine the preprocessor output line to see if it falls into one of three categories: - A first function declaration line. Save the function name for future reference. - A closing '}' of a function - just drop the previously saved function name. - A line containing source code print statement (cprintf/cprints/cputs). If the complete line is present - call the parser to convert it into a cmsgX() invocation. If the line preprocessor line is an incomplete C source line - keep processing preprocessor lines until full source line is received and then call the parser. Args: line: A string, a preprocessor output line. Returns: The input line if further processing is not required, or None, if input line is not yet a full source code line, or the cmsgX() invocation line, the result of converting a print statement line. """ if FUNCTION.match(line): # If this line looks like a first function definition line - # retrieve the function name. candidates = line.split() for candidate in candidates: if '(' in candidate: self.current_function = candidate.split('(')[0] elif line.startswith('}'): # If this is a '}' in the first column - we are not in a function # scope any more. self.current_function = None if self.partial_line: if HASHLINE.search(line): # Drop preprocessor directives from within multiline C strings. return None self.partial_line += line.rstrip() if ';' not in line: # Still not a complete line, nothing to process yet. return None # Got a complete C source line, process it. line = self.partial_line.rstrip() self.partial_line = '' return process_ccprintf(line, self.current_function) if not PTINTLINE.search(line): # If not a print statement - no need to worry, just pass it to the # output as is. return line if line.startswith('int'): # This is either the function prototype or the first line of the # function definition, no need to convert either. return line if ');' not in line: # This is a line with a print statement, but it is incomplete, # start accumulating preprocessor lines. self.partial_line = line.rstrip() return None # Process a full print statement line. return process_ccprintf(line.rstrip(), self.current_function) def preobj_process(name, ext): """Process a C preprocessor output file. Given a preprocessor output file, generate a new file, with all print statements replaced with cmsgX() invocations. Args: name: A string, name of the preprocessor output file to process. ext: A string, the extension to use for the generated file. """ line_processor = LineProcessor() output = os.path.splitext(name)[0] + '.' + ext with open(name, 'r') as in_file: with open(output, 'w') as outf: for lile in in_file.readlines(): processed = line_processor.process_preprocessor_line(lile) if processed: outf.write(processed) def parse_args(argv): """Prepare parser and parse command line arguments.""" prog = os.path.basename(argv[0]) parser = argparse.ArgumentParser(prog=prog) parser.add_argument('-o', '--output') parser.add_argument('-e', '--ext', default='Ep') parser.add_argument('-l', '--lockfile', default='/tmp/%s.lockfile' % prog) return parser.parse_known_args(argv) def generate_blob(): """Convert format strings dictionary into a blob. First convert the format string dictionary into a list of strings, placed in the list in locations matching their string index. Then serialize and compress the list. Returns: A byte array, the compressed list of format lines. """ strings = [''] * len(FMT_DICT) for key, value in FMT_DICT.items(): strings[value] = key dump = pickle.dumps('\0'.join(strings)) zipped = zlib.compress(dump, 9) print('dump size %d, compressed size %d' % (len(dump), len(zipped))) return zipped def seed_blob(outfile): """Read string data from a previously saved blob This function is invoked only if the blob file exists. """ global FMT_DICT if outfile.endswith('Ep'): print('invoked with', ' '.join(sys.argv)) assert False with open(outfile, 'rb') as blob: try: zipped = blob.read() pickled = zlib.decompress(zipped) dump = pickle.loads(pickled) except (zlib.error, pickle.UnpicklingError): print('%s does not seem to be a proper blob, ignored' % outfile) return strings = dump.split('\0') for i in range(len(strings)): FMT_DICT[strings[i]] = i def main(argv): """Main function. To facilitate the use of this script in GNU Makefile, support multiple concurrent invocations such that only one invocation does the processing. First lock the lock file, then check if the blob to be generated is older than any of the input files. If not - this means a different instance of this script has already completed processing, just exit. """ flags, files = parse_args(argv) lfd = open(flags.lockfile, 'w') fcntl.flock(lfd, fcntl.LOCK_EX) try: if os.path.exists(flags.output): outp_stamp = os.stat(flags.output).st_mtime_ns for e_file in files[1:]: if os.stat(e_file).st_mtime_ns >= outp_stamp: break else: # Output file is newer than all inputs. return seed_blob(flags.output) for e_file in files[1:]: preobj_process(e_file, flags.ext) zipped = generate_blob() if flags.output: outf = open(flags.output, 'wb') outf.write(zipped) outf.close() finally: fcntl.flock(lfd, fcntl.LOCK_UN) lfd.close() if __name__ == '__main__': main(sys.argv)