diff options
Diffstat (limited to 'tools/scan-build-py/libscanbuild/intercept.py')
-rw-r--r-- | tools/scan-build-py/libscanbuild/intercept.py | 359 |
1 files changed, 359 insertions, 0 deletions
diff --git a/tools/scan-build-py/libscanbuild/intercept.py b/tools/scan-build-py/libscanbuild/intercept.py new file mode 100644 index 0000000000..6062e2ea8c --- /dev/null +++ b/tools/scan-build-py/libscanbuild/intercept.py @@ -0,0 +1,359 @@ +# -*- coding: utf-8 -*- +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +""" This module is responsible to capture the compiler invocation of any +build process. The result of that should be a compilation database. + +This implementation is using the LD_PRELOAD or DYLD_INSERT_LIBRARIES +mechanisms provided by the dynamic linker. The related library is implemented +in C language and can be found under 'libear' directory. + +The 'libear' library is capturing all child process creation and logging the +relevant information about it into separate files in a specified directory. +The parameter of this process is the output directory name, where the report +files shall be placed. This parameter is passed as an environment variable. + +The module also implements compiler wrappers to intercept the compiler calls. + +The module implements the build command execution and the post-processing of +the output files, which will condensates into a compilation database. """ + +import sys +import os +import os.path +import re +import itertools +import json +import glob +import argparse +import logging +import subprocess +from libear import build_libear, TemporaryDirectory +from libscanbuild import duplicate_check, tempdir, initialize_logging +from libscanbuild import command_entry_point +from libscanbuild.command import Action, classify_parameters +from libscanbuild.shell import encode, decode + +__all__ = ['capture', 'intercept_build_main', 'intercept_build_wrapper'] + +GS = chr(0x1d) +RS = chr(0x1e) +US = chr(0x1f) + +COMPILER_WRAPPER_CC = 'intercept-cc' +COMPILER_WRAPPER_CXX = 'intercept-c++' + + +@command_entry_point +def intercept_build_main(bin_dir): + """ Entry point for 'intercept-build' command. """ + + parser = create_parser() + args = parser.parse_args() + + initialize_logging(args.verbose) + logging.debug('Parsed arguments: %s', args) + + if not args.build: + parser.print_help() + return 0 + + return capture(args, bin_dir) + + +def capture(args, bin_dir): + """ The entry point of build command interception. """ + + def post_processing(commands): + """ To make a compilation database, it needs to filter out commands + which are not compiler calls. Needs to find the source file name + from the arguments. And do shell escaping on the command. + + To support incremental builds, it is desired to read elements from + an existing compilation database from a previous run. These elemets + shall be merged with the new elements. """ + + # create entries from the current run + current = itertools.chain.from_iterable( + # creates a sequence of entry generators from an exec, + # but filter out non compiler calls before. + (format_entry(x) for x in commands if is_compiler_call(x))) + # read entries from previous run + if 'append' in args and args.append and os.path.exists(args.cdb): + with open(args.cdb) as handle: + previous = iter(json.load(handle)) + else: + previous = iter([]) + # filter out duplicate entries from both + duplicate = duplicate_check(entry_hash) + return (entry for entry in itertools.chain(previous, current) + if os.path.exists(entry['file']) and not duplicate(entry)) + + with TemporaryDirectory(prefix='intercept-', dir=tempdir()) as tmp_dir: + # run the build command + environment = setup_environment(args, tmp_dir, bin_dir) + logging.debug('run build in environment: %s', environment) + exit_code = subprocess.call(args.build, env=environment) + logging.info('build finished with exit code: %d', exit_code) + # read the intercepted exec calls + commands = itertools.chain.from_iterable( + parse_exec_trace(os.path.join(tmp_dir, filename)) + for filename in sorted(glob.iglob(os.path.join(tmp_dir, '*.cmd')))) + # do post processing only if that was requested + if 'raw_entries' not in args or not args.raw_entries: + entries = post_processing(commands) + else: + entries = commands + # dump the compilation database + with open(args.cdb, 'w+') as handle: + json.dump(list(entries), handle, sort_keys=True, indent=4) + return exit_code + + +def setup_environment(args, destination, bin_dir): + """ Sets up the environment for the build command. + + It sets the required environment variables and execute the given command. + The exec calls will be logged by the 'libear' preloaded library or by the + 'wrapper' programs. """ + + c_compiler = args.cc if 'cc' in args else 'cc' + cxx_compiler = args.cxx if 'cxx' in args else 'c++' + + libear_path = None if args.override_compiler or is_preload_disabled( + sys.platform) else build_libear(c_compiler, destination) + + environment = dict(os.environ) + environment.update({'INTERCEPT_BUILD_TARGET_DIR': destination}) + + if not libear_path: + logging.debug('intercept gonna use compiler wrappers') + environment.update({ + 'CC': os.path.join(bin_dir, COMPILER_WRAPPER_CC), + 'CXX': os.path.join(bin_dir, COMPILER_WRAPPER_CXX), + 'INTERCEPT_BUILD_CC': c_compiler, + 'INTERCEPT_BUILD_CXX': cxx_compiler, + 'INTERCEPT_BUILD_VERBOSE': 'DEBUG' if args.verbose > 2 else 'INFO' + }) + elif sys.platform == 'darwin': + logging.debug('intercept gonna preload libear on OSX') + environment.update({ + 'DYLD_INSERT_LIBRARIES': libear_path, + 'DYLD_FORCE_FLAT_NAMESPACE': '1' + }) + else: + logging.debug('intercept gonna preload libear on UNIX') + environment.update({'LD_PRELOAD': libear_path}) + + return environment + + +def intercept_build_wrapper(cplusplus): + """ Entry point for `intercept-cc` and `intercept-c++` compiler wrappers. + + It does generate execution report into target directory. And execute + the wrapped compilation with the real compiler. The parameters for + report and execution are from environment variables. + + Those parameters which for 'libear' library can't have meaningful + values are faked. """ + + # initialize wrapper logging + logging.basicConfig(format='intercept: %(levelname)s: %(message)s', + level=os.getenv('INTERCEPT_BUILD_VERBOSE', 'INFO')) + # write report + try: + target_dir = os.getenv('INTERCEPT_BUILD_TARGET_DIR') + if not target_dir: + raise UserWarning('exec report target directory not found') + pid = str(os.getpid()) + target_file = os.path.join(target_dir, pid + '.cmd') + logging.debug('writing exec report to: %s', target_file) + with open(target_file, 'ab') as handler: + working_dir = os.getcwd() + command = US.join(sys.argv) + US + content = RS.join([pid, pid, 'wrapper', working_dir, command]) + GS + handler.write(content.encode('utf-8')) + except IOError: + logging.exception('writing exec report failed') + except UserWarning as warning: + logging.warning(warning) + # execute with real compiler + compiler = os.getenv('INTERCEPT_BUILD_CXX', 'c++') if cplusplus \ + else os.getenv('INTERCEPT_BUILD_CC', 'cc') + compilation = [compiler] + sys.argv[1:] + logging.debug('execute compiler: %s', compilation) + return subprocess.call(compilation) + + +def parse_exec_trace(filename): + """ Parse the file generated by the 'libear' preloaded library. + + Given filename points to a file which contains the basic report + generated by the interception library or wrapper command. A single + report file _might_ contain multiple process creation info. """ + + logging.debug('parse exec trace file: %s', filename) + with open(filename, 'r') as handler: + content = handler.read() + for group in filter(bool, content.split(GS)): + records = group.split(RS) + yield { + 'pid': records[0], + 'ppid': records[1], + 'function': records[2], + 'directory': records[3], + 'command': records[4].split(US)[:-1] + } + + +def format_entry(entry): + """ Generate the desired fields for compilation database entries. """ + + def abspath(cwd, name): + """ Create normalized absolute path from input filename. """ + fullname = name if os.path.isabs(name) else os.path.join(cwd, name) + return os.path.normpath(fullname) + + logging.debug('format this command: %s', entry['command']) + atoms = classify_parameters(entry['command']) + if atoms['action'] <= Action.Compile: + for source in atoms['files']: + compiler = 'c++' if atoms['c++'] else 'cc' + flags = atoms['compile_options'] + flags += ['-o', atoms['output']] if atoms['output'] else [] + flags += ['-x', atoms['language']] if 'language' in atoms else [] + flags += [elem + for arch in atoms.get('archs_seen', []) + for elem in ['-arch', arch]] + command = [compiler, '-c'] + flags + [source] + logging.debug('formated as: %s', command) + yield { + 'directory': entry['directory'], + 'command': encode(command), + 'file': abspath(entry['directory'], source) + } + + +def is_compiler_call(entry): + """ A predicate to decide the entry is a compiler call or not. """ + + patterns = [ + re.compile(r'^([^/]*/)*intercept-c(c|\+\+)$'), + re.compile(r'^([^/]*/)*c(c|\+\+)$'), + re.compile(r'^([^/]*/)*([^-]*-)*[mg](cc|\+\+)(-\d+(\.\d+){0,2})?$'), + re.compile(r'^([^/]*/)*([^-]*-)*clang(\+\+)?(-\d+(\.\d+){0,2})?$'), + re.compile(r'^([^/]*/)*llvm-g(cc|\+\+)$'), + ] + executable = entry['command'][0] + return any((pattern.match(executable) for pattern in patterns)) + + +def is_preload_disabled(platform): + """ Library-based interposition will fail silently if SIP is enabled, + so this should be detected. You can detect whether SIP is enabled on + Darwin by checking whether (1) there is a binary called 'csrutil' in + the path and, if so, (2) whether the output of executing 'csrutil status' + contains 'System Integrity Protection status: enabled'. + + Same problem on linux when SELinux is enabled. The status query program + 'sestatus' and the output when it's enabled 'SELinux status: enabled'. """ + + if platform == 'darwin': + pattern = re.compile(r'System Integrity Protection status:\s+enabled') + command = ['csrutil', 'status'] + elif platform in {'linux', 'linux2'}: + pattern = re.compile(r'SELinux status:\s+enabled') + command = ['sestatus'] + else: + return False + + try: + lines = subprocess.check_output(command).decode('utf-8') + return any((pattern.match(line) for line in lines.splitlines())) + except: + return False + + +def entry_hash(entry): + """ Implement unique hash method for compilation database entries. """ + + # For faster lookup in set filename is reverted + filename = entry['file'][::-1] + # For faster lookup in set directory is reverted + directory = entry['directory'][::-1] + # On OS X the 'cc' and 'c++' compilers are wrappers for + # 'clang' therefore both call would be logged. To avoid + # this the hash does not contain the first word of the + # command. + command = ' '.join(decode(entry['command'])[1:]) + + return '<>'.join([filename, directory, command]) + + +def create_parser(): + """ Command line argument parser factory method. """ + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument( + '--verbose', '-v', + action='count', + default=0, + help="""Enable verbose output from '%(prog)s'. A second and third + flag increases verbosity.""") + parser.add_argument( + '--cdb', + metavar='<file>', + default="compile_commands.json", + help="""The JSON compilation database.""") + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--append', + action='store_true', + help="""Append new entries to existing compilation database.""") + group.add_argument( + '--disable-filter', '-n', + dest='raw_entries', + action='store_true', + help="""Intercepted child process creation calls (exec calls) are all + logged to the output. The output is not a compilation database. + This flag is for debug purposes.""") + + advanced = parser.add_argument_group('advanced options') + advanced.add_argument( + '--override-compiler', + action='store_true', + help="""Always resort to the compiler wrapper even when better + intercept methods are available.""") + advanced.add_argument( + '--use-cc', + metavar='<path>', + dest='cc', + default='cc', + help="""When '%(prog)s' analyzes a project by interposing a compiler + wrapper, which executes a real compiler for compilation and + do other tasks (record the compiler invocation). Because of + this interposing, '%(prog)s' does not know what compiler your + project normally uses. Instead, it simply overrides the CC + environment variable, and guesses your default compiler. + + If you need '%(prog)s' to use a specific compiler for + *compilation* then you can use this option to specify a path + to that compiler.""") + advanced.add_argument( + '--use-c++', + metavar='<path>', + dest='cxx', + default='c++', + help="""This is the same as "--use-cc" but for C++ code.""") + + parser.add_argument( + dest='build', + nargs=argparse.REMAINDER, + help="""Command to run.""") + + return parser |