From 099317518e6da031512251c99ed39bd5ce39f6ae Mon Sep 17 00:00:00 2001 From: Daniel Moody Date: Tue, 14 Mar 2023 21:05:42 +0000 Subject: SERVER-74287 add oom retry tool (cherry picked from commit 3949b2a325ac9f421a4b29074bc037c27293c5da) (cherry picked from commit f13c679a03aad1ea09da6dcd131fbee959d9039b) --- SConstruct | 70 ++++++++++++++++---- evergreen/scons_compile.sh | 2 + site_scons/site_tools/oom_auto_retry.py | 109 ++++++++++++++++++++++++++++++++ 3 files changed, 170 insertions(+), 11 deletions(-) create mode 100644 site_scons/site_tools/oom_auto_retry.py diff --git a/SConstruct b/SConstruct index 61d487447d3..c669a335ab7 100644 --- a/SConstruct +++ b/SConstruct @@ -3,6 +3,7 @@ import atexit import copy import errno +import functools import json import os import re @@ -679,6 +680,17 @@ except ValueError as e: print(("Error decoding version.json: {0}".format(e))) Exit(1) + +def to_boolean(s): + if isinstance(s, bool): + return s + elif s.lower() in ('1', "on", "true", "yes"): + return True + elif s.lower() in ('0', "off", "false", "no"): + return False + raise ValueError(f'Invalid value {s}, must be a boolean-like string') + + # Setup the command-line variables def variable_shlex_converter(val): # If the argument is something other than a string, propogate @@ -710,6 +722,17 @@ def variable_arch_converter(val): # Return whatever val is passed in - hopefully it's legit return val + +def bool_var_converter(val, var): + try: + return to_boolean(val) + except ValueError as exc: + if val.lower() != "auto": + raise ValueError( + f'Invalid {var} value {s}, must be a boolean-like string or "auto"') from exc + return "auto" + + # The Scons 'default' tool enables a lot of tools that we don't actually need to enable. # On platforms like Solaris, it actually does the wrong thing by enabling the sunstudio # toolchain first. As such it is simpler and more efficient to manually load the precise @@ -1059,8 +1082,18 @@ env_vars.Add('STRIP', help='Path to the strip utility (non-darwin platforms probably use OBJCOPY for this)', ) -env_vars.Add('TAPI', - help="Configures the path to the 'tapi' (an Xcode) utility") +env_vars.Add( + 'ENABLE_OOM_RETRY', + help= + 'Set the boolean (auto, on/off true/false 1/0) to enable retrying a compile or link commands from "out of memory" failures.', + converter=functools.partial(bool_var_converter, var='ENABLE_OOM_RETRY'), + default="False", +) + +env_vars.Add( + 'TAPI', + help="Configures the path to the 'tapi' (an Xcode) utility", +) env_vars.Add('TARGET_ARCH', help='Sets the architecture to build for', @@ -1297,15 +1330,6 @@ def conf_error(env, msg, *args): env.AddMethod(fatal_error, 'FatalError') env.AddMethod(conf_error, 'ConfError') -def to_boolean(s): - if isinstance(s, bool): - return s - elif s.lower() in ('1', "on", "true", "yes"): - return True - elif s.lower() in ('0', "off", "false", "no"): - return False - raise ValueError(f'Invalid value {s}, must be a boolean-like string') - # Normalize the VERBOSE Option, and make its value available as a # function. if env['VERBOSE'] == "auto": @@ -1512,6 +1536,30 @@ def is_toolchain(self, *args): env.AddMethod(get_toolchain_name, 'ToolchainName') env.AddMethod(is_toolchain, 'ToolchainIs') +if env.get('ENABLE_OOM_RETRY'): + if get_option('ninja') != 'disabled': + print('ENABLE_OOM_RETRY not compatible with ninja, disabling ENABLE_OOM_RETRY.') + else: + env['OOM_RETRY_ATTEMPTS'] = 10 + env['OOM_RETRY_MAX_DELAY_SECONDS'] = 120 + + if env.ToolchainIs('clang', 'gcc'): + env['OOM_RETRY_MESSAGES'] = [ + ': out of memory', + 'virtual memory exhausted: Cannot allocate memory', + ': fatal error: Killed signal terminated program cc1', + ] + elif env.ToolchainIs('msvc'): + env['OOM_RETRY_MESSAGES'] = [ + 'LNK1102: out of memory', + 'C1060: compiler is out of heap space', + 'LNK1171: unable to load mspdbcore.dll', + "LNK1201: error writing to program database ''", + ] + env['OOM_RETRY_RETURNCODES'] = [1102] + + env.Tool('oom_auto_retry') + if env['TARGET_ARCH']: if not detectSystem.CheckForProcessor(env['TARGET_ARCH']): env.ConfError("Could not detect processor specified in TARGET_ARCH variable") diff --git a/evergreen/scons_compile.sh b/evergreen/scons_compile.sh index 34a2c937bd7..575d73fecdb 100755 --- a/evergreen/scons_compile.sh +++ b/evergreen/scons_compile.sh @@ -49,6 +49,8 @@ else extra_args="$extra_args --release" fi +extra_args="$extra_args ENABLE_OOM_RETRY=1" + if [ "${generating_for_ninja}" = "true" ] && [ "Windows_NT" = "$OS" ]; then vcvars="$(vswhere -latest -property installationPath | tr '\\' '/' | dos2unix.exe)/VC/Auxiliary/Build/" export PATH="$(echo "$(cd "$vcvars" && cmd /C "vcvarsall.bat amd64 && C:/cygwin/bin/bash -c 'echo \$PATH'")" | tail -n +6)":$PATH diff --git a/site_scons/site_tools/oom_auto_retry.py b/site_scons/site_tools/oom_auto_retry.py new file mode 100644 index 00000000000..1eacc3fb401 --- /dev/null +++ b/site_scons/site_tools/oom_auto_retry.py @@ -0,0 +1,109 @@ +# Copyright 2023 MongoDB Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# + +import SCons + +import functools +import subprocess +import sys +import time +import random +import os + +from typing import Callable, List, Dict + + +def command_spawn_func(sh: str, escape: Callable[[str], str], cmd: str, args: List, env: Dict, + target: List, source: List): + retries = 0 + success = False + + build_env = target[0].get_build_env() + oom_messages = build_env.get('OOM_RETRY_MESSAGES', []) + oom_returncodes = [int(returncode) for returncode in build_env.get('OOM_RETRY_RETURNCODES', [])] + max_retries = build_env.get('OOM_RETRY_ATTEMPTS', 10) + oom_max_retry_delay = build_env.get('OOM_RETRY_MAX_DELAY_SECONDS', 120) + + while not success and retries <= max_retries: + + try: + start_time = time.time() + if sys.platform[:3] == 'win': + # have to use shell=True for windows because of https://github.com/python/cpython/issues/53908 + proc = subprocess.run(' '.join(args), env=env, close_fds=True, shell=True, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, + check=True) + else: + proc = subprocess.run([sh, '-c', ' '.join(args)], env=env, close_fds=True, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, + check=True) + except subprocess.CalledProcessError as exc: + print(f"{os.path.basename(__file__)} captured error:") + print(exc.stdout) + if any([oom_message in exc.stdout for oom_message in oom_messages]) or any( + [oom_returncode == exc.returncode for oom_returncode in oom_returncodes]): + retries += 1 + retry_delay = int((time.time() - start_time) + + oom_max_retry_delay * random.random()) + print(f"Ran out of memory while trying to build {target[0]}", ) + if retries <= max_retries: + print(f"trying again in {retry_delay} seconds with retry attempt {retries}") + time.sleep(retry_delay) + continue + + # There was no OOM error or no more OOM retries left + return exc.returncode + else: + if proc.stdout: + print(proc.stdout) + return proc.returncode + + +def generate(env): + + original_command_execute = SCons.Action.CommandAction.execute + + def oom_retry_execute(command_action_instance, target, source, env, executor=None): + + if 'conftest' not in str(target[0]) and target[0].has_builder() and target[0].get_builder( + ).get_name(env) in [ + 'Object', 'SharedObject', 'StaticObject', 'Program', 'StaticLibrary', + 'SharedLibrary' + ]: + + original_spawn = env['SPAWN'] + + env['SPAWN'] = functools.partial(command_spawn_func, target=target, source=source) + result = original_command_execute(command_action_instance, target, source, env, + executor) + env['SPAWN'] = original_spawn + + else: + result = original_command_execute(command_action_instance, target, source, env, + executor) + return result + + SCons.Action.CommandAction.execute = oom_retry_execute + + +def exists(env): + return True -- cgit v1.2.1