summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Moody <daniel.moody@mongodb.com>2023-03-14 21:05:42 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-03-21 04:32:23 +0000
commit8b65a42f2385e590b2fef76dec4e30a573039951 (patch)
tree2c90e98e461c74664d0cb573343ae0fe48cfe65d
parent3d5ad7de3dba072ae794d3b16aa2730849eb2d87 (diff)
downloadmongo-8b65a42f2385e590b2fef76dec4e30a573039951.tar.gz
SERVER-74287 add oom retry tool
(cherry picked from commit 3949b2a325ac9f421a4b29074bc037c27293c5da) (cherry picked from commit f13c679a03aad1ea09da6dcd131fbee959d9039b) (cherry picked from commit 7f3f01822da6ceb877d6944a2a5de8df0d403c0b)
-rw-r--r--SConstruct64
-rw-r--r--etc/evergreen.yml2
-rw-r--r--site_scons/site_tools/oom_auto_retry.py109
3 files changed, 166 insertions, 9 deletions
diff --git a/SConstruct b/SConstruct
index 9799d03f9ba..eb0eb9d88d9 100644
--- a/SConstruct
+++ b/SConstruct
@@ -3,6 +3,7 @@ import atexit
import copy
import datetime
import errno
+import functools
import json
import os
import re
@@ -599,6 +600,17 @@ except ValueError as e:
print(("Error decoding version.json: {0}".format(e)))
Exit(1)
+
+def to_boolean(s):
+ if isinstance(s, bool):
+ return s
+ elif s.lower() in ('1', "on", "true", "yes"):
+ return True
+ elif s.lower() in ('0', "off", "false", "no"):
+ return False
+ raise ValueError(f'Invalid value {s}, must be a boolean-like string')
+
+
# Setup the command-line variables
def variable_shlex_converter(val):
# If the argument is something other than a string, propogate
@@ -630,6 +642,17 @@ def variable_arch_converter(val):
# Return whatever val is passed in - hopefully it's legit
return val
+
+def bool_var_converter(val, var):
+ try:
+ return to_boolean(val)
+ except ValueError as exc:
+ if val.lower() != "auto":
+ raise ValueError(
+ f'Invalid {var} value {s}, must be a boolean-like string or "auto"') from exc
+ return "auto"
+
+
# The Scons 'default' tool enables a lot of tools that we don't actually need to enable.
# On platforms like Solaris, it actually does the wrong thing by enabling the sunstudio
# toolchain first. As such it is simpler and more efficient to manually load the precise
@@ -1006,6 +1029,14 @@ env_vars.Add('STRIP',
help='Path to the strip utility (non-darwin platforms probably use OBJCOPY for this)',
)
+env_vars.Add(
+ 'ENABLE_OOM_RETRY',
+ help=
+ 'Set the boolean (auto, on/off true/false 1/0) to enable retrying a compile or link commands from "out of memory" failures.',
+ converter=functools.partial(bool_var_converter, var='ENABLE_OOM_RETRY'),
+ default="False",
+)
+
env_vars.Add('TARGET_ARCH',
help='Sets the architecture to build for',
converter=variable_arch_converter,
@@ -1217,15 +1248,6 @@ def conf_error(env, msg, *args):
env.AddMethod(fatal_error, 'FatalError')
env.AddMethod(conf_error, 'ConfError')
-def to_boolean(s):
- if isinstance(s, bool):
- return s
- elif s.lower() in ('1', "on", "true", "yes"):
- return True
- elif s.lower() in ('0', "off", "false", "no"):
- return False
- raise ValueError(f'Invalid value {s}, must be a boolean-like string')
-
# Normalize the VERBOSE Option, and make its value available as a
# function.
if env['VERBOSE'] == "auto":
@@ -1445,6 +1467,30 @@ def is_toolchain(self, *args):
env.AddMethod(get_toolchain_name, 'ToolchainName')
env.AddMethod(is_toolchain, 'ToolchainIs')
+if env.get('ENABLE_OOM_RETRY'):
+ if get_option('ninja') != 'disabled':
+ print('ENABLE_OOM_RETRY not compatible with ninja, disabling ENABLE_OOM_RETRY.')
+ else:
+ env['OOM_RETRY_ATTEMPTS'] = 10
+ env['OOM_RETRY_MAX_DELAY_SECONDS'] = 120
+
+ if env.ToolchainIs('clang', 'gcc'):
+ env['OOM_RETRY_MESSAGES'] = [
+ ': out of memory',
+ 'virtual memory exhausted: Cannot allocate memory',
+ ': fatal error: Killed signal terminated program cc1',
+ ]
+ elif env.ToolchainIs('msvc'):
+ env['OOM_RETRY_MESSAGES'] = [
+ 'LNK1102: out of memory',
+ 'C1060: compiler is out of heap space',
+ 'LNK1171: unable to load mspdbcore.dll',
+ "LNK1201: error writing to program database ''",
+ ]
+ env['OOM_RETRY_RETURNCODES'] = [1102]
+
+ env.Tool('oom_auto_retry')
+
if env['TARGET_ARCH']:
if not detectConf.CheckForProcessor(env['TARGET_ARCH']):
env.ConfError("Could not detect processor specified in TARGET_ARCH variable")
diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index 4370d1033a8..541348f7f4d 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -1805,6 +1805,8 @@ functions:
extra_args="$extra_args --release"
fi
+ extra_args="$extra_args ENABLE_OOM_RETRY=1"
+
if [ "Windows_NT" = "$OS" ]; then
vcvars="$(vswhere -latest -property installationPath | tr '\\' '/' | dos2unix.exe)/VC/Auxiliary/Build/"
export PATH="$(echo "$(cd "$vcvars" && cmd /C "vcvarsall.bat amd64 && C:/cygwin/bin/bash -c 'echo \$PATH'")" | tail -n +6)":$PATH
diff --git a/site_scons/site_tools/oom_auto_retry.py b/site_scons/site_tools/oom_auto_retry.py
new file mode 100644
index 00000000000..1eacc3fb401
--- /dev/null
+++ b/site_scons/site_tools/oom_auto_retry.py
@@ -0,0 +1,109 @@
+# Copyright 2023 MongoDB Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+
+import SCons
+
+import functools
+import subprocess
+import sys
+import time
+import random
+import os
+
+from typing import Callable, List, Dict
+
+
+def command_spawn_func(sh: str, escape: Callable[[str], str], cmd: str, args: List, env: Dict,
+ target: List, source: List):
+ retries = 0
+ success = False
+
+ build_env = target[0].get_build_env()
+ oom_messages = build_env.get('OOM_RETRY_MESSAGES', [])
+ oom_returncodes = [int(returncode) for returncode in build_env.get('OOM_RETRY_RETURNCODES', [])]
+ max_retries = build_env.get('OOM_RETRY_ATTEMPTS', 10)
+ oom_max_retry_delay = build_env.get('OOM_RETRY_MAX_DELAY_SECONDS', 120)
+
+ while not success and retries <= max_retries:
+
+ try:
+ start_time = time.time()
+ if sys.platform[:3] == 'win':
+ # have to use shell=True for windows because of https://github.com/python/cpython/issues/53908
+ proc = subprocess.run(' '.join(args), env=env, close_fds=True, shell=True,
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True,
+ check=True)
+ else:
+ proc = subprocess.run([sh, '-c', ' '.join(args)], env=env, close_fds=True,
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True,
+ check=True)
+ except subprocess.CalledProcessError as exc:
+ print(f"{os.path.basename(__file__)} captured error:")
+ print(exc.stdout)
+ if any([oom_message in exc.stdout for oom_message in oom_messages]) or any(
+ [oom_returncode == exc.returncode for oom_returncode in oom_returncodes]):
+ retries += 1
+ retry_delay = int((time.time() - start_time) +
+ oom_max_retry_delay * random.random())
+ print(f"Ran out of memory while trying to build {target[0]}", )
+ if retries <= max_retries:
+ print(f"trying again in {retry_delay} seconds with retry attempt {retries}")
+ time.sleep(retry_delay)
+ continue
+
+ # There was no OOM error or no more OOM retries left
+ return exc.returncode
+ else:
+ if proc.stdout:
+ print(proc.stdout)
+ return proc.returncode
+
+
+def generate(env):
+
+ original_command_execute = SCons.Action.CommandAction.execute
+
+ def oom_retry_execute(command_action_instance, target, source, env, executor=None):
+
+ if 'conftest' not in str(target[0]) and target[0].has_builder() and target[0].get_builder(
+ ).get_name(env) in [
+ 'Object', 'SharedObject', 'StaticObject', 'Program', 'StaticLibrary',
+ 'SharedLibrary'
+ ]:
+
+ original_spawn = env['SPAWN']
+
+ env['SPAWN'] = functools.partial(command_spawn_func, target=target, source=source)
+ result = original_command_execute(command_action_instance, target, source, env,
+ executor)
+ env['SPAWN'] = original_spawn
+
+ else:
+ result = original_command_execute(command_action_instance, target, source, env,
+ executor)
+ return result
+
+ SCons.Action.CommandAction.execute = oom_retry_execute
+
+
+def exists(env):
+ return True