summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMikhail Shchatko <mikhail.shchatko@mongodb.com>2021-05-12 15:37:41 +0300
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-05-14 10:36:51 +0000
commit985e4fbf29d8b80101fa17cc00e28c6fa2eeba93 (patch)
tree4a309346d2621a4564683e16998388206bf3dcc5
parent51a860a7644d5c769b47d5b814feeb3dd43dcec5 (diff)
downloadmongo-985e4fbf29d8b80101fa17cc00e28c6fa2eeba93.tar.gz
SERVER-56478 Run powercycle with extended host lifetime for easier debugging
-rwxr-xr-xbuildscripts/evergreen_gen_powercycle_tasks.py110
-rwxr-xr-xbuildscripts/powercycle_sentinel.py119
-rw-r--r--buildscripts/resmokelib/powercycle/__init__.py3
-rw-r--r--buildscripts/resmokelib/powercycle/lib/__init__.py12
-rw-r--r--buildscripts/resmokelib/powercycle/lib/remote_operations.py5
-rwxr-xr-xbuildscripts/resmokelib/powercycle/powercycle.py15
-rw-r--r--buildscripts/resmokelib/powercycle/setup/__init__.py18
-rw-r--r--buildscripts/tests/test_powercycle_sentinel.py51
-rw-r--r--etc/evergreen.yml102
-rwxr-xr-xevergreen/compiled_binaries_get.sh46
-rwxr-xr-xevergreen/move_multiversion_binaries.sh6
-rw-r--r--evergreen/multiversion_setup.sh13
-rw-r--r--evergreen/powercycle_run_test.sh5
-rwxr-xr-xevergreen/powercycle_sentinel_run.sh10
-rwxr-xr-xevergreen/powercycle_tasks_generate.sh10
15 files changed, 452 insertions, 73 deletions
diff --git a/buildscripts/evergreen_gen_powercycle_tasks.py b/buildscripts/evergreen_gen_powercycle_tasks.py
new file mode 100755
index 00000000000..69c38ba54f3
--- /dev/null
+++ b/buildscripts/evergreen_gen_powercycle_tasks.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+"""Generate multiple powercycle tasks to run in evergreen."""
+from collections import namedtuple
+from typing import Any, List, Tuple, Set
+
+import click
+from shrub.v2 import BuildVariant, FunctionCall, ShrubProject, Task, TaskDependency
+from shrub.v2.command import BuiltInCommand
+
+from buildscripts.util.fileops import write_file
+from buildscripts.util.read_config import read_config_file
+from buildscripts.util.taskname import name_generated_task
+
+Config = namedtuple("config", [
+ "task_names",
+ "num_tasks",
+ "timeout_params",
+ "remote_credentials_vars",
+ "set_up_ec2_instance_vars",
+ "run_powercycle_vars",
+ "build_variant",
+ "distro",
+])
+
+
+def make_config(expansions_file: Any) -> Config:
+ """Group expansions into config."""
+ expansions = read_config_file(expansions_file)
+ task_names = expansions.get("task_names", "powercycle_smoke_skip_compile")
+ # Avoid duplicated task names
+ task_names = {task_name for task_name in task_names.split(" ")}
+ num_tasks = int(expansions.get("num_tasks", 10))
+ timeout_params = {
+ "exec_timeout_secs": int(expansions.get("exec_timeout_secs", 7200)),
+ "timeout_secs": int(expansions.get("timeout_secs", 1800)),
+ }
+ remote_credentials_vars = {
+ "private_key_file": "src/powercycle.pem",
+ "private_key_remote": "${__project_aws_ssh_key_value}",
+ }
+ set_up_ec2_instance_vars = {
+ "set_up_retry_count": int(expansions.get("set_up_retry_count", 2)),
+ }
+ run_powercycle_vars = {
+ "run_powercycle_args": expansions.get("run_powercycle_args"),
+ }
+ build_variant = expansions.get("build_variant")
+ distro = expansions.get("distro_id")
+
+ return Config(task_names, num_tasks, timeout_params, remote_credentials_vars,
+ set_up_ec2_instance_vars, run_powercycle_vars, build_variant, distro)
+
+
+def get_setup_commands() -> Tuple[List[FunctionCall], Set[TaskDependency]]:
+ """Return setup commands."""
+ return [
+ FunctionCall("do setup"),
+ ], {TaskDependency("archive_dist_test_debug")}
+
+
+def get_skip_compile_setup_commands() -> Tuple[List[FunctionCall], set]:
+ """Return skip compile setup commands."""
+ return [
+ FunctionCall("set task expansion macros"),
+ FunctionCall("set up venv"),
+ FunctionCall("upload pip requirements"),
+ FunctionCall("f_expansions_write"),
+ FunctionCall("configure evergreen api credentials"),
+ FunctionCall("get compiled binaries"),
+ ], set()
+
+
+@click.command()
+@click.argument("expansions_file", type=str, default="expansions.yml")
+@click.argument("output_file", type=str, default="powercycle_tasks.json")
+def main(expansions_file: str = "expansions.yml",
+ output_file: str = "powercycle_tasks.json") -> None:
+ """Generate multiple powercycle tasks to run in evergreen."""
+
+ config = make_config(expansions_file)
+ build_variant = BuildVariant(config.build_variant)
+ for task_name in config.task_names:
+ if "skip_compile" in task_name:
+ commands, task_dependency = get_skip_compile_setup_commands()
+ else:
+ commands, task_dependency = get_setup_commands()
+
+ commands.extend([
+ FunctionCall("set up remote credentials", config.remote_credentials_vars),
+ BuiltInCommand("timeout.update", config.timeout_params),
+ FunctionCall("set up EC2 instance", config.set_up_ec2_instance_vars),
+ FunctionCall("run powercycle test", config.run_powercycle_vars),
+ ])
+
+ build_variant.display_task(
+ task_name, {
+ Task(
+ name_generated_task(task_name, index, config.num_tasks, config.build_variant),
+ commands, task_dependency)
+ for index in range(config.num_tasks)
+ }, distros=[config.distro])
+
+ shrub_project = ShrubProject.empty()
+ shrub_project.add_build_variant(build_variant)
+
+ write_file(output_file, shrub_project.json())
+
+
+if __name__ == '__main__':
+ main()
diff --git a/buildscripts/powercycle_sentinel.py b/buildscripts/powercycle_sentinel.py
new file mode 100755
index 00000000000..bb276e2b712
--- /dev/null
+++ b/buildscripts/powercycle_sentinel.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""Powercycle tasks sentinel.
+
+Error out when any powercycle task on the same buildvariant runs for more than 2 hours.
+"""
+import logging
+import os
+import sys
+import time
+from datetime import datetime, timezone
+from typing import List
+
+import click
+import structlog
+from evergreen import RetryingEvergreenApi, EvergreenApi
+
+from buildscripts.util.read_config import read_config_file
+
+LOGGER = structlog.getLogger(__name__)
+
+EVERGREEN_HOST = "https://evergreen.mongodb.com"
+EVERGREEN_CONFIG_LOCATIONS = (
+ # Common for machines in Evergreen
+ os.path.join(os.getcwd(), ".evergreen.yml"),
+ # Common for local machines
+ os.path.expanduser(os.path.join("~", ".evergreen.yml")),
+)
+POWERCYCLE_TASK_EXEC_TIMEOUT_SECS = 2 * 60 * 60
+WATCH_INTERVAL_SECS = 5 * 60
+
+
+def get_evergreen_api() -> EvergreenApi:
+ """Return evergreen API."""
+ # Pickup the first config file found in common locations.
+ for file in EVERGREEN_CONFIG_LOCATIONS:
+ if os.path.isfile(file):
+ evg_api = RetryingEvergreenApi.get_api(config_file=file)
+ return evg_api
+
+ LOGGER.error("Evergreen config not found in locations.", locations=EVERGREEN_CONFIG_LOCATIONS)
+ sys.exit(1)
+
+
+def watch_tasks(task_ids: List[str], evg_api: EvergreenApi, watch_interval_secs: int) -> List[str]:
+ """Watch tasks if they run longer than exec timeout."""
+ watch_task_ids = task_ids[:]
+ long_running_task_ids = []
+
+ while watch_task_ids:
+ LOGGER.info("Looking if powercycle tasks are still running on the current buildvariant.")
+ powercycle_tasks = [evg_api.task_by_id(task_id) for task_id in watch_task_ids]
+ for task in powercycle_tasks:
+ if task.finish_time:
+ watch_task_ids.remove(task.task_id)
+ elif task.start_time and (datetime.now(timezone.utc) - task.start_time
+ ).total_seconds() > POWERCYCLE_TASK_EXEC_TIMEOUT_SECS:
+ long_running_task_ids.append(task.task_id)
+ watch_task_ids.remove(task.task_id)
+ if watch_task_ids:
+ time.sleep(watch_interval_secs)
+
+ return long_running_task_ids
+
+
+def get_links(task_ids: List[str]) -> str:
+ """Return evergreen task urls delimited by newline."""
+ return "\n".join([f"{EVERGREEN_HOST}/task/{task_id}" for task_id in task_ids])
+
+
+@click.command()
+@click.argument("expansions_file", type=str, default="expansions.yml")
+def main(expansions_file: str = "expansions.yml") -> None:
+ """Implementation."""
+
+ logging.basicConfig(
+ format="[%(levelname)s] %(message)s",
+ level=logging.INFO,
+ stream=sys.stdout,
+ )
+ structlog.configure(logger_factory=structlog.stdlib.LoggerFactory())
+
+ expansions = read_config_file(expansions_file)
+ build_id = expansions["build_id"]
+ current_task_id = expansions["task_id"]
+ gen_task_name = expansions["gen_task"]
+
+ evg_api = get_evergreen_api()
+
+ build_tasks = evg_api.tasks_by_build(build_id)
+ gen_task_id = [task.task_id for task in build_tasks if gen_task_name in task.task_id][0]
+ gen_task_url = f"{EVERGREEN_HOST}/task/{gen_task_id}"
+
+ while evg_api.task_by_id(gen_task_id).is_active():
+ LOGGER.info(
+ f"Waiting for '{gen_task_name}' task to generate powercycle tasks:\n{gen_task_url}")
+ time.sleep(WATCH_INTERVAL_SECS)
+
+ build_tasks = evg_api.tasks_by_build(build_id)
+ powercycle_task_ids = [
+ task.task_id for task in build_tasks
+ if not task.display_only and task.task_id != current_task_id and task.task_id != gen_task_id
+ and "powercycle" in task.task_id
+ ]
+ LOGGER.info(f"Watching powercycle tasks:\n{get_links(powercycle_task_ids)}")
+
+ long_running_task_ids = watch_tasks(powercycle_task_ids, evg_api, WATCH_INTERVAL_SECS)
+ if long_running_task_ids:
+ LOGGER.error(
+ f"Found powercycle tasks that are running for more than {POWERCYCLE_TASK_EXEC_TIMEOUT_SECS} "
+ f"seconds and most likely something is going wrong in those tasks:\n{get_links(long_running_task_ids)}"
+ )
+ LOGGER.error(
+ "Hopefully hosts from the tasks are still in run at the time you are seeing this "
+ "and the Build team is able to check them to diagnose the issue.")
+ sys.exit(1)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/buildscripts/resmokelib/powercycle/__init__.py b/buildscripts/resmokelib/powercycle/__init__.py
index 2c734073a5f..943bdfc2d79 100644
--- a/buildscripts/resmokelib/powercycle/__init__.py
+++ b/buildscripts/resmokelib/powercycle/__init__.py
@@ -156,6 +156,9 @@ MongoDB Powercycle Tests. To run a powercycle test locally, use the following st
f" config values will be used from '{powercycle_config.POWERCYCLE_TASKS_CONFIG}'."
f" [default: '%(default)s']", default="powercycle")
+ test_options.add_argument("--sshAccessRetryCount", dest="ssh_access_retry_count",
+ help=argparse.SUPPRESS, type=int, default=5)
+
# MongoDB options
mongodb_options.add_argument(
"--downloadUrl", dest="tarball_url",
diff --git a/buildscripts/resmokelib/powercycle/lib/__init__.py b/buildscripts/resmokelib/powercycle/lib/__init__.py
index 01913512cf0..4cc3492d42f 100644
--- a/buildscripts/resmokelib/powercycle/lib/__init__.py
+++ b/buildscripts/resmokelib/powercycle/lib/__init__.py
@@ -25,9 +25,7 @@ class PowercycleCommand(Subcommand):
def __init__(self):
"""Initialize PowercycleCommand."""
self.expansions = yaml.safe_load(open(powercycle_constants.EXPANSIONS_FILE))
- self.ssh_identity = self._get_ssh_identity()
- self.ssh_connection_options = \
- f"{self.ssh_identity} {powercycle_constants.DEFAULT_SSH_CONNECTION_OPTIONS}"
+ self.ssh_connection_options = f"-i powercycle.pem {powercycle_constants.DEFAULT_SSH_CONNECTION_OPTIONS}"
self.sudo = "" if self.is_windows() else "sudo"
# The username on the Windows image that powercycle uses is currently the default user.
self.user = "Administrator" if self.is_windows() else getpass.getuser()
@@ -52,14 +50,6 @@ class PowercycleCommand(Subcommand):
buff = buff_stdout.decode("utf-8", "replace")
return process.poll(), buff
- def _get_ssh_identity(self) -> str:
- workdir = self.expansions['workdir']
- if self.is_windows():
- workdir = workdir.replace("\\", "/")
- pem_file = '/'.join([workdir, 'src', 'powercycle.pem'])
-
- return f"-i {pem_file}"
-
def execute_cmd(cmd, use_file=False):
"""Execute command and returns return_code, output from command."""
diff --git a/buildscripts/resmokelib/powercycle/lib/remote_operations.py b/buildscripts/resmokelib/powercycle/lib/remote_operations.py
index 8ae7d9caec7..a011e786f75 100644
--- a/buildscripts/resmokelib/powercycle/lib/remote_operations.py
+++ b/buildscripts/resmokelib/powercycle/lib/remote_operations.py
@@ -51,7 +51,7 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes
def __init__( # pylint: disable=too-many-arguments
self, user_host, ssh_connection_options=None, ssh_options=None, scp_options=None,
- shell_binary="/bin/bash", use_shell=False, ignore_ret=False):
+ shell_binary="/bin/bash", use_shell=False, ignore_ret=False, access_retry_count=5):
"""Initialize RemoteOperations."""
self.user_host = user_host
@@ -62,6 +62,7 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes
self.ignore_ret = ignore_ret
self.shell_binary = shell_binary
self.use_shell = use_shell
+ self.access_retry_count = access_retry_count
# Check if we can remotely access the host.
self._access_code, self._access_buff = self._remote_access()
@@ -99,7 +100,7 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes
"""Check if a remote session is possible."""
cmd = "ssh {} {} {} date".format(self.ssh_connection_options, self.ssh_options,
self.user_host)
- return self._call_retries(cmd, 5)
+ return self._call_retries(cmd, self.access_retry_count)
def _perform_operation(self, cmd, retry, retry_count):
if retry:
diff --git a/buildscripts/resmokelib/powercycle/powercycle.py b/buildscripts/resmokelib/powercycle/powercycle.py
index 36f5c79579c..43dada73b1d 100755
--- a/buildscripts/resmokelib/powercycle/powercycle.py
+++ b/buildscripts/resmokelib/powercycle/powercycle.py
@@ -724,13 +724,13 @@ class LocalToRemoteOperations(object):
def __init__( # pylint: disable=too-many-arguments
self, user_host, ssh_connection_options=None, ssh_options=None,
- shell_binary="/bin/bash", use_shell=False):
+ shell_binary="/bin/bash", use_shell=False, access_retry_count=5):
"""Initialize LocalToRemoteOperations."""
self.remote_op = remote_operations.RemoteOperations(
user_host=user_host, ssh_connection_options=ssh_connection_options,
ssh_options=ssh_options, shell_binary=shell_binary, use_shell=use_shell,
- ignore_ret=True)
+ ignore_ret=True, access_retry_count=access_retry_count)
def shell(self, cmds, remote_dir=None):
"""Return tuple (ret, output) from performing remote shell operation."""
@@ -1332,7 +1332,7 @@ def main(parser_actions, options): # pylint: disable=too-many-branches,too-many
LOGGER.info("powercycle invocation: %s", " ".join(sys.argv))
- task_name = options.task_name
+ task_name = re.sub(r"(_[0-9]+)(_[\w-]+)?$", "", options.task_name)
task_config = powercycle_config.get_task_config(task_name, options.remote_operation)
LOGGER.info("powercycle task config: %s", task_config)
@@ -1450,9 +1450,9 @@ def main(parser_actions, options): # pylint: disable=too-many-branches,too-many
ssh_options = "" if _IS_WINDOWS else "-tt"
# Instantiate the local handler object.
- local_ops = LocalToRemoteOperations(user_host=ssh_user_host,
- ssh_connection_options=ssh_connection_options,
- ssh_options=ssh_options, use_shell=True)
+ local_ops = LocalToRemoteOperations(
+ user_host=ssh_user_host, ssh_connection_options=ssh_connection_options,
+ ssh_options=ssh_options, use_shell=True, access_retry_count=options.ssh_access_retry_count)
verify_remote_access(local_ops)
# Pass client_args to the remote script invocation.
@@ -1662,7 +1662,8 @@ def main(parser_actions, options): # pylint: disable=too-many-branches,too-many
# Reestablish remote access after crash.
local_ops = LocalToRemoteOperations(user_host=ssh_user_host,
ssh_connection_options=ssh_connection_options,
- ssh_options=ssh_options, use_shell=True)
+ ssh_options=ssh_options, use_shell=True,
+ access_retry_count=options.ssh_access_retry_count)
verify_remote_access(local_ops)
ret, output = call_remote_operation(local_ops, remote_python, script_name, client_args,
"--remoteOperation noop")
diff --git a/buildscripts/resmokelib/powercycle/setup/__init__.py b/buildscripts/resmokelib/powercycle/setup/__init__.py
index f0b9be02b95..85ed66a5b88 100644
--- a/buildscripts/resmokelib/powercycle/setup/__init__.py
+++ b/buildscripts/resmokelib/powercycle/setup/__init__.py
@@ -15,6 +15,9 @@ class SetUpEC2Instance(PowercycleCommand):
def execute(self) -> None: # pylint: disable=too-many-instance-attributes, too-many-locals, too-many-statements
""":return: None."""
+ default_retry_count = 2
+ retry_count = int(self.expansions.get("set_up_retry_count", default_retry_count))
+
# First operation -
# Create remote_dir.
group_cmd = f"id -Gn {self.user}"
@@ -31,7 +34,7 @@ class SetUpEC2Instance(PowercycleCommand):
cmds = f"{self.sudo} mkdir -p {remote_dir}; {self.sudo} chown -R {user_group} {remote_dir}; {set_permission_stmt} {remote_dir}; ls -ld {remote_dir}"
cmds = f"{cmds}; {self.sudo} mkdir -p {db_path}; {self.sudo} chown -R {user_group} {db_path}; {set_permission_stmt} {db_path}; ls -ld {db_path}"
- self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2)
+ self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=retry_count)
# Second operation -
# Copy buildscripts and mongoDB executables to the remote host.
@@ -41,7 +44,8 @@ class SetUpEC2Instance(PowercycleCommand):
if os.path.isdir(shared_libs):
files.append(shared_libs)
- self.remote_op.operation(SSHOperation.COPY_TO, files, remote_dir, retry=True, retry_count=2)
+ self.remote_op.operation(SSHOperation.COPY_TO, files, remote_dir, retry=True,
+ retry_count=retry_count)
# Third operation -
# Set up virtualenv on remote.
@@ -57,7 +61,7 @@ class SetUpEC2Instance(PowercycleCommand):
cmds = f"{cmds}; . $activate"
cmds = f"{cmds}; pip3 install -r $remote_dir/etc/pip/powercycle-requirements.txt"
- self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2)
+ self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=retry_count)
# Fourth operation -
# Enable core dumps on non-Windows remote hosts.
@@ -81,7 +85,7 @@ class SetUpEC2Instance(PowercycleCommand):
# https://unix.stackexchange.com/a/349558 in order to ensure the ssh client gets a
# response from the remote machine before it restarts.
cmds = f"{cmds}; nohup {self.sudo} reboot &>/dev/null & exit"
- self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2)
+ self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=retry_count)
# Fifth operation -
# Print the ulimit & kernel.core_pattern
@@ -93,7 +97,7 @@ class SetUpEC2Instance(PowercycleCommand):
cmds = f"{cmds}; then /sbin/sysctl kernel.core_pattern"
cmds = f"{cmds}; fi"
- self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2)
+ self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=retry_count)
# Sixth operation -
# Set up curator to collect system & process stats on remote.
@@ -120,7 +124,7 @@ class SetUpEC2Instance(PowercycleCommand):
cmds = f"{cmds}; crontab -l"
cmds = f"{cmds}; {{ {self.sudo} $HOME/curator stat system --file {monitor_system_file} > /dev/null 2>&1 & {self.sudo} $HOME/curator stat process-all --file {monitor_proc_file} > /dev/null 2>&1 & }} & disown"
- self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2)
+ self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=retry_count)
# Seventh operation -
# Install NotMyFault, used to crash Windows.
@@ -132,4 +136,4 @@ class SetUpEC2Instance(PowercycleCommand):
cmds = f"curl -s -o {windows_crash_zip} {windows_crash_dl}"
cmds = f"{cmds}; unzip -q {windows_crash_zip} -d {windows_crash_dir}"
cmds = f"{cmds}; chmod +x {windows_crash_dir}/*.exe"
- self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2)
+ self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=retry_count)
diff --git a/buildscripts/tests/test_powercycle_sentinel.py b/buildscripts/tests/test_powercycle_sentinel.py
new file mode 100644
index 00000000000..3f2774d9b54
--- /dev/null
+++ b/buildscripts/tests/test_powercycle_sentinel.py
@@ -0,0 +1,51 @@
+"""Unit tests for powercycle_sentinel.py."""
+# pylint: disable=missing-docstring
+import unittest
+from datetime import datetime, timezone, timedelta
+from unittest.mock import Mock
+
+from evergreen import EvergreenApi, Task
+
+from buildscripts.powercycle_sentinel import watch_tasks, POWERCYCLE_TASK_EXEC_TIMEOUT_SECS
+
+
+def make_task_mock(evg_api, task_id, start_time, finish_time):
+ return Task({
+ "task_id": task_id,
+ "start_time": start_time,
+ "finish_time": finish_time,
+ }, evg_api)
+
+
+class TestWatchTasks(unittest.TestCase):
+ """Test watch_tasks."""
+
+ def test_no_long_running_tasks(self):
+ evg_api = EvergreenApi()
+ task_ids = ["1", "2"]
+ now = datetime.now(timezone.utc).isoformat()
+ task_1 = make_task_mock(evg_api, task_ids[0], now, now)
+ task_2 = make_task_mock(evg_api, task_ids[1], now, now)
+ evg_api.task_by_id = Mock(
+ side_effect=(lambda task_id: {
+ "1": task_1,
+ "2": task_2,
+ }[task_id]))
+ long_running_task_ids = watch_tasks(task_ids, evg_api, 0)
+ self.assertEqual([], long_running_task_ids)
+
+ def test_found_long_running_tasks(self):
+ evg_api = EvergreenApi()
+ task_ids = ["1", "2"]
+ exec_timeout_seconds_ago = (datetime.now(timezone.utc) -
+ timedelta(hours=POWERCYCLE_TASK_EXEC_TIMEOUT_SECS)).isoformat()
+ now = datetime.now(timezone.utc).isoformat()
+ task_1 = make_task_mock(evg_api, task_ids[0], exec_timeout_seconds_ago, now)
+ task_2 = make_task_mock(evg_api, task_ids[1], exec_timeout_seconds_ago, None)
+ evg_api.task_by_id = Mock(
+ side_effect=(lambda task_id: {
+ "1": task_1,
+ "2": task_2,
+ }[task_id]))
+ long_running_task_ids = watch_tasks(task_ids, evg_api, 0)
+ self.assertEqual([task_2.task_id], long_running_task_ids)
diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index a85622cbd5e..1bbee28e035 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -121,8 +121,6 @@ variables:
- &powercycle_remote_credentials
private_key_file: src/powercycle.pem
private_key_remote: ${__project_aws_ssh_key_value}
- aws_key_remote: ${powercycle_aws_key}
- aws_secret_remote: ${powercycle_aws_secret}
- &libfuzzertests
name: libfuzzertests!
@@ -891,12 +889,68 @@ functions:
- *f_expansions_write
- *do_multiversion_setup
- "move multiversion binaries": &move_multiversion_binaries
+ # Used by generator
+ "get compiled binaries":
command: subprocess.exec
params:
binary: bash
args:
- - "./src/evergreen/move_multiversion_binaries.sh"
+ - "./src/evergreen/compiled_binaries_get.sh"
+ env:
+ workdir: ${workdir}
+
+ "generate powercycle tasks":
+ - *set_task_expansion_macros
+ - *f_expansions_write
+ - *set_up_venv
+ - *upload_pip_requirements
+
+ - command: subprocess.exec
+ params:
+ binary: bash
+ args:
+ - "./src/evergreen/powercycle_tasks_generate.sh"
+ env:
+ workdir: ${workdir}
+
+ - command: archive.targz_pack
+ params:
+ target: powercycle_tasks_config.tgz
+ source_dir: "./"
+ include:
+ - "powercycle_tasks.json"
+
+ - command: s3.put
+ params:
+ aws_key: ${aws_key}
+ aws_secret: ${aws_secret}
+ local_file: powercycle_tasks_config.tgz
+ remote_file: ${project}/${build_variant}/${revision}/powercycle_tasks/${task_name}-${build_id}.tgz
+ bucket: mciuploads
+ permissions: public-read
+ content_type: application/gzip
+ display_name: Generated Task Config - Execution ${execution}
+
+ - command: generate.tasks
+ params:
+ files:
+ - powercycle_tasks.json
+
+ "run powercycle sentinel":
+ - *set_task_expansion_macros
+ - *f_expansions_write
+ - *set_up_venv
+ - *upload_pip_requirements
+ - *configure_evergreen_api_credentials
+
+ - command: subprocess.exec
+ type: system
+ params:
+ binary: bash
+ args:
+ - "./src/evergreen/powercycle_sentinel_run.sh"
+ env:
+ workdir: ${workdir}
"execute resmoke tests": &execute_resmoke_tests
command: subprocess.exec
@@ -1607,6 +1661,7 @@ functions:
params:
provider: ec2
distro: ${distro_id}
+ timeout_teardown_secs: 604800 # 7 days
security_group_ids:
- sg-097bff6dd0d1d31d0
@@ -6075,26 +6130,24 @@ tasks:
resmoke_args: --suites=json_schema --storageEngine=wiredTiger
resmoke_jobs_max: 1
-- name: powercycle_smoke_skip_compile
- exec_timeout_secs: 7200 # 2 hour timeout for the task overall
+- name: powercycle_smoke_skip_compile_gen
commands:
- - *f_expansions_write
- - func: "set task expansion macros"
- - *f_expansions_write
- - func: "set up venv"
- - func: "upload pip requirements"
- - *f_expansions_write
- - func: "configure evergreen api credentials"
- - func: "do multiversion setup"
+ - func: "generate powercycle tasks"
vars:
- install_master_bin: true
- - *move_multiversion_binaries
- - func: "set up remote credentials"
+ task_names: >-
+ powercycle_smoke_skip_compile
+ num_tasks: 20
+ exec_timeout_secs: 604800 # 7 days
+ timeout_secs: 604800 # 7 days
+ set_up_retry_count: 1000000
+ run_powercycle_args: --sshAccessRetryCount=1000000
+
+- name: powercycle_sentinel
+ exec_timeout_secs: 604800 # 7 days
+ commands:
+ - func: "run powercycle sentinel"
vars:
- <<: *powercycle_remote_credentials
- - func: "set up EC2 instance"
- - func: "run powercycle test"
- timeout_secs: 1800 # 30 minute timeout for no output
+ gen_task: powercycle_smoke_skip_compile_gen
- name: powercycle_smoke
exec_timeout_secs: 7200 # 2 hour timeout for the task overall
@@ -8225,14 +8278,13 @@ buildvariants:
run_on:
- rhel80-small
expansions:
- multiversion_platform: amazon2
+ multiversion_platform: rhel80
multiversion_edition: enterprise
stepback: false
tasks:
- name: lint_fuzzer_sanity_all
- - name: powercycle_smoke_skip_compile
- distros:
- - amazon2-test
+ - name: powercycle_sentinel
+ - name: powercycle_smoke_skip_compile_gen
- name: security-daily-cron
modules:
diff --git a/evergreen/compiled_binaries_get.sh b/evergreen/compiled_binaries_get.sh
new file mode 100755
index 00000000000..b007fd97709
--- /dev/null
+++ b/evergreen/compiled_binaries_get.sh
@@ -0,0 +1,46 @@
+DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+. "$DIR/prelude.sh"
+
+cd src
+
+set -o errexit
+set -o verbose
+
+activate_venv
+
+rm -rf /data/install dist-test/bin
+
+edition="${multiversion_edition}"
+platform="${multiversion_platform}"
+architecture="${multiversion_architecture}"
+
+if [ ! -z "${multiversion_edition_42_or_later}" ]; then
+ edition="${multiversion_edition_42_or_later}"
+fi
+if [ ! -z "${multiversion_platform_42_or_later}" ]; then
+ platform="${multiversion_platform_42_or_later}"
+fi
+if [ ! -z "${multiversion_architecture_42_or_later}" ]; then
+ architecture="${multiversion_architecture_42_or_later}"
+fi
+
+if [ ! -z "${multiversion_edition_44_or_later}" ]; then
+ edition="${multiversion_edition_44_or_later}"
+fi
+if [ ! -z "${multiversion_platform_44_or_later}" ]; then
+ platform="${multiversion_platform_44_or_later}"
+fi
+if [ ! -z "${multiversion_architecture_44_or_later}" ]; then
+ architecture="${multiversion_architecture_44_or_later}"
+fi
+
+# This is primarily for tests for infrastructure which don't always need the latest
+# binaries.
+$python buildscripts/resmoke.py setup-multiversion \
+ --installDir /data/install \
+ --linkDir dist-test/bin \
+ --edition $edition \
+ --platform $platform \
+ --architecture $architecture \
+ --githubOauthToken "${github_token}" \
+ --useLatest master
diff --git a/evergreen/move_multiversion_binaries.sh b/evergreen/move_multiversion_binaries.sh
deleted file mode 100755
index 6429150bee2..00000000000
--- a/evergreen/move_multiversion_binaries.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-set -o verbose
-
-cd src
-# powercycle expects the binaries to be in dist-test/bin
-mkdir -p dist-test/bin
-mv /data/multiversion/* dist-test/bin/
diff --git a/evergreen/multiversion_setup.sh b/evergreen/multiversion_setup.sh
index cd7f1d942f1..d287d4e4b10 100644
--- a/evergreen/multiversion_setup.sh
+++ b/evergreen/multiversion_setup.sh
@@ -74,16 +74,3 @@ $python buildscripts/resmoke.py setup-multiversion \
--architecture $architecture \
--githubOauthToken "${github_token}" \
--useLatest 4.4 4.7 4.8 4.9
-
-# This is primarily for tests for infrastructure which don't always need the latest
-# binaries.
-if [ ! -z "${install_master_bin}" ]; then
- $python buildscripts/resmoke.py setup-multiversion \
- --installDir /data/install \
- --linkDir /data/multiversion \
- --edition $edition \
- --platform $platform \
- --architecture $architecture \
- --githubOauthToken "${github_token}" \
- --useLatest master
-fi
diff --git a/evergreen/powercycle_run_test.sh b/evergreen/powercycle_run_test.sh
index 6aa63c669e6..a0703faa161 100644
--- a/evergreen/powercycle_run_test.sh
+++ b/evergreen/powercycle_run_test.sh
@@ -18,5 +18,6 @@ trap 'echo $? > error_exit.txt; exit 0' EXIT
set +o errexit
eval $python -u buildscripts/resmoke.py powercycle run \
"--sshUserHost=$(printf "%s@%s" "$user" "${private_ip_address}") \
- --sshConnection=\"-i ${private_key_file}\" \
- --taskName=${task_name}"
+ --sshConnection=\"-i powercycle.pem\" \
+ --taskName=${task_name} \
+ ${run_powercycle_args}"
diff --git a/evergreen/powercycle_sentinel_run.sh b/evergreen/powercycle_sentinel_run.sh
new file mode 100755
index 00000000000..f36f94dda1d
--- /dev/null
+++ b/evergreen/powercycle_sentinel_run.sh
@@ -0,0 +1,10 @@
+DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+. "$DIR/prelude.sh"
+
+cd src
+
+set -o errexit
+set -o verbose
+
+activate_venv
+$python buildscripts/powercycle_sentinel.py ../expansions.yml
diff --git a/evergreen/powercycle_tasks_generate.sh b/evergreen/powercycle_tasks_generate.sh
new file mode 100755
index 00000000000..15b7555790e
--- /dev/null
+++ b/evergreen/powercycle_tasks_generate.sh
@@ -0,0 +1,10 @@
+DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+. "$DIR/prelude.sh"
+
+cd src
+
+set -o errexit
+set -o verbose
+
+activate_venv
+$python buildscripts/evergreen_gen_powercycle_tasks.py ../expansions.yml ../powercycle_tasks.json