diff options
-rwxr-xr-x | buildscripts/evergreen_task_timeout.py | 405 | ||||
-rw-r--r-- | buildscripts/task_generation/suite_split.py | 2 | ||||
-rw-r--r-- | buildscripts/task_generation/task_types/resmoke_tasks.py | 2 | ||||
-rw-r--r-- | buildscripts/tests/test_evergreen_task_timeout.py | 191 | ||||
-rw-r--r-- | buildscripts/tests/timeouts/__init__.py | 1 | ||||
-rw-r--r-- | buildscripts/tests/timeouts/test_timeout.py (renamed from buildscripts/tests/task_generation/test_timeout.py) | 2 | ||||
-rw-r--r-- | buildscripts/tests/timeouts/test_timeout_service.py | 258 | ||||
-rw-r--r-- | buildscripts/tests/util/test_taskname.py | 28 | ||||
-rw-r--r-- | buildscripts/timeouts/__init__.py | 1 | ||||
-rw-r--r-- | buildscripts/timeouts/timeout.py (renamed from buildscripts/task_generation/timeout.py) | 3 | ||||
-rw-r--r-- | buildscripts/timeouts/timeout_service.py | 189 | ||||
-rw-r--r-- | buildscripts/util/taskname.py | 23 | ||||
-rw-r--r-- | docs/evergreen-testing/index.md | 5 | ||||
-rw-r--r-- | docs/evergreen-testing/task_timeouts.md | 35 | ||||
-rw-r--r-- | etc/evergreen.yml | 6 | ||||
-rw-r--r-- | etc/evergreen_timeouts.yml | 93 | ||||
-rw-r--r-- | evergreen/functions/task_timeout_determine.sh | 16 |
17 files changed, 1145 insertions, 115 deletions
diff --git a/buildscripts/evergreen_task_timeout.py b/buildscripts/evergreen_task_timeout.py index d9e60a9fc87..f2177b35a16 100755 --- a/buildscripts/evergreen_task_timeout.py +++ b/buildscripts/evergreen_task_timeout.py @@ -3,14 +3,35 @@ import argparse import math +import os import sys -from datetime import timedelta -from typing import Optional +from datetime import datetime, timedelta +from pathlib import Path +from typing import Dict, List, Optional +import inject +import structlog import yaml - +from pydantic import BaseModel +from evergreen import EvergreenApi, RetryingEvergreenApi + +from buildscripts.ciconfig.evergreen import (EvergreenProjectConfig, parse_evergreen_file) +from buildscripts.timeouts.timeout_service import (TimeoutParams, TimeoutService, TimeoutSettings) +from buildscripts.util.cmdutils import enable_logging +from buildscripts.util.taskname import determine_task_base_name + +LOGGER = structlog.get_logger(__name__) +DEFAULT_TIMEOUT_OVERRIDES = "etc/evergreen_timeouts.yml" +DEFAULT_EVERGREEN_CONFIG = "etc/evergreen.yml" +DEFAULT_EVERGREEN_AUTH_CONFIG = "~/.evergreen.yml" COMMIT_QUEUE_ALIAS = "__commit_queue" UNITTEST_TASK = "run_unittests" +IGNORED_SUITES = { + "integration_tests_replset", "integration_tests_replset_ssl_auth", "integration_tests_sharded", + "integration_tests_standalone", "integration_tests_standalone_audit", "mongos_test", + "server_selection_json_test" +} +HISTORY_LOOKBACK = timedelta(weeks=2) COMMIT_QUEUE_TIMEOUT = timedelta(minutes=40) DEFAULT_REQUIRED_BUILD_TIMEOUT = timedelta(hours=1, minutes=20) @@ -19,114 +40,118 @@ DEFAULT_NON_REQUIRED_BUILD_TIMEOUT = timedelta(hours=2) # which is 5 mins 47 secs, excluding outliers below UNITTESTS_TIMEOUT = timedelta(minutes=12) -SPECIFIC_TASK_OVERRIDES = { - "linux-64-debug": {"auth": timedelta(minutes=60)}, - "enterprise-windows-all-feature-flags-suggested": { - "replica_sets_jscore_passthrough": timedelta(hours=3), - "replica_sets_update_v1_oplog_jscore_passthrough": timedelta(hours=2, minutes=30), - }, - "enterprise-windows-required": { - "replica_sets_jscore_passthrough": timedelta(hours=3), - "replica_sets_update_v1_oplog_jscore_passthrough": timedelta(hours=2, minutes=30), - }, - "enterprise-windows-inmem": {"replica_sets_jscore_passthrough": timedelta(hours=3), }, - "enterprise-windows": {"replica_sets_jscore_passthrough": timedelta(hours=3), }, - "windows-debug-suggested": { - "replica_sets_initsync_jscore_passthrough": timedelta(hours=2, minutes=30), - "replica_sets_jscore_passthrough": timedelta(hours=2, minutes=30), - "replica_sets_update_v1_oplog_jscore_passthrough": timedelta(hours=2, minutes=30), - }, - "windows": { - "replica_sets": timedelta(hours=3), - "replica_sets_jscore_passthrough": timedelta(hours=2, minutes=30), - }, - "ubuntu1804-debug-suggested": {"replica_sets_jscore_passthrough": timedelta(hours=3), }, - "enterprise-rhel-80-64-bit-coverage": { - "replica_sets_jscore_passthrough": timedelta(hours=2, minutes=30), - }, - "macos": {"replica_sets_jscore_passthrough": timedelta(hours=2, minutes=30), }, - "enterprise-macos": {"replica_sets_jscore_passthrough": timedelta(hours=2, minutes=30), }, - - # unittests outliers - # repeated execution runs a suite 10 times - "linux-64-repeated-execution": {UNITTEST_TASK: 10 * UNITTESTS_TIMEOUT}, - # some of the a/ub/t san variants need a little extra time - "enterprise-ubuntu2004-debug-tsan": {UNITTEST_TASK: 2 * UNITTESTS_TIMEOUT}, - "ubuntu1804-asan": {UNITTEST_TASK: 2 * UNITTESTS_TIMEOUT}, - "ubuntu1804-ubsan": {UNITTEST_TASK: 2 * UNITTESTS_TIMEOUT}, - "ubuntu1804-debug-asan": {UNITTEST_TASK: 2 * UNITTESTS_TIMEOUT}, - "ubuntu1804-debug-aubsan-lite": {UNITTEST_TASK: 2 * UNITTESTS_TIMEOUT}, - "ubuntu1804-debug-ubsan": {UNITTEST_TASK: 2 * UNITTESTS_TIMEOUT}, -} - -def _is_required_build_variant(build_variant: str) -> bool: +class TimeoutOverride(BaseModel): """ - Determine if the given build variants is a required build variant. + Specification for overriding a task timeout. - :param build_variant: Name of build variant to check. - :return: True if the given build variant is required. + * task: Name of task to overide. + * exec_timeout: Value to override exec timeout with. + * idle_timeout: Value to override idle timeout with. """ - return build_variant.endswith("-required") + task: str + exec_timeout: Optional[int] = None + idle_timeout: Optional[int] = None + + def get_exec_timeout(self) -> Optional[timedelta]: + """Get a timedelta of the exec timeout to use.""" + if self.exec_timeout is not None: + return timedelta(minutes=self.exec_timeout) + return None + + def get_idle_timeout(self) -> Optional[timedelta]: + """Get a timedelta of the idle timeout to use.""" + if self.idle_timeout is not None: + return timedelta(minutes=self.idle_timeout) + return None + + +class TimeoutOverrides(BaseModel): + """Collection of timeout overrides to apply.""" + + overrides: Dict[str, List[TimeoutOverride]] + + @classmethod + def from_yaml_file(cls, file_path: Path) -> "TimeoutOverrides": + """Read the timeout overrides from the given file.""" + with open(file_path) as file_handler: + return cls(**yaml.safe_load(file_handler)) + + def _lookup_override(self, build_variant: str, task_name: str) -> Optional[TimeoutOverride]: + """ + Check if the given task on the given build variant has an override defined. + + Note: If multiple overrides are found, an exception will be raised. + + :param build_variant: Build Variant to check. + :param task_name: Task name to check. + :return: Timeout override if found. + """ + overrides = [ + override for override in self.overrides.get(build_variant, []) + if override.task == task_name + ] + if overrides: + if len(overrides) > 1: + LOGGER.error("Found multiple overrides for the same task", + build_variant=build_variant, task=task_name, + overrides=[override.dict() for override in overrides]) + raise ValueError(f"Found multiple overrides for '{task_name}' on '{build_variant}'") + return overrides[0] + return None + + def lookup_exec_override(self, build_variant: str, task_name: str) -> Optional[timedelta]: + """ + Look up the exec timeout override of the given build variant/task. + + :param build_variant: Build Variant to check. + :param task_name: Task name to check. + :return: Exec timeout override if found. + """ + override = self._lookup_override(build_variant, task_name) + if override is not None: + return override.get_exec_timeout() + return None + + def lookup_idle_override(self, build_variant: str, task_name: str) -> Optional[timedelta]: + """ + Look up the idle timeout override of the given build variant/task. + + :param build_variant: Build Variant to check. + :param task_name: Task name to check. + :return: Idle timeout override if found. + """ + override = self._lookup_override(build_variant, task_name) + if override is not None: + return override.get_idle_timeout() + return None -def _has_override(variant: str, task_name: str) -> bool: - """ - Determine if the given task has a timeout override. - :param variant: Build Variant task is running on. - :param task_name: Task to check. - :return: True if override exists for task. +def _is_required_build_variant(build_variant: str) -> bool: """ - return variant in SPECIFIC_TASK_OVERRIDES and task_name in SPECIFIC_TASK_OVERRIDES[variant] - + Determine if the given build variants is a required build variant. -def determine_timeout(task_name: str, variant: str, idle_timeout: Optional[timedelta] = None, - exec_timeout: Optional[timedelta] = None, evg_alias: str = '') -> timedelta: - """ - Determine what exec timeout should be used. - - :param task_name: Name of task being run. - :param variant: Name of build variant being run. - :param idle_timeout: Idle timeout if specified. - :param exec_timeout: Override to use for exec_timeout or 0 if no override. - :param evg_alias: Evergreen alias running the task. - :return: Exec timeout to use for running task. + :param build_variant: Name of build variant to check. + :return: True if the given build variant is required. """ - determined_timeout = DEFAULT_NON_REQUIRED_BUILD_TIMEOUT - - if exec_timeout and exec_timeout.total_seconds() != 0: - determined_timeout = exec_timeout - - elif task_name == UNITTEST_TASK and not _has_override(variant, task_name): - determined_timeout = UNITTESTS_TIMEOUT - - elif evg_alias == COMMIT_QUEUE_ALIAS: - determined_timeout = COMMIT_QUEUE_TIMEOUT - - elif _has_override(variant, task_name): - determined_timeout = SPECIFIC_TASK_OVERRIDES[variant][task_name] - - elif _is_required_build_variant(variant): - determined_timeout = DEFAULT_REQUIRED_BUILD_TIMEOUT - - # The timeout needs to be at least as large as the idle timeout. - if idle_timeout and determined_timeout.total_seconds() < idle_timeout.total_seconds(): - return idle_timeout - - return determined_timeout + return build_variant.endswith("-required") -def output_timeout(task_timeout: timedelta, output_file: Optional[str]) -> None: +def output_timeout(exec_timeout: timedelta, idle_timeout: Optional[timedelta], + output_file: Optional[str]) -> None: """ Output timeout configuration to the specified location. - :param task_timeout: Timeout to output. + :param exec_timeout: Exec timeout to output. + :param idle_timeout: Idle timeout to output. :param output_file: Location of output file to write. """ output = { - "exec_timeout_secs": math.ceil(task_timeout.total_seconds()), + "exec_timeout_secs": math.ceil(exec_timeout.total_seconds()), } + if idle_timeout is not None: + output["timeout_secs"] = math.ceil(idle_timeout.total_seconds()) if output_file: with open(output_file, "w") as outfile: @@ -135,28 +160,216 @@ def output_timeout(task_timeout: timedelta, output_file: Optional[str]) -> None: yaml.dump(output, stream=sys.stdout, default_flow_style=False) +class TaskTimeoutOrchestrator: + """An orchestrator for determining task timeouts.""" + + @inject.autoparams() + def __init__(self, timeout_service: TimeoutService, timeout_overrides: TimeoutOverrides, + evg_project_config: EvergreenProjectConfig) -> None: + """ + Initialize the orchestrator. + + :param timeout_service: Service for calculating historic timeouts. + :param timeout_overrides: Timeout overrides for specific tasks. + """ + self.timeout_service = timeout_service + self.timeout_overrides = timeout_overrides + self.evg_project_config = evg_project_config + + def determine_exec_timeout( + self, task_name: str, variant: str, idle_timeout: Optional[timedelta] = None, + exec_timeout: Optional[timedelta] = None, evg_alias: str = "") -> timedelta: + """ + Determine what exec timeout should be used. + + :param task_name: Name of task being run. + :param variant: Name of build variant being run. + :param idle_timeout: Idle timeout if specified. + :param exec_timeout: Override to use for exec_timeout or 0 if no override. + :param evg_alias: Evergreen alias running the task. + :return: Exec timeout to use for running task. + """ + determined_timeout = DEFAULT_NON_REQUIRED_BUILD_TIMEOUT + + override = self.timeout_overrides.lookup_exec_override(variant, task_name) + + if exec_timeout and exec_timeout.total_seconds() != 0: + LOGGER.info("Using timeout from cmd line", + exec_timeout_secs=exec_timeout.total_seconds()) + determined_timeout = exec_timeout + + elif task_name == UNITTEST_TASK and override is None: + LOGGER.info("Overriding unittest timeout", + exec_timeout_secs=UNITTESTS_TIMEOUT.total_seconds()) + determined_timeout = UNITTESTS_TIMEOUT + + elif evg_alias == COMMIT_QUEUE_ALIAS: + LOGGER.info("Overriding commit-queue timeout", + exec_timeout_secs=COMMIT_QUEUE_TIMEOUT.total_seconds()) + determined_timeout = COMMIT_QUEUE_TIMEOUT + + elif override is not None: + LOGGER.info("Overriding configured timeout", exec_timeout_secs=override.total_seconds()) + determined_timeout = override + + elif _is_required_build_variant(variant): + LOGGER.info("Overriding required-builder timeout", + exec_timeout_secs=DEFAULT_REQUIRED_BUILD_TIMEOUT.total_seconds()) + determined_timeout = DEFAULT_REQUIRED_BUILD_TIMEOUT + + # The timeout needs to be at least as large as the idle timeout. + if idle_timeout and determined_timeout.total_seconds() < idle_timeout.total_seconds(): + LOGGER.info("Making exec timeout as large as idle timeout", + exec_timeout_secs=idle_timeout.total_seconds()) + return idle_timeout + + return determined_timeout + + def determine_idle_timeout(self, task_name: str, variant: str, + idle_timeout: Optional[timedelta] = None) -> Optional[timedelta]: + """ + Determine what idle timeout should be used. + + :param task_name: Name of task being run. + :param variant: Name of build variant being run. + :param idle_timeout: Override to use for idle_timeout. + :return: Idle timeout to use for running task. + """ + determined_timeout = None + override = self.timeout_overrides.lookup_idle_override(variant, task_name) + + if idle_timeout and idle_timeout.total_seconds() != 0: + LOGGER.info("Using timeout from cmd line", + idle_timeout_secs=idle_timeout.total_seconds()) + determined_timeout = idle_timeout + + elif override is not None: + LOGGER.info("Overriding configured timeout", idle_timeout_secs=override.total_seconds()) + determined_timeout = override + + return determined_timeout + + def determine_historic_timeout(self, task: str, variant: str, suite_name: str, + exec_timeout_factor: Optional[float]) -> Optional[timedelta]: + """ + Calculate the timeout based on historic test results. + + :param task: Name of task to query. + :param variant: Name of build variant to query. + :param suite_name: Name of test suite being run. + :param exec_timeout_factor: Scaling factor to use when determining timeout. + """ + if suite_name in IGNORED_SUITES: + return None + + timeout_params = TimeoutParams( + evg_project="mongodb-mongo-master", + build_variant=variant, + task_name=task, + suite_name=suite_name, + is_asan=self.is_build_variant_asan(variant), + ) + timeout_estimate = self.timeout_service.get_timeout_estimate(timeout_params) + if timeout_estimate and timeout_estimate.is_specified(): + exec_timeout = timeout_estimate.calculate_task_timeout( + repeat_factor=1, scaling_factor=exec_timeout_factor) + if exec_timeout is not None: + LOGGER.info("Using historic based timeout", exec_timeout_secs=exec_timeout) + return timedelta(seconds=exec_timeout) + return None + + def is_build_variant_asan(self, build_variant: str) -> bool: + """ + Determine if the given build variant is an ASAN build variant. + + :param build_variant: Name of build variant to check. + :return: True if build variant is an ASAN build variant. + """ + bv = self.evg_project_config.get_variant(build_variant) + return bv.is_asan_build() + + def determine_timeouts(self, cli_idle_timeout: Optional[timedelta], + cli_exec_timeout: Optional[timedelta], outfile: Optional[str], task: str, + variant: str, evg_alias: str, suite_name: str, + exec_timeout_factor: Optional[float]) -> None: + """ + Determine the timeouts to use for the given task and write timeouts to expansion file. + + :param cli_idle_timeout: Idle timeout specified by the CLI. + :param cli_exec_timeout: Exec timeout specified by the CLI. + :param outfile: File to write timeout expansions to. + :param variant: Build variant task is being run on. + :param evg_alias: Evergreen alias that triggered task. + :param suite_name: Name of evergreen suite being run. + :param exec_timeout_factor: Scaling factor to use when determining timeout. + """ + idle_timeout = self.determine_idle_timeout(task, variant, cli_idle_timeout) + exec_timeout = self.determine_exec_timeout(task, variant, idle_timeout, cli_exec_timeout, + evg_alias) + + historic_timeout = self.determine_historic_timeout(task, variant, suite_name, + exec_timeout_factor) + if historic_timeout: + exec_timeout = historic_timeout + + output_timeout(exec_timeout, idle_timeout, outfile) + + def main(): """Determine the timeout value a task should use in evergreen.""" parser = argparse.ArgumentParser(description=main.__doc__) parser.add_argument("--task-name", dest="task", required=True, help="Task being executed.") + parser.add_argument("--suite-name", dest="suite_name", required=True, + help="Resmoke suite being run against.") parser.add_argument("--build-variant", dest="variant", required=True, help="Build variant task is being executed on.") parser.add_argument("--evg-alias", dest="evg_alias", required=True, help="Evergreen alias used to trigger build.") parser.add_argument("--timeout", dest="timeout", type=int, help="Timeout to use (in sec).") parser.add_argument("--exec-timeout", dest="exec_timeout", type=int, - help="Exec timeout ot use (in sec).") + help="Exec timeout to use (in sec).") + parser.add_argument("--exec-timeout-factor", dest="exec_timeout_factor", type=float, + help="Exec timeout factor to use (in sec).") parser.add_argument("--out-file", dest="outfile", help="File to write configuration to.") + parser.add_argument("--timeout-overrides", dest="timeout_overrides_file", + default=DEFAULT_TIMEOUT_OVERRIDES, + help="File containing timeout overrides to use.") + parser.add_argument("--evg-api-config", dest="evg_api_config", + default=DEFAULT_EVERGREEN_AUTH_CONFIG, help="Evergreen API config file.") + parser.add_argument("--evg-project-config", dest="evg_project_config", + default=DEFAULT_EVERGREEN_CONFIG, help="Evergreen project config file.") options = parser.parse_args() + end_date = datetime.now() + start_date = end_date - HISTORY_LOOKBACK + timeout_override = timedelta(seconds=options.timeout) if options.timeout else None exec_timeout_override = timedelta( seconds=options.exec_timeout) if options.exec_timeout else None - task_timeout = determine_timeout(options.task, options.variant, timeout_override, - exec_timeout_override, options.evg_alias) - output_timeout(task_timeout, options.outfile) + + task_name = determine_task_base_name(options.task, options.variant) + timeout_overrides = TimeoutOverrides.from_yaml_file( + os.path.expanduser(options.timeout_overrides_file)) + + enable_logging(verbose=False) + + def dependencies(binder: inject.Binder) -> None: + binder.bind( + EvergreenApi, + RetryingEvergreenApi.get_api(config_file=os.path.expanduser(options.evg_api_config))) + binder.bind(TimeoutSettings, TimeoutSettings(start_date=start_date, end_date=end_date)) + binder.bind(TimeoutOverrides, timeout_overrides) + binder.bind(EvergreenProjectConfig, + parse_evergreen_file(os.path.expanduser(options.evg_project_config))) + + inject.configure(dependencies) + + task_timeout_orchestrator = inject.instance(TaskTimeoutOrchestrator) + task_timeout_orchestrator.determine_timeouts( + timeout_override, exec_timeout_override, options.outfile, task_name, options.variant, + options.evg_alias, options.suite_name, options.exec_timeout_factor) if __name__ == "__main__": diff --git a/buildscripts/task_generation/suite_split.py b/buildscripts/task_generation/suite_split.py index e0b3cfbb449..5e1e9d32115 100644 --- a/buildscripts/task_generation/suite_split.py +++ b/buildscripts/task_generation/suite_split.py @@ -13,7 +13,7 @@ from evergreen import EvergreenApi from buildscripts.task_generation.resmoke_proxy import ResmokeProxyService from buildscripts.task_generation.suite_split_strategies import SplitStrategy, FallbackStrategy -from buildscripts.task_generation.timeout import TimeoutEstimate +from buildscripts.timeouts.timeout import TimeoutEstimate from buildscripts.util import taskname from buildscripts.util.teststats import HistoricTaskData, TestRuntime, normalize_test_name diff --git a/buildscripts/task_generation/task_types/resmoke_tasks.py b/buildscripts/task_generation/task_types/resmoke_tasks.py index 7ceab680265..5ddd4f9a541 100644 --- a/buildscripts/task_generation/task_types/resmoke_tasks.py +++ b/buildscripts/task_generation/task_types/resmoke_tasks.py @@ -14,7 +14,7 @@ from buildscripts.task_generation.task_types.gentask_options import GenTaskOptio from buildscripts.task_generation.task_types.models.resmoke_task_model import ResmokeTask from buildscripts.task_generation.task_types.multiversion_decorator import MultiversionGenTaskDecorator, \ MultiversionDecoratorParams -from buildscripts.task_generation.timeout import TimeoutEstimate +from buildscripts.timeouts.timeout import TimeoutEstimate LOGGER = structlog.getLogger(__name__) diff --git a/buildscripts/tests/test_evergreen_task_timeout.py b/buildscripts/tests/test_evergreen_task_timeout.py index 3728885f479..2043de695c7 100644 --- a/buildscripts/tests/test_evergreen_task_timeout.py +++ b/buildscripts/tests/test_evergreen_task_timeout.py @@ -1,46 +1,213 @@ """Unit tests for the evergreen_task_timeout script.""" -from datetime import timedelta import unittest +from datetime import timedelta +from unittest.mock import MagicMock import buildscripts.evergreen_task_timeout as under_test +from buildscripts.ciconfig.evergreen import EvergreenProjectConfig +from buildscripts.timeouts.timeout_service import TimeoutService + +# pylint: disable=missing-docstring,no-self-use,invalid-name,protected-access + + +class TestTimeoutOverride(unittest.TestCase): + def test_exec_timeout_should_be_settable(self): + timeout_override = under_test.TimeoutOverride(task="my task", exec_timeout=42) + + timeout = timeout_override.get_exec_timeout() + + self.assertIsNotNone(timeout) + self.assertEqual(42 * 60, timeout.total_seconds()) + + def test_exec_timeout_should_default_to_none(self): + timeout_override = under_test.TimeoutOverride(task="my task") + + timeout = timeout_override.get_exec_timeout() + + self.assertIsNone(timeout) + + def test_idle_timeout_should_be_settable(self): + timeout_override = under_test.TimeoutOverride(task="my task", idle_timeout=42) + + timeout = timeout_override.get_idle_timeout() + + self.assertIsNotNone(timeout) + self.assertEqual(42 * 60, timeout.total_seconds()) + + def test_idle_timeout_should_default_to_none(self): + timeout_override = under_test.TimeoutOverride(task="my task") + + timeout = timeout_override.get_idle_timeout() + + self.assertIsNone(timeout) + + +class TestTimeoutOverrides(unittest.TestCase): + def test_looking_up_a_non_existing_override_should_return_none(self): + timeout_overrides = under_test.TimeoutOverrides(overrides={}) -# pylint: disable=missing-docstring,no-self-use + self.assertIsNone(timeout_overrides.lookup_exec_override("bv", "task")) + self.assertIsNone(timeout_overrides.lookup_idle_override("bv", "task")) + def test_looking_up_a_duplicate_override_should_raise_error(self): + timeout_overrides = under_test.TimeoutOverrides( + overrides={ + "bv": [{ + "task": "task_name", + "exec_timeout": 42, + "idle_timeout": 10, + }, { + "task": "task_name", + "exec_timeout": 314, + "idle_timeout": 20, + }] + }) -class DetermineTimeoutTest(unittest.TestCase): + with self.assertRaises(ValueError): + self.assertIsNone(timeout_overrides.lookup_exec_override("bv", "task_name")) + + with self.assertRaises(ValueError): + self.assertIsNone(timeout_overrides.lookup_idle_override("bv", "task_name")) + + def test_looking_up_an_exec_override_should_work(self): + timeout_overrides = under_test.TimeoutOverrides( + overrides={ + "bv": [ + { + "task": "another_task", + "exec_timeout": 314, + "idle_timeout": 20, + }, + { + "task": "task_name", + "exec_timeout": 42, + }, + ] + }) + + self.assertEqual(42 * 60, + timeout_overrides.lookup_exec_override("bv", "task_name").total_seconds()) + + def test_looking_up_an_idle_override_should_work(self): + timeout_overrides = under_test.TimeoutOverrides( + overrides={ + "bv": [ + { + "task": "another_task", + "exec_timeout": 314, + "idle_timeout": 20, + }, + { + "task": "task_name", + "idle_timeout": 10, + }, + ] + }) + + self.assertEqual(10 * 60, + timeout_overrides.lookup_idle_override("bv", "task_name").total_seconds()) + + +class TestDetermineExecTimeout(unittest.TestCase): def test_timeout_used_if_specified(self): + mock_timeout_overrides = under_test.TimeoutOverrides(overrides={}) + orchestrator = under_test.TaskTimeoutOrchestrator( + timeout_service=MagicMock(spec_set=TimeoutService), + timeout_overrides=mock_timeout_overrides, + evg_project_config=MagicMock(spec_set=EvergreenProjectConfig)) timeout = timedelta(seconds=42) self.assertEqual( - under_test.determine_timeout("task_name", "variant", None, timeout), timeout) + orchestrator.determine_exec_timeout("task_name", "variant", None, timeout), timeout) def test_default_is_returned_with_no_timeout(self): + mock_timeout_overrides = under_test.TimeoutOverrides(overrides={}) + orchestrator = under_test.TaskTimeoutOrchestrator( + timeout_service=MagicMock(spec_set=TimeoutService), + timeout_overrides=mock_timeout_overrides, + evg_project_config=MagicMock(spec_set=EvergreenProjectConfig)) self.assertEqual( - under_test.determine_timeout("task_name", "variant"), + orchestrator.determine_exec_timeout("task_name", "variant"), under_test.DEFAULT_NON_REQUIRED_BUILD_TIMEOUT) def test_default_is_returned_with_timeout_at_zero(self): + mock_timeout_overrides = under_test.TimeoutOverrides(overrides={}) + orchestrator = under_test.TaskTimeoutOrchestrator( + timeout_service=MagicMock(spec_set=TimeoutService), + timeout_overrides=mock_timeout_overrides, + evg_project_config=MagicMock(spec_set=EvergreenProjectConfig)) self.assertEqual( - under_test.determine_timeout("task_name", "variant", timedelta(seconds=0)), + orchestrator.determine_exec_timeout("task_name", "variant", timedelta(seconds=0)), under_test.DEFAULT_NON_REQUIRED_BUILD_TIMEOUT) def test_default_required_returned_on_required_variants(self): + mock_timeout_overrides = under_test.TimeoutOverrides(overrides={}) + orchestrator = under_test.TaskTimeoutOrchestrator( + timeout_service=MagicMock(spec_set=TimeoutService), + timeout_overrides=mock_timeout_overrides, + evg_project_config=MagicMock(spec_set=EvergreenProjectConfig)) self.assertEqual( - under_test.determine_timeout("task_name", "variant-required"), + orchestrator.determine_exec_timeout("task_name", "variant-required"), under_test.DEFAULT_REQUIRED_BUILD_TIMEOUT) def test_task_specific_timeout(self): + mock_timeout_overrides = under_test.TimeoutOverrides( + overrides={"linux-64-debug": [{"task": "auth", "exec_timeout": 60}]}) + orchestrator = under_test.TaskTimeoutOrchestrator( + timeout_service=MagicMock(spec_set=TimeoutService), + timeout_overrides=mock_timeout_overrides, + evg_project_config=MagicMock(spec_set=EvergreenProjectConfig)) self.assertEqual( - under_test.determine_timeout("auth", "linux-64-debug"), timedelta(minutes=60)) + orchestrator.determine_exec_timeout("auth", "linux-64-debug"), timedelta(minutes=60)) def test_commit_queue_items_use_commit_queue_timeout(self): - timeout = under_test.determine_timeout("auth", "variant", - evg_alias=under_test.COMMIT_QUEUE_ALIAS) + mock_timeout_overrides = under_test.TimeoutOverrides(overrides={}) + orchestrator = under_test.TaskTimeoutOrchestrator( + timeout_service=MagicMock(spec_set=TimeoutService), + timeout_overrides=mock_timeout_overrides, + evg_project_config=MagicMock(spec_set=EvergreenProjectConfig)) + timeout = orchestrator.determine_exec_timeout("auth", "variant", + evg_alias=under_test.COMMIT_QUEUE_ALIAS) self.assertEqual(timeout, under_test.COMMIT_QUEUE_TIMEOUT) def test_use_idle_timeout_if_greater_than_exec_timeout(self): + mock_timeout_overrides = under_test.TimeoutOverrides(overrides={}) + orchestrator = under_test.TaskTimeoutOrchestrator( + timeout_service=MagicMock(spec_set=TimeoutService), + timeout_overrides=mock_timeout_overrides, + evg_project_config=MagicMock(spec_set=EvergreenProjectConfig)) idle_timeout = timedelta(hours=2) exec_timeout = timedelta(minutes=10) - timeout = under_test.determine_timeout("task_name", "variant", idle_timeout=idle_timeout, - exec_timeout=exec_timeout) + timeout = orchestrator.determine_exec_timeout( + "task_name", "variant", idle_timeout=idle_timeout, exec_timeout=exec_timeout) self.assertEqual(timeout, idle_timeout) + + +class TestDetermineIdleTimeout(unittest.TestCase): + def test_timeout_used_if_specified(self): + mock_timeout_overrides = under_test.TimeoutOverrides(overrides={}) + orchestrator = under_test.TaskTimeoutOrchestrator( + timeout_service=MagicMock(spec_set=TimeoutService), + timeout_overrides=mock_timeout_overrides, + evg_project_config=MagicMock(spec_set=EvergreenProjectConfig)) + timeout = timedelta(seconds=42) + self.assertEqual( + orchestrator.determine_idle_timeout("task_name", "variant", timeout), timeout) + + def test_default_is_returned_with_no_timeout(self): + mock_timeout_overrides = under_test.TimeoutOverrides(overrides={}) + orchestrator = under_test.TaskTimeoutOrchestrator( + timeout_service=MagicMock(spec_set=TimeoutService), + timeout_overrides=mock_timeout_overrides, + evg_project_config=MagicMock(spec_set=EvergreenProjectConfig)) + self.assertIsNone(orchestrator.determine_idle_timeout("task_name", "variant")) + + def test_task_specific_timeout(self): + mock_timeout_overrides = under_test.TimeoutOverrides( + overrides={"linux-64-debug": [{"task": "auth", "idle_timeout": 60}]}) + orchestrator = under_test.TaskTimeoutOrchestrator( + timeout_service=MagicMock(spec_set=TimeoutService), + timeout_overrides=mock_timeout_overrides, + evg_project_config=MagicMock(spec_set=EvergreenProjectConfig)) + self.assertEqual( + orchestrator.determine_idle_timeout("auth", "linux-64-debug"), timedelta(minutes=60)) diff --git a/buildscripts/tests/timeouts/__init__.py b/buildscripts/tests/timeouts/__init__.py new file mode 100644 index 00000000000..4b7a2bb941b --- /dev/null +++ b/buildscripts/tests/timeouts/__init__.py @@ -0,0 +1 @@ +"""Empty.""" diff --git a/buildscripts/tests/task_generation/test_timeout.py b/buildscripts/tests/timeouts/test_timeout.py index 5d9fb48c6e6..1d450aed913 100644 --- a/buildscripts/tests/task_generation/test_timeout.py +++ b/buildscripts/tests/timeouts/test_timeout.py @@ -1,7 +1,7 @@ """Unit tests for timeout.py.""" import unittest -from buildscripts.task_generation import timeout as under_test +from buildscripts.timeouts import timeout as under_test # pylint: disable=missing-docstring,invalid-name,unused-argument,no-self-use,protected-access,no-value-for-parameter diff --git a/buildscripts/tests/timeouts/test_timeout_service.py b/buildscripts/tests/timeouts/test_timeout_service.py new file mode 100644 index 00000000000..bb0dd8a0c3e --- /dev/null +++ b/buildscripts/tests/timeouts/test_timeout_service.py @@ -0,0 +1,258 @@ +"""Unit tests for timeout_service.py.""" +import random +import unittest +from datetime import datetime, timedelta +from unittest.mock import MagicMock + +from requests.exceptions import HTTPError +from evergreen import EvergreenApi + +import buildscripts.timeouts.timeout_service as under_test +from buildscripts.task_generation.resmoke_proxy import ResmokeProxyService +from buildscripts.util.teststats import HistoricTaskData + +# pylint: disable=missing-docstring,no-self-use,invalid-name,protected-access + + +def build_mock_service(evg_api=None, resmoke_proxy=None): + end_date = datetime.now() + start_date = end_date - timedelta(weeks=2) + timeout_settings = under_test.TimeoutSettings( + end_date=end_date, + start_date=start_date, + ) + return under_test.TimeoutService( + evg_api=evg_api if evg_api else MagicMock(spec_set=EvergreenApi), + resmoke_proxy=resmoke_proxy if resmoke_proxy else MagicMock(spec_set=ResmokeProxyService), + timeout_settings=timeout_settings) + + +def tst_stat_mock(file, duration, pass_count): + return MagicMock(test_file=file, avg_duration_pass=duration, num_pass=pass_count) + + +class TestGetTimeoutEstimate(unittest.TestCase): + def test_no_stats_should_return_default_timeout(self): + mock_evg_api = MagicMock(spec_set=EvergreenApi) + mock_evg_api.test_stats_by_project.return_value = [] + timeout_service = build_mock_service(evg_api=mock_evg_api) + timeout_params = under_test.TimeoutParams( + evg_project="my project", + build_variant="bv", + task_name="my task", + suite_name="my suite", + is_asan=False, + ) + + timeout = timeout_service.get_timeout_estimate(timeout_params) + + self.assertFalse(timeout.is_specified()) + + def test_a_test_with_missing_history_should_cause_a_default_timeout(self): + mock_evg_api = MagicMock(spec_set=EvergreenApi) + test_stats = [tst_stat_mock(f"test_{i}.js", 60, 1) for i in range(30)] + mock_evg_api.test_stats_by_project.return_value = test_stats + mock_resmoke_proxy = MagicMock(spec_set=ResmokeProxyService) + mock_resmoke_proxy.list_tests.return_value = ["test_with_no_stats.js"] + timeout_service = build_mock_service(evg_api=mock_evg_api, resmoke_proxy=mock_resmoke_proxy) + timeout_params = under_test.TimeoutParams( + evg_project="my project", + build_variant="bv", + task_name="my task", + suite_name="my suite", + is_asan=False, + ) + + timeout = timeout_service.get_timeout_estimate(timeout_params) + + self.assertFalse(timeout.is_specified()) + + def test_a_test_with_zero_runtime_history_should_cause_a_default_timeout(self): + mock_evg_api = MagicMock(spec_set=EvergreenApi) + test_stats = [tst_stat_mock(f"test_{i}.js", 60, 1) for i in range(30)] + test_stats.append(tst_stat_mock("zero.js", 0.0, 1)) + mock_evg_api.test_stats_by_project.return_value = test_stats + mock_resmoke_proxy = MagicMock(spec_set=ResmokeProxyService) + mock_resmoke_proxy.list_tests.return_value = [ts.test_file for ts in test_stats] + timeout_service = build_mock_service(evg_api=mock_evg_api, resmoke_proxy=mock_resmoke_proxy) + timeout_params = under_test.TimeoutParams( + evg_project="my project", + build_variant="bv", + task_name="my task", + suite_name="my suite", + is_asan=False, + ) + + timeout = timeout_service.get_timeout_estimate(timeout_params) + + self.assertFalse(timeout.is_specified()) + + def test_all_tests_with_runtime_history_should_use_custom_timeout(self): + mock_evg_api = MagicMock(spec_set=EvergreenApi) + n_tests = 30 + test_runtime = 600 + test_stats = [tst_stat_mock(f"test_{i}.js", test_runtime, 1) for i in range(n_tests)] + mock_evg_api.test_stats_by_project.return_value = test_stats + mock_resmoke_proxy = MagicMock(spec_set=ResmokeProxyService) + mock_resmoke_proxy.list_tests.return_value = [ts.test_file for ts in test_stats] + timeout_service = build_mock_service(evg_api=mock_evg_api, resmoke_proxy=mock_resmoke_proxy) + timeout_params = under_test.TimeoutParams( + evg_project="my project", + build_variant="bv", + task_name="my task", + suite_name="my suite", + is_asan=False, + ) + + timeout = timeout_service.get_timeout_estimate(timeout_params) + + self.assertTrue(timeout.is_specified()) + self.assertEqual(1860, timeout.calculate_test_timeout(1)) + self.assertEqual(54180, timeout.calculate_task_timeout(1)) + + +class TestGetTaskHookOverhead(unittest.TestCase): + def test_no_stats_should_return_zero(self): + timeout_service = build_mock_service() + + overhead = timeout_service.get_task_hook_overhead("suite", is_asan=False, test_count=30, + historic_stats=None) + + self.assertEqual(0.0, overhead) + + def test_stats_with_no_clean_every_n_should_return_zero(self): + timeout_service = build_mock_service() + test_stats = HistoricTaskData.from_stats_list( + [tst_stat_mock(f"test_{i}.js", 60, 1) for i in range(30)]) + + overhead = timeout_service.get_task_hook_overhead("suite", is_asan=False, test_count=30, + historic_stats=test_stats) + + self.assertEqual(0.0, overhead) + + def test_stats_with_clean_every_n_should_return_overhead(self): + test_count = 30 + runtime = 25 + timeout_service = build_mock_service() + test_stat_list = [tst_stat_mock(f"test_{i}.js", 60, 1) for i in range(test_count)] + test_stat_list.extend([ + tst_stat_mock(f"test_{i}:{under_test.CLEAN_EVERY_N_HOOK}", runtime, 1) + for i in range(10) + ]) + random.shuffle(test_stat_list) + test_stats = HistoricTaskData.from_stats_list(test_stat_list) + + overhead = timeout_service.get_task_hook_overhead( + "suite", is_asan=True, test_count=test_count, historic_stats=test_stats) + + self.assertEqual(runtime * test_count, overhead) + + +class TestLookupHistoricStats(unittest.TestCase): + def test_no_stats_from_evergreen_should_return_none(self): + mock_evg_api = MagicMock(spec_set=EvergreenApi) + mock_evg_api.test_stats_by_project.return_value = [] + timeout_service = build_mock_service(evg_api=mock_evg_api) + timeout_params = under_test.TimeoutParams( + evg_project="my project", + build_variant="bv", + task_name="my task", + suite_name="my suite", + is_asan=False, + ) + + stats = timeout_service.lookup_historic_stats(timeout_params) + + self.assertIsNone(stats) + + def test_errors_from_evergreen_should_return_none(self): + mock_evg_api = MagicMock(spec_set=EvergreenApi) + mock_evg_api.test_stats_by_project.side_effect = HTTPError("failed to connect") + timeout_service = build_mock_service(evg_api=mock_evg_api) + timeout_params = under_test.TimeoutParams( + evg_project="my project", + build_variant="bv", + task_name="my task", + suite_name="my suite", + is_asan=False, + ) + + stats = timeout_service.lookup_historic_stats(timeout_params) + + self.assertIsNone(stats) + + def test_stats_from_evergreen_should_return_the_stats(self): + mock_evg_api = MagicMock(spec_set=EvergreenApi) + test_stats = [tst_stat_mock(f"test_{i}.js", 60, 1) for i in range(100)] + mock_evg_api.test_stats_by_project.return_value = test_stats + timeout_service = build_mock_service(evg_api=mock_evg_api) + timeout_params = under_test.TimeoutParams( + evg_project="my project", + build_variant="bv", + task_name="my task", + suite_name="my suite", + is_asan=False, + ) + + stats = timeout_service.lookup_historic_stats(timeout_params) + + self.assertIsNotNone(stats) + self.assertEqual(len(test_stats), len(stats.historic_test_results)) + + +class TestGetCleanEveryNCadence(unittest.TestCase): + def test_clean_every_n_cadence_on_asan(self): + timeout_service = build_mock_service() + + cadence = timeout_service._get_clean_every_n_cadence("suite", True) + + self.assertEqual(1, cadence) + + def test_clean_every_n_cadence_from_hook_config(self): + expected_n = 42 + mock_resmoke_proxy = MagicMock() + mock_resmoke_proxy.read_suite_config.return_value = { + "executor": { + "hooks": [{ + "class": "hook1", + }, { + "class": under_test.CLEAN_EVERY_N_HOOK, + "n": expected_n, + }] + } + } + timeout_service = build_mock_service(resmoke_proxy=mock_resmoke_proxy) + + cadence = timeout_service._get_clean_every_n_cadence("suite", False) + + self.assertEqual(expected_n, cadence) + + def test_clean_every_n_cadence_no_n_in_hook_config(self): + mock_resmoke_proxy = MagicMock() + mock_resmoke_proxy.read_suite_config.return_value = { + "executor": { + "hooks": [{ + "class": "hook1", + }, { + "class": under_test.CLEAN_EVERY_N_HOOK, + }] + } + } + timeout_service = build_mock_service(resmoke_proxy=mock_resmoke_proxy) + + cadence = timeout_service._get_clean_every_n_cadence("suite", False) + + self.assertEqual(1, cadence) + + def test_clean_every_n_cadence_no_hook_config(self): + mock_resmoke_proxy = MagicMock() + mock_resmoke_proxy.read_suite_config.return_value = { + "executor": {"hooks": [{ + "class": "hook1", + }, ]} + } + timeout_service = build_mock_service(resmoke_proxy=mock_resmoke_proxy) + + cadence = timeout_service._get_clean_every_n_cadence("suite", False) + + self.assertEqual(1, cadence) diff --git a/buildscripts/tests/util/test_taskname.py b/buildscripts/tests/util/test_taskname.py index 22ab279066a..7f3296ca1aa 100644 --- a/buildscripts/tests/util/test_taskname.py +++ b/buildscripts/tests/util/test_taskname.py @@ -4,7 +4,7 @@ import unittest import buildscripts.util.taskname as under_test -# pylint: disable=missing-docstring,protected-access +# pylint: disable=missing-docstring,protected-access,invalid-name class TestNameTask(unittest.TestCase): @@ -24,3 +24,29 @@ class TestRemoveGenSuffix(unittest.TestCase): input_task_name = "sharded_multi_stmt_txn_jscore_passthroug" self.assertEqual("sharded_multi_stmt_txn_jscore_passthroug", under_test.remove_gen_suffix(input_task_name)) + + +class TestDetermineTaskBaseName(unittest.TestCase): + def test_task_name_with_build_variant_should_strip_bv_and_sub_task_index(self): + bv = "enterprise-rhel-80-64-bit-dynamic-required" + task_name = f"auth_23_{bv}" + + base_task_name = under_test.determine_task_base_name(task_name, bv) + + self.assertEqual("auth", base_task_name) + + def test_task_name_without_build_variant_should_strip_sub_task_index(self): + bv = "enterprise-rhel-80-64-bit-dynamic-required" + task_name = "auth_314" + + base_task_name = under_test.determine_task_base_name(task_name, bv) + + self.assertEqual("auth", base_task_name) + + def test_task_name_without_build_variant_or_subtask_index_should_self(self): + bv = "enterprise-rhel-80-64-bit-dynamic-required" + task_name = "auth" + + base_task_name = under_test.determine_task_base_name(task_name, bv) + + self.assertEqual("auth", base_task_name) diff --git a/buildscripts/timeouts/__init__.py b/buildscripts/timeouts/__init__.py new file mode 100644 index 00000000000..4b7a2bb941b --- /dev/null +++ b/buildscripts/timeouts/__init__.py @@ -0,0 +1 @@ +"""Empty.""" diff --git a/buildscripts/task_generation/timeout.py b/buildscripts/timeouts/timeout.py index 261c2a8b82d..3e3440f9c5b 100644 --- a/buildscripts/task_generation/timeout.py +++ b/buildscripts/timeouts/timeout.py @@ -1,11 +1,10 @@ """Timeout information for generating tasks.""" import math from datetime import timedelta -from inspect import getframeinfo, currentframe +from inspect import currentframe, getframeinfo from typing import NamedTuple, Optional import structlog - from buildscripts.patch_builds.task_generation import TimeoutInfo LOGGER = structlog.getLogger(__name__) diff --git a/buildscripts/timeouts/timeout_service.py b/buildscripts/timeouts/timeout_service.py new file mode 100644 index 00000000000..8c0d5ad58cd --- /dev/null +++ b/buildscripts/timeouts/timeout_service.py @@ -0,0 +1,189 @@ +"""Service for determining task timeouts.""" +from datetime import datetime +from typing import Any, Dict, NamedTuple, Optional + +import inject +import structlog +from buildscripts.task_generation.resmoke_proxy import ResmokeProxyService +from buildscripts.timeouts.timeout import TimeoutEstimate +from buildscripts.util.teststats import HistoricTaskData +from evergreen import EvergreenApi + +LOGGER = structlog.get_logger(__name__) +CLEAN_EVERY_N_HOOK = "CleanEveryN" + + +class TimeoutParams(NamedTuple): + """ + Parameters about task being run. + + * evg_project: Evergreen project. + * build_variant: Evergreen build variant. + * task_name: Evergreen task_name. + * suite_name: Test Suite being run. + * is_asan: Whether this run is part of an asan build. + """ + + evg_project: str + build_variant: str + task_name: str + suite_name: str + is_asan: bool + + +class TimeoutSettings(NamedTuple): + """Settings for determining timeouts.""" + + start_date: datetime + end_date: datetime + + +class TimeoutService: + """A service for determining task timeouts.""" + + @inject.autoparams() + def __init__(self, evg_api: EvergreenApi, resmoke_proxy: ResmokeProxyService, + timeout_settings: TimeoutSettings) -> None: + """ + Initialize the service. + + :param evg_api: Evergreen API client. + :param resmoke_proxy: Proxy to query resmoke. + :param timeout_settings: Settings for how timeouts are calculated. + """ + self.evg_api = evg_api + self.resmoke_proxy = resmoke_proxy + self.timeout_settings = timeout_settings + + def get_timeout_estimate(self, timeout_params: TimeoutParams) -> TimeoutEstimate: + """ + Calculate the timeout estimate for the given task based on historic test results. + + :param timeout_params: Details about the task to query. + :return: Timeouts to use based on historic test results. + """ + historic_stats = self.lookup_historic_stats(timeout_params) + if not historic_stats: + return TimeoutEstimate.no_timeouts() + + test_set = set(self.resmoke_proxy.list_tests(timeout_params.suite_name)) + test_runtimes = [ + stat for stat in historic_stats.get_tests_runtimes() if stat.test_name in test_set + ] + test_runtime_set = {test.test_name for test in test_runtimes} + for test in test_set: + if test not in test_runtime_set: + # If we don't have historic runtime information for all the tests, we cannot + # reliable determine a timeout, so fallback to a default timeout. + LOGGER.warning( + "Could not find historic runtime information for test, using default timeout", + test=test) + return TimeoutEstimate.no_timeouts() + + total_runtime = 0.0 + max_runtime = 0.0 + + for runtime in test_runtimes: + if runtime.runtime > 0.0: + total_runtime += runtime.runtime + max_runtime = max(max_runtime, runtime.runtime) + else: + LOGGER.warning("Found a test with 0 runtime, using default timeouts", + test=runtime.test_name) + # We found a test with a runtime of 0, which indicates that it does not have a + # proper runtime history, so fall back to a default timeout. + return TimeoutEstimate.no_timeouts() + + hook_overhead = self.get_task_hook_overhead( + timeout_params.suite_name, timeout_params.is_asan, len(test_set), historic_stats) + total_runtime += hook_overhead + + return TimeoutEstimate(max_test_runtime=max_runtime, expected_task_runtime=total_runtime) + + def get_task_hook_overhead(self, suite_name: str, is_asan: bool, test_count: int, + historic_stats: Optional[HistoricTaskData]) -> float: + """ + Add how much overhead task-level hooks each suite should account for. + + Certain test hooks need to be accounted for on the task level instead of the test level + in order to calculate accurate timeouts. So we will add details about those hooks to + each suite here. + + :param suite_name: Name of suite being generated. + :param is_asan: Whether ASAN is being used. + :param test_count: Number of tests in sub-suite. + :param historic_stats: Historic runtime data of the suite. + """ + # The CleanEveryN hook is run every 'N' tests. The runtime of the + # hook will be associated with whichever test happens to be running, which could be + # different every run. So we need to take its runtime into account at the task level. + if historic_stats is None: + return 0.0 + + clean_every_n_cadence = self._get_clean_every_n_cadence(suite_name, is_asan) + avg_clean_every_n_runtime = historic_stats.get_avg_hook_runtime(CLEAN_EVERY_N_HOOK) + LOGGER.debug("task hook overhead", cadence=clean_every_n_cadence, + runtime=avg_clean_every_n_runtime, is_asan=is_asan) + if avg_clean_every_n_runtime != 0: + n_expected_runs = test_count / clean_every_n_cadence + return n_expected_runs * avg_clean_every_n_runtime + return 0.0 + + def lookup_historic_stats(self, timeout_params: TimeoutParams) -> Optional[HistoricTaskData]: + """ + Lookup historic test results stats for the given task. + + :param timeout_params: Details about the task to lookup. + :return: Historic test results if they exist. + """ + try: + evg_stats = HistoricTaskData.from_evg( + self.evg_api, timeout_params.evg_project, self.timeout_settings.start_date, + self.timeout_settings.end_date, timeout_params.task_name, + timeout_params.build_variant) + if not evg_stats: + LOGGER.warning("No historic runtime information available") + return None + return evg_stats + except Exception: # pylint: disable=broad-except + # If we have any trouble getting the historic runtime information, log the issue, but + # don't fall back to default timeouts instead of failing. + LOGGER.warning("Error querying history runtime information from evergreen", + exc_info=True) + return None + + def _get_clean_every_n_cadence(self, suite_name: str, is_asan: bool) -> int: + """ + Get the N value for the CleanEveryN hook. + + :param suite_name: Name of suite being generated. + :param is_asan: Whether ASAN is being used. + :return: How frequently clean every end is run. + """ + # Default to 1, which is the worst case meaning CleanEveryN would run for every test. + clean_every_n_cadence = 1 + if is_asan: + # ASAN runs hard-code N to 1. See `resmokelib/testing/hooks/cleanup.py`. + return clean_every_n_cadence + + clean_every_n_config = self._get_hook_config(suite_name, CLEAN_EVERY_N_HOOK) + if clean_every_n_config: + clean_every_n_cadence = clean_every_n_config.get("n", 1) + + return clean_every_n_cadence + + def _get_hook_config(self, suite_name: str, hook_name: str) -> Optional[Dict[str, Any]]: + """ + Get the configuration for the given hook. + + :param hook_name: Name of hook to query. + :return: Configuration for hook, if it exists. + """ + hooks_config = self.resmoke_proxy.read_suite_config(suite_name).get("executor", + {}).get("hooks") + if hooks_config: + for hook in hooks_config: + if hook.get("class") == hook_name: + return hook + + return None diff --git a/buildscripts/util/taskname.py b/buildscripts/util/taskname.py index 7dd3b12685b..784fc6d6555 100644 --- a/buildscripts/util/taskname.py +++ b/buildscripts/util/taskname.py @@ -1,6 +1,7 @@ """Functions for working with resmoke task names.""" import math +import re GEN_SUFFIX = "_gen" @@ -36,3 +37,25 @@ def remove_gen_suffix(task_name: str) -> str: if task_name.endswith(GEN_SUFFIX): return task_name[:-4] return task_name + + +def determine_task_base_name(task_name: str, build_variant: str) -> str: + """ + Determine the base name of a task. + + For generated tasks the base name will have the build variant and sub-task index + stripped off. For other tasks, it is the unmodified task_name. + + :param task_name: Name of task to get base name of. + :param build_variant: Build variant that may be included in task name. + :return: Base name of given task. + """ + match = re.match(f"(.*)_([0-9]+|misc)_{build_variant}", task_name) + if match: + return match.group(1) + + match = re.match(r"(.*)_([0-9]+|misc)", task_name) + if match: + return match.group(1) + + return task_name diff --git a/docs/evergreen-testing/index.md b/docs/evergreen-testing/index.md new file mode 100644 index 00000000000..f57692ade9a --- /dev/null +++ b/docs/evergreen-testing/index.md @@ -0,0 +1,5 @@ +# Testing in Evergreen + +Documentation about how MongoDB is tested in Evergreen. + +* [Task Timeouts](task_timeouts.md) diff --git a/docs/evergreen-testing/task_timeouts.md b/docs/evergreen-testing/task_timeouts.md new file mode 100644 index 00000000000..e370aad22c9 --- /dev/null +++ b/docs/evergreen-testing/task_timeouts.md @@ -0,0 +1,35 @@ +# Evergreen Task Timeouts + +## Type of timeouts + +There are two types of timeouts that [evergreen supports](https://github.com/evergreen-ci/evergreen/wiki/Project-Commands#timeoutupdate): + +* **Exec timeout**: The _exec_ timeout is the overall timeout for a task. Once the total runtime for +a test hits this value, the timeout logic will be triggered. This value is specified by +**exec_timeout_secs** in the evergreen configuration. +* **Idle timeout**: The _idle_ timeout is the amount of time in which evergreen will wait for +output to be created before it considers the task hung and triggers timeout logic. This value +is specified by **timeout_secs** in the evergreen configuration. + +**Note**: In most cases, **exec_timeout** is usually the more useful of the timeouts. + +## Setting the timeout for a task + +There are a few ways in which the timeout can be determined for a task running in evergreen. + +* **Specified in 'etc/evergreen.yml'**: Timeout can be specified directly in the 'evergreen.yml' file, +both on tasks and build variants. This can be useful for setting default timeout values, but is limited +since different build variants frequently have different runtime characteristics and it is not possible +to set timeouts for a task running on a specific build variant. + +* **etc/evergreen_timeouts.yml**: The 'etc/evergreen_timeouts.yml' file for overriding timeouts +for specific tasks on specific build variants. This provides a work-around for the limitations of +specifying the timeouts directly in the 'evergreen.yml'. In order to use this method, the task +must run the "determine task timeout" and "update task timeout expansions" functions at the beginning +of the task evergreen definition. Most resmoke tasks already do this. + +* **buildscripts/evergreen_task_timeout.py**: This is the script that reads the 'etc/evergreen_timeouts.yml' +file and calculates the timeout to use. Additionally, it will check the historic test results of the +task being run and see if there is enough information to calculate timeouts based on that. It can +also be used for more advanced ways of determining timeouts (e.g. the script is used to set much +more aggressive timeouts on tasks that are run in the commit-queue). diff --git a/etc/evergreen.yml b/etc/evergreen.yml index 108467e15d2..aea721dc962 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -1218,6 +1218,9 @@ functions: - *update_resmoke_jobs_expansions - *f_expansions_write - *configure_evergreen_api_credentials + - *determine_task_timeout + - *update_task_timeout_expansions + - *f_expansions_write - command: subprocess.exec params: binary: bash @@ -1243,6 +1246,7 @@ functions: "run tests": - *f_expansions_write + - *configure_evergreen_api_credentials - *determine_task_timeout - *update_task_timeout_expansions - *f_expansions_write @@ -2246,7 +2250,9 @@ tasks: - "./build/**.gcno" - "./etc/*san.suppressions" - "./etc/backports_required_for_multiversion_tests.yml" + - "./etc/evergreen_timeouts.yml" - "./etc/expansions.default.yml" + - "./etc/evergreen.yml" - "./etc/pip/**" - "./etc/repo_config.yaml" - "./etc/scons/**" diff --git a/etc/evergreen_timeouts.yml b/etc/evergreen_timeouts.yml new file mode 100644 index 00000000000..b25ce231d27 --- /dev/null +++ b/etc/evergreen_timeouts.yml @@ -0,0 +1,93 @@ +# This file defines timeouts in evergreen that will override the default timeouts. +# +# Each key under `overrides` provides the build variant where the override will occur. The +# override should include the `task` that should have its timeout overridden and either the +# `exec_timeout` to override or the `idle_timeout` to override. +# +# The timeouts should be specified in minutes. + +# Note: In order to make it easier to find existing entries, please try to keep the build variants +# in alphabetical order. + +overrides: + enterprise-macos: + - task: replica_sets_jscore_passthrough + exec_timeout: 150 # 2.5 hours + + enterprise-rhel-80-64-bit-coverage: + - task: replica_sets_jscore_passthrough + exec_timeout: 150 # 2.5 hours. + + enterprise-ubuntu2004-debug-tsan: + - task: run_unittests + exec_timeout: 24 + + enterprise-windows: + - task: replica_sets_jscore_passthrough + exec_timeout: 180 # 3 hours. + + enterprise-windows-all-feature-flags-suggested: + - task: replica_sets_jscore_passthrough + exec_timeout: 180 # 3 hours. + - task: replica_sets_update_v1_oplog_jscore_passthrough + exec_timeout: 150 # 2.5 hours. + + enterprise-windows-inmem: + - task: replica_sets_jscore_passthrough + exec_timeout: 180 # 3 hours. + + enterprise-windows-required: + - task: replica_sets_jscore_passthrough + exec_timeout: 180 # 3 hours. + - task: replica_sets_update_v1_oplog_jscore_passthrough + exec_timeout: 150 # 2.5 hours. + + linux-64-debug: + - task: auth + exec_timeout: 60 # 1 hour. + + linux-64-debug-repeated-execution: + - task: run_unittests + exec_timeout: 120 # 2 hours. + + macos: + - task: replica_sets_jscore_passthrough + exec_timeout: 150 # 2.5 hours + + ubuntu1804-asan: + - task: run_unittests + exec_timeout: 24 + + ubuntu1804-debug-asan: + - task: run_unittests + exec_timeout: 24 + + ubuntu1804-debug-aubsan-lite: + - task: run_unittests + exec_timeout: 24 + + ubuntu1804-debug-ubsan: + - task: run_unittests + exec_timeout: 24 + + ubuntu1804-debug-suggested: + - task: replica_sets_jscore_passthrough + exec_timeout: 180 # 3 hours. + + ubuntu1804-ubsan: + - task: run_unittests + exec_timeout: 24 + + windows: + - task: replica_sets + exec_timeout: 180 # 3 hours. + - task: replica_sets_jscore_passthrough + exec_timeout: 150 # 2.5 hours. + + windows-debug-suggested: + - task: replica_sets_initsync_jscore_passthrough + exec_timeout: 150 # 2.5 hours. + - task: replica_sets_jscore_passthrough + exec_timeout: 180 # 3 hours. + - task: replica_sets_update_v1_oplog_jscore_passthrough + exec_timeout: 150 # 2.5 hours. diff --git a/evergreen/functions/task_timeout_determine.sh b/evergreen/functions/task_timeout_determine.sh index 645aedbc302..f63416b2374 100644 --- a/evergreen/functions/task_timeout_determine.sh +++ b/evergreen/functions/task_timeout_determine.sh @@ -5,11 +5,25 @@ cd src set -o verbose set -o errexit + +# Set the suite name to be the task name by default; unless overridden with the `suite` expansion. +suite_name=${task_name} +if [[ -n ${suite} ]]; then + suite_name=${suite} +fi + +timeout_factor="" +if [[ -n "${exec_timeout_factor}" ]]; then + timeout_factor="--exec-timeout-factor ${exec_timeout_factor}" +fi + activate_venv -$python buildscripts/evergreen_task_timeout.py \ +PATH=$PATH:$HOME:/ $python buildscripts/evergreen_task_timeout.py $timeout_factor \ --task-name ${task_name} \ + --suite-name ${suite_name} \ --build-variant ${build_variant} \ --evg-alias '${alias}' \ --timeout ${timeout_secs} \ --exec-timeout ${exec_timeout_secs} \ + --evg-api-config ./.evergreen.yml \ --out-file task_timeout_expansions.yml |