summaryrefslogtreecommitdiff
path: root/buildscripts/evergreen_task_timeout.py
blob: f35a6c6c89793288c16b19bf0e39fe1f732d9c6d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#!/usr/bin/env python3
"""Determine the timeout value a task should use in evergreen."""
from __future__ import annotations

import argparse
import math
import os
import shlex
import sys
from datetime import timedelta
from pathlib import Path
from typing import Dict, List, Optional

import inject
import structlog
import yaml
from pydantic import BaseModel
from evergreen import EvergreenApi, RetryingEvergreenApi

from buildscripts.ciconfig.evergreen import (EvergreenProjectConfig, parse_evergreen_file)
from buildscripts.task_generation.resmoke_proxy import ResmokeProxyService
from buildscripts.timeouts.timeout_service import (TimeoutParams, TimeoutService)
from buildscripts.util.cmdutils import enable_logging
from buildscripts.util.taskname import determine_task_base_name

LOGGER = structlog.get_logger(__name__)
DEFAULT_TIMEOUT_OVERRIDES = "etc/evergreen_timeouts.yml"
DEFAULT_EVERGREEN_CONFIG = "etc/evergreen.yml"
DEFAULT_EVERGREEN_AUTH_CONFIG = "~/.evergreen.yml"
COMMIT_QUEUE_ALIAS = "__commit_queue"
UNITTEST_TASK = "run_unittests"
IGNORED_SUITES = {
    "integration_tests_replset", "integration_tests_replset_ssl_auth", "integration_tests_sharded",
    "integration_tests_standalone", "integration_tests_standalone_audit", "mongos_test",
    "server_selection_json_test"
}
HISTORY_LOOKBACK = timedelta(weeks=2)

COMMIT_QUEUE_TIMEOUT = timedelta(minutes=40)
DEFAULT_REQUIRED_BUILD_TIMEOUT = timedelta(hours=1, minutes=20)
DEFAULT_NON_REQUIRED_BUILD_TIMEOUT = timedelta(hours=2)
# 2x the longest "run tests" phase for unittests as of c9bf1dbc9cc46e497b2f12b2d6685ef7348b0726,
# which is 5 mins 47 secs, excluding outliers below
UNITTESTS_TIMEOUT = timedelta(minutes=12)


class TimeoutOverride(BaseModel):
    """
    Specification for overriding a task timeout.

    * task: Name of task to overide.
    * exec_timeout: Value to override exec timeout with.
    * idle_timeout: Value to override idle timeout with.
    """

    task: str
    exec_timeout: Optional[int] = None
    idle_timeout: Optional[int] = None

    @classmethod
    def from_seconds(cls, task: str, exec_timeout_secs: Optional[float],
                     idle_timeout_secs: Optional[float]) -> TimeoutOverride:
        """Create an instance of an override from seconds."""
        exec_timeout = exec_timeout_secs / 60 if exec_timeout_secs else None
        idle_timeout = idle_timeout_secs / 60 if idle_timeout_secs else None
        return cls(
            task=task,
            exec_timeout=exec_timeout,
            idle_timeout=idle_timeout,
        )

    def get_exec_timeout(self) -> Optional[timedelta]:
        """Get a timedelta of the exec timeout to use."""
        if self.exec_timeout is not None:
            return timedelta(minutes=self.exec_timeout)
        return None

    def get_idle_timeout(self) -> Optional[timedelta]:
        """Get a timedelta of the idle timeout to use."""
        if self.idle_timeout is not None:
            return timedelta(minutes=self.idle_timeout)
        return None


class TimeoutOverrides(BaseModel):
    """Collection of timeout overrides to apply."""

    overrides: Dict[str, List[TimeoutOverride]]

    @classmethod
    def from_yaml_file(cls, file_path: Path) -> "TimeoutOverrides":
        """Read the timeout overrides from the given file."""
        with open(file_path) as file_handler:
            return cls(**yaml.safe_load(file_handler))

    def _lookup_override(self, build_variant: str, task_name: str) -> Optional[TimeoutOverride]:
        """
        Check if the given task on the given build variant has an override defined.

        Note: If multiple overrides are found, an exception will be raised.

        :param build_variant: Build Variant to check.
        :param task_name: Task name to check.
        :return: Timeout override if found.
        """
        overrides = [
            override for override in self.overrides.get(build_variant, [])
            if override.task == task_name
        ]
        if overrides:
            if len(overrides) > 1:
                LOGGER.error("Found multiple overrides for the same task",
                             build_variant=build_variant, task=task_name,
                             overrides=[override.dict() for override in overrides])
                raise ValueError(f"Found multiple overrides for '{task_name}' on '{build_variant}'")
            return overrides[0]
        return None

    def lookup_exec_override(self, build_variant: str, task_name: str) -> Optional[timedelta]:
        """
        Look up the exec timeout override of the given build variant/task.

        :param build_variant: Build Variant to check.
        :param task_name: Task name to check.
        :return: Exec timeout override if found.
        """
        override = self._lookup_override(build_variant, task_name)
        if override is not None:
            return override.get_exec_timeout()
        return None

    def lookup_idle_override(self, build_variant: str, task_name: str) -> Optional[timedelta]:
        """
        Look up the idle timeout override of the given build variant/task.

        :param build_variant: Build Variant to check.
        :param task_name: Task name to check.
        :return: Idle timeout override if found.
        """
        override = self._lookup_override(build_variant, task_name)
        if override is not None:
            return override.get_idle_timeout()
        return None


def _is_required_build_variant(build_variant: str) -> bool:
    """
    Determine if the given build variants is a required build variant.

    :param build_variant: Name of build variant to check.
    :return: True if the given build variant is required.
    """
    return build_variant.endswith("-required")


def output_timeout(exec_timeout: timedelta, idle_timeout: Optional[timedelta],
                   output_file: Optional[str]) -> None:
    """
    Output timeout configuration to the specified location.

    :param exec_timeout: Exec timeout to output.
    :param idle_timeout: Idle timeout to output.
    :param output_file: Location of output file to write.
    """
    output = {
        "exec_timeout_secs": math.ceil(exec_timeout.total_seconds()),
    }
    if idle_timeout is not None:
        output["timeout_secs"] = math.ceil(idle_timeout.total_seconds())

    if output_file:
        with open(output_file, "w") as outfile:
            yaml.dump(output, stream=outfile, default_flow_style=False)

    yaml.dump(output, stream=sys.stdout, default_flow_style=False)


class TaskTimeoutOrchestrator:
    """An orchestrator for determining task timeouts."""

    @inject.autoparams()
    def __init__(self, timeout_service: TimeoutService, timeout_overrides: TimeoutOverrides,
                 evg_project_config: EvergreenProjectConfig) -> None:
        """
        Initialize the orchestrator.

        :param timeout_service: Service for calculating historic timeouts.
        :param timeout_overrides: Timeout overrides for specific tasks.
        :param evg_project_config: Evergreen project configuration.
        """
        self.timeout_service = timeout_service
        self.timeout_overrides = timeout_overrides
        self.evg_project_config = evg_project_config

    def determine_exec_timeout(self, task_name: str, variant: str,
                               idle_timeout: Optional[timedelta] = None,
                               exec_timeout: Optional[timedelta] = None, evg_alias: str = "",
                               historic_timeout: Optional[timedelta] = None) -> timedelta:
        """
        Determine what exec timeout should be used.

        :param task_name: Name of task being run.
        :param variant: Name of build variant being run.
        :param idle_timeout: Idle timeout if specified.
        :param exec_timeout: Override to use for exec_timeout or 0 if no override.
        :param evg_alias: Evergreen alias running the task.
        :param historic_timeout: Timeout determined by looking at previous task executions.
        :return: Exec timeout to use for running task.
        """
        determined_timeout = DEFAULT_NON_REQUIRED_BUILD_TIMEOUT
        if historic_timeout is not None:
            determined_timeout = historic_timeout

        override = self.timeout_overrides.lookup_exec_override(variant, task_name)

        if exec_timeout and exec_timeout.total_seconds() != 0:
            LOGGER.info("Using timeout from cmd line",
                        exec_timeout_secs=exec_timeout.total_seconds())
            determined_timeout = exec_timeout

        elif override is not None:
            LOGGER.info("Overriding configured timeout", exec_timeout_secs=override.total_seconds())
            determined_timeout = override

        elif task_name == UNITTEST_TASK and override is None:
            LOGGER.info("Overriding unittest timeout",
                        exec_timeout_secs=UNITTESTS_TIMEOUT.total_seconds())
            determined_timeout = UNITTESTS_TIMEOUT

        elif _is_required_build_variant(
                variant) and determined_timeout > DEFAULT_REQUIRED_BUILD_TIMEOUT:
            LOGGER.info("Overriding required-builder timeout",
                        exec_timeout_secs=DEFAULT_REQUIRED_BUILD_TIMEOUT.total_seconds())
            determined_timeout = DEFAULT_REQUIRED_BUILD_TIMEOUT

        elif evg_alias == COMMIT_QUEUE_ALIAS:
            LOGGER.info("Overriding commit-queue timeout",
                        exec_timeout_secs=COMMIT_QUEUE_TIMEOUT.total_seconds())
            determined_timeout = COMMIT_QUEUE_TIMEOUT

        # The timeout needs to be at least as large as the idle timeout.
        if idle_timeout and determined_timeout.total_seconds() < idle_timeout.total_seconds():
            LOGGER.info("Making exec timeout as large as idle timeout",
                        exec_timeout_secs=idle_timeout.total_seconds())
            return idle_timeout

        return determined_timeout

    def determine_idle_timeout(self, task_name: str, variant: str,
                               idle_timeout: Optional[timedelta] = None,
                               historic_timeout: Optional[timedelta] = None) -> Optional[timedelta]:
        """
        Determine what idle timeout should be used.

        :param task_name: Name of task being run.
        :param variant: Name of build variant being run.
        :param idle_timeout: Override to use for idle_timeout.
        :param historic_timeout: Timeout determined by looking at previous task executions.
        :return: Idle timeout to use for running task.
        """
        determined_timeout = historic_timeout

        override = self.timeout_overrides.lookup_idle_override(variant, task_name)

        if idle_timeout and idle_timeout.total_seconds() != 0:
            LOGGER.info("Using timeout from cmd line",
                        idle_timeout_secs=idle_timeout.total_seconds())
            determined_timeout = idle_timeout

        elif override is not None:
            LOGGER.info("Overriding configured timeout", idle_timeout_secs=override.total_seconds())
            determined_timeout = override

        return determined_timeout

    def determine_historic_timeout(self, task: str, variant: str, suite_name: str,
                                   exec_timeout_factor: Optional[float]) -> TimeoutOverride:
        """
        Calculate the timeout based on historic test results.

        :param task: Name of task to query.
        :param variant: Name of build variant to query.
        :param suite_name: Name of test suite being run.
        :param exec_timeout_factor: Scaling factor to use when determining timeout.
        """
        if suite_name in IGNORED_SUITES:
            return TimeoutOverride(task=task, exec_timeout=None, idle_timeout=None)

        timeout_params = TimeoutParams(
            evg_project="mongodb-mongo-master",
            build_variant=variant,
            task_name=task,
            suite_name=suite_name,
            is_asan=self.is_build_variant_asan(variant),
        )
        timeout_estimate = self.timeout_service.get_timeout_estimate(timeout_params)
        if timeout_estimate and timeout_estimate.is_specified():
            exec_timeout = timeout_estimate.calculate_task_timeout(
                repeat_factor=1, scaling_factor=exec_timeout_factor)
            idle_timeout = timeout_estimate.calculate_test_timeout(repeat_factor=1)
            if exec_timeout is not None or idle_timeout is not None:
                LOGGER.info("Getting historic based timeout", exec_timeout_secs=exec_timeout,
                            idle_timeout_secs=idle_timeout)
                return TimeoutOverride.from_seconds(task, exec_timeout, idle_timeout)
        return TimeoutOverride(task=task, exec_timeout=None, idle_timeout=None)

    def is_build_variant_asan(self, build_variant: str) -> bool:
        """
        Determine if the given build variant is an ASAN build variant.

        :param build_variant: Name of build variant to check.
        :return: True if build variant is an ASAN build variant.
        """
        bv = self.evg_project_config.get_variant(build_variant)
        return bv.is_asan_build()

    def determine_timeouts(self, cli_idle_timeout: Optional[timedelta],
                           cli_exec_timeout: Optional[timedelta], outfile: Optional[str], task: str,
                           variant: str, evg_alias: str, suite_name: str,
                           exec_timeout_factor: Optional[float]) -> None:
        """
        Determine the timeouts to use for the given task and write timeouts to expansion file.

        :param cli_idle_timeout: Idle timeout specified by the CLI.
        :param cli_exec_timeout: Exec timeout specified by the CLI.
        :param outfile: File to write timeout expansions to.
        :param variant: Build variant task is being run on.
        :param evg_alias: Evergreen alias that triggered task.
        :param suite_name: Name of evergreen suite being run.
        :param exec_timeout_factor: Scaling factor to use when determining timeout.
        """
        historic_timeout = self.determine_historic_timeout(task, variant, suite_name,
                                                           exec_timeout_factor)

        idle_timeout = self.determine_idle_timeout(task, variant, cli_idle_timeout,
                                                   historic_timeout.get_idle_timeout())
        exec_timeout = self.determine_exec_timeout(task, variant, idle_timeout, cli_exec_timeout,
                                                   evg_alias, historic_timeout.get_exec_timeout())

        output_timeout(exec_timeout, idle_timeout, outfile)


def main():
    """Determine the timeout value a task should use in evergreen."""
    parser = argparse.ArgumentParser(description=main.__doc__)

    parser.add_argument("--install-dir", dest="install_dir", required=True,
                        help="Path to bin directory of testable installation")
    parser.add_argument("--task-name", dest="task", required=True, help="Task being executed.")
    parser.add_argument("--suite-name", dest="suite_name", required=True,
                        help="Resmoke suite being run against.")
    parser.add_argument("--build-variant", dest="variant", required=True,
                        help="Build variant task is being executed on.")
    parser.add_argument("--evg-alias", dest="evg_alias", required=True,
                        help="Evergreen alias used to trigger build.")
    parser.add_argument("--timeout", dest="timeout", type=int, help="Timeout to use (in sec).")
    parser.add_argument("--exec-timeout", dest="exec_timeout", type=int,
                        help="Exec timeout to use (in sec).")
    parser.add_argument("--exec-timeout-factor", dest="exec_timeout_factor", type=float,
                        help="Exec timeout factor to use (in sec).")
    parser.add_argument("--out-file", dest="outfile", help="File to write configuration to.")
    parser.add_argument("--timeout-overrides", dest="timeout_overrides_file",
                        default=DEFAULT_TIMEOUT_OVERRIDES,
                        help="File containing timeout overrides to use.")
    parser.add_argument("--evg-api-config", dest="evg_api_config",
                        default=DEFAULT_EVERGREEN_AUTH_CONFIG, help="Evergreen API config file.")
    parser.add_argument("--evg-project-config", dest="evg_project_config",
                        default=DEFAULT_EVERGREEN_CONFIG, help="Evergreen project config file.")

    options = parser.parse_args()

    timeout_override = timedelta(seconds=options.timeout) if options.timeout else None
    exec_timeout_override = timedelta(
        seconds=options.exec_timeout) if options.exec_timeout else None

    task_name = determine_task_base_name(options.task, options.variant)
    timeout_overrides = TimeoutOverrides.from_yaml_file(
        os.path.expanduser(options.timeout_overrides_file))

    enable_logging(verbose=False)

    def dependencies(binder: inject.Binder) -> None:
        binder.bind(
            EvergreenApi,
            RetryingEvergreenApi.get_api(config_file=os.path.expanduser(options.evg_api_config)))
        binder.bind(TimeoutOverrides, timeout_overrides)
        binder.bind(EvergreenProjectConfig,
                    parse_evergreen_file(os.path.expanduser(options.evg_project_config)))
        binder.bind(
            ResmokeProxyService,
            ResmokeProxyService(run_options=f"--installDir={shlex.quote(options.install_dir)}"))

    inject.configure(dependencies)

    task_timeout_orchestrator = inject.instance(TaskTimeoutOrchestrator)
    task_timeout_orchestrator.determine_timeouts(
        timeout_override, exec_timeout_override, options.outfile, task_name, options.variant,
        options.evg_alias, options.suite_name, options.exec_timeout_factor)


if __name__ == "__main__":
    main()