summaryrefslogtreecommitdiff
path: root/buildscripts/powercycle_sentinel.py
blob: bb276e2b7123c2f3a425d9645230e8c70df11c42 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python3
"""Powercycle tasks sentinel.

Error out when any powercycle task on the same buildvariant runs for more than 2 hours.
"""
import logging
import os
import sys
import time
from datetime import datetime, timezone
from typing import List

import click
import structlog
from evergreen import RetryingEvergreenApi, EvergreenApi

from buildscripts.util.read_config import read_config_file

LOGGER = structlog.getLogger(__name__)

EVERGREEN_HOST = "https://evergreen.mongodb.com"
EVERGREEN_CONFIG_LOCATIONS = (
    # Common for machines in Evergreen
    os.path.join(os.getcwd(), ".evergreen.yml"),
    # Common for local machines
    os.path.expanduser(os.path.join("~", ".evergreen.yml")),
)
POWERCYCLE_TASK_EXEC_TIMEOUT_SECS = 2 * 60 * 60
WATCH_INTERVAL_SECS = 5 * 60


def get_evergreen_api() -> EvergreenApi:
    """Return evergreen API."""
    # Pickup the first config file found in common locations.
    for file in EVERGREEN_CONFIG_LOCATIONS:
        if os.path.isfile(file):
            evg_api = RetryingEvergreenApi.get_api(config_file=file)
            return evg_api

    LOGGER.error("Evergreen config not found in locations.", locations=EVERGREEN_CONFIG_LOCATIONS)
    sys.exit(1)


def watch_tasks(task_ids: List[str], evg_api: EvergreenApi, watch_interval_secs: int) -> List[str]:
    """Watch tasks if they run longer than exec timeout."""
    watch_task_ids = task_ids[:]
    long_running_task_ids = []

    while watch_task_ids:
        LOGGER.info("Looking if powercycle tasks are still running on the current buildvariant.")
        powercycle_tasks = [evg_api.task_by_id(task_id) for task_id in watch_task_ids]
        for task in powercycle_tasks:
            if task.finish_time:
                watch_task_ids.remove(task.task_id)
            elif task.start_time and (datetime.now(timezone.utc) - task.start_time
                                      ).total_seconds() > POWERCYCLE_TASK_EXEC_TIMEOUT_SECS:
                long_running_task_ids.append(task.task_id)
                watch_task_ids.remove(task.task_id)
        if watch_task_ids:
            time.sleep(watch_interval_secs)

    return long_running_task_ids


def get_links(task_ids: List[str]) -> str:
    """Return evergreen task urls delimited by newline."""
    return "\n".join([f"{EVERGREEN_HOST}/task/{task_id}" for task_id in task_ids])


@click.command()
@click.argument("expansions_file", type=str, default="expansions.yml")
def main(expansions_file: str = "expansions.yml") -> None:
    """Implementation."""

    logging.basicConfig(
        format="[%(levelname)s] %(message)s",
        level=logging.INFO,
        stream=sys.stdout,
    )
    structlog.configure(logger_factory=structlog.stdlib.LoggerFactory())

    expansions = read_config_file(expansions_file)
    build_id = expansions["build_id"]
    current_task_id = expansions["task_id"]
    gen_task_name = expansions["gen_task"]

    evg_api = get_evergreen_api()

    build_tasks = evg_api.tasks_by_build(build_id)
    gen_task_id = [task.task_id for task in build_tasks if gen_task_name in task.task_id][0]
    gen_task_url = f"{EVERGREEN_HOST}/task/{gen_task_id}"

    while evg_api.task_by_id(gen_task_id).is_active():
        LOGGER.info(
            f"Waiting for '{gen_task_name}' task to generate powercycle tasks:\n{gen_task_url}")
        time.sleep(WATCH_INTERVAL_SECS)

    build_tasks = evg_api.tasks_by_build(build_id)
    powercycle_task_ids = [
        task.task_id for task in build_tasks
        if not task.display_only and task.task_id != current_task_id and task.task_id != gen_task_id
        and "powercycle" in task.task_id
    ]
    LOGGER.info(f"Watching powercycle tasks:\n{get_links(powercycle_task_ids)}")

    long_running_task_ids = watch_tasks(powercycle_task_ids, evg_api, WATCH_INTERVAL_SECS)
    if long_running_task_ids:
        LOGGER.error(
            f"Found powercycle tasks that are running for more than {POWERCYCLE_TASK_EXEC_TIMEOUT_SECS} "
            f"seconds and most likely something is going wrong in those tasks:\n{get_links(long_running_task_ids)}"
        )
        LOGGER.error(
            "Hopefully hosts from the tasks are still in run at the time you are seeing this "
            "and the Build team is able to check them to diagnose the issue.")
        sys.exit(1)


if __name__ == '__main__':
    main()