diff options
author | Felix Edel <felix.edel@bmw.de> | 2022-11-17 13:53:36 +0100 |
---|---|---|
committer | Felix Edel <felix.edel@bmw.de> | 2022-12-12 16:49:39 +0100 |
commit | 137557c559635c1d24e7ad6f4541179e4c948d30 (patch) | |
tree | 86fa182d8698482c85e7b271d421315daa4a3357 | |
parent | 532c30469ffc1a1b35d812455fa8271b392915cd (diff) | |
download | zuul-137557c559635c1d24e7ad6f4541179e4c948d30.tar.gz |
Abort job if cleanup playbook timed out
We've investigated an issue where a job was stuck on the executor
because it wasn't aborted properly. The job was cancelled by the
scheduler, but the cleanup playbook on the executor ran into a timeout.
This caused another abort via the WatchDog.
The problem is that the abort function doesn't do anything if the
cleanup playbook is running [1]. Most probably this covers the case
that we don't want to abort the cleanup playbook after a normal job
cancellation.
However, this doesn't differentiate if the abort was caused by the run
of the cleanup playbook itself, resulting in a build that's hanging
indefinitely on the executor.
To fix this, we now differentiate if the abort was caused by a stop()
call [2] or if it was caused by a timeout. In case of a timeout, we kill
the running process.
Add a test case to validate the changed behaviour. Without the fix, the
test case runs indefinitetly because the cleanup playbook won't be
aborted even after the test times out (during the test cleanup).
[1]: https://opendev.org/zuul/zuul/src/commit/4d555ca675d204b1d668a63fab2942a70f159143/zuul/executor/server.py#L2688
[2]: https://opendev.org/zuul/zuul/src/commit/4d555ca675d204b1d668a63fab2942a70f159143/zuul/executor/server.py#L1064
Change-Id: I979f55b52da3b7a237ac826dfa8f3007e8679932
4 files changed, 51 insertions, 19 deletions
diff --git a/tests/fixtures/config/cleanup-playbook/git/common-config/playbooks/cleanup-timeout.yaml b/tests/fixtures/config/cleanup-playbook/git/common-config/playbooks/cleanup-timeout.yaml new file mode 100644 index 000000000..fee618817 --- /dev/null +++ b/tests/fixtures/config/cleanup-playbook/git/common-config/playbooks/cleanup-timeout.yaml @@ -0,0 +1,4 @@ +- hosts: all + tasks: + - pause: + seconds: 300 diff --git a/tests/fixtures/config/cleanup-playbook/git/common-config/zuul.yaml b/tests/fixtures/config/cleanup-playbook/git/common-config/zuul.yaml index c6f6f188a..275b1920c 100644 --- a/tests/fixtures/config/cleanup-playbook/git/common-config/zuul.yaml +++ b/tests/fixtures/config/cleanup-playbook/git/common-config/zuul.yaml @@ -33,3 +33,8 @@ name: python27-failure cleanup-run: playbooks/cleanup.yaml run: playbooks/failure.yaml + +- job: + name: python27-cleanup-timeout + cleanup-run: playbooks/cleanup-timeout.yaml + run: playbooks/python27.yaml diff --git a/tests/unit/test_v3.py b/tests/unit/test_v3.py index 90450244b..81d95927a 100644 --- a/tests/unit/test_v3.py +++ b/tests/unit/test_v3.py @@ -22,7 +22,7 @@ import textwrap import gc import re from time import sleep -from unittest import skip, skipIf +from unittest import mock, skip, skipIf from zuul.lib import yamlutil import fixtures @@ -4357,6 +4357,29 @@ class TestCleanupPlaybooks(AnsibleZuulTestCase): self.assertTrue(os.path.exists(post_start)) self.assertFalse(os.path.exists(post_end)) + @mock.patch("zuul.executor.server.CLEANUP_TIMEOUT", 5) + def test_cleanup_playbook_timeout(self): + # Test that when the cleanup runs into a timeout, the job + # still completes. + self.executor_server.verbose = True + + # Change the zuul config to run the python27-cleanup-timeout job + in_repo_conf = textwrap.dedent( + """ + - project: + check: + jobs: + - python27-cleanup-timeout + """) + A = self.fake_gerrit.addFakeChange("org/project", "master", "A", + files={".zuul.yaml": in_repo_conf}) + self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1)) + self.waitUntilSettled() + + self.assertHistory([ + dict(name="python27-cleanup-timeout", result="SUCCESS", + changes="1,1")]) + class TestPlaybookSemaphore(AnsibleZuulTestCase): tenant_config_file = 'config/playbook-semaphore/main.yaml' diff --git a/zuul/executor/server.py b/zuul/executor/server.py index b7fed2a8c..c3737b5cc 100644 --- a/zuul/executor/server.py +++ b/zuul/executor/server.py @@ -98,6 +98,9 @@ BLACKLISTED_VARS = dict( ansible_ssh_extra_args='-o PermitLocalCommand=no', ) +# TODO: make this configurable +CLEANUP_TIMEOUT = 300 + class VerboseCommand(commandsocket.Command): name = 'verbose' @@ -1862,9 +1865,6 @@ class AnsibleJob(object): # This means we can't run any cleanup playbooks. return - # TODO: make this configurable - cleanup_timeout = 300 - with open(self.jobdir.job_output_file, 'a') as job_output: job_output.write("{now} | Running Ansible cleanup...\n".format( now=datetime.datetime.now() @@ -1873,7 +1873,7 @@ class AnsibleJob(object): self.cleanup_started = True for index, playbook in enumerate(self.jobdir.cleanup_playbooks): self.runAnsiblePlaybook( - playbook, cleanup_timeout, self.ansible_version, + playbook, CLEANUP_TIMEOUT, self.ansible_version, success=success, phase='cleanup', index=index) def _logFinalPlaybookError(self): @@ -2672,23 +2672,23 @@ class AnsibleJob(object): def _ansibleTimeout(self, msg): self.log.warning(msg) - self.abortRunningProc() + self.abortRunningProc(timed_out=True) - def abortRunningProc(self): + def abortRunningProc(self, timed_out=False): with self.proc_lock: - if self.proc and not self.cleanup_started: - self.log.debug("Abort: sending kill signal to job " - "process group") - try: - pgid = os.getpgid(self.proc.pid) - os.killpg(pgid, signal.SIGKILL) - except Exception: - self.log.exception( - "Exception while killing ansible process:") - elif self.proc and self.cleanup_started: - self.log.debug("Abort: cleanup is in progress") - else: + if not self.proc: self.log.debug("Abort: no process is running") + return + elif self.cleanup_started and not timed_out: + self.log.debug("Abort: cleanup is in progress") + return + + self.log.debug("Abort: sending kill signal to job process group") + try: + pgid = os.getpgid(self.proc.pid) + os.killpg(pgid, signal.SIGKILL) + except Exception: + self.log.exception("Exception while killing ansible process:") def runAnsible(self, cmd, timeout, playbook, ansible_version, wrapped=True, cleanup=False): |