summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelix Edel <felix.edel@bmw.de>2022-11-17 13:53:36 +0100
committerFelix Edel <felix.edel@bmw.de>2022-12-12 16:49:39 +0100
commit137557c559635c1d24e7ad6f4541179e4c948d30 (patch)
tree86fa182d8698482c85e7b271d421315daa4a3357
parent532c30469ffc1a1b35d812455fa8271b392915cd (diff)
downloadzuul-137557c559635c1d24e7ad6f4541179e4c948d30.tar.gz
Abort job if cleanup playbook timed out
We've investigated an issue where a job was stuck on the executor because it wasn't aborted properly. The job was cancelled by the scheduler, but the cleanup playbook on the executor ran into a timeout. This caused another abort via the WatchDog. The problem is that the abort function doesn't do anything if the cleanup playbook is running [1]. Most probably this covers the case that we don't want to abort the cleanup playbook after a normal job cancellation. However, this doesn't differentiate if the abort was caused by the run of the cleanup playbook itself, resulting in a build that's hanging indefinitely on the executor. To fix this, we now differentiate if the abort was caused by a stop() call [2] or if it was caused by a timeout. In case of a timeout, we kill the running process. Add a test case to validate the changed behaviour. Without the fix, the test case runs indefinitetly because the cleanup playbook won't be aborted even after the test times out (during the test cleanup). [1]: https://opendev.org/zuul/zuul/src/commit/4d555ca675d204b1d668a63fab2942a70f159143/zuul/executor/server.py#L2688 [2]: https://opendev.org/zuul/zuul/src/commit/4d555ca675d204b1d668a63fab2942a70f159143/zuul/executor/server.py#L1064 Change-Id: I979f55b52da3b7a237ac826dfa8f3007e8679932
-rw-r--r--tests/fixtures/config/cleanup-playbook/git/common-config/playbooks/cleanup-timeout.yaml4
-rw-r--r--tests/fixtures/config/cleanup-playbook/git/common-config/zuul.yaml5
-rw-r--r--tests/unit/test_v3.py25
-rw-r--r--zuul/executor/server.py36
4 files changed, 51 insertions, 19 deletions
diff --git a/tests/fixtures/config/cleanup-playbook/git/common-config/playbooks/cleanup-timeout.yaml b/tests/fixtures/config/cleanup-playbook/git/common-config/playbooks/cleanup-timeout.yaml
new file mode 100644
index 000000000..fee618817
--- /dev/null
+++ b/tests/fixtures/config/cleanup-playbook/git/common-config/playbooks/cleanup-timeout.yaml
@@ -0,0 +1,4 @@
+- hosts: all
+ tasks:
+ - pause:
+ seconds: 300
diff --git a/tests/fixtures/config/cleanup-playbook/git/common-config/zuul.yaml b/tests/fixtures/config/cleanup-playbook/git/common-config/zuul.yaml
index c6f6f188a..275b1920c 100644
--- a/tests/fixtures/config/cleanup-playbook/git/common-config/zuul.yaml
+++ b/tests/fixtures/config/cleanup-playbook/git/common-config/zuul.yaml
@@ -33,3 +33,8 @@
name: python27-failure
cleanup-run: playbooks/cleanup.yaml
run: playbooks/failure.yaml
+
+- job:
+ name: python27-cleanup-timeout
+ cleanup-run: playbooks/cleanup-timeout.yaml
+ run: playbooks/python27.yaml
diff --git a/tests/unit/test_v3.py b/tests/unit/test_v3.py
index 90450244b..81d95927a 100644
--- a/tests/unit/test_v3.py
+++ b/tests/unit/test_v3.py
@@ -22,7 +22,7 @@ import textwrap
import gc
import re
from time import sleep
-from unittest import skip, skipIf
+from unittest import mock, skip, skipIf
from zuul.lib import yamlutil
import fixtures
@@ -4357,6 +4357,29 @@ class TestCleanupPlaybooks(AnsibleZuulTestCase):
self.assertTrue(os.path.exists(post_start))
self.assertFalse(os.path.exists(post_end))
+ @mock.patch("zuul.executor.server.CLEANUP_TIMEOUT", 5)
+ def test_cleanup_playbook_timeout(self):
+ # Test that when the cleanup runs into a timeout, the job
+ # still completes.
+ self.executor_server.verbose = True
+
+ # Change the zuul config to run the python27-cleanup-timeout job
+ in_repo_conf = textwrap.dedent(
+ """
+ - project:
+ check:
+ jobs:
+ - python27-cleanup-timeout
+ """)
+ A = self.fake_gerrit.addFakeChange("org/project", "master", "A",
+ files={".zuul.yaml": in_repo_conf})
+ self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
+ self.waitUntilSettled()
+
+ self.assertHistory([
+ dict(name="python27-cleanup-timeout", result="SUCCESS",
+ changes="1,1")])
+
class TestPlaybookSemaphore(AnsibleZuulTestCase):
tenant_config_file = 'config/playbook-semaphore/main.yaml'
diff --git a/zuul/executor/server.py b/zuul/executor/server.py
index b7fed2a8c..c3737b5cc 100644
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@@ -98,6 +98,9 @@ BLACKLISTED_VARS = dict(
ansible_ssh_extra_args='-o PermitLocalCommand=no',
)
+# TODO: make this configurable
+CLEANUP_TIMEOUT = 300
+
class VerboseCommand(commandsocket.Command):
name = 'verbose'
@@ -1862,9 +1865,6 @@ class AnsibleJob(object):
# This means we can't run any cleanup playbooks.
return
- # TODO: make this configurable
- cleanup_timeout = 300
-
with open(self.jobdir.job_output_file, 'a') as job_output:
job_output.write("{now} | Running Ansible cleanup...\n".format(
now=datetime.datetime.now()
@@ -1873,7 +1873,7 @@ class AnsibleJob(object):
self.cleanup_started = True
for index, playbook in enumerate(self.jobdir.cleanup_playbooks):
self.runAnsiblePlaybook(
- playbook, cleanup_timeout, self.ansible_version,
+ playbook, CLEANUP_TIMEOUT, self.ansible_version,
success=success, phase='cleanup', index=index)
def _logFinalPlaybookError(self):
@@ -2672,23 +2672,23 @@ class AnsibleJob(object):
def _ansibleTimeout(self, msg):
self.log.warning(msg)
- self.abortRunningProc()
+ self.abortRunningProc(timed_out=True)
- def abortRunningProc(self):
+ def abortRunningProc(self, timed_out=False):
with self.proc_lock:
- if self.proc and not self.cleanup_started:
- self.log.debug("Abort: sending kill signal to job "
- "process group")
- try:
- pgid = os.getpgid(self.proc.pid)
- os.killpg(pgid, signal.SIGKILL)
- except Exception:
- self.log.exception(
- "Exception while killing ansible process:")
- elif self.proc and self.cleanup_started:
- self.log.debug("Abort: cleanup is in progress")
- else:
+ if not self.proc:
self.log.debug("Abort: no process is running")
+ return
+ elif self.cleanup_started and not timed_out:
+ self.log.debug("Abort: cleanup is in progress")
+ return
+
+ self.log.debug("Abort: sending kill signal to job process group")
+ try:
+ pgid = os.getpgid(self.proc.pid)
+ os.killpg(pgid, signal.SIGKILL)
+ except Exception:
+ self.log.exception("Exception while killing ansible process:")
def runAnsible(self, cmd, timeout, playbook, ansible_version,
wrapped=True, cleanup=False):