summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbst-marge-bot <marge-bot@buildstream.build>2019-11-13 16:39:05 +0000
committerbst-marge-bot <marge-bot@buildstream.build>2019-11-13 16:39:05 +0000
commitc3eee615fb5fe957d17151ff655ebebc3d029681 (patch)
tree9416341236dd216fbed71a4f08c1534aa35ed65f
parentbaff3507d2ad5bad2e0a92e3b5ee0805e19504f7 (diff)
parentecc1f607aa186192e707552f309d78708fd68159 (diff)
downloadbuildstream-c3eee615fb5fe957d17151ff655ebebc3d029681.tar.gz
Merge branch 'bschubert/graceful-children-sigterm' into 'master'
Gracefully shutdown children on termination Closes #1185 See merge request BuildStream/buildstream!1692
-rw-r--r--src/buildstream/_scheduler/jobs/job.py38
-rw-r--r--src/buildstream/_scheduler/scheduler.py21
2 files changed, 31 insertions, 28 deletions
diff --git a/src/buildstream/_scheduler/jobs/job.py b/src/buildstream/_scheduler/jobs/job.py
index 4e6199e16..3363d7b60 100644
--- a/src/buildstream/_scheduler/jobs/job.py
+++ b/src/buildstream/_scheduler/jobs/job.py
@@ -45,6 +45,8 @@ class _ReturnCode(FastEnum):
FAIL = 1
PERM_FAIL = 2
SKIPPED = 3
+ TERMINATED = 4
+ KILLED = -9
# JobStatus:
@@ -249,22 +251,6 @@ class Job():
def get_terminated(self):
return self._terminated
- # terminate_wait()
- #
- # Wait for terminated jobs to complete
- #
- # Args:
- # timeout (float): Seconds to wait
- #
- # Returns:
- # (bool): True if the process terminated cleanly, otherwise False
- #
- def terminate_wait(self, timeout):
-
- # Join the child process after sending SIGTERM
- self._process.join(timeout)
- return self._process.exitcode is not None
-
# kill()
#
# Forcefully kill the process, and any children it might have.
@@ -471,6 +457,20 @@ class Job():
status = JobStatus.SKIPPED
elif returncode in (_ReturnCode.FAIL, _ReturnCode.PERM_FAIL):
status = JobStatus.FAIL
+ elif returncode == _ReturnCode.TERMINATED:
+ if self._terminated:
+ self.message(MessageType.INFO, "Process was terminated")
+ else:
+ self.message(MessageType.ERROR, "Process was terminated unexpectedly")
+
+ status = JobStatus.FAIL
+ elif returncode == _ReturnCode.KILLED:
+ if self._terminated:
+ self.message(MessageType.INFO, "Process was killed")
+ else:
+ self.message(MessageType.ERROR, "Process was killed unexpectedly")
+
+ status = JobStatus.FAIL
else:
status = JobStatus.FAIL
@@ -730,6 +730,12 @@ class ChildJob():
with _signals.suspendable(stop_time, resume_time), \
self._messenger.recorded_messages(self._logfile, self._logdir) as filename:
+ # Graciously handle sigterms.
+ def handle_sigterm(_signum, _sigframe):
+ self._child_shutdown(_ReturnCode.TERMINATED)
+
+ signal.signal(signal.SIGTERM, handle_sigterm)
+
self.message(MessageType.START, self.action_name, logfile=filename)
try:
diff --git a/src/buildstream/_scheduler/scheduler.py b/src/buildstream/_scheduler/scheduler.py
index 7ef5c5fe3..86e3af021 100644
--- a/src/buildstream/_scheduler/scheduler.py
+++ b/src/buildstream/_scheduler/scheduler.py
@@ -34,6 +34,9 @@ from .._message import Message, MessageType
from ..plugin import Plugin
+_MAX_TIMEOUT_TO_KILL_CHILDREN = 20 # in seconds
+
+
# A decent return code for Scheduler.run()
class SchedStatus(FastEnum):
SUCCESS = 0
@@ -526,21 +529,15 @@ class Scheduler():
self.loop.remove_signal_handler(signal.SIGTERM)
def _terminate_jobs_real(self):
- # 20 seconds is a long time, it can take a while and sometimes
- # we still fail, need to look deeper into this again.
- wait_start = datetime.datetime.now()
- wait_limit = 20.0
+ def kill_jobs():
+ for job_ in self._active_jobs:
+ job_.kill()
- # First tell all jobs to terminate
- for job in self._active_jobs:
- job.terminate()
+ # Schedule all jobs to be killed if they have not exited after timeout
+ self.loop.call_later(_MAX_TIMEOUT_TO_KILL_CHILDREN, kill_jobs)
- # Now wait for them to really terminate
for job in self._active_jobs:
- elapsed = datetime.datetime.now() - wait_start
- timeout = max(wait_limit - elapsed.total_seconds(), 0.0)
- if not job.terminate_wait(timeout):
- job.kill()
+ job.terminate()
# Regular timeout for driving status in the UI
def _tick(self):