summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames E. Blair <jeblair@hp.com>2015-04-02 16:37:15 -0700
committerJames E. Blair <jeblair@hp.com>2015-04-02 16:44:00 -0700
commitf15139b789d32aef361075487b1b7495a9807b05 (patch)
tree2f80becb7ab163ef192c7f4eb278255a98bf5704
parent96698e21ca554f16b5c6fe4ee6a5d9177fe05ab7 (diff)
downloadzuul-f15139b789d32aef361075487b1b7495a9807b05.tar.gz
Fix checking all builds are waiting in tests
The check for whether all outstanding builds are waiting (which means the system is stable and may be examined) did not account for builds which may have just started running but had not reported their start back to Zuul. This means that the test system could determine the system had settled while a job start packet was in-flight. This has been seen to cause the test_abandoned_gate test to fail due to the inconsistent state. Since this race can occur in reality, also update the launcher itself to more gracefully handle this condition. Change-Id: I734094514db294564f6526e42b801a7e1d22a021
-rwxr-xr-xtests/base.py23
-rw-r--r--zuul/launcher/gearman.py11
2 files changed, 16 insertions, 18 deletions
diff --git a/tests/base.py b/tests/base.py
index 08b3cab41..3d0c39faf 100755
--- a/tests/base.py
+++ b/tests/base.py
@@ -1167,8 +1167,6 @@ class ZuulTestCase(BaseTestCase):
return True
def areAllBuildsWaiting(self):
- ret = True
-
builds = self.launcher.builds.values()
for build in builds:
client_job = None
@@ -1180,35 +1178,34 @@ class ZuulTestCase(BaseTestCase):
if not client_job:
self.log.debug("%s is not known to the gearman client" %
build)
- ret = False
- continue
+ return False
if not client_job.handle:
self.log.debug("%s has no handle" % client_job)
- ret = False
- continue
+ return False
server_job = self.gearman_server.jobs.get(client_job.handle)
if not server_job:
self.log.debug("%s is not known to the gearman server" %
client_job)
- ret = False
- continue
+ return False
if not hasattr(server_job, 'waiting'):
self.log.debug("%s is being enqueued" % server_job)
- ret = False
- continue
+ return False
if server_job.waiting:
continue
worker_job = self.worker.gearman_jobs.get(server_job.unique)
if worker_job:
+ if build.number is None:
+ self.log.debug("%s has not reported start" % worker_job)
+ return False
if worker_job.build.isWaiting():
continue
else:
self.log.debug("%s is running" % worker_job)
- ret = False
+ return False
else:
self.log.debug("%s is unassigned" % server_job)
- ret = False
- return ret
+ return False
+ return True
def waitUntilSettled(self):
self.log.debug("Waiting until settled...")
diff --git a/zuul/launcher/gearman.py b/zuul/launcher/gearman.py
index 915151e7b..653678a9a 100644
--- a/zuul/launcher/gearman.py
+++ b/zuul/launcher/gearman.py
@@ -404,14 +404,15 @@ class Gearman(object):
self.log.debug("Removed build %s from queue" % build)
return
+ time.sleep(1)
+
self.log.debug("Still unable to find build %s to cancel" % build)
if build.number:
self.log.debug("Build %s has just started" % build)
- else:
- self.log.error("Build %s has not started but was not"
- "found in queue; canceling anyway" % build)
- self.cancelRunningBuild(build)
- self.log.debug("Canceled possibly running build %s" % build)
+ self.log.debug("Canceled running build %s" % build)
+ self.cancelRunningBuild(build)
+ return
+ self.log.debug("Unable to cancel build %s" % build)
def onBuildCompleted(self, job, result=None):
if job.unique in self.meta_jobs: