diff options
author | James E. Blair <jeblair@hp.com> | 2015-04-02 16:37:15 -0700 |
---|---|---|
committer | James E. Blair <jeblair@hp.com> | 2015-04-02 16:44:00 -0700 |
commit | f15139b789d32aef361075487b1b7495a9807b05 (patch) | |
tree | 2f80becb7ab163ef192c7f4eb278255a98bf5704 | |
parent | 96698e21ca554f16b5c6fe4ee6a5d9177fe05ab7 (diff) | |
download | zuul-f15139b789d32aef361075487b1b7495a9807b05.tar.gz |
Fix checking all builds are waiting in tests
The check for whether all outstanding builds are waiting (which
means the system is stable and may be examined) did not account
for builds which may have just started running but had not reported
their start back to Zuul. This means that the test system could
determine the system had settled while a job start packet was
in-flight. This has been seen to cause the test_abandoned_gate
test to fail due to the inconsistent state.
Since this race can occur in reality, also update the launcher
itself to more gracefully handle this condition.
Change-Id: I734094514db294564f6526e42b801a7e1d22a021
-rwxr-xr-x | tests/base.py | 23 | ||||
-rw-r--r-- | zuul/launcher/gearman.py | 11 |
2 files changed, 16 insertions, 18 deletions
diff --git a/tests/base.py b/tests/base.py index 08b3cab41..3d0c39faf 100755 --- a/tests/base.py +++ b/tests/base.py @@ -1167,8 +1167,6 @@ class ZuulTestCase(BaseTestCase): return True def areAllBuildsWaiting(self): - ret = True - builds = self.launcher.builds.values() for build in builds: client_job = None @@ -1180,35 +1178,34 @@ class ZuulTestCase(BaseTestCase): if not client_job: self.log.debug("%s is not known to the gearman client" % build) - ret = False - continue + return False if not client_job.handle: self.log.debug("%s has no handle" % client_job) - ret = False - continue + return False server_job = self.gearman_server.jobs.get(client_job.handle) if not server_job: self.log.debug("%s is not known to the gearman server" % client_job) - ret = False - continue + return False if not hasattr(server_job, 'waiting'): self.log.debug("%s is being enqueued" % server_job) - ret = False - continue + return False if server_job.waiting: continue worker_job = self.worker.gearman_jobs.get(server_job.unique) if worker_job: + if build.number is None: + self.log.debug("%s has not reported start" % worker_job) + return False if worker_job.build.isWaiting(): continue else: self.log.debug("%s is running" % worker_job) - ret = False + return False else: self.log.debug("%s is unassigned" % server_job) - ret = False - return ret + return False + return True def waitUntilSettled(self): self.log.debug("Waiting until settled...") diff --git a/zuul/launcher/gearman.py b/zuul/launcher/gearman.py index 915151e7b..653678a9a 100644 --- a/zuul/launcher/gearman.py +++ b/zuul/launcher/gearman.py @@ -404,14 +404,15 @@ class Gearman(object): self.log.debug("Removed build %s from queue" % build) return + time.sleep(1) + self.log.debug("Still unable to find build %s to cancel" % build) if build.number: self.log.debug("Build %s has just started" % build) - else: - self.log.error("Build %s has not started but was not" - "found in queue; canceling anyway" % build) - self.cancelRunningBuild(build) - self.log.debug("Canceled possibly running build %s" % build) + self.log.debug("Canceled running build %s" % build) + self.cancelRunningBuild(build) + return + self.log.debug("Unable to cancel build %s" % build) def onBuildCompleted(self, job, result=None): if job.unique in self.meta_jobs: |