Fix checking all builds are waiting in tests

The check for whether all outstanding builds are waiting (which means the system is stable and may be examined) did not account for builds which may have just started running but had not reported their start back to Zuul. This means that the test system could determine the system had settled while a job start packet was in-flight. This has been seen to cause the test_abandoned_gate test to fail due to the inconsistent state. Since this race can occur in reality, also update the launcher itself to more gracefully handle this condition. Change-Id: I734094514db294564f6526e42b801a7e1d22a021
author: James E. Blair <jeblair@hp.com> 2015-04-02 16:37:15 -0700
committer: James E. Blair <jeblair@hp.com> 2015-04-02 16:44:00 -0700
commit: f15139b789d32aef361075487b1b7495a9807b05 (patch)
tree: 2f80becb7ab163ef192c7f4eb278255a98bf5704
parent: 96698e21ca554f16b5c6fe4ee6a5d9177fe05ab7 (diff)
download: zuul-f15139b789d32aef361075487b1b7495a9807b05.tar.gz
2 files changed, 16 insertions, 18 deletions
diff --git a/tests/base.py b/tests/base.py
index 08b3cab41..3d0c39faf 100755
--- a/tests/base.py
+++ b/tests/base.py
@@ -1167,8 +1167,6 @@ class ZuulTestCase(BaseTestCase):
         return True
 
     def areAllBuildsWaiting(self):
-        ret = True
-
         builds = self.launcher.builds.values()
         for build in builds:
             client_job = None
@@ -1180,35 +1178,34 @@ class ZuulTestCase(BaseTestCase):
             if not client_job:
                 self.log.debug("%s is not known to the gearman client" %
                                build)
-                ret = False
-                continue
+                return False
             if not client_job.handle:
                 self.log.debug("%s has no handle" % client_job)
-                ret = False
-                continue
+                return False
             server_job = self.gearman_server.jobs.get(client_job.handle)
             if not server_job:
                 self.log.debug("%s is not known to the gearman server" %
                                client_job)
-                ret = False
-                continue
+                return False
             if not hasattr(server_job, 'waiting'):
                 self.log.debug("%s is being enqueued" % server_job)
-                ret = False
-                continue
+                return False
             if server_job.waiting:
                 continue
             worker_job = self.worker.gearman_jobs.get(server_job.unique)
             if worker_job:
+                if build.number is None:
+                    self.log.debug("%s has not reported start" % worker_job)
+                    return False
                 if worker_job.build.isWaiting():
                     continue
                 else:
                     self.log.debug("%s is running" % worker_job)
-                    ret = False
+                    return False
             else:
                 self.log.debug("%s is unassigned" % server_job)
-                ret = False
-        return ret
+                return False
+        return True
 
     def waitUntilSettled(self):
         self.log.debug("Waiting until settled...")
diff --git a/zuul/launcher/gearman.py b/zuul/launcher/gearman.py
index 915151e7b..653678a9a 100644
--- a/zuul/launcher/gearman.py
+++ b/zuul/launcher/gearman.py
@@ -404,14 +404,15 @@ class Gearman(object):
             self.log.debug("Removed build %s from queue" % build)
             return
 
+        time.sleep(1)
+
         self.log.debug("Still unable to find build %s to cancel" % build)
         if build.number:
             self.log.debug("Build %s has just started" % build)
-        else:
-            self.log.error("Build %s has not started but was not"
-                           "found in queue; canceling anyway" % build)
-        self.cancelRunningBuild(build)
-        self.log.debug("Canceled possibly running build %s" % build)
+            self.log.debug("Canceled running build %s" % build)
+            self.cancelRunningBuild(build)
+            return
+        self.log.debug("Unable to cancel build %s" % build)
 
     def onBuildCompleted(self, job, result=None):
         if job.unique in self.meta_jobs:
author	James E. Blair <jeblair@hp.com>	2015-04-02 16:37:15 -0700
committer	James E. Blair <jeblair@hp.com>	2015-04-02 16:44:00 -0700
commit	f15139b789d32aef361075487b1b7495a9807b05 (patch)
tree	2f80becb7ab163ef192c7f4eb278255a98bf5704
parent	96698e21ca554f16b5c6fe4ee6a5d9177fe05ab7 (diff)
download	zuul-f15139b789d32aef361075487b1b7495a9807b05.tar.gz