On reconfiguration, re-enqueue items at the same position

Upon reconfiguration, we currently re-enqueue every item back into its pipeline, in case a difference in the configuration would change what should occur. In the simple case, this is fine, but if a dependent pipeline has a branching dependency structure due to already failing jobs, we would erroneously re-order the changes back into a linear arrangement. The next pass through the pipeline manager would move those items back to where they should be, but in doing so, would reset their builds. To correct this, after re-enqueueing a change, if the change that was previously ahead of it in the pipeline was also successfully re-enqueued, immediately move the change behind it (or if the change ahead was "None" meaning it was its own head, move it behind no change). This should have the effect of putting changes back where they were in relation to other failing changes. If the change ahead was not successfully re-enqueued, the current behavior of simply putting it behind the nearest change in the queue is preserved. If anything more complex happens, any errors will be corrected on the next pass through the pipeline manager. Change-Id: Ie3771d9bbbc1ca77425cf62751d8e5f70ba1f14c
author: James E. Blair <jeblair@redhat.com> 2017-10-31 16:40:36 -0700
committer: James E. Blair <jeblair@redhat.com> 2017-10-31 17:01:09 -0700
commit: 872738fc33c51749af4d364a39b1c3bf4906ab48 (patch)
tree: f14c5b477ac687098094bf8a7ec84a484ee94dc8
parent: a1fa9aa1c624f302a06a91c3bcb2e608ccc93604 (diff)
download: zuul-872738fc33c51749af4d364a39b1c3bf4906ab48.tar.gz
4 files changed, 137 insertions, 2 deletions
diff --git a/tests/fixtures/layouts/reconfigure-failed-head.yaml b/tests/fixtures/layouts/reconfigure-failed-head.yaml
new file mode 100644
index 000000000..3347c9b82
--- /dev/null
+++ b/tests/fixtures/layouts/reconfigure-failed-head.yaml
@@ -0,0 +1,56 @@
+- pipeline:
+    name: check
+    manager: independent
+    trigger:
+      gerrit:
+        - event: patchset-created
+    success:
+      gerrit:
+        Verified: 1
+    failure:
+      gerrit:
+        Verified: -1
+
+- pipeline:
+    name: gate
+    manager: dependent
+    success-message: Build succeeded (gate).
+    trigger:
+      gerrit:
+        - event: comment-added
+          approval:
+            - Approved: 1
+    success:
+      gerrit:
+        Verified: 2
+        submit: true
+    failure:
+      gerrit:
+        Verified: -2
+    start:
+      gerrit:
+        Verified: 0
+    precedence: high
+
+- job:
+    name: base
+    parent: null
+
+- job:
+    name: job1
+    run: playbooks/job1.yaml
+
+- job:
+    name: job2
+    run: playbooks/job2.yaml
+
+- project:
+    name: org/project
+    check:
+      jobs:
+        - job1
+        - job2
+    gate:
+      jobs:
+        - job1
+        - job2
diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index b0df2b2df..53a20ffe2 100755
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@@ -3034,6 +3034,70 @@ class TestScheduler(ZuulTestCase):
         self.assertEqual(len(tenant.layout.pipelines['check'].queues), 0)
         self.assertIn('Build succeeded', A.messages[0])
 
+    @simple_layout("layouts/reconfigure-failed-head.yaml")
+    def test_live_reconfiguration_failed_change_at_head(self):
+        # Test that if we reconfigure with a failed change at head,
+        # that the change behind it isn't reparented onto it.
+
+        self.executor_server.hold_jobs_in_build = True
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+        A.addApproval('Code-Review', 2)
+        B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
+        B.addApproval('Code-Review', 2)
+
+        self.executor_server.failJob('job1', A)
+
+        self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+        self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
+
+        self.waitUntilSettled()
+
+        self.assertBuilds([
+            dict(name='job1', changes='1,1'),
+            dict(name='job2', changes='1,1'),
+            dict(name='job1', changes='1,1 2,1'),
+            dict(name='job2', changes='1,1 2,1'),
+        ])
+
+        self.release(self.builds[0])
+        self.waitUntilSettled()
+
+        self.assertBuilds([
+            dict(name='job2', changes='1,1'),
+            dict(name='job1', changes='2,1'),
+            dict(name='job2', changes='2,1'),
+        ])
+
+        # Unordered history comparison because the aborts can finish
+        # in any order.
+        self.assertHistory([
+            dict(name='job1', result='FAILURE', changes='1,1'),
+            dict(name='job1', result='ABORTED', changes='1,1 2,1'),
+            dict(name='job2', result='ABORTED', changes='1,1 2,1'),
+        ], ordered=False)
+
+        self.sched.reconfigure(self.config)
+        self.waitUntilSettled()
+
+        self.executor_server.hold_jobs_in_build = False
+        self.executor_server.release()
+        self.waitUntilSettled()
+
+        self.assertBuilds([])
+
+        self.assertHistory([
+            dict(name='job1', result='FAILURE', changes='1,1'),
+            dict(name='job1', result='ABORTED', changes='1,1 2,1'),
+            dict(name='job2', result='ABORTED', changes='1,1 2,1'),
+            dict(name='job2', result='SUCCESS', changes='1,1'),
+            dict(name='job1', result='SUCCESS', changes='2,1'),
+            dict(name='job2', result='SUCCESS', changes='2,1'),
+        ], ordered=False)
+        self.assertEqual(A.data['status'], 'NEW')
+        self.assertEqual(B.data['status'], 'MERGED')
+        self.assertEqual(A.reported, 2)
+        self.assertEqual(B.reported, 2)
+
     def test_delayed_repo_init(self):
         self.init_repo("org/new-project")
         files = {'README': ''}
diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py
index 9c30d7475..6c72c2d89 100644
--- a/zuul/manager/__init__.py
+++ b/zuul/manager/__init__.py
@@ -223,13 +223,19 @@ class PipelineManager(object):
             if item.change.equals(change):
                 self.removeItem(item)
 
-    def reEnqueueItem(self, item, last_head):
+    def reEnqueueItem(self, item, last_head, old_item_ahead, item_ahead_valid):
         with self.getChangeQueue(item.change, last_head.queue) as change_queue:
             if change_queue:
                 self.log.debug("Re-enqueing change %s in queue %s" %
                                (item.change, change_queue))
                 change_queue.enqueueItem(item)
 
+                # If the old item ahead was re-enqued, this value will
+                # be true, so we should attempt to move the item back
+                # to where it was in case an item ahead is already
+                # failing.
+                if item_ahead_valid:
+                    change_queue.moveItem(item, old_item_ahead)
                 # Get an updated copy of the layout and update the job
                 # graph if necessary.  This resumes the buildset merge
                 # state machine.  If we have an up-to-date layout, it
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 73fa923a5..a725fcd68 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -614,13 +614,22 @@ class Scheduler(threading.Thread):
                     item.queue = None
                     item.change.project = self._reenqueueGetProject(
                         tenant, item)
+                    # If the old item ahead made it in, re-enqueue
+                    # this one behind it.
+                    if item.item_ahead in items_to_remove:
+                        old_item_ahead = None
+                        item_ahead_valid = False
+                    else:
+                        old_item_ahead = item.item_ahead
+                        item_ahead_valid = True
                     item.item_ahead = None
                     item.items_behind = []
                     reenqueued = False
                     if item.change.project:
                         try:
                             reenqueued = new_pipeline.manager.reEnqueueItem(
-                                item, last_head)
+                                item, last_head, old_item_ahead,
+                                item_ahead_valid=item_ahead_valid)
                         except Exception:
                             self.log.exception(
                                 "Exception while re-enqueing item %s",
author	James E. Blair <jeblair@redhat.com>	2017-10-31 16:40:36 -0700
committer	James E. Blair <jeblair@redhat.com>	2017-10-31 17:01:09 -0700
commit	872738fc33c51749af4d364a39b1c3bf4906ab48 (patch)
tree	f14c5b477ac687098094bf8a7ec84a484ee94dc8
parent	a1fa9aa1c624f302a06a91c3bcb2e608ccc93604 (diff)
download	zuul-872738fc33c51749af4d364a39b1c3bf4906ab48.tar.gz