Reset stack status even when lock engine_id is None

Having a stack which is wedged UPDATE_IN_PROGRESS due to another bug, restarting heat-engine should have resulted in the stack being reset to UPDATE_FAILED. However the reset was skipped because the lock engine_id was None. This change removes the engine_id check since it is possible in practice for an UPDATE_IN_PROGRESS stack to not be locked. Since it is possible for the stack to change state in the unlocked window, a check is added that the stack is still IN_PROGRESS before the reset is performed. Change-Id: I1739ccbdf75af35aac5be16b99200975df58b8e2 Closes-Bug: #1514615 (cherry picked from commit 44b2f0965dd329cf45a151705b0648819acff8c7)
author: Steve Baker <sbaker@redhat.com> 2015-11-10 13:09:10 +1300
committer: Steve Baker <sbaker@redhat.com> 2015-11-18 14:16:27 +1300
commit: 20f6676c8b8db371c7264d3c3c9510640a896a55 (patch)
tree: 6334f3f0f2ee0864320e5ba7555a386edd5c8b1e
parent: 9bf939178a8c263a7e236e14979dc52d8d526d4c (diff)
download: heat-20f6676c8b8db371c7264d3c3c9510640a896a55.tar.gz
3 files changed, 80 insertions, 30 deletions
diff --git a/heat/engine/service.py b/heat/engine/service.py
index a2cc0e0c3..5d16bfd61 100644
--- a/heat/engine/service.py
+++ b/heat/engine/service.py
@@ -1597,30 +1597,50 @@ class EngineService(service.Service):
 
     def reset_stack_status(self):
         cnxt = context.get_admin_context()
-        filters = {'status': parser.Stack.IN_PROGRESS}
+        filters = {
+            'status': parser.Stack.IN_PROGRESS,
+            'convergence': False
+        }
         stacks = stack_object.Stack.get_all(cnxt,
                                             filters=filters,
                                             tenant_safe=False,
                                             show_nested=True) or []
         for s in stacks:
+            stack_id = s.id
             stk = parser.Stack.load(cnxt, stack=s,
                                     use_stored_context=True)
             lock = stack_lock.StackLock(cnxt, stk, self.engine_id)
-            # If stacklock is released, means stack status may changed.
             engine_id = lock.get_engine_id()
-            if not engine_id:
-                continue
-            # Try to steal the lock and set status to failed.
             try:
-                lock.acquire(retry=False)
+                with lock.thread_lock(retry=False):
+
+                    # refetch stack and confirm it is still IN_PROGRESS
+                    s = stack_object.Stack.get_by_id(
+                        cnxt,
+                        stack_id,
+                        tenant_safe=False,
+                        eager_load=True)
+                    if s.status != parser.Stack.IN_PROGRESS:
+                        lock.release()
+                        continue
+
+                stk = parser.Stack.load(cnxt, stack=s,
+                                        use_stored_context=True)
+                LOG.info(_LI('Engine %(engine)s went down when stack '
+                             '%(stack_id)s was in action %(action)s'),
+                         {'engine': engine_id, 'action': stk.action,
+                          'stack_id': stk.id})
+
+                # Set stack and resources status to FAILED in sub thread
+                self.thread_group_mgr.start_with_acquired_lock(
+                    stk,
+                    lock,
+                    self.set_stack_and_resource_to_failed,
+                    stk
+                )
             except exception.ActionInProgress:
                 continue
-            LOG.info(_LI('Engine %(engine)s went down when stack %(stack_id)s'
-                         ' was in action %(action)s'),
-                     {'engine': engine_id, 'action': stk.action,
-                      'stack_id': stk.id})
-
-            # Set stack and resources status to FAILED in sub thread
-            self.thread_group_mgr.start_with_acquired_lock(
-                stk, lock, self.set_stack_and_resource_to_failed, stk
-            )
+            except Exception:
+                LOG.exception(_LE('Error while resetting stack: %s')
+                              % stack_id)
+                continue
diff --git a/heat/engine/stack_lock.py b/heat/engine/stack_lock.py
index c66758cda..627083b75 100644
--- a/heat/engine/stack_lock.py
+++ b/heat/engine/stack_lock.py
@@ -132,14 +132,17 @@ class StackLock(object):
                                      'stack': stack_id})
 
     @contextlib.contextmanager
-    def thread_lock(self, stack_id):
+    def thread_lock(self, stack_id, retry=True):
         """
         Acquire a lock and release it only if there is an exception.  The
         release method still needs to be scheduled to be run at the
         end of the thread using the Thread.link method.
+
+        :param retry: When True, retry if lock was released while stealing.
+        :type retry: boolean
         """
         try:
-            self.acquire()
+            self.acquire(retry)
             yield
         except exception.ActionInProgress:
             raise
diff --git a/heat/tests/test_engine_service.py b/heat/tests/test_engine_service.py
index aa9411384..8336cef4f 100644
--- a/heat/tests/test_engine_service.py
+++ b/heat/tests/test_engine_service.py
@@ -924,7 +924,7 @@ class StackServiceCreateUpdateDeleteTest(common.HeatTestCase):
         stack_lock.StackLock.try_acquire().AndReturn(self.man.engine_id)
         # this is to simulate lock release on DummyThreadGroup stop
         self.m.StubOutWithMock(stack_lock.StackLock, 'acquire')
-        stack_lock.StackLock.acquire().AndReturn(None)
+        stack_lock.StackLock.acquire(True).AndReturn(None)
 
         self.m.StubOutWithMock(self.man.thread_group_mgr, 'stop')
         self.man.thread_group_mgr.stop(stack.id).AndReturn(None)
@@ -992,7 +992,7 @@ class StackServiceCreateUpdateDeleteTest(common.HeatTestCase):
             stack_identity=mox.IgnoreArg()).AndReturn(None)
 
         self.m.StubOutWithMock(stack_lock.StackLock, 'acquire')
-        stack_lock.StackLock.acquire().AndReturn(None)
+        stack_lock.StackLock.acquire(True).AndReturn(None)
         self.m.ReplayAll()
 
         self.assertIsNone(self.man.delete_stack(self.ctx, stack.identifier()))
@@ -1019,7 +1019,7 @@ class StackServiceCreateUpdateDeleteTest(common.HeatTestCase):
             self.ctx, "other-engine-fake-uuid").AndReturn(False)
 
         self.m.StubOutWithMock(stack_lock.StackLock, 'acquire')
-        stack_lock.StackLock.acquire().AndReturn(None)
+        stack_lock.StackLock.acquire(True).AndReturn(None)
         self.m.ReplayAll()
 
         self.assertIsNone(self.man.delete_stack(self.ctx, stack.identifier()))
@@ -3492,6 +3492,7 @@ class StackServiceTest(common.HeatTestCase):
     @mock.patch('heat.engine.service.ThreadGroupManager',
                 return_value=mock.Mock())
     @mock.patch.object(stack_object.Stack, 'get_all')
+    @mock.patch.object(stack_object.Stack, 'get_by_id')
     @mock.patch('heat.engine.stack_lock.StackLock',
                 return_value=mock.Mock())
     @mock.patch.object(parser.Stack, 'load')
@@ -3501,6 +3502,7 @@ class StackServiceTest(common.HeatTestCase):
             mock_admin_context,
             mock_stack_load,
             mock_stacklock,
+            mock_get_by_id,
             mock_get_all,
             mock_thread):
         mock_admin_context.return_value = self.ctx
@@ -3509,7 +3511,19 @@ class StackServiceTest(common.HeatTestCase):
         db_stack.id = 'foo'
         db_stack.status = 'IN_PROGRESS'
         db_stack.status_reason = None
-        mock_get_all.return_value = [db_stack]
+
+        unlocked_stack = mock.MagicMock()
+        unlocked_stack.id = 'bar'
+        unlocked_stack.status = 'IN_PROGRESS'
+        unlocked_stack.status_reason = None
+
+        unlocked_stack_failed = mock.MagicMock()
+        unlocked_stack_failed.id = 'bar'
+        unlocked_stack_failed.status = 'FAILED'
+        unlocked_stack_failed.status_reason = 'because'
+
+        mock_get_all.return_value = [db_stack, unlocked_stack]
+        mock_get_by_id.side_effect = [db_stack, unlocked_stack_failed]
 
         fake_stack = mock.MagicMock()
         fake_stack.action = 'CREATE'
@@ -3518,26 +3532,39 @@ class StackServiceTest(common.HeatTestCase):
 
         mock_stack_load.return_value = fake_stack
 
-        fake_lock = mock.MagicMock()
-        fake_lock.get_engine_id.return_value = 'old-engine'
-        fake_lock.acquire.return_value = None
-        mock_stacklock.return_value = fake_lock
+        lock1 = mock.MagicMock()
+        lock1.get_engine_id.return_value = 'old-engine'
+        lock1.acquire.return_value = None
+        lock2 = mock.MagicMock()
+        lock2.acquire.return_value = None
+        mock_stacklock.side_effect = [lock1, lock2]
 
         self.eng.thread_group_mgr = mock_thread
 
         self.eng.reset_stack_status()
 
         mock_admin_context.assert_called_once_with()
-        filters = {'status': parser.Stack.IN_PROGRESS}
+        filters = {
+            'status': parser.Stack.IN_PROGRESS,
+            'convergence': False
+        }
         mock_get_all.assert_called_once_with(self.ctx,
                                              filters=filters,
                                              tenant_safe=False,
                                              show_nested=True)
-        mock_stack_load.assert_called_once_with(self.ctx,
-                                                stack=db_stack,
-                                                use_stored_context=True)
+        mock_get_by_id.assert_has_calls([
+            mock.call(self.ctx, 'foo', tenant_safe=False, eager_load=True),
+            mock.call(self.ctx, 'bar', tenant_safe=False, eager_load=True),
+        ])
+        mock_stack_load.assert_has_calls([
+            mock.call(self.ctx, stack=db_stack, use_stored_context=True),
+            mock.call(self.ctx, stack=db_stack, use_stored_context=True),
+            mock.call(self.ctx, stack=unlocked_stack,
+                      use_stored_context=True),
+        ])
+        self.assertTrue(lock2.release.called)
         mock_thread.start_with_acquired_lock.assert_called_once_with(
-            fake_stack, fake_lock,
+            fake_stack, lock1,
             self.eng.set_stack_and_resource_to_failed, fake_stack
         )
author	Steve Baker <sbaker@redhat.com>	2015-11-10 13:09:10 +1300
committer	Steve Baker <sbaker@redhat.com>	2015-11-18 14:16:27 +1300
commit	20f6676c8b8db371c7264d3c3c9510640a896a55 (patch)
tree	6334f3f0f2ee0864320e5ba7555a386edd5c8b1e
parent	9bf939178a8c263a7e236e14979dc52d8d526d4c (diff)
download	heat-20f6676c8b8db371c7264d3c3c9510640a896a55.tar.gz