diff options
author | Steve Baker <sbaker@redhat.com> | 2015-11-10 13:09:10 +1300 |
---|---|---|
committer | Steve Baker <sbaker@redhat.com> | 2015-11-18 14:16:27 +1300 |
commit | 20f6676c8b8db371c7264d3c3c9510640a896a55 (patch) | |
tree | 6334f3f0f2ee0864320e5ba7555a386edd5c8b1e | |
parent | 9bf939178a8c263a7e236e14979dc52d8d526d4c (diff) | |
download | heat-20f6676c8b8db371c7264d3c3c9510640a896a55.tar.gz |
Reset stack status even when lock engine_id is None
Having a stack which is wedged UPDATE_IN_PROGRESS due to another bug,
restarting heat-engine should have resulted in the stack being reset to
UPDATE_FAILED. However the reset was skipped because the lock engine_id
was None.
This change removes the engine_id check since it is possible in practice
for an UPDATE_IN_PROGRESS stack to not be locked.
Since it is possible for the stack to change state in the unlocked
window, a check is added that the stack is still IN_PROGRESS before the
reset is performed.
Change-Id: I1739ccbdf75af35aac5be16b99200975df58b8e2
Closes-Bug: #1514615
(cherry picked from commit 44b2f0965dd329cf45a151705b0648819acff8c7)
-rw-r--r-- | heat/engine/service.py | 50 | ||||
-rw-r--r-- | heat/engine/stack_lock.py | 7 | ||||
-rw-r--r-- | heat/tests/test_engine_service.py | 53 |
3 files changed, 80 insertions, 30 deletions
diff --git a/heat/engine/service.py b/heat/engine/service.py index a2cc0e0c3..5d16bfd61 100644 --- a/heat/engine/service.py +++ b/heat/engine/service.py @@ -1597,30 +1597,50 @@ class EngineService(service.Service): def reset_stack_status(self): cnxt = context.get_admin_context() - filters = {'status': parser.Stack.IN_PROGRESS} + filters = { + 'status': parser.Stack.IN_PROGRESS, + 'convergence': False + } stacks = stack_object.Stack.get_all(cnxt, filters=filters, tenant_safe=False, show_nested=True) or [] for s in stacks: + stack_id = s.id stk = parser.Stack.load(cnxt, stack=s, use_stored_context=True) lock = stack_lock.StackLock(cnxt, stk, self.engine_id) - # If stacklock is released, means stack status may changed. engine_id = lock.get_engine_id() - if not engine_id: - continue - # Try to steal the lock and set status to failed. try: - lock.acquire(retry=False) + with lock.thread_lock(retry=False): + + # refetch stack and confirm it is still IN_PROGRESS + s = stack_object.Stack.get_by_id( + cnxt, + stack_id, + tenant_safe=False, + eager_load=True) + if s.status != parser.Stack.IN_PROGRESS: + lock.release() + continue + + stk = parser.Stack.load(cnxt, stack=s, + use_stored_context=True) + LOG.info(_LI('Engine %(engine)s went down when stack ' + '%(stack_id)s was in action %(action)s'), + {'engine': engine_id, 'action': stk.action, + 'stack_id': stk.id}) + + # Set stack and resources status to FAILED in sub thread + self.thread_group_mgr.start_with_acquired_lock( + stk, + lock, + self.set_stack_and_resource_to_failed, + stk + ) except exception.ActionInProgress: continue - LOG.info(_LI('Engine %(engine)s went down when stack %(stack_id)s' - ' was in action %(action)s'), - {'engine': engine_id, 'action': stk.action, - 'stack_id': stk.id}) - - # Set stack and resources status to FAILED in sub thread - self.thread_group_mgr.start_with_acquired_lock( - stk, lock, self.set_stack_and_resource_to_failed, stk - ) + except Exception: + LOG.exception(_LE('Error while resetting stack: %s') + % stack_id) + continue diff --git a/heat/engine/stack_lock.py b/heat/engine/stack_lock.py index c66758cda..627083b75 100644 --- a/heat/engine/stack_lock.py +++ b/heat/engine/stack_lock.py @@ -132,14 +132,17 @@ class StackLock(object): 'stack': stack_id}) @contextlib.contextmanager - def thread_lock(self, stack_id): + def thread_lock(self, stack_id, retry=True): """ Acquire a lock and release it only if there is an exception. The release method still needs to be scheduled to be run at the end of the thread using the Thread.link method. + + :param retry: When True, retry if lock was released while stealing. + :type retry: boolean """ try: - self.acquire() + self.acquire(retry) yield except exception.ActionInProgress: raise diff --git a/heat/tests/test_engine_service.py b/heat/tests/test_engine_service.py index aa9411384..8336cef4f 100644 --- a/heat/tests/test_engine_service.py +++ b/heat/tests/test_engine_service.py @@ -924,7 +924,7 @@ class StackServiceCreateUpdateDeleteTest(common.HeatTestCase): stack_lock.StackLock.try_acquire().AndReturn(self.man.engine_id) # this is to simulate lock release on DummyThreadGroup stop self.m.StubOutWithMock(stack_lock.StackLock, 'acquire') - stack_lock.StackLock.acquire().AndReturn(None) + stack_lock.StackLock.acquire(True).AndReturn(None) self.m.StubOutWithMock(self.man.thread_group_mgr, 'stop') self.man.thread_group_mgr.stop(stack.id).AndReturn(None) @@ -992,7 +992,7 @@ class StackServiceCreateUpdateDeleteTest(common.HeatTestCase): stack_identity=mox.IgnoreArg()).AndReturn(None) self.m.StubOutWithMock(stack_lock.StackLock, 'acquire') - stack_lock.StackLock.acquire().AndReturn(None) + stack_lock.StackLock.acquire(True).AndReturn(None) self.m.ReplayAll() self.assertIsNone(self.man.delete_stack(self.ctx, stack.identifier())) @@ -1019,7 +1019,7 @@ class StackServiceCreateUpdateDeleteTest(common.HeatTestCase): self.ctx, "other-engine-fake-uuid").AndReturn(False) self.m.StubOutWithMock(stack_lock.StackLock, 'acquire') - stack_lock.StackLock.acquire().AndReturn(None) + stack_lock.StackLock.acquire(True).AndReturn(None) self.m.ReplayAll() self.assertIsNone(self.man.delete_stack(self.ctx, stack.identifier())) @@ -3492,6 +3492,7 @@ class StackServiceTest(common.HeatTestCase): @mock.patch('heat.engine.service.ThreadGroupManager', return_value=mock.Mock()) @mock.patch.object(stack_object.Stack, 'get_all') + @mock.patch.object(stack_object.Stack, 'get_by_id') @mock.patch('heat.engine.stack_lock.StackLock', return_value=mock.Mock()) @mock.patch.object(parser.Stack, 'load') @@ -3501,6 +3502,7 @@ class StackServiceTest(common.HeatTestCase): mock_admin_context, mock_stack_load, mock_stacklock, + mock_get_by_id, mock_get_all, mock_thread): mock_admin_context.return_value = self.ctx @@ -3509,7 +3511,19 @@ class StackServiceTest(common.HeatTestCase): db_stack.id = 'foo' db_stack.status = 'IN_PROGRESS' db_stack.status_reason = None - mock_get_all.return_value = [db_stack] + + unlocked_stack = mock.MagicMock() + unlocked_stack.id = 'bar' + unlocked_stack.status = 'IN_PROGRESS' + unlocked_stack.status_reason = None + + unlocked_stack_failed = mock.MagicMock() + unlocked_stack_failed.id = 'bar' + unlocked_stack_failed.status = 'FAILED' + unlocked_stack_failed.status_reason = 'because' + + mock_get_all.return_value = [db_stack, unlocked_stack] + mock_get_by_id.side_effect = [db_stack, unlocked_stack_failed] fake_stack = mock.MagicMock() fake_stack.action = 'CREATE' @@ -3518,26 +3532,39 @@ class StackServiceTest(common.HeatTestCase): mock_stack_load.return_value = fake_stack - fake_lock = mock.MagicMock() - fake_lock.get_engine_id.return_value = 'old-engine' - fake_lock.acquire.return_value = None - mock_stacklock.return_value = fake_lock + lock1 = mock.MagicMock() + lock1.get_engine_id.return_value = 'old-engine' + lock1.acquire.return_value = None + lock2 = mock.MagicMock() + lock2.acquire.return_value = None + mock_stacklock.side_effect = [lock1, lock2] self.eng.thread_group_mgr = mock_thread self.eng.reset_stack_status() mock_admin_context.assert_called_once_with() - filters = {'status': parser.Stack.IN_PROGRESS} + filters = { + 'status': parser.Stack.IN_PROGRESS, + 'convergence': False + } mock_get_all.assert_called_once_with(self.ctx, filters=filters, tenant_safe=False, show_nested=True) - mock_stack_load.assert_called_once_with(self.ctx, - stack=db_stack, - use_stored_context=True) + mock_get_by_id.assert_has_calls([ + mock.call(self.ctx, 'foo', tenant_safe=False, eager_load=True), + mock.call(self.ctx, 'bar', tenant_safe=False, eager_load=True), + ]) + mock_stack_load.assert_has_calls([ + mock.call(self.ctx, stack=db_stack, use_stored_context=True), + mock.call(self.ctx, stack=db_stack, use_stored_context=True), + mock.call(self.ctx, stack=unlocked_stack, + use_stored_context=True), + ]) + self.assertTrue(lock2.release.called) mock_thread.start_with_acquired_lock.assert_called_once_with( - fake_stack, fake_lock, + fake_stack, lock1, self.eng.set_stack_and_resource_to_failed, fake_stack ) |