diff options
author | Ruby Loo <rloo@oath.com> | 2018-11-07 21:26:31 +0000 |
---|---|---|
committer | Ruby Loo <opensrloo@gmail.com> | 2018-11-26 16:21:26 +0000 |
commit | 162e5f3a8684db311db250b0db8c82fbb26b4412 (patch) | |
tree | cd8f9ac41fa9eecb70a976f782be00f3774fc39a | |
parent | a1c8f31cdfd4402e1ff939a90bad76bad6be7006 (diff) | |
download | ironic-162e5f3a8684db311db250b0db8c82fbb26b4412.tar.gz |
Don't fail when node is in CLEANFAIL state
When a timeout occurs when a node is in CLEANWAIT state, the conductor
puts it into the CLEANFAIL state. However, it tries to do that twice, and
our state machine doesn't support moving from a CLEANFAIL state to another
state via the 'fail' verb/event.
The code was changed so that it doesn't try to move it to CLEANFAIL twice,
and a check is put to prevent the node from being 'failed' frome a CLEANFAIL
state.
Change-Id: Ieeb77dd28a5d3053588c46fe2a700b5e6ceabbd7
Story: 2004299
Task: 27855
(cherry picked from commit 78ae60f11ee708442249c3b00f06d5e0fe73910d)
-rw-r--r-- | ironic/conductor/utils.py | 8 | ||||
-rw-r--r-- | ironic/tests/unit/conductor/test_utils.py | 30 | ||||
-rw-r--r-- | releasenotes/notes/cleanwait_timeout_fail-4323ba7d4d4da3e6.yaml | 8 |
3 files changed, 35 insertions, 11 deletions
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 29772e0dc..3c2e762e2 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -338,8 +338,10 @@ def cleanup_cleanwait_timeout(task): "check if the ramdisk responsible for the cleaning is " "running on the node. Failed on step %(step)s.") % {'step': task.node.clean_step}) - cleaning_error_handler(task, msg=last_error, - set_fail_state=True) + # NOTE(rloo): this is called from the periodic task for cleanwait timeouts, + # via the task manager's process_event(). The node has already been moved + # to CLEANFAIL, so the error handler doesn't need to set the fail state. + cleaning_error_handler(task, msg=last_error, set_fail_state=False) def cleaning_error_handler(task, msg, tear_down_cleaning=True, @@ -375,7 +377,7 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True, node.maintenance_reason = msg node.save() - if set_fail_state: + if set_fail_state and node.provision_state != states.CLEANFAIL: target_state = states.MANAGEABLE if manual_clean else None task.process_event('fail', target_state=target_state) diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 630557489..7772bdd7e 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -1164,7 +1164,7 @@ class ErrorHandlersTestCase(tests_base.TestCase): msg="Timeout reached while cleaning the node. Please " "check if the ramdisk responsible for the cleaning is " "running on the node. Failed on step {}.", - set_fail_state=True) + set_fail_state=False) def test_cleanup_cleanwait_timeout(self): self.node.provision_state = states.CLEANFAIL @@ -1181,16 +1181,18 @@ class ErrorHandlersTestCase(tests_base.TestCase): conductor_utils.cleanup_cleanwait_timeout(self.task) self.assertEqual({}, self.node.clean_step) self.assertNotIn('clean_step_index', self.node.driver_internal_info) - self.task.process_event.assert_called_once_with('fail', - target_state=None) + self.assertFalse(self.task.process_event.called) self.assertTrue(self.node.maintenance) self.assertEqual(clean_error, self.node.maintenance_reason) - def test_cleaning_error_handler(self): - self.node.provision_state = states.CLEANING + def _test_cleaning_error_handler(self, prov_state=states.CLEANING): + self.node.provision_state = prov_state target = 'baz' self.node.target_provision_state = target - self.node.driver_internal_info = {} + self.node.clean_step = {'key': 'val'} + self.node.driver_internal_info = { + 'cleaning_reboot': True, + 'clean_step_index': 0} msg = 'error bar' conductor_utils.cleaning_error_handler(self.task, msg) self.node.save.assert_called_once_with() @@ -1201,8 +1203,20 @@ class ErrorHandlersTestCase(tests_base.TestCase): self.assertEqual(msg, self.node.maintenance_reason) driver = self.task.driver.deploy driver.tear_down_cleaning.assert_called_once_with(self.task) - self.task.process_event.assert_called_once_with('fail', - target_state=None) + if prov_state == states.CLEANFAIL: + self.assertFalse(self.task.process_event.called) + else: + self.task.process_event.assert_called_once_with('fail', + target_state=None) + + def test_cleaning_error_handler(self): + self._test_cleaning_error_handler() + + def test_cleaning_error_handler_cleanwait(self): + self._test_cleaning_error_handler(prov_state=states.CLEANWAIT) + + def test_cleaning_error_handler_cleanfail(self): + self._test_cleaning_error_handler(prov_state=states.CLEANFAIL) def test_cleaning_error_handler_manual(self): target = states.MANAGEABLE diff --git a/releasenotes/notes/cleanwait_timeout_fail-4323ba7d4d4da3e6.yaml b/releasenotes/notes/cleanwait_timeout_fail-4323ba7d4d4da3e6.yaml new file mode 100644 index 000000000..9587fe80d --- /dev/null +++ b/releasenotes/notes/cleanwait_timeout_fail-4323ba7d4d4da3e6.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes an issue with a baremetal node that times out during cleaning. + The ironic-conductor was attempting to change the node's provision state + to 'clean failed' twice, resulting in the node's ``last_error`` being set + incorrectly. This no longer happens. For more information, see + `story 2004299 <https://storyboard.openstack.org/#!/story/2004299>`_. |