Don't fail when node is in CLEANFAIL state

When a timeout occurs when a node is in CLEANWAIT state, the conductor puts it into the CLEANFAIL state. However, it tries to do that twice, and our state machine doesn't support moving from a CLEANFAIL state to another state via the 'fail' verb/event. The code was changed so that it doesn't try to move it to CLEANFAIL twice, and a check is put to prevent the node from being 'failed' frome a CLEANFAIL state. Change-Id: Ieeb77dd28a5d3053588c46fe2a700b5e6ceabbd7 Story: 2004299 Task: 27855 (cherry picked from commit 78ae60f11ee708442249c3b00f06d5e0fe73910d)
author: Ruby Loo <rloo@oath.com> 2018-11-07 21:26:31 +0000
committer: Ruby Loo <opensrloo@gmail.com> 2018-11-26 16:21:26 +0000
commit: 162e5f3a8684db311db250b0db8c82fbb26b4412 (patch)
tree: cd8f9ac41fa9eecb70a976f782be00f3774fc39a
parent: a1c8f31cdfd4402e1ff939a90bad76bad6be7006 (diff)
download: ironic-162e5f3a8684db311db250b0db8c82fbb26b4412.tar.gz
3 files changed, 35 insertions, 11 deletions
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index 29772e0dc..3c2e762e2 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -338,8 +338,10 @@ def cleanup_cleanwait_timeout(task):
                     "check if the ramdisk responsible for the cleaning is "
                     "running on the node. Failed on step %(step)s.") %
                   {'step': task.node.clean_step})
-    cleaning_error_handler(task, msg=last_error,
-                           set_fail_state=True)
+    # NOTE(rloo): this is called from the periodic task for cleanwait timeouts,
+    # via the task manager's process_event(). The node has already been moved
+    # to CLEANFAIL, so the error handler doesn't need to set the fail state.
+    cleaning_error_handler(task, msg=last_error, set_fail_state=False)
 
 
 def cleaning_error_handler(task, msg, tear_down_cleaning=True,
@@ -375,7 +377,7 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
     node.maintenance_reason = msg
     node.save()
 
-    if set_fail_state:
+    if set_fail_state and node.provision_state != states.CLEANFAIL:
         target_state = states.MANAGEABLE if manual_clean else None
         task.process_event('fail', target_state=target_state)
 
diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py
index 630557489..7772bdd7e 100644
--- a/ironic/tests/unit/conductor/test_utils.py
+++ b/ironic/tests/unit/conductor/test_utils.py
@@ -1164,7 +1164,7 @@ class ErrorHandlersTestCase(tests_base.TestCase):
             msg="Timeout reached while cleaning the node. Please "
                 "check if the ramdisk responsible for the cleaning is "
                 "running on the node. Failed on step {}.",
-            set_fail_state=True)
+            set_fail_state=False)
 
     def test_cleanup_cleanwait_timeout(self):
         self.node.provision_state = states.CLEANFAIL
@@ -1181,16 +1181,18 @@ class ErrorHandlersTestCase(tests_base.TestCase):
         conductor_utils.cleanup_cleanwait_timeout(self.task)
         self.assertEqual({}, self.node.clean_step)
         self.assertNotIn('clean_step_index', self.node.driver_internal_info)
-        self.task.process_event.assert_called_once_with('fail',
-                                                        target_state=None)
+        self.assertFalse(self.task.process_event.called)
         self.assertTrue(self.node.maintenance)
         self.assertEqual(clean_error, self.node.maintenance_reason)
 
-    def test_cleaning_error_handler(self):
-        self.node.provision_state = states.CLEANING
+    def _test_cleaning_error_handler(self, prov_state=states.CLEANING):
+        self.node.provision_state = prov_state
         target = 'baz'
         self.node.target_provision_state = target
-        self.node.driver_internal_info = {}
+        self.node.clean_step = {'key': 'val'}
+        self.node.driver_internal_info = {
+            'cleaning_reboot': True,
+            'clean_step_index': 0}
         msg = 'error bar'
         conductor_utils.cleaning_error_handler(self.task, msg)
         self.node.save.assert_called_once_with()
@@ -1201,8 +1203,20 @@ class ErrorHandlersTestCase(tests_base.TestCase):
         self.assertEqual(msg, self.node.maintenance_reason)
         driver = self.task.driver.deploy
         driver.tear_down_cleaning.assert_called_once_with(self.task)
-        self.task.process_event.assert_called_once_with('fail',
-                                                        target_state=None)
+        if prov_state == states.CLEANFAIL:
+            self.assertFalse(self.task.process_event.called)
+        else:
+            self.task.process_event.assert_called_once_with('fail',
+                                                            target_state=None)
+
+    def test_cleaning_error_handler(self):
+        self._test_cleaning_error_handler()
+
+    def test_cleaning_error_handler_cleanwait(self):
+        self._test_cleaning_error_handler(prov_state=states.CLEANWAIT)
+
+    def test_cleaning_error_handler_cleanfail(self):
+        self._test_cleaning_error_handler(prov_state=states.CLEANFAIL)
 
     def test_cleaning_error_handler_manual(self):
         target = states.MANAGEABLE
diff --git a/releasenotes/notes/cleanwait_timeout_fail-4323ba7d4d4da3e6.yaml b/releasenotes/notes/cleanwait_timeout_fail-4323ba7d4d4da3e6.yaml
new file mode 100644
index 000000000..9587fe80d
--- /dev/null
+++ b/releasenotes/notes/cleanwait_timeout_fail-4323ba7d4d4da3e6.yaml
@@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    Fixes an issue with a baremetal node that times out during cleaning.
+    The ironic-conductor was attempting to change the node's provision state
+    to 'clean failed' twice, resulting in the node's ``last_error`` being set
+    incorrectly. This no longer happens. For more information, see
+    `story 2004299 <https://storyboard.openstack.org/#!/story/2004299>`_.
author	Ruby Loo <rloo@oath.com>	2018-11-07 21:26:31 +0000
committer	Ruby Loo <opensrloo@gmail.com>	2018-11-26 16:21:26 +0000
commit	162e5f3a8684db311db250b0db8c82fbb26b4412 (patch)
tree	cd8f9ac41fa9eecb70a976f782be00f3774fc39a
parent	a1c8f31cdfd4402e1ff939a90bad76bad6be7006 (diff)
download	ironic-162e5f3a8684db311db250b0db8c82fbb26b4412.tar.gz