summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Krelle <ckrelle@nvidia.com>2023-04-12 08:58:35 -0700
committerChris Krelle <ckrelle@nvidia.com>2023-04-24 16:20:54 -0700
commit510a612eed6dc6e893d6101574b5f84ca5757762 (patch)
tree70fa32f1e6c75ffecd16fa1ddd8e81559273a97b
parent8ef9db15704c0c2cb1342c7c1554bfa8d8a7a2e3 (diff)
downloadironic-510a612eed6dc6e893d6101574b5f84ca5757762.tar.gz
Add ablity to power off nodes in clean failed
We have seen duplicate ip issues when leaving clean failed nodes powered on. This patch allows operators to power down nodes that enter clean failed state. Change-Id: Iecb402227485fe0ba787a262121c9d6a048b0e13
-rw-r--r--ironic/conductor/utils.py5
-rw-r--r--ironic/conf/conductor.py8
-rw-r--r--ironic/tests/unit/conductor/test_cleaning.py30
-rw-r--r--releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml8
4 files changed, 51 insertions, 0 deletions
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index 2272c0df7..add9ee74d 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -499,6 +499,11 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
# NOTE(dtantsur): avoid overwriting existing maintenance_reason
if not node.maintenance_reason and set_maintenance:
node.maintenance_reason = errmsg
+
+ if CONF.conductor.poweroff_in_cleanfail:
+ # NOTE(NobodyCam): Power off node in clean fail
+ node_power_action(task, states.POWER_OFF)
+
node.save()
if set_fail_state and node.provision_state != states.CLEANFAIL:
diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py
index 653e30f56..2452fafe7 100644
--- a/ironic/conf/conductor.py
+++ b/ironic/conf/conductor.py
@@ -349,6 +349,14 @@ opts = [
'is a global setting applying to all requests this '
'conductor receives, regardless of access rights. '
'The concurrent clean limit cannot be disabled.')),
+
+ cfg.BoolOpt('poweroff_in_cleanfail',
+ default=False,
+ help=_('If True power off nodes in the ``clean failed`` '
+ 'state. Default False. Option may be unsafe '
+ 'when using Cleaning to perform '
+ 'hardware-transformative actions such as '
+ 'firmware upgrade.')),
]
diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py
index 34e805deb..cdfbf14ee 100644
--- a/ironic/tests/unit/conductor/test_cleaning.py
+++ b/ironic/tests/unit/conductor/test_cleaning.py
@@ -436,6 +436,36 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertFalse(node.maintenance)
self.assertIsNone(node.fault)
+ @mock.patch('ironic.drivers.modules.fake.FakePower.set_power_state',
+ autospec=True)
+ @mock.patch.object(n_flat.FlatNetwork, 'validate', autospec=True)
+ @mock.patch.object(conductor_steps, 'set_node_cleaning_steps',
+ autospec=True)
+ def test_do_node_clean_steps_fail_poweroff(self, mock_steps, mock_validate,
+ mock_power, clean_steps=None,
+ invalid_exc=True):
+ if invalid_exc:
+ mock_steps.side_effect = exception.InvalidParameterValue('invalid')
+ else:
+ mock_steps.side_effect = exception.NodeCleaningFailure('failure')
+ tgt_prov_state = states.MANAGEABLE if clean_steps else states.AVAILABLE
+ self.config(poweroff_in_cleanfail=True, group='conductor')
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ uuid=uuidutils.generate_uuid(),
+ provision_state=states.CLEANING,
+ power_state=states.POWER_ON,
+ target_provision_state=tgt_prov_state)
+ with task_manager.acquire(
+ self.context, node.uuid, shared=False) as task:
+ cleaning.do_node_clean(task, clean_steps=clean_steps)
+ mock_validate.assert_called_once_with(mock.ANY, task)
+ node.refresh()
+ self.assertEqual(states.CLEANFAIL, node.provision_state)
+ self.assertEqual(tgt_prov_state, node.target_provision_state)
+ mock_steps.assert_called_once_with(mock.ANY, disable_ramdisk=False)
+ self.assertTrue(mock_power.called)
+
def test__do_node_clean_automated_steps_fail(self):
for invalid in (True, False):
self.__do_node_clean_steps_fail(invalid_exc=invalid)
diff --git a/releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml b/releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml
new file mode 100644
index 000000000..2856bac06
--- /dev/null
+++ b/releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml
@@ -0,0 +1,8 @@
+---
+features:
+ - |
+ Add new conductor conf option: [conductor]poweroff_in_cleanfail
+ (default: False). when True nodes entering clean failed state
+ will be powered off. This option may be unsafe when using
+ Cleaning to perform hardware-transformative actions such as
+ firmware upgrade.