diff options
author | Chris Krelle <ckrelle@nvidia.com> | 2023-04-12 08:58:35 -0700 |
---|---|---|
committer | Chris Krelle <ckrelle@nvidia.com> | 2023-04-24 16:20:54 -0700 |
commit | 510a612eed6dc6e893d6101574b5f84ca5757762 (patch) | |
tree | 70fa32f1e6c75ffecd16fa1ddd8e81559273a97b | |
parent | 8ef9db15704c0c2cb1342c7c1554bfa8d8a7a2e3 (diff) | |
download | ironic-510a612eed6dc6e893d6101574b5f84ca5757762.tar.gz |
Add ablity to power off nodes in clean failed
We have seen duplicate ip issues when leaving clean failed nodes
powered on. This patch allows operators to power down nodes that
enter clean failed state.
Change-Id: Iecb402227485fe0ba787a262121c9d6a048b0e13
-rw-r--r-- | ironic/conductor/utils.py | 5 | ||||
-rw-r--r-- | ironic/conf/conductor.py | 8 | ||||
-rw-r--r-- | ironic/tests/unit/conductor/test_cleaning.py | 30 | ||||
-rw-r--r-- | releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml | 8 |
4 files changed, 51 insertions, 0 deletions
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 2272c0df7..add9ee74d 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -499,6 +499,11 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False, # NOTE(dtantsur): avoid overwriting existing maintenance_reason if not node.maintenance_reason and set_maintenance: node.maintenance_reason = errmsg + + if CONF.conductor.poweroff_in_cleanfail: + # NOTE(NobodyCam): Power off node in clean fail + node_power_action(task, states.POWER_OFF) + node.save() if set_fail_state and node.provision_state != states.CLEANFAIL: diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 653e30f56..2452fafe7 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -349,6 +349,14 @@ opts = [ 'is a global setting applying to all requests this ' 'conductor receives, regardless of access rights. ' 'The concurrent clean limit cannot be disabled.')), + + cfg.BoolOpt('poweroff_in_cleanfail', + default=False, + help=_('If True power off nodes in the ``clean failed`` ' + 'state. Default False. Option may be unsafe ' + 'when using Cleaning to perform ' + 'hardware-transformative actions such as ' + 'firmware upgrade.')), ] diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py index 34e805deb..cdfbf14ee 100644 --- a/ironic/tests/unit/conductor/test_cleaning.py +++ b/ironic/tests/unit/conductor/test_cleaning.py @@ -436,6 +436,36 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertFalse(node.maintenance) self.assertIsNone(node.fault) + @mock.patch('ironic.drivers.modules.fake.FakePower.set_power_state', + autospec=True) + @mock.patch.object(n_flat.FlatNetwork, 'validate', autospec=True) + @mock.patch.object(conductor_steps, 'set_node_cleaning_steps', + autospec=True) + def test_do_node_clean_steps_fail_poweroff(self, mock_steps, mock_validate, + mock_power, clean_steps=None, + invalid_exc=True): + if invalid_exc: + mock_steps.side_effect = exception.InvalidParameterValue('invalid') + else: + mock_steps.side_effect = exception.NodeCleaningFailure('failure') + tgt_prov_state = states.MANAGEABLE if clean_steps else states.AVAILABLE + self.config(poweroff_in_cleanfail=True, group='conductor') + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + uuid=uuidutils.generate_uuid(), + provision_state=states.CLEANING, + power_state=states.POWER_ON, + target_provision_state=tgt_prov_state) + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + cleaning.do_node_clean(task, clean_steps=clean_steps) + mock_validate.assert_called_once_with(mock.ANY, task) + node.refresh() + self.assertEqual(states.CLEANFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + mock_steps.assert_called_once_with(mock.ANY, disable_ramdisk=False) + self.assertTrue(mock_power.called) + def test__do_node_clean_automated_steps_fail(self): for invalid in (True, False): self.__do_node_clean_steps_fail(invalid_exc=invalid) diff --git a/releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml b/releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml new file mode 100644 index 000000000..2856bac06 --- /dev/null +++ b/releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Add new conductor conf option: [conductor]poweroff_in_cleanfail + (default: False). when True nodes entering clean failed state + will be powered off. This option may be unsafe when using + Cleaning to perform hardware-transformative actions such as + firmware upgrade. |