From 510a612eed6dc6e893d6101574b5f84ca5757762 Mon Sep 17 00:00:00 2001 From: Chris Krelle Date: Wed, 12 Apr 2023 08:58:35 -0700 Subject: Add ablity to power off nodes in clean failed We have seen duplicate ip issues when leaving clean failed nodes powered on. This patch allows operators to power down nodes that enter clean failed state. Change-Id: Iecb402227485fe0ba787a262121c9d6a048b0e13 --- ironic/conductor/utils.py | 5 ++++ ironic/conf/conductor.py | 8 ++++++ ironic/tests/unit/conductor/test_cleaning.py | 30 ++++++++++++++++++++++ .../Cleanfail-power-off-13b5fdcc2727866a.yaml | 8 ++++++ 4 files changed, 51 insertions(+) create mode 100644 releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 2272c0df7..add9ee74d 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -499,6 +499,11 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False, # NOTE(dtantsur): avoid overwriting existing maintenance_reason if not node.maintenance_reason and set_maintenance: node.maintenance_reason = errmsg + + if CONF.conductor.poweroff_in_cleanfail: + # NOTE(NobodyCam): Power off node in clean fail + node_power_action(task, states.POWER_OFF) + node.save() if set_fail_state and node.provision_state != states.CLEANFAIL: diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 653e30f56..2452fafe7 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -349,6 +349,14 @@ opts = [ 'is a global setting applying to all requests this ' 'conductor receives, regardless of access rights. ' 'The concurrent clean limit cannot be disabled.')), + + cfg.BoolOpt('poweroff_in_cleanfail', + default=False, + help=_('If True power off nodes in the ``clean failed`` ' + 'state. Default False. Option may be unsafe ' + 'when using Cleaning to perform ' + 'hardware-transformative actions such as ' + 'firmware upgrade.')), ] diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py index 34e805deb..cdfbf14ee 100644 --- a/ironic/tests/unit/conductor/test_cleaning.py +++ b/ironic/tests/unit/conductor/test_cleaning.py @@ -436,6 +436,36 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertFalse(node.maintenance) self.assertIsNone(node.fault) + @mock.patch('ironic.drivers.modules.fake.FakePower.set_power_state', + autospec=True) + @mock.patch.object(n_flat.FlatNetwork, 'validate', autospec=True) + @mock.patch.object(conductor_steps, 'set_node_cleaning_steps', + autospec=True) + def test_do_node_clean_steps_fail_poweroff(self, mock_steps, mock_validate, + mock_power, clean_steps=None, + invalid_exc=True): + if invalid_exc: + mock_steps.side_effect = exception.InvalidParameterValue('invalid') + else: + mock_steps.side_effect = exception.NodeCleaningFailure('failure') + tgt_prov_state = states.MANAGEABLE if clean_steps else states.AVAILABLE + self.config(poweroff_in_cleanfail=True, group='conductor') + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + uuid=uuidutils.generate_uuid(), + provision_state=states.CLEANING, + power_state=states.POWER_ON, + target_provision_state=tgt_prov_state) + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + cleaning.do_node_clean(task, clean_steps=clean_steps) + mock_validate.assert_called_once_with(mock.ANY, task) + node.refresh() + self.assertEqual(states.CLEANFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + mock_steps.assert_called_once_with(mock.ANY, disable_ramdisk=False) + self.assertTrue(mock_power.called) + def test__do_node_clean_automated_steps_fail(self): for invalid in (True, False): self.__do_node_clean_steps_fail(invalid_exc=invalid) diff --git a/releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml b/releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml new file mode 100644 index 000000000..2856bac06 --- /dev/null +++ b/releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Add new conductor conf option: [conductor]poweroff_in_cleanfail + (default: False). when True nodes entering clean failed state + will be powered off. This option may be unsafe when using + Cleaning to perform hardware-transformative actions such as + firmware upgrade. -- cgit v1.2.1