summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/source/admin/cleaning.rst12
-rw-r--r--ironic/conductor/utils.py19
-rw-r--r--ironic/tests/unit/conductor/test_cleaning.py14
-rw-r--r--ironic/tests/unit/conductor/test_manager.py11
-rw-r--r--releasenotes/notes/redundant-maintenance-09849674334f656a.yaml5
5 files changed, 48 insertions, 13 deletions
diff --git a/doc/source/admin/cleaning.rst b/doc/source/admin/cleaning.rst
index 8338a8b17..9cfd628c7 100644
--- a/doc/source/admin/cleaning.rst
+++ b/doc/source/admin/cleaning.rst
@@ -314,9 +314,15 @@ cleaning.
Troubleshooting
===============
-If cleaning fails on a node, the node will be put into ``clean failed`` state
-and placed in maintenance mode, to prevent ironic from taking actions on the
-node.
+If cleaning fails on a node, the node will be put into ``clean failed`` state.
+If the failure happens while running a clean step, the node is also placed in
+maintenance mode to prevent ironic from taking actions on the node. The
+operator should validate that no permanent damage has been done to the
+node and no processes are still running on it before removing the maintenance
+mode.
+
+.. note:: Older versions of ironic may put the node to maintenance even when
+ no clean step has been running.
Nodes in ``clean failed`` will not be powered off, as the node might be in a
state such that powering it off could damage the node or remove useful
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index 081c6ce42..fccb9261c 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -407,9 +407,9 @@ def cleanup_cleanwait_timeout(task):
def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
- tear_down_cleaning=True,
- set_fail_state=True):
- """Put a failed node in CLEANFAIL and maintenance.
+ tear_down_cleaning=True, set_fail_state=True,
+ set_maintenance=None):
+ """Put a failed node in CLEANFAIL and maintenance (if needed).
:param task: a TaskManager instance.
:param logmsg: Message to be logged.
@@ -420,12 +420,19 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
cleaning. Default to True.
:param set_fail_state: Whether to set node to failed state. Default to
True.
+ :param set_maintenance: Whether to set maintenance mode. If None,
+ maintenance mode will be set if and only if a clean step is being
+ executed on a node.
"""
+ if set_maintenance is None:
+ set_maintenance = bool(task.node.clean_step)
+
errmsg = errmsg or logmsg
LOG.error(logmsg, exc_info=traceback)
node = task.node
- node.fault = faults.CLEAN_FAILURE
- node.maintenance = True
+ if set_maintenance:
+ node.fault = faults.CLEAN_FAILURE
+ node.maintenance = True
if tear_down_cleaning:
try:
@@ -457,7 +464,7 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
manual_clean = node.target_provision_state == states.MANAGEABLE
node.last_error = errmsg
# NOTE(dtantsur): avoid overwriting existing maintenance_reason
- if not node.maintenance_reason:
+ if not node.maintenance_reason and set_maintenance:
node.maintenance_reason = errmsg
node.save()
diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py
index 2fba81687..e2e00cf0b 100644
--- a/ironic/tests/unit/conductor/test_cleaning.py
+++ b/ironic/tests/unit/conductor/test_cleaning.py
@@ -18,6 +18,7 @@ from oslo_config import cfg
from oslo_utils import uuidutils
from ironic.common import exception
+from ironic.common import faults
from ironic.common import states
from ironic.conductor import cleaning
from ironic.conductor import steps as conductor_steps
@@ -63,6 +64,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
node.refresh()
self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
+ self.assertFalse(node.maintenance)
+ self.assertIsNone(node.fault)
mock_validate.assert_called_once_with(mock.ANY, mock.ANY)
@mock.patch('ironic.drivers.modules.fake.FakePower.validate',
@@ -331,6 +334,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertIn('is not allowed', node.last_error)
self.assertTrue(node.maintenance)
self.assertEqual('Original reason', node.maintenance_reason)
+ self.assertIsNone(node.fault) # no clean step running
self.assertFalse(mock_prep.called)
self.assertFalse(mock_tear_down.called)
@@ -356,6 +360,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertEqual(tgt_prov_state, node.target_provision_state)
mock_prep.assert_called_once_with(mock.ANY, task)
mock_validate.assert_called_once_with(mock.ANY, task)
+ self.assertFalse(node.maintenance)
+ self.assertIsNone(node.fault)
def test__do_node_clean_automated_prepare_clean_fail(self):
self.__do_node_clean_prepare_clean_fail()
@@ -413,6 +419,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
mock_steps.assert_called_once_with(mock.ANY)
+ self.assertFalse(node.maintenance)
+ self.assertIsNone(node.fault)
def test__do_node_clean_automated_steps_fail(self):
for invalid in (True, False):
@@ -460,6 +468,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
if clean_steps:
self.assertEqual(clean_steps,
node.driver_internal_info['clean_steps'])
+ self.assertFalse(node.maintenance)
# Check that state didn't change
self.assertEqual(states.CLEANING, node.provision_state)
@@ -751,6 +760,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertNotIn('clean_step_index', node.driver_internal_info)
self.assertIsNotNone(node.last_error)
self.assertTrue(node.maintenance)
+ self.assertEqual(faults.CLEAN_FAILURE, node.fault)
mock_execute.assert_called_once_with(
mock.ANY, mock.ANY, self.clean_steps[0])
mock_collect_logs.assert_called_once_with(mock.ANY, label='cleaning')
@@ -931,7 +941,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertNotIn('clean_step_index', node.driver_internal_info)
self.assertIsNotNone(node.last_error)
self.assertEqual(1, tear_mock.call_count)
- self.assertTrue(node.maintenance)
+ self.assertFalse(node.maintenance) # no step is running
deploy_exec_calls = [
mock.call(mock.ANY, mock.ANY, self.clean_steps[0]),
mock.call(mock.ANY, mock.ANY, self.clean_steps[2]),
@@ -1038,7 +1048,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertEqual({}, node.clean_step)
self.assertNotIn('clean_step_index', node.driver_internal_info)
self.assertIsNotNone(node.last_error)
- self.assertTrue(node.maintenance)
+ self.assertTrue(node.maintenance) # the 1st clean step was running
deploy_exec_mock.assert_called_once_with(mock.ANY, mock.ANY,
self.clean_steps[0])
# Make sure we don't execute any other step and return
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index e954ec178..b3d3f4251 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -36,6 +36,7 @@ from ironic.common import boot_devices
from ironic.common import components
from ironic.common import driver_factory
from ironic.common import exception
+from ironic.common import faults
from ironic.common import images
from ironic.common import indicator_states
from ironic.common import nova
@@ -1858,7 +1859,7 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertIsNotNone(node.last_error)
mock_cleanup.assert_called_once_with(mock.ANY, mock.ANY)
- def _check_cleanwait_timeouts(self, manual=False):
+ def _check_cleanwait_timeouts(self, manual=False, with_step=True):
self._start_service()
CONF.set_override('clean_callback_timeout', 1, group='conductor')
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
@@ -1869,7 +1870,7 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0),
clean_step={
'interface': 'deploy',
- 'step': 'erase_devices'},
+ 'step': 'erase_devices'} if with_step else {},
driver_internal_info={
'cleaning_reboot': manual,
'clean_step_index': 0})
@@ -1880,6 +1881,9 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertIsNotNone(node.last_error)
+ self.assertEqual(with_step, node.maintenance)
+ self.assertEqual(faults.CLEAN_FAILURE if with_step else None,
+ node.fault)
# Test that cleaning parameters have been purged in order
# to prevent looping of the cleaning sequence
self.assertEqual({}, node.clean_step)
@@ -1892,6 +1896,9 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
def test__check_cleanwait_timeouts_manual_clean(self):
self._check_cleanwait_timeouts(manual=True)
+ def test__check_cleanwait_timeouts_boot_timeout(self):
+ self._check_cleanwait_timeouts(with_step=False)
+
@mock.patch('ironic.drivers.modules.fake.FakeRescue.clean_up',
autospec=True)
@mock.patch.object(conductor_utils, 'node_power_action', autospec=True)
diff --git a/releasenotes/notes/redundant-maintenance-09849674334f656a.yaml b/releasenotes/notes/redundant-maintenance-09849674334f656a.yaml
new file mode 100644
index 000000000..2d8246974
--- /dev/null
+++ b/releasenotes/notes/redundant-maintenance-09849674334f656a.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+ - |
+ Failed cleaning no longer results in maintenance mode if no clean step is
+ running, e.g. on PXE timeout or failed clean steps validation.