summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZhenguo Niu <Niu.ZGlinux@gmail.com>2016-08-02 20:24:54 +0800
committerDmitry Tantsur <divius.inside@gmail.com>2018-02-20 09:04:07 +0000
commit8ad8c874d208e2c80be05bc64afe67d9a9c7a9ec (patch)
treed1b9f16eb7a41605cd214019a8eb73c9b16667c8
parent504a67b46d883545c320847aef6da96f6fa2c607 (diff)
downloadironic-8ad8c874d208e2c80be05bc64afe67d9a9c7a9ec.tar.gz
Clean nodes stuck in CLEANING state when ir-cond restarts
When a conductor managing a node dies abruptly mid cleaing, the node will get stuck in the CLEANING state. This also moves _start_service() before creating CLEANING nodes in tests. Finally, it adds autospec to a few places where the tests fail in a mysterious way otherwise. Change-Id: Ia7bce4dff57569707de4fcf3002eac241a5aa85b Co-Authored-By: Dmitry Tantsur <dtantsur@redhat.com> Partial-Bug: #1651092 (cherry picked from commit 2921fe685d8f096717f8795494c1032025407fe4)
-rw-r--r--ironic/conductor/base_manager.py37
-rw-r--r--ironic/tests/unit/conductor/test_base_manager.py6
-rw-r--r--ironic/tests/unit/conductor/test_manager.py57
-rw-r--r--releasenotes/notes/clean-nodes-stuck-in-cleaning-on-startup-443823ea4f937965.yaml5
4 files changed, 64 insertions, 41 deletions
diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py
index 34319babf..a75a5161f 100644
--- a/ironic/conductor/base_manager.py
+++ b/ironic/conductor/base_manager.py
@@ -223,21 +223,14 @@ class BaseConductorManager(object):
self._periodic_tasks_worker.add_done_callback(
self._on_periodic_tasks_stop)
- # NOTE(lucasagomes): If the conductor server dies abruptly
- # mid deployment (OMM Killer, power outage, etc...) we
- # can not resume the deployment even if the conductor
- # comes back online. Cleaning the reservation of the nodes
- # (dbapi.clear_node_reservations_for_conductor) is not enough to
- # unstick it, so let's gracefully fail the deployment so the node
- # can go through the steps (deleting & cleaning) to make itself
- # available again.
- filters = {'reserved': False,
- 'provision_state': states.DEPLOYING}
- last_error = (_("The deployment can't be resumed by conductor "
- "%s. Moving to fail state.") % self.host)
- self._fail_if_in_state(ironic_context.get_admin_context(), filters,
- states.DEPLOYING, 'provision_updated_at',
- last_error=last_error)
+ self._fail_transient_state(
+ states.DEPLOYING,
+ _("The deployment can't be resumed by conductor "
+ "%s. Moving to fail state.") % self.host)
+ self._fail_transient_state(
+ states.CLEANING,
+ _("The cleaning can't be resumed by conductor "
+ "%s. Moving to fail state.") % self.host)
# Start consoles if it set enabled in a greenthread.
try:
@@ -259,6 +252,20 @@ class BaseConductorManager(object):
self._started = True
+ def _fail_transient_state(self, state, last_error):
+ """Apply "fail" transition to nodes in a transient state.
+
+ If the conductor server dies abruptly mid deployment or cleaning
+ (OMM Killer, power outage, etc...) we can not resume the process even
+ if the conductor comes back online. Cleaning the reservation of
+ the nodes (dbapi.clear_node_reservations_for_conductor) is not enough
+ to unstick it, so let's gracefully fail the process.
+ """
+ filters = {'reserved': False, 'provision_state': state}
+ self._fail_if_in_state(ironic_context.get_admin_context(), filters,
+ state, 'provision_updated_at',
+ last_error=last_error)
+
def del_host(self, deregister=True):
# Conductor deregistration fails if called on non-initialized
# conductor (e.g. when rpc server is unreachable).
diff --git a/ironic/tests/unit/conductor/test_base_manager.py b/ironic/tests/unit/conductor/test_base_manager.py
index ea2ff86aa..ff62fdb89 100644
--- a/ironic/tests/unit/conductor/test_base_manager.py
+++ b/ironic/tests/unit/conductor/test_base_manager.py
@@ -184,7 +184,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
ht_mock.assert_called_once_with()
@mock.patch.object(base_manager, 'LOG')
- @mock.patch.object(base_manager.BaseConductorManager, 'del_host')
+ @mock.patch.object(base_manager.BaseConductorManager, 'del_host',
+ autospec=True)
@mock.patch.object(driver_factory, 'DriverFactory')
def test_starts_with_only_dynamic_drivers(self, df_mock, del_mock,
log_mock):
@@ -197,7 +198,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertFalse(del_mock.called)
@mock.patch.object(base_manager, 'LOG')
- @mock.patch.object(base_manager.BaseConductorManager, 'del_host')
+ @mock.patch.object(base_manager.BaseConductorManager, 'del_host',
+ autospec=True)
@mock.patch.object(driver_factory, 'HardwareTypesFactory')
def test_starts_with_only_classic_drivers(self, ht_mock, del_mock,
log_mock):
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index 24f467f07..b743d42c6 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -2208,13 +2208,13 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
def test__do_node_clean_automated_disabled(self, mock_validate):
self.config(automated_clean=False, group='conductor')
+
+ self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
target_provision_state=states.AVAILABLE,
last_error=None)
-
- self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_node_clean(task)
@@ -2326,6 +2326,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
else:
tgt_prov_state = states.AVAILABLE
driver_info = {'clean_steps': self.clean_steps}
+
+ self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@@ -2334,7 +2336,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
power_state=states.POWER_OFF,
driver_internal_info=driver_info)
- self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_node_clean(task, clean_steps=clean_steps)
@@ -2372,6 +2373,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
tgt_prov_state = states.AVAILABLE
driver_internal_info['clean_steps'] = self.clean_steps
+ self._start_service()
+
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@@ -2382,8 +2385,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_execute.return_value = return_state
expected_first_step = node.driver_internal_info['clean_steps'][0]
- self._start_service()
-
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@@ -2410,6 +2411,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
manual=False):
# Resume an in-progress cleaning after the first async step
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
+
+ self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@@ -2420,8 +2423,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
clean_step=self.clean_steps[0])
mock_execute.return_value = return_state
- self._start_service()
-
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, self.next_clean_step_index)
@@ -2448,6 +2449,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
info = {'clean_steps': self.clean_steps,
'clean_step_index': len(self.clean_steps) - 1}
+
+ self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@@ -2456,8 +2459,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
driver_internal_info=info,
clean_step=self.clean_steps[-1])
- self._start_service()
-
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, None)
@@ -2485,6 +2486,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_power_execute, manual=False):
# Run all steps from start to finish (all synchronous)
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
+
+ self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@@ -2496,8 +2499,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_deploy_execute.return_value = None
mock_power_execute.return_value = None
- self._start_service()
-
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@@ -2530,6 +2531,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
manual=False):
# When a clean step fails, go to CLEANFAIL
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
+
+ self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@@ -2540,8 +2543,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
clean_step={})
mock_execute.side_effect = Exception()
- self._start_service()
-
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@@ -2575,6 +2576,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self, tear_mock, power_exec_mock, deploy_exec_mock, log_mock,
manual=True):
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
+
+ self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@@ -2588,8 +2591,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
power_exec_mock.return_value = None
tear_mock.side_effect = Exception('boom')
- self._start_service()
-
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@@ -2632,6 +2633,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
{'clean_steps': None}):
# Resume where there are no steps, should be a noop
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
+
+ self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
uuid=uuidutils.generate_uuid(),
@@ -2641,8 +2644,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
driver_internal_info=info,
clean_step={})
- self._start_service()
-
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, None)
@@ -2670,6 +2671,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self, deploy_exec_mock, power_exec_mock, manual=False):
# When a clean step fails, go to CLEANFAIL
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
+
+ self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@@ -2680,8 +2683,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
clean_step={})
deploy_exec_mock.return_value = "foo"
- self._start_service()
-
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@@ -2930,11 +2931,19 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, mgr_utils.CommonMixIn,
self.assertEqual([(nodes[0].uuid, 'fake', 0)], result)
mock_nodeinfo_list.assert_called_once_with(
columns=self.columns, filters=mock.sentinel.filters)
- mock_fail_if_state.assert_called_once_with(
- mock.ANY, mock.ANY,
- {'provision_state': 'deploying', 'reserved': False},
- 'deploying', 'provision_updated_at',
- last_error=mock.ANY)
+ expected_calls = [mock.call(mock.ANY, mock.ANY,
+ {'provision_state': 'deploying',
+ 'reserved': False},
+ 'deploying',
+ 'provision_updated_at',
+ last_error=mock.ANY),
+ mock.call(mock.ANY, mock.ANY,
+ {'provision_state': 'cleaning',
+ 'reserved': False},
+ 'cleaning',
+ 'provision_updated_at',
+ last_error=mock.ANY)]
+ mock_fail_if_state.assert_has_calls(expected_calls)
@mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list')
def test_iter_nodes_shutdown(self, mock_nodeinfo_list):
diff --git a/releasenotes/notes/clean-nodes-stuck-in-cleaning-on-startup-443823ea4f937965.yaml b/releasenotes/notes/clean-nodes-stuck-in-cleaning-on-startup-443823ea4f937965.yaml
new file mode 100644
index 000000000..09d2a016c
--- /dev/null
+++ b/releasenotes/notes/clean-nodes-stuck-in-cleaning-on-startup-443823ea4f937965.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+ - When a conductor managing a node dies mid-cleaning the node would get stuck
+ in the CLEANING state. Now upon conductor startup nodes in the CLEANING state
+ will be moved to the CLEANFAIL state.