summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.openstack.org>2018-08-07 15:36:43 +0000
committerGerrit Code Review <review@openstack.org>2018-08-07 15:36:43 +0000
commit1364df4361e0746aef89ea1224f45d61f8cb9990 (patch)
treeb1d457c075fb59828c5732a73ce70ee9b410cfe3
parenta62a029f670b783ac82ec5b7e2b340666644514b (diff)
parentd70db292aeadc99434c002a042da0993150c32fc (diff)
downloadironic-1364df4361e0746aef89ea1224f45d61f8cb9990.tar.gz
Merge "Node gets stuck in ING state when conductor goes down"
-rw-r--r--ironic/common/states.py9
-rw-r--r--ironic/conductor/base_manager.py14
-rw-r--r--ironic/tests/unit/conductor/test_base_manager.py23
-rw-r--r--ironic/tests/unit/conductor/test_manager.py22
-rw-r--r--releasenotes/notes/node-stuck-when-conductor-down-3aa41a3abed9daf5.yaml7
5 files changed, 56 insertions, 19 deletions
diff --git a/ironic/common/states.py b/ironic/common/states.py
index cef2459f5..6a4d5013d 100644
--- a/ironic/common/states.py
+++ b/ironic/common/states.py
@@ -234,6 +234,15 @@ UNSTABLE_STATES = (DEPLOYING, DEPLOYWAIT, CLEANING, CLEANWAIT, VERIFYING,
RESCUEWAIT, UNRESCUING)
"""States that can be changed without external request."""
+STUCK_STATES_TREATED_AS_FAIL = (DEPLOYING, CLEANING, VERIFYING, INSPECTING,
+ ADOPTING, RESCUING, UNRESCUING)
+"""States that cannot be resumed once a conductor dies.
+
+If a node gets stuck with one of these states for some reason
+(eg. conductor goes down when executing task), node will be moved
+to fail state.
+"""
+
##############
# Power states
##############
diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py
index 1be8274b3..5156cbc94 100644
--- a/ironic/conductor/base_manager.py
+++ b/ironic/conductor/base_manager.py
@@ -179,14 +179,12 @@ class BaseConductorManager(object):
self._periodic_tasks_worker.add_done_callback(
self._on_periodic_tasks_stop)
- self._fail_transient_state(
- states.DEPLOYING,
- _("The deployment can't be resumed by conductor "
- "%s. Moving to fail state.") % self.host)
- self._fail_transient_state(
- states.CLEANING,
- _("The cleaning can't be resumed by conductor "
- "%s. Moving to fail state.") % self.host)
+ for state in states.STUCK_STATES_TREATED_AS_FAIL:
+ self._fail_transient_state(
+ state,
+ _("The %(state)s state can't be resumed by conductor "
+ "%(host)s. Moving to fail state.") %
+ {'state': state, 'host': self.host})
# Start consoles if it set enabled in a greenthread.
try:
diff --git a/ironic/tests/unit/conductor/test_base_manager.py b/ironic/tests/unit/conductor/test_base_manager.py
index 5de426afc..0c619d912 100644
--- a/ironic/tests/unit/conductor/test_base_manager.py
+++ b/ironic/tests/unit/conductor/test_base_manager.py
@@ -13,6 +13,7 @@
"""Test class for Ironic BaseConductorManager."""
import collections
+import uuid
import eventlet
import futurist
@@ -24,6 +25,7 @@ from oslo_utils import uuidutils
from ironic.common import driver_factory
from ironic.common import exception
+from ironic.common import states
from ironic.conductor import base_manager
from ironic.conductor import manager
from ironic.conductor import notification_utils
@@ -206,6 +208,27 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertRaisesRegex(RuntimeError, 'already running',
self.service.init_host)
+ def test_start_recover_nodes_stuck(self):
+ state_trans = [
+ (states.DEPLOYING, states.DEPLOYFAIL),
+ (states.CLEANING, states.CLEANFAIL),
+ (states.VERIFYING, states.ENROLL),
+ (states.INSPECTING, states.INSPECTFAIL),
+ (states.ADOPTING, states.ADOPTFAIL),
+ (states.RESCUING, states.RESCUEFAIL),
+ (states.UNRESCUING, states.UNRESCUEFAIL),
+ ]
+ nodes = [obj_utils.create_test_node(self.context, uuid=uuid.uuid4(),
+ driver='fake-hardware',
+ provision_state=state[0])
+ for state in state_trans]
+
+ self._start_service()
+ for node, state in zip(nodes, state_trans):
+ node.refresh()
+ self.assertEqual(state[1], node.provision_state,
+ 'Test failed when recovering from %s' % state[0])
+
@mock.patch.object(base_manager, 'LOG')
def test_warning_on_low_workers_pool(self, log_mock):
CONF.set_override('workers_pool_size', 3, 'conductor')
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index 80bd7a466..03c9982bd 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -4035,11 +4035,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue')
def test__do_node_rescue_returns_rescuewait(self, mock_rescue):
+ self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.RESCUING,
instance_info={'rescue_password':
'password'})
- self._start_service()
with task_manager.TaskManager(self.context, node.uuid) as task:
mock_rescue.return_value = states.RESCUEWAIT
self.service._do_node_rescue(task)
@@ -4050,11 +4050,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue')
def test__do_node_rescue_returns_rescue(self, mock_rescue):
+ self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.RESCUING,
instance_info={'rescue_password':
'password'})
- self._start_service()
with task_manager.TaskManager(self.context, node.uuid) as task:
mock_rescue.return_value = states.RESCUE
self.service._do_node_rescue(task)
@@ -4066,11 +4066,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch.object(manager, 'LOG')
@mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue')
def test__do_node_rescue_errors(self, mock_rescue, mock_log):
+ self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.RESCUING,
instance_info={'rescue_password':
'password'})
- self._start_service()
mock_rescue.side_effect = exception.InstanceRescueFailure(
'failed to rescue')
with task_manager.TaskManager(self.context, node.uuid) as task:
@@ -4086,11 +4086,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch.object(manager, 'LOG')
@mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue')
def test__do_node_rescue_bad_state(self, mock_rescue, mock_log):
+ self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.RESCUING,
instance_info={'rescue_password':
'password'})
- self._start_service()
mock_rescue.return_value = states.ACTIVE
with task_manager.TaskManager(self.context, node.uuid) as task:
self.service._do_node_rescue(task)
@@ -4156,11 +4156,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue')
def test__do_node_unrescue(self, mock_unrescue):
+ self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.UNRESCUING,
target_provision_state=states.ACTIVE,
instance_info={})
- self._start_service()
with task_manager.TaskManager(self.context, node.uuid) as task:
mock_unrescue.return_value = states.ACTIVE
self.service._do_node_unrescue(task)
@@ -4171,11 +4171,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch.object(manager, 'LOG')
@mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue')
def test__do_node_unrescue_ironic_error(self, mock_unrescue, mock_log):
+ self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.UNRESCUING,
target_provision_state=states.ACTIVE,
instance_info={})
- self._start_service()
mock_unrescue.side_effect = exception.InstanceUnrescueFailure(
'Unable to unrescue')
with task_manager.TaskManager(self.context, node.uuid) as task:
@@ -4190,11 +4190,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch.object(manager, 'LOG')
@mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue')
def test__do_node_unrescue_other_error(self, mock_unrescue, mock_log):
+ self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.UNRESCUING,
target_provision_state=states.ACTIVE,
instance_info={})
- self._start_service()
mock_unrescue.side_effect = RuntimeError('Some failure')
with task_manager.TaskManager(self.context, node.uuid) as task:
self.assertRaises(RuntimeError,
@@ -4207,10 +4207,10 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
@mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue')
def test__do_node_unrescue_bad_state(self, mock_unrescue):
+ self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=states.UNRESCUING,
instance_info={})
- self._start_service()
mock_unrescue.return_value = states.RESCUEWAIT
with task_manager.TaskManager(self.context, node.uuid) as task:
self.service._do_node_unrescue(task)
@@ -4272,6 +4272,7 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
def test__do_node_verify(self, mock_validate, mock_get_power_state,
mock_notif):
+ self._start_service()
mock_get_power_state.return_value = states.POWER_OFF
# Required for exception handling
mock_notif.__name__ = 'NodeCorrectedPowerStateNotification'
@@ -4282,7 +4283,6 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
last_error=None,
power_state=states.NOSTATE)
- self._start_service()
with task_manager.acquire(
self.context, node['id'], shared=False) as task:
self.service._do_node_verify(task)
@@ -4311,6 +4311,7 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
def test__do_node_verify_validation_fails(self, mock_validate,
mock_get_power_state):
+ self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.VERIFYING,
@@ -4320,7 +4321,6 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_validate.side_effect = RuntimeError("boom")
- self._start_service()
with task_manager.acquire(
self.context, node['id'], shared=False) as task:
self.service._do_node_verify(task)
@@ -4339,6 +4339,7 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
def test__do_node_verify_get_state_fails(self, mock_validate,
mock_get_power_state):
+ self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.VERIFYING,
@@ -4348,7 +4349,6 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_get_power_state.side_effect = RuntimeError("boom")
- self._start_service()
with task_manager.acquire(
self.context, node['id'], shared=False) as task:
self.service._do_node_verify(task)
diff --git a/releasenotes/notes/node-stuck-when-conductor-down-3aa41a3abed9daf5.yaml b/releasenotes/notes/node-stuck-when-conductor-down-3aa41a3abed9daf5.yaml
new file mode 100644
index 000000000..6a8eb5d39
--- /dev/null
+++ b/releasenotes/notes/node-stuck-when-conductor-down-3aa41a3abed9daf5.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+ - If a node gets stuck in one of the states ``deploying``, ``cleaning``,
+ ``verifying``, ``inspecting``, ``adopting``, ``rescuing``, ``unrescuing``
+ for some reason (eg. conductor goes down when executing a task), it will
+ be moved to an appropriate failure state in the next time the conductor
+ starts.