summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.openstack.org>2018-03-30 21:12:57 +0000
committerGerrit Code Review <review@openstack.org>2018-03-30 21:12:57 +0000
commit039e20d8751a81061dc64d4038d18face9fad5e4 (patch)
tree19d5a8fdd0096c2bc15aea8b670f51b878308657
parent0c297ccbb73d1424af3ae7d2a17c4805bd23ae4b (diff)
parenta90b999a2d8afec0476dbd82d4eb2578ece313d0 (diff)
downloadironic-10.1.2.tar.gz
Merge "Rework logic handling reserved orphaned nodes in the conductor" into stable/queens10.1.2
-rw-r--r--ironic/conductor/base_manager.py9
-rw-r--r--ironic/conductor/manager.py71
-rw-r--r--ironic/conductor/utils.py21
-rw-r--r--ironic/tests/unit/conductor/test_manager.py100
-rw-r--r--ironic/tests/unit/conductor/test_utils.py19
-rw-r--r--releasenotes/notes/orphan-nodes-389cb6d90c2917ec.yaml9
6 files changed, 191 insertions, 38 deletions
diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py
index a75a5161f..2aaaab9a5 100644
--- a/ironic/conductor/base_manager.py
+++ b/ironic/conductor/base_manager.py
@@ -22,6 +22,7 @@ from futurist import rejection
from oslo_db import exception as db_exception
from oslo_log import log
from oslo_utils import excutils
+import six
from ironic.common import context as ironic_context
from ironic.common import driver_factory
@@ -440,7 +441,8 @@ class BaseConductorManager(object):
this would process nodes whose provision_updated_at
field value was 60 or more seconds before 'now'.
:param: provision_state: provision_state that the node is in,
- for the provisioning activity to have failed.
+ for the provisioning activity to have failed,
+ either one string or a set.
:param: sort_key: the nodes are sorted based on this key.
:param: callback_method: the callback method to be invoked in a
spawned thread, for a failed node. This
@@ -457,6 +459,9 @@ class BaseConductorManager(object):
fsm.
"""
+ if isinstance(provision_state, six.string_types):
+ provision_state = {provision_state}
+
node_iter = self.iter_nodes(filters=filters,
sort_key=sort_key,
sort_dir='asc')
@@ -467,7 +472,7 @@ class BaseConductorManager(object):
with task_manager.acquire(context, node_uuid,
purpose='node state check') as task:
if (task.node.maintenance or
- task.node.provision_state != provision_state):
+ task.node.provision_state not in provision_state):
continue
target_state = (None if not keep_target_state else
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index 69f70450c..efe3536d1 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -1561,15 +1561,24 @@ class ConductorManager(base_manager.BaseConductorManager):
self._fail_if_in_state(context, filters, states.DEPLOYWAIT,
sort_key, callback_method, err_handler)
- @METRICS.timer('ConductorManager._check_deploying_status')
+ @METRICS.timer('ConductorManager._check_orphan_nodes')
@periodics.periodic(spacing=CONF.conductor.check_provision_state_interval)
- def _check_deploying_status(self, context):
- """Periodically checks the status of nodes in DEPLOYING state.
+ def _check_orphan_nodes(self, context):
+ """Periodically checks the status of nodes that were taken over.
- Periodically checks the nodes in DEPLOYING and the state of the
- conductor deploying them. If we find out that a conductor that
- was provisioning the node has died we then break release the
- node and gracefully mark the deployment as failed.
+ Periodically checks the nodes that are managed by this conductor but
+ have a reservation from a conductor that went offline.
+
+ 1. Nodes in DEPLOYING state move to DEPLOY FAIL.
+
+ 2. Nodes in CLEANING state move to CLEAN FAIL with maintenance set.
+
+ 3. Nodes in a transient power state get the power operation aborted.
+
+ 4. Reservation is removed.
+
+ The latter operation happens even for nodes in maintenance mode,
+ otherwise it's not possible to move them out of maintenance.
:param context: request context.
"""
@@ -1578,12 +1587,14 @@ class ConductorManager(base_manager.BaseConductorManager):
return
node_iter = self.iter_nodes(
- fields=['id', 'reservation'],
- filters={'provision_state': states.DEPLOYING,
- 'maintenance': False,
- 'reserved_by_any_of': offline_conductors})
+ fields=['id', 'reservation', 'maintenance', 'provision_state',
+ 'target_power_state'],
+ filters={'reserved_by_any_of': offline_conductors})
+
+ state_cleanup_required = []
- for node_uuid, driver, node_id, conductor_hostname in node_iter:
+ for (node_uuid, driver, node_id, conductor_hostname,
+ maintenance, provision_state, target_power_state) in node_iter:
# NOTE(lucasagomes): Although very rare, this may lead to a
# race condition. By the time we release the lock the conductor
# that was previously managing the node could be back online.
@@ -1604,11 +1615,43 @@ class ConductorManager(base_manager.BaseConductorManager):
LOG.warning("During checking for deploying state, when "
"releasing the lock of the node %s, it was "
"already unlocked.", node_uuid)
+ else:
+ LOG.warning('Forcibly removed reservation of conductor %(old)s'
+ ' on node %(node)s as that conductor went offline',
+ {'old': conductor_hostname, 'node': node_uuid})
+
+ # TODO(dtantsur): clean up all states that are not stable and
+ # are not one of WAIT states.
+ if not maintenance and (provision_state in (states.DEPLOYING,
+ states.CLEANING) or
+ target_power_state is not None):
+ LOG.debug('Node %(node)s taken over from conductor %(old)s '
+ 'requires state clean up: provision state is '
+ '%(state)s, target power state is %(pstate)s',
+ {'node': node_uuid, 'old': conductor_hostname,
+ 'state': provision_state,
+ 'pstate': target_power_state})
+ state_cleanup_required.append(node_uuid)
+
+ for node_uuid in state_cleanup_required:
+ with task_manager.acquire(context, node_uuid,
+ purpose='power state clean up') as task:
+ if not task.node.maintenance and task.node.target_power_state:
+ old_state = task.node.target_power_state
+ task.node.target_power_state = None
+ task.node.last_error = _('Pending power operation was '
+ 'aborted due to conductor take '
+ 'over')
+ task.node.save()
+ LOG.warning('Aborted pending power operation %(op)s '
+ 'on node %(node)s due to conductor take over',
+ {'op': old_state, 'node': node_uuid})
self._fail_if_in_state(
- context, {'id': node_id}, states.DEPLOYING,
+ context, {'uuid': node_uuid},
+ {states.DEPLOYING, states.CLEANING},
'provision_updated_at',
- callback_method=utils.cleanup_after_timeout,
+ callback_method=utils.abort_on_conductor_take_over,
err_handler=utils.provisioning_error_handler)
@METRICS.timer('ConductorManager._do_adoption')
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index 94075a4e6..29772e0dc 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -380,6 +380,27 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
task.process_event('fail', target_state=target_state)
+@task_manager.require_exclusive_lock
+def abort_on_conductor_take_over(task):
+ """Set node's state when a task was aborted due to conductor take over.
+
+ :param task: a TaskManager instance.
+ """
+ msg = _('Operation was aborted due to conductor take over')
+ # By this time the "fail" even was processed, so we cannot end up in
+ # CLEANING or CLEAN WAIT, only in CLEAN FAIL.
+ if task.node.provision_state == states.CLEANFAIL:
+ cleaning_error_handler(task, msg, set_fail_state=False)
+ else:
+ # For aborted deployment (and potentially other operations), just set
+ # the last_error accordingly.
+ task.node.last_error = msg
+ task.node.save()
+
+ LOG.warning('Aborted the current operation on node %s due to '
+ 'conductor take over', task.node.uuid)
+
+
def rescuing_error_handler(task, msg, set_fail_state=True):
"""Cleanup rescue task after timeout or failure.
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index 1fe20b9d0..8e2229477 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -6252,16 +6252,17 @@ class DestroyPortgroupTestCase(mgr_utils.ServiceSetUpMixin,
@mock.patch.object(manager.ConductorManager, '_fail_if_in_state')
@mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor')
@mock.patch.object(dbapi.IMPL, 'get_offline_conductors')
-class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
- db_base.DbTestCase):
+class ManagerCheckOrphanNodesTestCase(mgr_utils.ServiceSetUpMixin,
+ db_base.DbTestCase):
def setUp(self):
- super(ManagerCheckDeployingStatusTestCase, self).setUp()
+ super(ManagerCheckOrphanNodesTestCase, self).setUp()
self._start_service()
self.node = obj_utils.create_test_node(
self.context, id=1, uuid=uuidutils.generate_uuid(),
driver='fake', provision_state=states.DEPLOYING,
- target_provision_state=states.DEPLOYDONE,
+ target_provision_state=states.ACTIVE,
+ target_power_state=states.POWER_ON,
reservation='fake-conductor')
# create a second node in a different state to test the
@@ -6271,28 +6272,53 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
driver='fake', provision_state=states.AVAILABLE,
target_provision_state=states.NOSTATE)
- def test__check_deploying_status(self, mock_off_cond, mock_mapped,
- mock_fail_if):
+ def test__check_orphan_nodes(self, mock_off_cond, mock_mapped,
+ mock_fail_if):
mock_off_cond.return_value = ['fake-conductor']
- self.service._check_deploying_status(self.context)
+ self.service._check_orphan_nodes(self.context)
self.node.refresh()
mock_off_cond.assert_called_once_with()
mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
mock_fail_if.assert_called_once_with(
- mock.ANY, {'id': self.node.id}, states.DEPLOYING,
+ mock.ANY, {'uuid': self.node.uuid},
+ {states.DEPLOYING, states.CLEANING},
'provision_updated_at',
- callback_method=conductor_utils.cleanup_after_timeout,
+ callback_method=conductor_utils.abort_on_conductor_take_over,
err_handler=conductor_utils.provisioning_error_handler)
# assert node was released
self.assertIsNone(self.node.reservation)
+ self.assertIsNone(self.node.target_power_state)
+ self.assertIsNotNone(self.node.last_error)
- def test__check_deploying_status_alive(self, mock_off_cond,
- mock_mapped, mock_fail_if):
+ def test__check_orphan_nodes_cleaning(self, mock_off_cond, mock_mapped,
+ mock_fail_if):
+ self.node.provision_state = states.CLEANING
+ self.node.save()
+ mock_off_cond.return_value = ['fake-conductor']
+
+ self.service._check_orphan_nodes(self.context)
+
+ self.node.refresh()
+ mock_off_cond.assert_called_once_with()
+ mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
+ mock_fail_if.assert_called_once_with(
+ mock.ANY, {'uuid': self.node.uuid},
+ {states.DEPLOYING, states.CLEANING},
+ 'provision_updated_at',
+ callback_method=conductor_utils.abort_on_conductor_take_over,
+ err_handler=conductor_utils.provisioning_error_handler)
+ # assert node was released
+ self.assertIsNone(self.node.reservation)
+ self.assertIsNone(self.node.target_power_state)
+ self.assertIsNotNone(self.node.last_error)
+
+ def test__check_orphan_nodes_alive(self, mock_off_cond,
+ mock_mapped, mock_fail_if):
mock_off_cond.return_value = []
- self.service._check_deploying_status(self.context)
+ self.service._check_orphan_nodes(self.context)
self.node.refresh()
mock_off_cond.assert_called_once_with()
@@ -6302,7 +6328,7 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
self.assertIsNotNone(self.node.reservation)
@mock.patch.object(objects.Node, 'release')
- def test__check_deploying_status_release_exceptions_skipping(
+ def test__check_orphan_nodes_release_exceptions_skipping(
self, mock_release, mock_off_cond, mock_mapped, mock_fail_if):
mock_off_cond.return_value = ['fake-conductor']
# Add another node so we can check both exceptions
@@ -6315,7 +6341,7 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
mock_mapped.return_value = True
mock_release.side_effect = [exception.NodeNotFound('not found'),
exception.NodeLocked('locked')]
- self.service._check_deploying_status(self.context)
+ self.service._check_orphan_nodes(self.context)
self.node.refresh()
mock_off_cond.assert_called_once_with()
@@ -6325,23 +6351,53 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
# Assert we skipped and didn't try to call _fail_if_in_state
self.assertFalse(mock_fail_if.called)
- @mock.patch.object(objects.Node, 'release')
- def test__check_deploying_status_release_node_not_locked(
- self, mock_release, mock_off_cond, mock_mapped, mock_fail_if):
+ def test__check_orphan_nodes_release_node_not_locked(
+ self, mock_off_cond, mock_mapped, mock_fail_if):
+ # this simulates releasing the node elsewhere
+ count = [0]
+
+ def _fake_release(*args, **kwargs):
+ self.node.reservation = None
+ self.node.save()
+ # raise an exception only the first time release is called
+ count[0] += 1
+ if count[0] == 1:
+ raise exception.NodeNotLocked('not locked')
+
mock_off_cond.return_value = ['fake-conductor']
mock_mapped.return_value = True
- mock_release.side_effect = exception.NodeNotLocked('not locked')
- self.service._check_deploying_status(self.context)
+ with mock.patch.object(objects.Node, 'release',
+ side_effect=_fake_release) as mock_release:
+ self.service._check_orphan_nodes(self.context)
+ mock_release.assert_called_with(self.context, mock.ANY,
+ self.node.id)
- self.node.refresh()
mock_off_cond.assert_called_once_with()
mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
mock_fail_if.assert_called_once_with(
- mock.ANY, {'id': self.node.id}, states.DEPLOYING,
+ mock.ANY, {'uuid': self.node.uuid},
+ {states.DEPLOYING, states.CLEANING},
'provision_updated_at',
- callback_method=conductor_utils.cleanup_after_timeout,
+ callback_method=conductor_utils.abort_on_conductor_take_over,
err_handler=conductor_utils.provisioning_error_handler)
+ def test__check_orphan_nodes_maintenance(self, mock_off_cond, mock_mapped,
+ mock_fail_if):
+ self.node.maintenance = True
+ self.node.save()
+ mock_off_cond.return_value = ['fake-conductor']
+
+ self.service._check_orphan_nodes(self.context)
+
+ self.node.refresh()
+ mock_off_cond.assert_called_once_with()
+ mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
+ # assert node was released
+ self.assertIsNone(self.node.reservation)
+ # not changing states in maintenance
+ self.assertFalse(mock_fail_if.called)
+ self.assertIsNotNone(self.node.target_power_state)
+
class TestIndirectionApiConductor(db_base.DbTestCase):
diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py
index c35e1c31a..630557489 100644
--- a/ironic/tests/unit/conductor/test_utils.py
+++ b/ironic/tests/unit/conductor/test_utils.py
@@ -1242,6 +1242,25 @@ class ErrorHandlersTestCase(tests_base.TestCase):
self.assertIn(msg, self.node.last_error)
self.assertIn(msg, self.node.maintenance_reason)
+ def test_abort_on_conductor_take_over_cleaning(self):
+ self.node.maintenance = False
+ self.node.provision_state = states.CLEANFAIL
+ conductor_utils.abort_on_conductor_take_over(self.task)
+ self.assertTrue(self.node.maintenance)
+ self.assertIn('take over', self.node.maintenance_reason)
+ self.assertIn('take over', self.node.last_error)
+ self.task.driver.deploy.tear_down_cleaning.assert_called_once_with(
+ self.task)
+ self.node.save.assert_called_once_with()
+
+ def test_abort_on_conductor_take_over_deploying(self):
+ self.node.maintenance = False
+ self.node.provision_state = states.DEPLOYFAIL
+ conductor_utils.abort_on_conductor_take_over(self.task)
+ self.assertFalse(self.node.maintenance)
+ self.assertIn('take over', self.node.last_error)
+ self.node.save.assert_called_once_with()
+
@mock.patch.object(conductor_utils, 'LOG')
def test_spawn_cleaning_error_handler_no_worker(self, log_mock):
exc = exception.NoFreeConductorWorker()
diff --git a/releasenotes/notes/orphan-nodes-389cb6d90c2917ec.yaml b/releasenotes/notes/orphan-nodes-389cb6d90c2917ec.yaml
new file mode 100644
index 000000000..c5c3f1e11
--- /dev/null
+++ b/releasenotes/notes/orphan-nodes-389cb6d90c2917ec.yaml
@@ -0,0 +1,9 @@
+---
+fixes:
+ - |
+ On node take over, any locks that are left from the old conductor are
+ cleared by the new one. Previously it only happened for nodes in
+ ``DEPLOYING`` state.
+ - |
+ On taking over nodes in ``CLEANING`` state, the new conductor moves them
+ to ``CLEAN FAIL`` and set maintenance.