Merge "Rework logic handling reserved orphaned nodes in the conductor" into stable/queens10.1.2

author: Zuul <zuul@review.openstack.org> 2018-03-30 21:12:57 +0000
committer: Gerrit Code Review <review@openstack.org> 2018-03-30 21:12:57 +0000
commit: 039e20d8751a81061dc64d4038d18face9fad5e4 (patch)
tree: 19d5a8fdd0096c2bc15aea8b670f51b878308657
parent: 0c297ccbb73d1424af3ae7d2a17c4805bd23ae4b (diff)
parent: a90b999a2d8afec0476dbd82d4eb2578ece313d0 (diff)
download: ironic-10.1.2.tar.gz
6 files changed, 191 insertions, 38 deletions
diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py
index a75a5161f..2aaaab9a5 100644
--- a/ironic/conductor/base_manager.py
+++ b/ironic/conductor/base_manager.py
@@ -22,6 +22,7 @@ from futurist import rejection
 from oslo_db import exception as db_exception
 from oslo_log import log
 from oslo_utils import excutils
+import six
 
 from ironic.common import context as ironic_context
 from ironic.common import driver_factory
@@ -440,7 +441,8 @@ class BaseConductorManager(object):
                          this would process nodes whose provision_updated_at
                          field value was 60 or more seconds before 'now'.
         :param: provision_state: provision_state that the node is in,
-                                 for the provisioning activity to have failed.
+                                 for the provisioning activity to have failed,
+                                 either one string or a set.
         :param: sort_key: the nodes are sorted based on this key.
         :param: callback_method: the callback method to be invoked in a
                                  spawned thread, for a failed node. This
@@ -457,6 +459,9 @@ class BaseConductorManager(object):
                                    fsm.
 
         """
+        if isinstance(provision_state, six.string_types):
+            provision_state = {provision_state}
+
         node_iter = self.iter_nodes(filters=filters,
                                     sort_key=sort_key,
                                     sort_dir='asc')
@@ -467,7 +472,7 @@ class BaseConductorManager(object):
                 with task_manager.acquire(context, node_uuid,
                                           purpose='node state check') as task:
                     if (task.node.maintenance or
-                            task.node.provision_state != provision_state):
+                            task.node.provision_state not in provision_state):
                         continue
 
                     target_state = (None if not keep_target_state else
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index 69f70450c..efe3536d1 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -1561,15 +1561,24 @@ class ConductorManager(base_manager.BaseConductorManager):
         self._fail_if_in_state(context, filters, states.DEPLOYWAIT,
                                sort_key, callback_method, err_handler)
 
-    @METRICS.timer('ConductorManager._check_deploying_status')
+    @METRICS.timer('ConductorManager._check_orphan_nodes')
     @periodics.periodic(spacing=CONF.conductor.check_provision_state_interval)
-    def _check_deploying_status(self, context):
-        """Periodically checks the status of nodes in DEPLOYING state.
+    def _check_orphan_nodes(self, context):
+        """Periodically checks the status of nodes that were taken over.
 
-        Periodically checks the nodes in DEPLOYING and the state of the
-        conductor deploying them. If we find out that a conductor that
-        was provisioning the node has died we then break release the
-        node and gracefully mark the deployment as failed.
+        Periodically checks the nodes that are managed by this conductor but
+        have a reservation from a conductor that went offline.
+
+        1. Nodes in DEPLOYING state move to DEPLOY FAIL.
+
+        2. Nodes in CLEANING state move to CLEAN FAIL with maintenance set.
+
+        3. Nodes in a transient power state get the power operation aborted.
+
+        4. Reservation is removed.
+
+        The latter operation happens even for nodes in maintenance mode,
+        otherwise it's not possible to move them out of maintenance.
 
         :param context: request context.
         """
@@ -1578,12 +1587,14 @@ class ConductorManager(base_manager.BaseConductorManager):
             return
 
         node_iter = self.iter_nodes(
-            fields=['id', 'reservation'],
-            filters={'provision_state': states.DEPLOYING,
-                     'maintenance': False,
-                     'reserved_by_any_of': offline_conductors})
+            fields=['id', 'reservation', 'maintenance', 'provision_state',
+                    'target_power_state'],
+            filters={'reserved_by_any_of': offline_conductors})
+
+        state_cleanup_required = []
 
-        for node_uuid, driver, node_id, conductor_hostname in node_iter:
+        for (node_uuid, driver, node_id, conductor_hostname,
+             maintenance, provision_state, target_power_state) in node_iter:
             # NOTE(lucasagomes): Although very rare, this may lead to a
             # race condition. By the time we release the lock the conductor
             # that was previously managing the node could be back online.
@@ -1604,11 +1615,43 @@ class ConductorManager(base_manager.BaseConductorManager):
                 LOG.warning("During checking for deploying state, when "
                             "releasing the lock of the node %s, it was "
                             "already unlocked.", node_uuid)
+            else:
+                LOG.warning('Forcibly removed reservation of conductor %(old)s'
+                            ' on node %(node)s as that conductor went offline',
+                            {'old': conductor_hostname, 'node': node_uuid})
+
+            # TODO(dtantsur): clean up all states that are not stable and
+            # are not one of WAIT states.
+            if not maintenance and (provision_state in (states.DEPLOYING,
+                                                        states.CLEANING) or
+                                    target_power_state is not None):
+                LOG.debug('Node %(node)s taken over from conductor %(old)s '
+                          'requires state clean up: provision state is '
+                          '%(state)s, target power state is %(pstate)s',
+                          {'node': node_uuid, 'old': conductor_hostname,
+                           'state': provision_state,
+                           'pstate': target_power_state})
+                state_cleanup_required.append(node_uuid)
+
+        for node_uuid in state_cleanup_required:
+            with task_manager.acquire(context, node_uuid,
+                                      purpose='power state clean up') as task:
+                if not task.node.maintenance and task.node.target_power_state:
+                    old_state = task.node.target_power_state
+                    task.node.target_power_state = None
+                    task.node.last_error = _('Pending power operation was '
+                                             'aborted due to conductor take '
+                                             'over')
+                    task.node.save()
+                    LOG.warning('Aborted pending power operation %(op)s '
+                                'on node %(node)s due to conductor take over',
+                                {'op': old_state, 'node': node_uuid})
 
             self._fail_if_in_state(
-                context, {'id': node_id}, states.DEPLOYING,
+                context, {'uuid': node_uuid},
+                {states.DEPLOYING, states.CLEANING},
                 'provision_updated_at',
-                callback_method=utils.cleanup_after_timeout,
+                callback_method=utils.abort_on_conductor_take_over,
                 err_handler=utils.provisioning_error_handler)
 
     @METRICS.timer('ConductorManager._do_adoption')
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index 94075a4e6..29772e0dc 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -380,6 +380,27 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
         task.process_event('fail', target_state=target_state)
 
 
+@task_manager.require_exclusive_lock
+def abort_on_conductor_take_over(task):
+    """Set node's state when a task was aborted due to conductor take over.
+
+    :param task: a TaskManager instance.
+    """
+    msg = _('Operation was aborted due to conductor take over')
+    # By this time the "fail" even was processed, so we cannot end up in
+    # CLEANING or CLEAN WAIT, only in CLEAN FAIL.
+    if task.node.provision_state == states.CLEANFAIL:
+        cleaning_error_handler(task, msg, set_fail_state=False)
+    else:
+        # For aborted deployment (and potentially other operations), just set
+        # the last_error accordingly.
+        task.node.last_error = msg
+        task.node.save()
+
+    LOG.warning('Aborted the current operation on node %s due to '
+                'conductor take over', task.node.uuid)
+
+
 def rescuing_error_handler(task, msg, set_fail_state=True):
     """Cleanup rescue task after timeout or failure.
 
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index 1fe20b9d0..8e2229477 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -6252,16 +6252,17 @@ class DestroyPortgroupTestCase(mgr_utils.ServiceSetUpMixin,
 @mock.patch.object(manager.ConductorManager, '_fail_if_in_state')
 @mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor')
 @mock.patch.object(dbapi.IMPL, 'get_offline_conductors')
-class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
-                                          db_base.DbTestCase):
+class ManagerCheckOrphanNodesTestCase(mgr_utils.ServiceSetUpMixin,
+                                      db_base.DbTestCase):
     def setUp(self):
-        super(ManagerCheckDeployingStatusTestCase, self).setUp()
+        super(ManagerCheckOrphanNodesTestCase, self).setUp()
         self._start_service()
 
         self.node = obj_utils.create_test_node(
             self.context, id=1, uuid=uuidutils.generate_uuid(),
             driver='fake', provision_state=states.DEPLOYING,
-            target_provision_state=states.DEPLOYDONE,
+            target_provision_state=states.ACTIVE,
+            target_power_state=states.POWER_ON,
             reservation='fake-conductor')
 
         # create a second node in a different state to test the
@@ -6271,28 +6272,53 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
             driver='fake', provision_state=states.AVAILABLE,
             target_provision_state=states.NOSTATE)
 
-    def test__check_deploying_status(self, mock_off_cond, mock_mapped,
-                                     mock_fail_if):
+    def test__check_orphan_nodes(self, mock_off_cond, mock_mapped,
+                                 mock_fail_if):
         mock_off_cond.return_value = ['fake-conductor']
 
-        self.service._check_deploying_status(self.context)
+        self.service._check_orphan_nodes(self.context)
 
         self.node.refresh()
         mock_off_cond.assert_called_once_with()
         mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
         mock_fail_if.assert_called_once_with(
-            mock.ANY, {'id': self.node.id}, states.DEPLOYING,
+            mock.ANY, {'uuid': self.node.uuid},
+            {states.DEPLOYING, states.CLEANING},
             'provision_updated_at',
-            callback_method=conductor_utils.cleanup_after_timeout,
+            callback_method=conductor_utils.abort_on_conductor_take_over,
             err_handler=conductor_utils.provisioning_error_handler)
         # assert node was released
         self.assertIsNone(self.node.reservation)
+        self.assertIsNone(self.node.target_power_state)
+        self.assertIsNotNone(self.node.last_error)
 
-    def test__check_deploying_status_alive(self, mock_off_cond,
-                                           mock_mapped, mock_fail_if):
+    def test__check_orphan_nodes_cleaning(self, mock_off_cond, mock_mapped,
+                                          mock_fail_if):
+        self.node.provision_state = states.CLEANING
+        self.node.save()
+        mock_off_cond.return_value = ['fake-conductor']
+
+        self.service._check_orphan_nodes(self.context)
+
+        self.node.refresh()
+        mock_off_cond.assert_called_once_with()
+        mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
+        mock_fail_if.assert_called_once_with(
+            mock.ANY, {'uuid': self.node.uuid},
+            {states.DEPLOYING, states.CLEANING},
+            'provision_updated_at',
+            callback_method=conductor_utils.abort_on_conductor_take_over,
+            err_handler=conductor_utils.provisioning_error_handler)
+        # assert node was released
+        self.assertIsNone(self.node.reservation)
+        self.assertIsNone(self.node.target_power_state)
+        self.assertIsNotNone(self.node.last_error)
+
+    def test__check_orphan_nodes_alive(self, mock_off_cond,
+                                       mock_mapped, mock_fail_if):
         mock_off_cond.return_value = []
 
-        self.service._check_deploying_status(self.context)
+        self.service._check_orphan_nodes(self.context)
 
         self.node.refresh()
         mock_off_cond.assert_called_once_with()
@@ -6302,7 +6328,7 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
         self.assertIsNotNone(self.node.reservation)
 
     @mock.patch.object(objects.Node, 'release')
-    def test__check_deploying_status_release_exceptions_skipping(
+    def test__check_orphan_nodes_release_exceptions_skipping(
             self, mock_release, mock_off_cond, mock_mapped, mock_fail_if):
         mock_off_cond.return_value = ['fake-conductor']
         # Add another node so we can check both exceptions
@@ -6315,7 +6341,7 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
         mock_mapped.return_value = True
         mock_release.side_effect = [exception.NodeNotFound('not found'),
                                     exception.NodeLocked('locked')]
-        self.service._check_deploying_status(self.context)
+        self.service._check_orphan_nodes(self.context)
 
         self.node.refresh()
         mock_off_cond.assert_called_once_with()
@@ -6325,23 +6351,53 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
         # Assert we skipped and didn't try to call _fail_if_in_state
         self.assertFalse(mock_fail_if.called)
 
-    @mock.patch.object(objects.Node, 'release')
-    def test__check_deploying_status_release_node_not_locked(
-            self, mock_release, mock_off_cond, mock_mapped, mock_fail_if):
+    def test__check_orphan_nodes_release_node_not_locked(
+            self, mock_off_cond, mock_mapped, mock_fail_if):
+        # this simulates releasing the node elsewhere
+        count = [0]
+
+        def _fake_release(*args, **kwargs):
+            self.node.reservation = None
+            self.node.save()
+            # raise an exception only the first time release is called
+            count[0] += 1
+            if count[0] == 1:
+                raise exception.NodeNotLocked('not locked')
+
         mock_off_cond.return_value = ['fake-conductor']
         mock_mapped.return_value = True
-        mock_release.side_effect = exception.NodeNotLocked('not locked')
-        self.service._check_deploying_status(self.context)
+        with mock.patch.object(objects.Node, 'release',
+                               side_effect=_fake_release) as mock_release:
+            self.service._check_orphan_nodes(self.context)
+            mock_release.assert_called_with(self.context, mock.ANY,
+                                            self.node.id)
 
-        self.node.refresh()
         mock_off_cond.assert_called_once_with()
         mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
         mock_fail_if.assert_called_once_with(
-            mock.ANY, {'id': self.node.id}, states.DEPLOYING,
+            mock.ANY, {'uuid': self.node.uuid},
+            {states.DEPLOYING, states.CLEANING},
             'provision_updated_at',
-            callback_method=conductor_utils.cleanup_after_timeout,
+            callback_method=conductor_utils.abort_on_conductor_take_over,
             err_handler=conductor_utils.provisioning_error_handler)
 
+    def test__check_orphan_nodes_maintenance(self, mock_off_cond, mock_mapped,
+                                             mock_fail_if):
+        self.node.maintenance = True
+        self.node.save()
+        mock_off_cond.return_value = ['fake-conductor']
+
+        self.service._check_orphan_nodes(self.context)
+
+        self.node.refresh()
+        mock_off_cond.assert_called_once_with()
+        mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
+        # assert node was released
+        self.assertIsNone(self.node.reservation)
+        # not changing states in maintenance
+        self.assertFalse(mock_fail_if.called)
+        self.assertIsNotNone(self.node.target_power_state)
+
 
 class TestIndirectionApiConductor(db_base.DbTestCase):
 
diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py
index c35e1c31a..630557489 100644
--- a/ironic/tests/unit/conductor/test_utils.py
+++ b/ironic/tests/unit/conductor/test_utils.py
@@ -1242,6 +1242,25 @@ class ErrorHandlersTestCase(tests_base.TestCase):
         self.assertIn(msg, self.node.last_error)
         self.assertIn(msg, self.node.maintenance_reason)
 
+    def test_abort_on_conductor_take_over_cleaning(self):
+        self.node.maintenance = False
+        self.node.provision_state = states.CLEANFAIL
+        conductor_utils.abort_on_conductor_take_over(self.task)
+        self.assertTrue(self.node.maintenance)
+        self.assertIn('take over', self.node.maintenance_reason)
+        self.assertIn('take over', self.node.last_error)
+        self.task.driver.deploy.tear_down_cleaning.assert_called_once_with(
+            self.task)
+        self.node.save.assert_called_once_with()
+
+    def test_abort_on_conductor_take_over_deploying(self):
+        self.node.maintenance = False
+        self.node.provision_state = states.DEPLOYFAIL
+        conductor_utils.abort_on_conductor_take_over(self.task)
+        self.assertFalse(self.node.maintenance)
+        self.assertIn('take over', self.node.last_error)
+        self.node.save.assert_called_once_with()
+
     @mock.patch.object(conductor_utils, 'LOG')
     def test_spawn_cleaning_error_handler_no_worker(self, log_mock):
         exc = exception.NoFreeConductorWorker()
diff --git a/releasenotes/notes/orphan-nodes-389cb6d90c2917ec.yaml b/releasenotes/notes/orphan-nodes-389cb6d90c2917ec.yaml
new file mode 100644
index 000000000..c5c3f1e11
--- /dev/null
+++ b/releasenotes/notes/orphan-nodes-389cb6d90c2917ec.yaml
@@ -0,0 +1,9 @@
+---
+fixes:
+  - |
+    On node take over, any locks that are left from the old conductor are
+    cleared by the new one. Previously it only happened for nodes in
+    ``DEPLOYING`` state.
+  - |
+    On taking over nodes in ``CLEANING`` state, the new conductor moves them
+    to ``CLEAN FAIL`` and set maintenance.
author	Zuul <zuul@review.openstack.org>	2018-03-30 21:12:57 +0000
committer	Gerrit Code Review <review@openstack.org>	2018-03-30 21:12:57 +0000
commit	039e20d8751a81061dc64d4038d18face9fad5e4 (patch)
tree	19d5a8fdd0096c2bc15aea8b670f51b878308657
parent	0c297ccbb73d1424af3ae7d2a17c4805bd23ae4b (diff)
parent	a90b999a2d8afec0476dbd82d4eb2578ece313d0 (diff)
download	ironic-10.1.2.tar.gz