diff options
author | Lucas Alvares Gomes <lucasagomes@gmail.com> | 2014-06-18 17:19:37 +0100 |
---|---|---|
committer | Lucas Alvares Gomes <lucasagomes@gmail.com> | 2014-07-24 17:44:10 +0100 |
commit | 8b554deed62d86d19d7459ab1ac36cf2dc27745a (patch) | |
tree | 086f958d21a7668c7f10cc97194413c520d9369c | |
parent | 401eae8a89cee131198325b89a2c7208d74d94c2 (diff) | |
download | ironic-8b554deed62d86d19d7459ab1ac36cf2dc27745a.tar.gz |
Fix nodes left in an incosistent state if no workers
This patch is fixing the problem of leaving the nodes in an inconsistent
state if there's no free conductor workers available to deploy or the
tear down a node, the patch is using the set_spawn_error_hook() method
of TaskManager to run some custom code that will rollback the nodes
to the previous provision_state and target_provision_state in case
NoFreeConductorWorker is raised.
Closes-Bug: #1331494
Change-Id: I5d6e8e2c69cbdf1f9abe169afe617aa79783e57d
-rw-r--r-- | ironic/conductor/manager.py | 54 | ||||
-rw-r--r-- | ironic/tests/conductor/test_manager.py | 27 |
2 files changed, 73 insertions, 8 deletions
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index d8162525d..ad81df4a9 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -53,6 +53,7 @@ from oslo import messaging from ironic.common import driver_factory from ironic.common import exception from ironic.common import hash_ring as hash +from ironic.common import i18n from ironic.common import neutron from ironic.common import states from ironic.conductor import task_manager @@ -60,7 +61,6 @@ from ironic.conductor import utils from ironic.db import api as dbapi from ironic import objects from ironic.openstack.common import excutils -from ironic.openstack.common.gettextutils import _LI from ironic.openstack.common import lockutils from ironic.openstack.common import log from ironic.openstack.common import periodic_task @@ -68,6 +68,9 @@ from ironic.openstack.common import periodic_task MANAGER_TOPIC = 'ironic.conductor_manager' WORKER_SPAWN_lOCK = "conductor_worker_spawn" +_LW = i18n._LW +_LI = i18n._LI + LOG = log.getLogger(__name__) conductor_opts = [ @@ -340,6 +343,34 @@ class ConductorManager(periodic_task.PeriodicTasks): method=driver_method, **info) + def _provisioning_error_handler(self, e, node, context, provision_state, + target_provision_state): + """Set the node's provisioning states if error occurs. + + This hook gets called upon an exception being raised when spawning + the worker to do the deployment or tear down of a node. + + :param e: the exception object that was raised. + :param node: an Ironic node object. + :param context: security context. + :param provision_state: the provision state to be set on + the node. + :param target_provision_state: the target provision state to be + set on the node. + + """ + if isinstance(e, exception.NoFreeConductorWorker): + node.provision_state = provision_state + node.target_provision_state = target_provision_state + node.last_error = (_("No free conductor workers available")) + node.save(context) + LOG.warning(_LW("No free conductor workers available to perform " + "an action on node %(node)s, setting node's " + "provision_state back to %(prov_state)s and " + "target_provision_state to %(tgt_prov_state)s."), + {'node': node.uuid, 'prov_state': provision_state, + 'tgt_prov_state': target_provision_state}) + @messaging.expected_exceptions(exception.NoFreeConductorWorker, exception.NodeLocked, exception.NodeInMaintenance, @@ -395,11 +426,21 @@ class ConductorManager(periodic_task.PeriodicTasks): "RPC do_node_deploy failed to validate deploy info. " "Error: %(msg)s") % {'msg': e}) + # Save the previous states so we can rollback the node to a + # consistent state in case there's no free workers to do the + # deploy work + previous_prov_state = node.provision_state + previous_tgt_provision_state = node.target_provision_state + # Set target state to expose that work is in progress node.provision_state = states.DEPLOYING node.target_provision_state = states.DEPLOYDONE node.last_error = None node.save(context) + + task.set_spawn_error_hook(self._provisioning_error_handler, node, + context, previous_prov_state, + previous_tgt_provision_state) task.spawn_after(self._spawn_worker, self._do_node_deploy, context, task) @@ -466,10 +507,21 @@ class ConductorManager(periodic_task.PeriodicTasks): "RPC do_node_tear_down failed to validate deploy info. " "Error: %(msg)s") % {'msg': e}) + # save the previous states so we can rollback the node to a + # consistent state in case there's no free workers to do the + # tear down work + previous_prov_state = node.provision_state + previous_tgt_provision_state = node.target_provision_state + + # set target state to expose that work is in progress node.provision_state = states.DELETING node.target_provision_state = states.DELETED node.last_error = None node.save(context) + + task.set_spawn_error_hook(self._provisioning_error_handler, node, + context, previous_prov_state, + previous_tgt_provision_state) task.spawn_after(self._spawn_worker, self._do_node_tear_down, context, task) diff --git a/ironic/tests/conductor/test_manager.py b/ironic/tests/conductor/test_manager.py index 2e3c20e3f..76dfeb8dd 100644 --- a/ironic/tests/conductor/test_manager.py +++ b/ironic/tests/conductor/test_manager.py @@ -720,7 +720,12 @@ class ManagerTestCase(tests_db_base.DbTestCase): mock_spawn.assert_called_once_with(mock.ANY, mock.ANY, mock.ANY) def test_do_node_deploy_worker_pool_full(self): - node = obj_utils.create_test_node(self.context, driver='fake') + prv_state = states.NOSTATE + tgt_prv_state = states.NOSTATE + node = obj_utils.create_test_node(self.context, + provision_state=prv_state, + target_provision_state=tgt_prv_state, + last_error=None, driver='fake') self._start_service() with mock.patch.object(self.service, '_spawn_worker') as mock_spawn: @@ -733,8 +738,10 @@ class ManagerTestCase(tests_db_base.DbTestCase): self.assertEqual(exception.NoFreeConductorWorker, exc.exc_info[0]) self.service._worker_pool.waitall() node.refresh() - # This is a sync operation last_error should be None. - self.assertIsNone(node.last_error) + # Make sure things were rolled back + self.assertEqual(prv_state, node.provision_state) + self.assertEqual(tgt_prv_state, node.target_provision_state) + self.assertIsNotNone(node.last_error) # Verify reservation has been cleared. self.assertIsNone(node.reservation) @@ -818,10 +825,14 @@ class ManagerTestCase(tests_db_base.DbTestCase): @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker') def test_do_node_tear_down_worker_pool_full(self, mock_spawn): + prv_state = states.ACTIVE + tgt_prv_state = states.NOSTATE fake_instance_info = {'foo': 'bar'} node = obj_utils.create_test_node(self.context, driver='fake', - provision_state=states.ACTIVE, - instance_info=fake_instance_info) + provision_state=prv_state, + target_provision_state=tgt_prv_state, + instance_info=fake_instance_info, + last_error=None) self._start_service() mock_spawn.side_effect = exception.NoFreeConductorWorker() @@ -833,10 +844,12 @@ class ManagerTestCase(tests_db_base.DbTestCase): self.assertEqual(exception.NoFreeConductorWorker, exc.exc_info[0]) self.service._worker_pool.waitall() node.refresh() - # This is a sync operation last_error should be None. - self.assertIsNone(node.last_error) # Assert instance_info was not touched self.assertEqual(fake_instance_info, node.instance_info) + # Make sure things were rolled back + self.assertEqual(prv_state, node.provision_state) + self.assertEqual(tgt_prv_state, node.target_provision_state) + self.assertIsNotNone(node.last_error) # Verify reservation has been cleared. self.assertIsNone(node.reservation) |