diff options
author | Josh Gachnang <josh@pcsforeducation.com> | 2015-04-10 15:15:33 -0700 |
---|---|---|
committer | Josh Gachnang <josh@pcsforeducation.com> | 2015-04-13 10:12:55 -0700 |
commit | af3918fb69701fa794cbbe9de9cafc69ac3e936d (patch) | |
tree | 653f9f14d55c0e709cbee723b3c0ce022a33f234 | |
parent | aa2cf65e4db36184e538885656f98e753fa6a768 (diff) | |
download | ironic-af3918fb69701fa794cbbe9de9cafc69ac3e936d.tar.gz |
Convert internal RPC continue_node_cleaning to a "cast"
The agent driver is using RPCs to call back from the driver to the
conductor asynchronously. When using the RPC.call() method, some nodes
would end up with stuck locks when using the agent driver during cleaning.
The agent driver would issue a call() to continue_node_cleaning() after
either the first heartbeat (from prepare_cleaning) or a heartbeat after
a clean step had completed. The conductor would attempt to get a lock,
but would not be able to. The node would retain its locked state (so
far as I could tell), even after the error. Other nodes would continue
and complete cleaning just fine. The exception raised by
continue_node_cleaning() was likely not caught by the agent driver, but
caught by vendor_passthru() in the conductor as an expected exception.
Switching to cast() avoids the issue because the errors are not sent
back to the caller. I didn't experience any more stuck locks with
this change.
Change-Id: I4dbb04ccb93199bba4e1a1614bc19b70a068a9ea
Closes-Bug: 1442810
-rw-r--r-- | ironic/conductor/manager.py | 6 | ||||
-rw-r--r-- | ironic/conductor/rpcapi.py | 16 | ||||
-rw-r--r-- | ironic/tests/conductor/test_manager.py | 16 | ||||
-rw-r--r-- | ironic/tests/conductor/test_rpcapi.py | 4 |
4 files changed, 16 insertions, 26 deletions
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 3322741ab..c2b75bca0 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -204,7 +204,7 @@ class ConductorManager(periodic_task.PeriodicTasks): """Ironic Conductor manager main class.""" # NOTE(rloo): This must be in sync with rpcapi.ConductorAPI's. - RPC_API_VERSION = '1.26' + RPC_API_VERSION = '1.27' target = messaging.Target(version=RPC_API_VERSION) @@ -824,10 +824,6 @@ class ConductorManager(periodic_task.PeriodicTasks): state=node.provision_state) self._do_node_clean(task) - @messaging.expected_exceptions(exception.NoFreeConductorWorker, - exception.NodeLocked, - exception.InvalidStateRequested, - exception.NodeNotFound) def continue_node_clean(self, context, node_id): """RPC method to continue cleaning a node. diff --git a/ironic/conductor/rpcapi.py b/ironic/conductor/rpcapi.py index 06c7d434c..1ca0db5ee 100644 --- a/ironic/conductor/rpcapi.py +++ b/ironic/conductor/rpcapi.py @@ -69,11 +69,12 @@ class ConductorAPI(object): | 1.24 - Added inspect_hardware method | 1.25 - Added destroy_port | 1.26 - Added continue_node_clean + | 1.27 - Convert continue_node_clean to cast """ # NOTE(rloo): This must be in sync with manager.ConductorManager's. - RPC_API_VERSION = '1.26' + RPC_API_VERSION = '1.27' def __init__(self, topic=None): super(ConductorAPI, self).__init__() @@ -329,18 +330,15 @@ class ConductorAPI(object): def continue_node_clean(self, context, node_id, topic=None): """Signal to conductor service to start the next cleaning action. + NOTE(JoshNang) this is an RPC cast, there will be no response or + exception raised by the conductor for this RPC. + :param context: request context. :param node_id: node id or uuid. :param topic: RPC topic. Defaults to self.topic. - :raises: NoFreeConductorWorker when there is no free worker to start - async task. - :raises: InvalidStateRequested if the requested action can not - be performed. - :raises: NodeLocked if node is locked by another conductor. - :raises: NodeNotFound if the node no longer appears in the database """ - cctxt = self.client.prepare(topic=topic or self.topic, version='1.26') - return cctxt.call(context, 'continue_node_clean', + cctxt = self.client.prepare(topic=topic or self.topic, version='1.27') + return cctxt.cast(context, 'continue_node_clean', node_id=node_id) def validate_driver_interfaces(self, context, node_id, topic=None): diff --git a/ironic/tests/conductor/test_manager.py b/ironic/tests/conductor/test_manager.py index 47a6e77f7..df0078712 100644 --- a/ironic/tests/conductor/test_manager.py +++ b/ironic/tests/conductor/test_manager.py @@ -1606,11 +1606,9 @@ class DoNodeCleanTestCase(_ServiceSetUpMixin, tests_db_base.DbTestCase): mock_spawn.side_effect = exception.NoFreeConductorWorker() - exc = self.assertRaises(messaging.rpc.ExpectedException, - self.service.continue_node_clean, - self.context, node.uuid) - # Compare true exception hidden by @messaging.expected_exceptions - self.assertEqual(exception.NoFreeConductorWorker, exc.exc_info[0]) + self.assertRaises(exception.NoFreeConductorWorker, + self.service.continue_node_clean, + self.context, node.uuid) self.service._worker_pool.waitall() node.refresh() @@ -1630,11 +1628,9 @@ class DoNodeCleanTestCase(_ServiceSetUpMixin, tests_db_base.DbTestCase): last_error=None) self._start_service() - exc = self.assertRaises(messaging.rpc.ExpectedException, - self.service.continue_node_clean, - self.context, node.uuid) - # Compare true exception hidden by @messaging.expected_exceptions - self.assertEqual(exception.InvalidStateRequested, exc.exc_info[0]) + self.assertRaises(exception.InvalidStateRequested, + self.service.continue_node_clean, + self.context, node.uuid) self.service._worker_pool.waitall() node.refresh() diff --git a/ironic/tests/conductor/test_rpcapi.py b/ironic/tests/conductor/test_rpcapi.py index 1ec18db3f..e7d27627c 100644 --- a/ironic/tests/conductor/test_rpcapi.py +++ b/ironic/tests/conductor/test_rpcapi.py @@ -291,6 +291,6 @@ class RPCAPITestCase(base.DbTestCase): def test_continue_node_clean(self): self._test_rpcapi('continue_node_clean', - 'call', - version='1.26', + 'cast', + version='1.27', node_id=self.fake_node['uuid']) |