Avoid n-cond startup abort for keystone failures

Conductor creates a placement client for the potential case where it needs to make a call for certain operations. A transient network or keystone failure will currently cause it to abort startup, which means it is not available for other unrelated activities, such as DB proxying for compute. This makes conductor test the placement client on startup, but only abort startup on errors that are highly likely to be permanent configuration errors, and only warn about things like being unable to contact keystone/placement during initialization. If a non-fatal error is encountered at startup, later operations needing the placement client will retry initialization. Conflicts: nova/tests/unit/conductor/test_conductor.py NOTE(melwitt): The conflict is because change Id5b04cf2f6ca24af8e366d23f15cf0e5cac8e1cc (Use unittest.mock instead of third party mock) is not in Yoga. Closes-Bug: #1846820 Change-Id: Idb7fcbce0c9562e7b9bd3e80f2a6d4b9bc286830 (cherry picked from commit 232684b44022f1bc4d72b07045900780de456e63)
author: Dan Smith <dansmith@redhat.com> 2022-08-11 10:18:25 -0700
committer: melanie witt <melwittt@gmail.com> 2022-09-22 23:04:14 +0000
commit: 19346082058d51c78bb157ca5e1304d15691dd9a (patch)
tree: c91b14cef97e3974a3bea98a5bf7c79647cb86dc
parent: 77273f067d96a4ec401c3b36f2922d63c4ad7103 (diff)
download: nova-19346082058d51c78bb157ca5e1304d15691dd9a.tar.gz
3 files changed, 87 insertions, 1 deletions
diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py
index aaec1c99b7..53067bbef7 100644
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@@ -21,6 +21,7 @@ import eventlet
 import functools
 import sys
 
+from keystoneauth1 import exceptions as ks_exc
 from oslo_config import cfg
 from oslo_db import exception as db_exc
 from oslo_limit import exception as limit_exceptions
@@ -243,11 +244,42 @@ class ComputeTaskManager:
         self.network_api = neutron.API()
         self.servicegroup_api = servicegroup.API()
         self.query_client = query.SchedulerQueryClient()
-        self.report_client = report.report_client_singleton()
         self.notifier = rpc.get_notifier('compute')
         # Help us to record host in EventReporter
         self.host = CONF.host
 
+        try:
+            # Test our placement client during initialization
+            self.report_client
+        except (ks_exc.EndpointNotFound,
+                ks_exc.DiscoveryFailure,
+                ks_exc.RequestTimeout,
+                ks_exc.GatewayTimeout,
+                ks_exc.ConnectFailure) as e:
+            # Non-fatal, likely transient (although not definitely);
+            # continue startup but log the warning so that when things
+            # fail later, it will be clear why we can not do certain
+            # things.
+            LOG.warning('Unable to initialize placement client (%s); '
+                        'Continuing with startup, but some operations '
+                        'will not be possible.', e)
+        except (ks_exc.MissingAuthPlugin,
+                ks_exc.Unauthorized) as e:
+            # This is almost definitely fatal mis-configuration. The
+            # Unauthorized error might be transient, but it is
+            # probably reasonable to consider it fatal.
+            LOG.error('Fatal error initializing placement client; '
+                      'config is incorrect or incomplete: %s', e)
+            raise
+        except Exception as e:
+            # Unknown/unexpected errors here are fatal
+            LOG.error('Fatal error initializing placement client: %s', e)
+            raise
+
+    @property
+    def report_client(self):
+        return report.report_client_singleton()
+
     def reset(self):
         LOG.info('Reloading compute RPC API')
         compute_rpcapi.LAST_VERSION = None
diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py
index 15aa960aad..8c954db9a7 100644
--- a/nova/tests/unit/conductor/test_conductor.py
+++ b/nova/tests/unit/conductor/test_conductor.py
@@ -17,6 +17,8 @@
 
 import copy
 
+import ddt
+from keystoneauth1 import exceptions as ks_exc
 import mock
 from oslo_db import exception as db_exc
 from oslo_limit import exception as limit_exceptions
@@ -52,6 +54,7 @@ from nova.objects import block_device as block_device_obj
 from nova.objects import fields
 from nova.objects import request_spec
 from nova.scheduler.client import query
+from nova.scheduler.client import report
 from nova.scheduler import utils as scheduler_utils
 from nova import test
 from nova.tests import fixtures
@@ -4869,3 +4872,35 @@ class ConductorTaskAPITestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
             logtext)
         self.assertIn('host3\' because it is not up', logtext)
         self.assertIn('image1 failed 1 times', logtext)
+
+
+@ddt.ddt
+class TestConductorTaskManager(test.NoDBTestCase):
+    def test_placement_client_startup(self):
+        self.assertIsNone(report.PLACEMENTCLIENT)
+        conductor_manager.ComputeTaskManager()
+        self.assertIsNotNone(report.PLACEMENTCLIENT)
+
+    @ddt.data(ks_exc.MissingAuthPlugin,
+              ks_exc.Unauthorized,
+              test.TestingException)
+    def test_placement_client_startup_fatals(self, exc):
+        self.assertRaises(exc,
+                          self._test_placement_client_startup_exception, exc)
+
+    @ddt.data(ks_exc.EndpointNotFound,
+              ks_exc.DiscoveryFailure,
+              ks_exc.RequestTimeout,
+              ks_exc.GatewayTimeout,
+              ks_exc.ConnectFailure)
+    def test_placement_client_startup_non_fatal(self, exc):
+        self._test_placement_client_startup_exception(exc)
+
+    @mock.patch.object(report, 'LOG')
+    def _test_placement_client_startup_exception(self, exc, mock_log):
+        with mock.patch.object(report.SchedulerReportClient, '_create_client',
+                               side_effect=exc):
+            try:
+                conductor_manager.ComputeTaskManager()
+            finally:
+                mock_log.error.assert_called_once()
diff --git a/nova/tests/unit/scheduler/client/test_report.py b/nova/tests/unit/scheduler/client/test_report.py
index 485f187d9e..9b2f5c3a0a 100644
--- a/nova/tests/unit/scheduler/client/test_report.py
+++ b/nova/tests/unit/scheduler/client/test_report.py
@@ -185,6 +185,25 @@ class TestSingleton(test.NoDBTestCase):
             self.assertRaises(exc, report.report_client_singleton)
         mock_log.error.assert_called_once()
 
+    def test_error_then_success(self):
+        # Simulate an error
+        self._test_error(ks_exc.ConnectFailure)
+
+        # Ensure we did not set the global client
+        self.assertIsNone(report.PLACEMENTCLIENT)
+
+        # Call again, with no error
+        client = report.report_client_singleton()
+
+        # Make sure we got a client and that it was set as the global
+        # one
+        self.assertIsNotNone(client)
+        self.assertEqual(client, report.PLACEMENTCLIENT)
+
+        # Make sure we keep getting the same one
+        client2 = report.report_client_singleton()
+        self.assertEqual(client, client2)
+
 
 class TestConstructor(test.NoDBTestCase):
     def setUp(self):
author	Dan Smith <dansmith@redhat.com>	2022-08-11 10:18:25 -0700
committer	melanie witt <melwittt@gmail.com>	2022-09-22 23:04:14 +0000
commit	19346082058d51c78bb157ca5e1304d15691dd9a (patch)
tree	c91b14cef97e3974a3bea98a5bf7c79647cb86dc
parent	77273f067d96a4ec401c3b36f2922d63c4ad7103 (diff)
download	nova-19346082058d51c78bb157ca5e1304d15691dd9a.tar.gz