diff options
-rw-r--r-- | doc/source/install/enrollment.rst | 26 | ||||
-rw-r--r-- | ironic/api/controllers/v1/ramdisk.py | 6 | ||||
-rw-r--r-- | ironic/common/kickstart_utils.py | 4 | ||||
-rw-r--r-- | ironic/common/molds.py | 6 | ||||
-rw-r--r-- | ironic/common/rpc_service.py | 18 | ||||
-rw-r--r-- | ironic/conductor/base_manager.py | 17 | ||||
-rw-r--r-- | ironic/conf/default.py | 5 | ||||
-rw-r--r-- | ironic/drivers/modules/ansible/playbooks/library/stream_url.py | 3 | ||||
-rw-r--r-- | ironic/tests/unit/common/test_kickstart_utils.py | 3 | ||||
-rw-r--r-- | ironic/tests/unit/common/test_molds.py | 35 | ||||
-rw-r--r-- | ironic/tests/unit/common/test_rpc_service.py | 50 | ||||
-rw-r--r-- | releasenotes/notes/graceful_shutdown_wait-9a62627714b86726.yaml | 15 |
12 files changed, 144 insertions, 44 deletions
diff --git a/doc/source/install/enrollment.rst b/doc/source/install/enrollment.rst index 9f9355d75..40c63b4bb 100644 --- a/doc/source/install/enrollment.rst +++ b/doc/source/install/enrollment.rst @@ -81,15 +81,9 @@ affected, since the initial provision state is still ``available``. However, using API version 1.11 or above may break existing automation tooling with respect to node creation. -The default API version used by (the most recent) python-ironicclient is 1.9, -but it may change in the future and should not be relied on. - -In the examples below we will use version 1.11 of the Bare metal API. -This gives us the following advantages: - -* Explicit power credentials validation before leaving the ``enroll`` state. -* Running node cleaning before entering the ``available`` state. -* Not exposing half-configured nodes to the scheduler. +The ``openstack baremetal`` command line tool tries to negotiate the most +recent supported version, which in virtually all cases will be newer than +1.11. To set the API version for all commands, you can set the environment variable ``IRONIC_API_VERSION``. For the OpenStackClient baremetal plugin, set @@ -118,7 +112,6 @@ and may be combined if desired. .. code-block:: console - $ export OS_BAREMETAL_API_VERSION=1.11 $ baremetal node create --driver ipmi +--------------+--------------------------------------+ | Property | Value | @@ -423,12 +416,13 @@ Validating node information Making node available for deployment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In order for nodes to be available for deploying workloads on them, nodes -must be in the ``available`` provision state. To do this, nodes -created with API version 1.11 and above must be moved from the ``enroll`` state -to the ``manageable`` state and then to the ``available`` state. -This section can be safely skipped, if API version 1.10 or earlier is used -(which is the case by default). +In order for nodes to be available for deploying workloads on them, nodes must +be in the ``available`` provision state. To do this, nodes must be moved from +the ``enroll`` state to the ``manageable`` state and then to the ``available`` +state. + +.. note:: + This section can be skipped, if API version 1.10 or earlier is used. After creating a node and before moving it from its initial provision state of ``enroll``, basic power and port information needs to be configured on the node. diff --git a/ironic/api/controllers/v1/ramdisk.py b/ironic/api/controllers/v1/ramdisk.py index 5feef5e02..b98eb7dc2 100644 --- a/ironic/api/controllers/v1/ramdisk.py +++ b/ironic/api/controllers/v1/ramdisk.py @@ -131,13 +131,17 @@ class LookupController(rest.RestController): else: node = objects.Node.get_by_port_addresses( api.request.context, valid_addresses) - except exception.NotFound: + except exception.NotFound as e: # NOTE(dtantsur): we are reraising the same exception to make sure # we don't disclose the difference between nodes that are not found # at all and nodes in a wrong state by different error messages. + LOG.error('No node has been found during lookup: %s', e) raise exception.NotFound() if CONF.api.restrict_lookup and not self.lookup_allowed(node): + LOG.error('Lookup is not allowed for node %(node)s in the ' + 'provision state %(state)s', + {'node': node.uuid, 'state': node.provision_state}) raise exception.NotFound() if api_utils.allow_agent_token(): diff --git a/ironic/common/kickstart_utils.py b/ironic/common/kickstart_utils.py index 433cf2390..4e02e2ea7 100644 --- a/ironic/common/kickstart_utils.py +++ b/ironic/common/kickstart_utils.py @@ -23,6 +23,7 @@ import pycdlib import requests from ironic.common import exception +from ironic.conf import CONF LOG = logging.getLogger(__name__) @@ -107,7 +108,8 @@ def decode_and_extract_config_drive_iso(config_drive_iso_gz): def _fetch_config_drive_from_url(url): try: - config_drive = requests.get(url).content + config_drive = requests.get( + url, timeout=CONF.webserver_connection_timeout).content except requests.exceptions.RequestException as e: raise exception.InstanceDeployFailure( "Can't download the configdrive content from '%(url)s'. " diff --git a/ironic/common/molds.py b/ironic/common/molds.py index 234fcc6e3..a77e42a63 100644 --- a/ironic/common/molds.py +++ b/ironic/common/molds.py @@ -49,7 +49,8 @@ def save_configuration(task, url, data): ) def _request(url, data, auth_header): return requests.put( - url, data=json.dumps(data, indent=2), headers=auth_header) + url, data=json.dumps(data, indent=2), headers=auth_header, + timeout=CONF.webserver_connection_timeout) auth_header = _get_auth_header(task) response = _request(url, data, auth_header) @@ -76,7 +77,8 @@ def get_configuration(task, url): reraise=True ) def _request(url, auth_header): - return requests.get(url, headers=auth_header) + return requests.get(url, headers=auth_header, + timeout=CONF.webserver_connection_timeout) auth_header = _get_auth_header(task) response = _request(url, auth_header) diff --git a/ironic/common/rpc_service.py b/ironic/common/rpc_service.py index cb0f23c98..a74f6bab3 100644 --- a/ironic/common/rpc_service.py +++ b/ironic/common/rpc_service.py @@ -100,7 +100,8 @@ class RPCService(service.Service): seconds=CONF.hash_ring_reset_interval) try: - self.manager.del_host(deregister=self.deregister) + self.manager.del_host(deregister=self.deregister, + clear_node_reservations=False) except Exception as e: LOG.exception('Service error occurred when cleaning up ' 'the RPC manager. Error: %s', e) @@ -127,6 +128,21 @@ class RPCService(service.Service): LOG.info('Stopped RPC server for service %(service)s on host ' '%(host)s.', {'service': self.topic, 'host': self.host}) + + # Wait for reservation locks held by this conductor. + # The conductor process will end when: + # - All reservations for this conductor are released + # - CONF.graceful_shutdown_timeout has elapsed + # - The process manager (systemd, kubernetes) sends SIGKILL after the + # configured graceful period + graceful_time = initial_time + datetime.timedelta( + seconds=CONF.graceful_shutdown_timeout) + while (self.manager.has_reserved() + and graceful_time > timeutils.utcnow()): + LOG.info('Waiting for reserved nodes to clear on host %(host)s', + {'host': self.host}) + time.sleep(1) + rpc.set_global_manager(None) def _handle_signal(self, signo, frame): diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index 5c2e4ea95..544411e1d 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -298,15 +298,17 @@ class BaseConductorManager(object): # This is only used in tests currently. Delete it? self._periodic_task_callables = periodic_task_callables - def del_host(self, deregister=True): + def del_host(self, deregister=True, clear_node_reservations=True): # Conductor deregistration fails if called on non-initialized # conductor (e.g. when rpc server is unreachable). if not hasattr(self, 'conductor'): return self._shutdown = True self._keepalive_evt.set() - # clear all locks held by this conductor before deregistering - self.dbapi.clear_node_reservations_for_conductor(self.host) + + if clear_node_reservations: + # clear all locks held by this conductor before deregistering + self.dbapi.clear_node_reservations_for_conductor(self.host) if deregister: try: # Inform the cluster that this conductor is shutting down. @@ -338,6 +340,15 @@ class BaseConductorManager(object): """Return a count of currently online conductors""" return len(self.dbapi.get_online_conductors()) + def has_reserved(self): + """Determines if this host currently has any reserved nodes + + :returns: True if this host has reserved nodes + """ + return bool(self.dbapi.get_nodeinfo_list( + filters={'reserved_by_any_of': [self.host]}, + limit=1)) + def _register_and_validate_hardware_interfaces(self, hardware_types): """Register and validate hardware interfaces for this conductor. diff --git a/ironic/conf/default.py b/ironic/conf/default.py index c7aff69cc..5b40c1f31 100644 --- a/ironic/conf/default.py +++ b/ironic/conf/default.py @@ -426,8 +426,9 @@ webserver_opts = [ 'Defaults to True.')), cfg.IntOpt('webserver_connection_timeout', default=60, - help=_('Connection timeout when accessing remote web servers ' - 'with images.')), + help=_('Connection timeout when accessing/interacting with ' + 'remote web servers with images or other artifacts ' + 'being accessed.')), ] diff --git a/ironic/drivers/modules/ansible/playbooks/library/stream_url.py b/ironic/drivers/modules/ansible/playbooks/library/stream_url.py index 0da3cc4dd..786b65013 100644 --- a/ironic/drivers/modules/ansible/playbooks/library/stream_url.py +++ b/ironic/drivers/modules/ansible/playbooks/library/stream_url.py @@ -31,7 +31,8 @@ class StreamingDownloader(object): else: self.hasher = None self.chunksize = chunksize - resp = requests.get(url, stream=True, verify=verify, cert=certs) + resp = requests.get(url, stream=True, verify=verify, cert=certs, + timeout=30) if resp.status_code != 200: raise Exception('Invalid response code: %s' % resp.status_code) diff --git a/ironic/tests/unit/common/test_kickstart_utils.py b/ironic/tests/unit/common/test_kickstart_utils.py index 0dd1ac572..db6123b9d 100644 --- a/ironic/tests/unit/common/test_kickstart_utils.py +++ b/ironic/tests/unit/common/test_kickstart_utils.py @@ -129,4 +129,5 @@ echo $CONTENT | /usr/bin/base64 --decode > {file_path}\n\ task.node.instance_info = i_info task.node.save() self.assertEqual(expected, ks_utils.prepare_config_drive(task)) - mock_get.assert_called_with('http://server/fake-configdrive-url') + mock_get.assert_called_with('http://server/fake-configdrive-url', + timeout=60) diff --git a/ironic/tests/unit/common/test_molds.py b/ironic/tests/unit/common/test_molds.py index 810dd61bc..2323c2fa8 100644 --- a/ironic/tests/unit/common/test_molds.py +++ b/ironic/tests/unit/common/test_molds.py @@ -46,7 +46,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase): molds.save_configuration(task, url, data) mock_put.assert_called_once_with(url, '{\n "key": "value"\n}', - headers={'X-Auth-Token': 'token'}) + headers={'X-Auth-Token': 'token'}, + timeout=60) @mock.patch.object(swift, 'get_swift_session', autospec=True) @mock.patch.object(requests, 'put', autospec=True) @@ -77,7 +78,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase): mock_put.assert_called_once_with( url, '{\n "key": "value"\n}', - headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}) + headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}, + timeout=60) @mock.patch.object(requests, 'put', autospec=True) def test_save_configuration_http_noauth(self, mock_put): @@ -91,7 +93,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase): molds.save_configuration(task, url, data) mock_put.assert_called_once_with( url, '{\n "key": "value"\n}', - headers=None) + headers=None, + timeout=60) @mock.patch.object(requests, 'put', autospec=True) def test_save_configuration_http_error(self, mock_put): @@ -112,7 +115,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase): {'key': 'value'}) mock_put.assert_called_once_with( 'https://example.com/file2', '{\n "key": "value"\n}', - headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}) + headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}, + timeout=60) @mock.patch.object(requests, 'put', autospec=True) def test_save_configuration_connection_error(self, mock_put): @@ -132,7 +136,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase): task, 'https://example.com/file2', {'key': 'value'}) mock_put.assert_called_with( 'https://example.com/file2', '{\n "key": "value"\n}', - headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}) + headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}, + timeout=60) self.assertEqual(mock_put.call_count, 3) @mock.patch.object(requests, 'put', autospec=True) @@ -155,7 +160,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase): {'key': 'value'}) mock_put.assert_called_with( 'https://example.com/file2', '{\n "key": "value"\n}', - headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}) + headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}, + timeout=60) self.assertEqual(mock_put.call_count, 2) @mock.patch.object(swift, 'get_swift_session', autospec=True) @@ -176,7 +182,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase): result = molds.get_configuration(task, url) mock_get.assert_called_once_with( - url, headers={'X-Auth-Token': 'token'}) + url, headers={'X-Auth-Token': 'token'}, + timeout=60) self.assertJsonEqual({'key': 'value'}, result) @mock.patch.object(swift, 'get_swift_session', autospec=True) @@ -210,7 +217,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase): result = molds.get_configuration(task, url) mock_get.assert_called_once_with( - url, headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}) + url, headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}, + timeout=60) self.assertJsonEqual({"key": "value"}, result) @mock.patch.object(requests, 'get', autospec=True) @@ -228,7 +236,7 @@ class ConfigurationMoldTestCase(db_base.DbTestCase): with task_manager.acquire(self.context, self.node.uuid) as task: result = molds.get_configuration(task, url) - mock_get.assert_called_once_with(url, headers=None) + mock_get.assert_called_once_with(url, headers=None, timeout=60) self.assertJsonEqual({"key": "value"}, result) @mock.patch.object(requests, 'get', autospec=True) @@ -249,7 +257,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase): 'https://example.com/file2') mock_get.assert_called_once_with( 'https://example.com/file2', - headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}) + headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}, + timeout=60) @mock.patch.object(requests, 'get', autospec=True) def test_get_configuration_connection_error(self, mock_get): @@ -269,7 +278,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase): task, 'https://example.com/file2') mock_get.assert_called_with( 'https://example.com/file2', - headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}) + headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}, + timeout=60) self.assertEqual(mock_get.call_count, 3) @mock.patch.object(requests, 'get', autospec=True) @@ -291,7 +301,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase): 'https://example.com/file2') mock_get.assert_called_with( 'https://example.com/file2', - headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}) + headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='}, + timeout=60) self.assertEqual(mock_get.call_count, 2) @mock.patch.object(requests, 'get', autospec=True) diff --git a/ironic/tests/unit/common/test_rpc_service.py b/ironic/tests/unit/common/test_rpc_service.py index 09446ecf8..752b7665a 100644 --- a/ironic/tests/unit/common/test_rpc_service.py +++ b/ironic/tests/unit/common/test_rpc_service.py @@ -22,6 +22,7 @@ from oslo_utils import timeutils from ironic.common import context from ironic.common import rpc from ironic.common import rpc_service +from ironic.common import service as ironic_service from ironic.conductor import manager from ironic.objects import base as objects_base from ironic.tests.unit.db import base as db_base @@ -39,6 +40,8 @@ class TestRPCService(db_base.DbTestCase): mgr_module = "ironic.conductor.manager" mgr_class = "ConductorManager" self.rpc_svc = rpc_service.RPCService(host, mgr_module, mgr_class) + # register oslo_service DEFAULT config options + ironic_service.process_launcher() self.rpc_svc.manager.dbapi = self.dbapi @mock.patch.object(manager.ConductorManager, 'prepare_host', autospec=True) @@ -123,7 +126,10 @@ class TestRPCService(db_base.DbTestCase): with mock.patch.object(self.dbapi, 'get_online_conductors', autospec=True) as mock_cond_list: mock_cond_list.return_value = [conductor1] - self.rpc_svc.stop() + with mock.patch.object(self.dbapi, 'get_nodeinfo_list', + autospec=True) as mock_nodeinfo_list: + mock_nodeinfo_list.return_value = [] + self.rpc_svc.stop() # single conductor so exit immediately without waiting mock_sleep.assert_not_called() @@ -139,7 +145,11 @@ class TestRPCService(db_base.DbTestCase): autospec=True) as mock_cond_list: # multiple conductors, so wait for hash_ring_reset_interval mock_cond_list.return_value = [conductor1, conductor2] - self.rpc_svc.stop() + with mock.patch.object(self.dbapi, 'get_nodeinfo_list', + autospec=True) as mock_nodeinfo_list: + mock_nodeinfo_list.return_value = [] + self.rpc_svc.stop() + mock_nodeinfo_list.assert_called_once() # wait the total CONF.hash_ring_reset_interval 15 seconds mock_sleep.assert_has_calls([mock.call(15)]) @@ -160,7 +170,11 @@ class TestRPCService(db_base.DbTestCase): autospec=True) as mock_cond_list: # multiple conductors, so wait for hash_ring_reset_interval mock_cond_list.return_value = [conductor1, conductor2] - self.rpc_svc.stop() + with mock.patch.object(self.dbapi, 'get_nodeinfo_list', + autospec=True) as mock_nodeinfo_list: + mock_nodeinfo_list.return_value = [] + self.rpc_svc.stop() + mock_nodeinfo_list.assert_called_once() # wait the remaining 10 seconds mock_sleep.assert_has_calls([mock.call(10)]) @@ -181,7 +195,35 @@ class TestRPCService(db_base.DbTestCase): autospec=True) as mock_cond_list: # multiple conductors, so wait for hash_ring_reset_interval mock_cond_list.return_value = [conductor1, conductor2] - self.rpc_svc.stop() + with mock.patch.object(self.dbapi, 'get_nodeinfo_list', + autospec=True) as mock_nodeinfo_list: + mock_nodeinfo_list.return_value = [] + self.rpc_svc.stop() + mock_nodeinfo_list.assert_called_once() # no wait required, CONF.hash_ring_reset_interval already exceeded mock_sleep.assert_not_called() + + @mock.patch.object(timeutils, 'utcnow', autospec=True) + @mock.patch.object(time, 'sleep', autospec=True) + def test_stop_has_reserved(self, mock_sleep, mock_utcnow): + mock_utcnow.return_value = datetime.datetime(2023, 2, 2, 21, 10, 0) + conductor1 = db_utils.get_test_conductor(hostname='fake_host') + conductor2 = db_utils.get_test_conductor(hostname='other_fake_host') + + with mock.patch.object(self.dbapi, 'get_online_conductors', + autospec=True) as mock_cond_list: + # multiple conductors, so wait for hash_ring_reset_interval + mock_cond_list.return_value = [conductor1, conductor2] + with mock.patch.object(self.dbapi, 'get_nodeinfo_list', + autospec=True) as mock_nodeinfo_list: + # 3 calls to manager has_reserved until all reservation locks + # are released + mock_nodeinfo_list.side_effect = [['a', 'b'], ['a'], []] + self.rpc_svc.stop() + self.assertEqual(3, mock_nodeinfo_list.call_count) + + # wait the remaining 15 seconds, then wait until has_reserved + # returns False + mock_sleep.assert_has_calls( + [mock.call(15), mock.call(1), mock.call(1)]) diff --git a/releasenotes/notes/graceful_shutdown_wait-9a62627714b86726.yaml b/releasenotes/notes/graceful_shutdown_wait-9a62627714b86726.yaml new file mode 100644 index 000000000..778b7dc6f --- /dev/null +++ b/releasenotes/notes/graceful_shutdown_wait-9a62627714b86726.yaml @@ -0,0 +1,15 @@ +--- +features: + - | + On shutdown the conductor will wait for at most + ``[DEFAULT]graceful_shutdown_timeout`` seconds for existing lock node + reservations to clear. Previously lock reservations were cleared + immediately, which in some cases would result in nodes going into a failed + state. +upgrade: + - | + ``[DEFAULT]graceful_shutdown_timeout`` defaults to 60s. Systemd + ``TimeoutStopSec`` defaults to 30s. Kubernetes + ``terminationGracePeriodSeconds`` defaults to 90s. It is recommended to + align the value of ``[DEFAULT]graceful_shutdown_timeout`` with the graceful + timeout of the process manager of the conductor process.
\ No newline at end of file |