summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/source/install/enrollment.rst26
-rw-r--r--ironic/api/controllers/v1/ramdisk.py6
-rw-r--r--ironic/common/kickstart_utils.py4
-rw-r--r--ironic/common/molds.py6
-rw-r--r--ironic/common/rpc_service.py18
-rw-r--r--ironic/conductor/base_manager.py17
-rw-r--r--ironic/conf/default.py5
-rw-r--r--ironic/drivers/modules/ansible/playbooks/library/stream_url.py3
-rw-r--r--ironic/tests/unit/common/test_kickstart_utils.py3
-rw-r--r--ironic/tests/unit/common/test_molds.py35
-rw-r--r--ironic/tests/unit/common/test_rpc_service.py50
-rw-r--r--releasenotes/notes/graceful_shutdown_wait-9a62627714b86726.yaml15
12 files changed, 144 insertions, 44 deletions
diff --git a/doc/source/install/enrollment.rst b/doc/source/install/enrollment.rst
index 9f9355d75..40c63b4bb 100644
--- a/doc/source/install/enrollment.rst
+++ b/doc/source/install/enrollment.rst
@@ -81,15 +81,9 @@ affected, since the initial provision state is still ``available``.
However, using API version 1.11 or above may break existing automation tooling
with respect to node creation.
-The default API version used by (the most recent) python-ironicclient is 1.9,
-but it may change in the future and should not be relied on.
-
-In the examples below we will use version 1.11 of the Bare metal API.
-This gives us the following advantages:
-
-* Explicit power credentials validation before leaving the ``enroll`` state.
-* Running node cleaning before entering the ``available`` state.
-* Not exposing half-configured nodes to the scheduler.
+The ``openstack baremetal`` command line tool tries to negotiate the most
+recent supported version, which in virtually all cases will be newer than
+1.11.
To set the API version for all commands, you can set the environment variable
``IRONIC_API_VERSION``. For the OpenStackClient baremetal plugin, set
@@ -118,7 +112,6 @@ and may be combined if desired.
.. code-block:: console
- $ export OS_BAREMETAL_API_VERSION=1.11
$ baremetal node create --driver ipmi
+--------------+--------------------------------------+
| Property | Value |
@@ -423,12 +416,13 @@ Validating node information
Making node available for deployment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-In order for nodes to be available for deploying workloads on them, nodes
-must be in the ``available`` provision state. To do this, nodes
-created with API version 1.11 and above must be moved from the ``enroll`` state
-to the ``manageable`` state and then to the ``available`` state.
-This section can be safely skipped, if API version 1.10 or earlier is used
-(which is the case by default).
+In order for nodes to be available for deploying workloads on them, nodes must
+be in the ``available`` provision state. To do this, nodes must be moved from
+the ``enroll`` state to the ``manageable`` state and then to the ``available``
+state.
+
+.. note::
+ This section can be skipped, if API version 1.10 or earlier is used.
After creating a node and before moving it from its initial provision state of
``enroll``, basic power and port information needs to be configured on the node.
diff --git a/ironic/api/controllers/v1/ramdisk.py b/ironic/api/controllers/v1/ramdisk.py
index 5feef5e02..b98eb7dc2 100644
--- a/ironic/api/controllers/v1/ramdisk.py
+++ b/ironic/api/controllers/v1/ramdisk.py
@@ -131,13 +131,17 @@ class LookupController(rest.RestController):
else:
node = objects.Node.get_by_port_addresses(
api.request.context, valid_addresses)
- except exception.NotFound:
+ except exception.NotFound as e:
# NOTE(dtantsur): we are reraising the same exception to make sure
# we don't disclose the difference between nodes that are not found
# at all and nodes in a wrong state by different error messages.
+ LOG.error('No node has been found during lookup: %s', e)
raise exception.NotFound()
if CONF.api.restrict_lookup and not self.lookup_allowed(node):
+ LOG.error('Lookup is not allowed for node %(node)s in the '
+ 'provision state %(state)s',
+ {'node': node.uuid, 'state': node.provision_state})
raise exception.NotFound()
if api_utils.allow_agent_token():
diff --git a/ironic/common/kickstart_utils.py b/ironic/common/kickstart_utils.py
index 433cf2390..4e02e2ea7 100644
--- a/ironic/common/kickstart_utils.py
+++ b/ironic/common/kickstart_utils.py
@@ -23,6 +23,7 @@ import pycdlib
import requests
from ironic.common import exception
+from ironic.conf import CONF
LOG = logging.getLogger(__name__)
@@ -107,7 +108,8 @@ def decode_and_extract_config_drive_iso(config_drive_iso_gz):
def _fetch_config_drive_from_url(url):
try:
- config_drive = requests.get(url).content
+ config_drive = requests.get(
+ url, timeout=CONF.webserver_connection_timeout).content
except requests.exceptions.RequestException as e:
raise exception.InstanceDeployFailure(
"Can't download the configdrive content from '%(url)s'. "
diff --git a/ironic/common/molds.py b/ironic/common/molds.py
index 234fcc6e3..a77e42a63 100644
--- a/ironic/common/molds.py
+++ b/ironic/common/molds.py
@@ -49,7 +49,8 @@ def save_configuration(task, url, data):
)
def _request(url, data, auth_header):
return requests.put(
- url, data=json.dumps(data, indent=2), headers=auth_header)
+ url, data=json.dumps(data, indent=2), headers=auth_header,
+ timeout=CONF.webserver_connection_timeout)
auth_header = _get_auth_header(task)
response = _request(url, data, auth_header)
@@ -76,7 +77,8 @@ def get_configuration(task, url):
reraise=True
)
def _request(url, auth_header):
- return requests.get(url, headers=auth_header)
+ return requests.get(url, headers=auth_header,
+ timeout=CONF.webserver_connection_timeout)
auth_header = _get_auth_header(task)
response = _request(url, auth_header)
diff --git a/ironic/common/rpc_service.py b/ironic/common/rpc_service.py
index cb0f23c98..a74f6bab3 100644
--- a/ironic/common/rpc_service.py
+++ b/ironic/common/rpc_service.py
@@ -100,7 +100,8 @@ class RPCService(service.Service):
seconds=CONF.hash_ring_reset_interval)
try:
- self.manager.del_host(deregister=self.deregister)
+ self.manager.del_host(deregister=self.deregister,
+ clear_node_reservations=False)
except Exception as e:
LOG.exception('Service error occurred when cleaning up '
'the RPC manager. Error: %s', e)
@@ -127,6 +128,21 @@ class RPCService(service.Service):
LOG.info('Stopped RPC server for service %(service)s on host '
'%(host)s.',
{'service': self.topic, 'host': self.host})
+
+ # Wait for reservation locks held by this conductor.
+ # The conductor process will end when:
+ # - All reservations for this conductor are released
+ # - CONF.graceful_shutdown_timeout has elapsed
+ # - The process manager (systemd, kubernetes) sends SIGKILL after the
+ # configured graceful period
+ graceful_time = initial_time + datetime.timedelta(
+ seconds=CONF.graceful_shutdown_timeout)
+ while (self.manager.has_reserved()
+ and graceful_time > timeutils.utcnow()):
+ LOG.info('Waiting for reserved nodes to clear on host %(host)s',
+ {'host': self.host})
+ time.sleep(1)
+
rpc.set_global_manager(None)
def _handle_signal(self, signo, frame):
diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py
index 5c2e4ea95..544411e1d 100644
--- a/ironic/conductor/base_manager.py
+++ b/ironic/conductor/base_manager.py
@@ -298,15 +298,17 @@ class BaseConductorManager(object):
# This is only used in tests currently. Delete it?
self._periodic_task_callables = periodic_task_callables
- def del_host(self, deregister=True):
+ def del_host(self, deregister=True, clear_node_reservations=True):
# Conductor deregistration fails if called on non-initialized
# conductor (e.g. when rpc server is unreachable).
if not hasattr(self, 'conductor'):
return
self._shutdown = True
self._keepalive_evt.set()
- # clear all locks held by this conductor before deregistering
- self.dbapi.clear_node_reservations_for_conductor(self.host)
+
+ if clear_node_reservations:
+ # clear all locks held by this conductor before deregistering
+ self.dbapi.clear_node_reservations_for_conductor(self.host)
if deregister:
try:
# Inform the cluster that this conductor is shutting down.
@@ -338,6 +340,15 @@ class BaseConductorManager(object):
"""Return a count of currently online conductors"""
return len(self.dbapi.get_online_conductors())
+ def has_reserved(self):
+ """Determines if this host currently has any reserved nodes
+
+ :returns: True if this host has reserved nodes
+ """
+ return bool(self.dbapi.get_nodeinfo_list(
+ filters={'reserved_by_any_of': [self.host]},
+ limit=1))
+
def _register_and_validate_hardware_interfaces(self, hardware_types):
"""Register and validate hardware interfaces for this conductor.
diff --git a/ironic/conf/default.py b/ironic/conf/default.py
index c7aff69cc..5b40c1f31 100644
--- a/ironic/conf/default.py
+++ b/ironic/conf/default.py
@@ -426,8 +426,9 @@ webserver_opts = [
'Defaults to True.')),
cfg.IntOpt('webserver_connection_timeout',
default=60,
- help=_('Connection timeout when accessing remote web servers '
- 'with images.')),
+ help=_('Connection timeout when accessing/interacting with '
+ 'remote web servers with images or other artifacts '
+ 'being accessed.')),
]
diff --git a/ironic/drivers/modules/ansible/playbooks/library/stream_url.py b/ironic/drivers/modules/ansible/playbooks/library/stream_url.py
index 0da3cc4dd..786b65013 100644
--- a/ironic/drivers/modules/ansible/playbooks/library/stream_url.py
+++ b/ironic/drivers/modules/ansible/playbooks/library/stream_url.py
@@ -31,7 +31,8 @@ class StreamingDownloader(object):
else:
self.hasher = None
self.chunksize = chunksize
- resp = requests.get(url, stream=True, verify=verify, cert=certs)
+ resp = requests.get(url, stream=True, verify=verify, cert=certs,
+ timeout=30)
if resp.status_code != 200:
raise Exception('Invalid response code: %s' % resp.status_code)
diff --git a/ironic/tests/unit/common/test_kickstart_utils.py b/ironic/tests/unit/common/test_kickstart_utils.py
index 0dd1ac572..db6123b9d 100644
--- a/ironic/tests/unit/common/test_kickstart_utils.py
+++ b/ironic/tests/unit/common/test_kickstart_utils.py
@@ -129,4 +129,5 @@ echo $CONTENT | /usr/bin/base64 --decode > {file_path}\n\
task.node.instance_info = i_info
task.node.save()
self.assertEqual(expected, ks_utils.prepare_config_drive(task))
- mock_get.assert_called_with('http://server/fake-configdrive-url')
+ mock_get.assert_called_with('http://server/fake-configdrive-url',
+ timeout=60)
diff --git a/ironic/tests/unit/common/test_molds.py b/ironic/tests/unit/common/test_molds.py
index 810dd61bc..2323c2fa8 100644
--- a/ironic/tests/unit/common/test_molds.py
+++ b/ironic/tests/unit/common/test_molds.py
@@ -46,7 +46,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase):
molds.save_configuration(task, url, data)
mock_put.assert_called_once_with(url, '{\n "key": "value"\n}',
- headers={'X-Auth-Token': 'token'})
+ headers={'X-Auth-Token': 'token'},
+ timeout=60)
@mock.patch.object(swift, 'get_swift_session', autospec=True)
@mock.patch.object(requests, 'put', autospec=True)
@@ -77,7 +78,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase):
mock_put.assert_called_once_with(
url, '{\n "key": "value"\n}',
- headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='})
+ headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='},
+ timeout=60)
@mock.patch.object(requests, 'put', autospec=True)
def test_save_configuration_http_noauth(self, mock_put):
@@ -91,7 +93,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase):
molds.save_configuration(task, url, data)
mock_put.assert_called_once_with(
url, '{\n "key": "value"\n}',
- headers=None)
+ headers=None,
+ timeout=60)
@mock.patch.object(requests, 'put', autospec=True)
def test_save_configuration_http_error(self, mock_put):
@@ -112,7 +115,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase):
{'key': 'value'})
mock_put.assert_called_once_with(
'https://example.com/file2', '{\n "key": "value"\n}',
- headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='})
+ headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='},
+ timeout=60)
@mock.patch.object(requests, 'put', autospec=True)
def test_save_configuration_connection_error(self, mock_put):
@@ -132,7 +136,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase):
task, 'https://example.com/file2', {'key': 'value'})
mock_put.assert_called_with(
'https://example.com/file2', '{\n "key": "value"\n}',
- headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='})
+ headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='},
+ timeout=60)
self.assertEqual(mock_put.call_count, 3)
@mock.patch.object(requests, 'put', autospec=True)
@@ -155,7 +160,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase):
{'key': 'value'})
mock_put.assert_called_with(
'https://example.com/file2', '{\n "key": "value"\n}',
- headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='})
+ headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='},
+ timeout=60)
self.assertEqual(mock_put.call_count, 2)
@mock.patch.object(swift, 'get_swift_session', autospec=True)
@@ -176,7 +182,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase):
result = molds.get_configuration(task, url)
mock_get.assert_called_once_with(
- url, headers={'X-Auth-Token': 'token'})
+ url, headers={'X-Auth-Token': 'token'},
+ timeout=60)
self.assertJsonEqual({'key': 'value'}, result)
@mock.patch.object(swift, 'get_swift_session', autospec=True)
@@ -210,7 +217,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase):
result = molds.get_configuration(task, url)
mock_get.assert_called_once_with(
- url, headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='})
+ url, headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='},
+ timeout=60)
self.assertJsonEqual({"key": "value"}, result)
@mock.patch.object(requests, 'get', autospec=True)
@@ -228,7 +236,7 @@ class ConfigurationMoldTestCase(db_base.DbTestCase):
with task_manager.acquire(self.context, self.node.uuid) as task:
result = molds.get_configuration(task, url)
- mock_get.assert_called_once_with(url, headers=None)
+ mock_get.assert_called_once_with(url, headers=None, timeout=60)
self.assertJsonEqual({"key": "value"}, result)
@mock.patch.object(requests, 'get', autospec=True)
@@ -249,7 +257,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase):
'https://example.com/file2')
mock_get.assert_called_once_with(
'https://example.com/file2',
- headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='})
+ headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='},
+ timeout=60)
@mock.patch.object(requests, 'get', autospec=True)
def test_get_configuration_connection_error(self, mock_get):
@@ -269,7 +278,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase):
task, 'https://example.com/file2')
mock_get.assert_called_with(
'https://example.com/file2',
- headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='})
+ headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='},
+ timeout=60)
self.assertEqual(mock_get.call_count, 3)
@mock.patch.object(requests, 'get', autospec=True)
@@ -291,7 +301,8 @@ class ConfigurationMoldTestCase(db_base.DbTestCase):
'https://example.com/file2')
mock_get.assert_called_with(
'https://example.com/file2',
- headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='})
+ headers={'Authorization': 'Basic dXNlcjpwYXNzd29yZA=='},
+ timeout=60)
self.assertEqual(mock_get.call_count, 2)
@mock.patch.object(requests, 'get', autospec=True)
diff --git a/ironic/tests/unit/common/test_rpc_service.py b/ironic/tests/unit/common/test_rpc_service.py
index 09446ecf8..752b7665a 100644
--- a/ironic/tests/unit/common/test_rpc_service.py
+++ b/ironic/tests/unit/common/test_rpc_service.py
@@ -22,6 +22,7 @@ from oslo_utils import timeutils
from ironic.common import context
from ironic.common import rpc
from ironic.common import rpc_service
+from ironic.common import service as ironic_service
from ironic.conductor import manager
from ironic.objects import base as objects_base
from ironic.tests.unit.db import base as db_base
@@ -39,6 +40,8 @@ class TestRPCService(db_base.DbTestCase):
mgr_module = "ironic.conductor.manager"
mgr_class = "ConductorManager"
self.rpc_svc = rpc_service.RPCService(host, mgr_module, mgr_class)
+ # register oslo_service DEFAULT config options
+ ironic_service.process_launcher()
self.rpc_svc.manager.dbapi = self.dbapi
@mock.patch.object(manager.ConductorManager, 'prepare_host', autospec=True)
@@ -123,7 +126,10 @@ class TestRPCService(db_base.DbTestCase):
with mock.patch.object(self.dbapi, 'get_online_conductors',
autospec=True) as mock_cond_list:
mock_cond_list.return_value = [conductor1]
- self.rpc_svc.stop()
+ with mock.patch.object(self.dbapi, 'get_nodeinfo_list',
+ autospec=True) as mock_nodeinfo_list:
+ mock_nodeinfo_list.return_value = []
+ self.rpc_svc.stop()
# single conductor so exit immediately without waiting
mock_sleep.assert_not_called()
@@ -139,7 +145,11 @@ class TestRPCService(db_base.DbTestCase):
autospec=True) as mock_cond_list:
# multiple conductors, so wait for hash_ring_reset_interval
mock_cond_list.return_value = [conductor1, conductor2]
- self.rpc_svc.stop()
+ with mock.patch.object(self.dbapi, 'get_nodeinfo_list',
+ autospec=True) as mock_nodeinfo_list:
+ mock_nodeinfo_list.return_value = []
+ self.rpc_svc.stop()
+ mock_nodeinfo_list.assert_called_once()
# wait the total CONF.hash_ring_reset_interval 15 seconds
mock_sleep.assert_has_calls([mock.call(15)])
@@ -160,7 +170,11 @@ class TestRPCService(db_base.DbTestCase):
autospec=True) as mock_cond_list:
# multiple conductors, so wait for hash_ring_reset_interval
mock_cond_list.return_value = [conductor1, conductor2]
- self.rpc_svc.stop()
+ with mock.patch.object(self.dbapi, 'get_nodeinfo_list',
+ autospec=True) as mock_nodeinfo_list:
+ mock_nodeinfo_list.return_value = []
+ self.rpc_svc.stop()
+ mock_nodeinfo_list.assert_called_once()
# wait the remaining 10 seconds
mock_sleep.assert_has_calls([mock.call(10)])
@@ -181,7 +195,35 @@ class TestRPCService(db_base.DbTestCase):
autospec=True) as mock_cond_list:
# multiple conductors, so wait for hash_ring_reset_interval
mock_cond_list.return_value = [conductor1, conductor2]
- self.rpc_svc.stop()
+ with mock.patch.object(self.dbapi, 'get_nodeinfo_list',
+ autospec=True) as mock_nodeinfo_list:
+ mock_nodeinfo_list.return_value = []
+ self.rpc_svc.stop()
+ mock_nodeinfo_list.assert_called_once()
# no wait required, CONF.hash_ring_reset_interval already exceeded
mock_sleep.assert_not_called()
+
+ @mock.patch.object(timeutils, 'utcnow', autospec=True)
+ @mock.patch.object(time, 'sleep', autospec=True)
+ def test_stop_has_reserved(self, mock_sleep, mock_utcnow):
+ mock_utcnow.return_value = datetime.datetime(2023, 2, 2, 21, 10, 0)
+ conductor1 = db_utils.get_test_conductor(hostname='fake_host')
+ conductor2 = db_utils.get_test_conductor(hostname='other_fake_host')
+
+ with mock.patch.object(self.dbapi, 'get_online_conductors',
+ autospec=True) as mock_cond_list:
+ # multiple conductors, so wait for hash_ring_reset_interval
+ mock_cond_list.return_value = [conductor1, conductor2]
+ with mock.patch.object(self.dbapi, 'get_nodeinfo_list',
+ autospec=True) as mock_nodeinfo_list:
+ # 3 calls to manager has_reserved until all reservation locks
+ # are released
+ mock_nodeinfo_list.side_effect = [['a', 'b'], ['a'], []]
+ self.rpc_svc.stop()
+ self.assertEqual(3, mock_nodeinfo_list.call_count)
+
+ # wait the remaining 15 seconds, then wait until has_reserved
+ # returns False
+ mock_sleep.assert_has_calls(
+ [mock.call(15), mock.call(1), mock.call(1)])
diff --git a/releasenotes/notes/graceful_shutdown_wait-9a62627714b86726.yaml b/releasenotes/notes/graceful_shutdown_wait-9a62627714b86726.yaml
new file mode 100644
index 000000000..778b7dc6f
--- /dev/null
+++ b/releasenotes/notes/graceful_shutdown_wait-9a62627714b86726.yaml
@@ -0,0 +1,15 @@
+---
+features:
+ - |
+ On shutdown the conductor will wait for at most
+ ``[DEFAULT]graceful_shutdown_timeout`` seconds for existing lock node
+ reservations to clear. Previously lock reservations were cleared
+ immediately, which in some cases would result in nodes going into a failed
+ state.
+upgrade:
+ - |
+ ``[DEFAULT]graceful_shutdown_timeout`` defaults to 60s. Systemd
+ ``TimeoutStopSec`` defaults to 30s. Kubernetes
+ ``terminationGracePeriodSeconds`` defaults to 90s. It is recommended to
+ align the value of ``[DEFAULT]graceful_shutdown_timeout`` with the graceful
+ timeout of the process manager of the conductor process. \ No newline at end of file