diff options
-rw-r--r-- | doc/source/install/enrollment.rst | 26 | ||||
-rw-r--r-- | ironic/common/image_service.py | 35 | ||||
-rw-r--r-- | ironic/common/rpc_service.py | 18 | ||||
-rw-r--r-- | ironic/conductor/base_manager.py | 17 | ||||
-rw-r--r-- | ironic/conductor/utils.py | 5 | ||||
-rw-r--r-- | ironic/conf/conductor.py | 8 | ||||
-rw-r--r-- | ironic/db/sqlalchemy/api.py | 6 | ||||
-rw-r--r-- | ironic/tests/base.py | 15 | ||||
-rw-r--r-- | ironic/tests/unit/common/test_image_service.py | 107 | ||||
-rw-r--r-- | ironic/tests/unit/common/test_rpc_service.py | 50 | ||||
-rw-r--r-- | ironic/tests/unit/conductor/test_cleaning.py | 30 | ||||
-rw-r--r-- | ironic/tests/unit/conductor/test_manager.py | 3 | ||||
-rw-r--r-- | ironic/tests/unit/conductor/test_utils.py | 11 | ||||
-rw-r--r-- | playbooks/metal3-ci/run.yaml | 2 | ||||
-rw-r--r-- | releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml | 8 | ||||
-rw-r--r-- | releasenotes/notes/cross-link-1ffd1a4958f14fd7.yaml | 5 | ||||
-rw-r--r-- | releasenotes/notes/graceful_shutdown_wait-9a62627714b86726.yaml | 15 | ||||
-rwxr-xr-x | tools/test-setup.sh | 2 | ||||
-rw-r--r-- | tox.ini | 2 |
19 files changed, 214 insertions, 151 deletions
diff --git a/doc/source/install/enrollment.rst b/doc/source/install/enrollment.rst index 9f9355d75..40c63b4bb 100644 --- a/doc/source/install/enrollment.rst +++ b/doc/source/install/enrollment.rst @@ -81,15 +81,9 @@ affected, since the initial provision state is still ``available``. However, using API version 1.11 or above may break existing automation tooling with respect to node creation. -The default API version used by (the most recent) python-ironicclient is 1.9, -but it may change in the future and should not be relied on. - -In the examples below we will use version 1.11 of the Bare metal API. -This gives us the following advantages: - -* Explicit power credentials validation before leaving the ``enroll`` state. -* Running node cleaning before entering the ``available`` state. -* Not exposing half-configured nodes to the scheduler. +The ``openstack baremetal`` command line tool tries to negotiate the most +recent supported version, which in virtually all cases will be newer than +1.11. To set the API version for all commands, you can set the environment variable ``IRONIC_API_VERSION``. For the OpenStackClient baremetal plugin, set @@ -118,7 +112,6 @@ and may be combined if desired. .. code-block:: console - $ export OS_BAREMETAL_API_VERSION=1.11 $ baremetal node create --driver ipmi +--------------+--------------------------------------+ | Property | Value | @@ -423,12 +416,13 @@ Validating node information Making node available for deployment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In order for nodes to be available for deploying workloads on them, nodes -must be in the ``available`` provision state. To do this, nodes -created with API version 1.11 and above must be moved from the ``enroll`` state -to the ``manageable`` state and then to the ``available`` state. -This section can be safely skipped, if API version 1.10 or earlier is used -(which is the case by default). +In order for nodes to be available for deploying workloads on them, nodes must +be in the ``available`` provision state. To do this, nodes must be moved from +the ``enroll`` state to the ``manageable`` state and then to the ``available`` +state. + +.. note:: + This section can be skipped, if API version 1.10 or earlier is used. After creating a node and before moving it from its initial provision state of ``enroll``, basic power and port information needs to be configured on the node. diff --git a/ironic/common/image_service.py b/ironic/common/image_service.py index 5280ee6bc..4b95b5152 100644 --- a/ironic/common/image_service.py +++ b/ironic/common/image_service.py @@ -34,10 +34,6 @@ from ironic.common import utils from ironic.conf import CONF IMAGE_CHUNK_SIZE = 1024 * 1024 # 1mb -# NOTE(kaifeng) Image will be truncated to 2GiB by sendfile, -# we use a large chunk size here for a better performance -# while keep the chunk size less than the size limit. -SENDFILE_CHUNK_SIZE = 1024 * 1024 * 1024 # 1Gb LOG = log.getLogger(__name__) @@ -264,26 +260,23 @@ class FileImageService(BaseImageService): """ source_image_path = self.validate_href(image_href) dest_image_path = image_file.name - local_device = os.stat(dest_image_path).st_dev try: - # We should have read and write access to source file to create - # hard link to it. - if (local_device == os.stat(source_image_path).st_dev - and os.access(source_image_path, os.R_OK | os.W_OK)): - image_file.close() - os.remove(dest_image_path) + image_file.close() + os.remove(dest_image_path) + + try: os.link(source_image_path, dest_image_path) + except OSError as exc: + LOG.debug('Could not create a link from %(src)s to %(dest)s, ' + 'will copy the content instead. Error: %(exc)s.', + {'src': source_image_path, 'dest': dest_image_path, + 'exc': exc}) else: - filesize = os.path.getsize(source_image_path) - offset = 0 - with open(source_image_path, 'rb') as input_img: - while offset < filesize: - count = min(SENDFILE_CHUNK_SIZE, filesize - offset) - nbytes_out = os.sendfile(image_file.fileno(), - input_img.fileno(), - offset, - count) - offset += nbytes_out + return + + # NOTE(dtantsur): starting with Python 3.8, copyfile() uses + # efficient copying (i.e. sendfile) under the hood. + shutil.copyfile(source_image_path, dest_image_path) except Exception as e: raise exception.ImageDownloadFailed(image_href=image_href, reason=str(e)) diff --git a/ironic/common/rpc_service.py b/ironic/common/rpc_service.py index cb0f23c98..a74f6bab3 100644 --- a/ironic/common/rpc_service.py +++ b/ironic/common/rpc_service.py @@ -100,7 +100,8 @@ class RPCService(service.Service): seconds=CONF.hash_ring_reset_interval) try: - self.manager.del_host(deregister=self.deregister) + self.manager.del_host(deregister=self.deregister, + clear_node_reservations=False) except Exception as e: LOG.exception('Service error occurred when cleaning up ' 'the RPC manager. Error: %s', e) @@ -127,6 +128,21 @@ class RPCService(service.Service): LOG.info('Stopped RPC server for service %(service)s on host ' '%(host)s.', {'service': self.topic, 'host': self.host}) + + # Wait for reservation locks held by this conductor. + # The conductor process will end when: + # - All reservations for this conductor are released + # - CONF.graceful_shutdown_timeout has elapsed + # - The process manager (systemd, kubernetes) sends SIGKILL after the + # configured graceful period + graceful_time = initial_time + datetime.timedelta( + seconds=CONF.graceful_shutdown_timeout) + while (self.manager.has_reserved() + and graceful_time > timeutils.utcnow()): + LOG.info('Waiting for reserved nodes to clear on host %(host)s', + {'host': self.host}) + time.sleep(1) + rpc.set_global_manager(None) def _handle_signal(self, signo, frame): diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index 5c2e4ea95..544411e1d 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -298,15 +298,17 @@ class BaseConductorManager(object): # This is only used in tests currently. Delete it? self._periodic_task_callables = periodic_task_callables - def del_host(self, deregister=True): + def del_host(self, deregister=True, clear_node_reservations=True): # Conductor deregistration fails if called on non-initialized # conductor (e.g. when rpc server is unreachable). if not hasattr(self, 'conductor'): return self._shutdown = True self._keepalive_evt.set() - # clear all locks held by this conductor before deregistering - self.dbapi.clear_node_reservations_for_conductor(self.host) + + if clear_node_reservations: + # clear all locks held by this conductor before deregistering + self.dbapi.clear_node_reservations_for_conductor(self.host) if deregister: try: # Inform the cluster that this conductor is shutting down. @@ -338,6 +340,15 @@ class BaseConductorManager(object): """Return a count of currently online conductors""" return len(self.dbapi.get_online_conductors()) + def has_reserved(self): + """Determines if this host currently has any reserved nodes + + :returns: True if this host has reserved nodes + """ + return bool(self.dbapi.get_nodeinfo_list( + filters={'reserved_by_any_of': [self.host]}, + limit=1)) + def _register_and_validate_hardware_interfaces(self, hardware_types): """Register and validate hardware interfaces for this conductor. diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 2272c0df7..add9ee74d 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -499,6 +499,11 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False, # NOTE(dtantsur): avoid overwriting existing maintenance_reason if not node.maintenance_reason and set_maintenance: node.maintenance_reason = errmsg + + if CONF.conductor.poweroff_in_cleanfail: + # NOTE(NobodyCam): Power off node in clean fail + node_power_action(task, states.POWER_OFF) + node.save() if set_fail_state and node.provision_state != states.CLEANFAIL: diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 653e30f56..2452fafe7 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -349,6 +349,14 @@ opts = [ 'is a global setting applying to all requests this ' 'conductor receives, regardless of access rights. ' 'The concurrent clean limit cannot be disabled.')), + + cfg.BoolOpt('poweroff_in_cleanfail', + default=False, + help=_('If True power off nodes in the ``clean failed`` ' + 'state. Default False. Option may be unsafe ' + 'when using Cleaning to perform ' + 'hardware-transformative actions such as ' + 'firmware upgrade.')), ] diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index 93a211fc3..31ec9647e 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -1379,12 +1379,14 @@ class Connection(api.Connection): def list_hardware_type_interfaces(self, hardware_types): with _session_for_read() as session: - query = (session.query(models.ConductorHardwareInterfaces) + query = (session.query(models.ConductorHardwareInterfaces, + models.Conductor) + .join(models.Conductor) .filter(models.ConductorHardwareInterfaces.hardware_type .in_(hardware_types))) query = _filter_active_conductors(query) - return query.all() + return [row[0] for row in query] @oslo_db_api.retry_on_deadlock def register_conductor_hardware_interfaces(self, conductor_id, interfaces): diff --git a/ironic/tests/base.py b/ironic/tests/base.py index 348f15c20..69e449d3b 100644 --- a/ironic/tests/base.py +++ b/ironic/tests/base.py @@ -102,6 +102,11 @@ class WarningsFixture(fixtures.Fixture): category=UserWarning, ) + # NOTE(gibi): The UUIDFields emits a warning if the value is not a + # valid UUID. Let's escalate that to an exception in the test to + # prevent adding violations. + warnings.filterwarnings('error', message='.* is an invalid UUID.') + # Enable deprecation warnings to capture upcoming SQLAlchemy changes warnings.filterwarnings( @@ -125,16 +130,6 @@ class WarningsFixture(fixtures.Fixture): category=sqla_exc.SAWarning, ) - # ...but filter everything out until we get around to fixing them - # TODO(stephenfin): Fix all of these - - warnings.filterwarnings( - 'ignore', - module='ironic', - message='SELECT statement has a cartesian product ', - category=sqla_exc.SAWarning, - ) - # FIXME(stephenfin): We can remove this once oslo.db is fixed # https://review.opendev.org/c/openstack/oslo.db/+/856453 warnings.filterwarnings( diff --git a/ironic/tests/unit/common/test_image_service.py b/ironic/tests/unit/common/test_image_service.py index 197518e1a..297a1b4b9 100644 --- a/ironic/tests/unit/common/test_image_service.py +++ b/ironic/tests/unit/common/test_image_service.py @@ -10,7 +10,6 @@ # License for the specific language governing permissions and limitations # under the License. -import builtins import datetime from http import client as http_client import io @@ -483,119 +482,55 @@ class FileImageServiceTestCase(base.TestCase): 'properties': {}, 'no_cache': True}, result) + @mock.patch.object(shutil, 'copyfile', autospec=True) @mock.patch.object(os, 'link', autospec=True) @mock.patch.object(os, 'remove', autospec=True) - @mock.patch.object(os, 'access', return_value=True, autospec=True) - @mock.patch.object(os, 'stat', autospec=True) @mock.patch.object(image_service.FileImageService, 'validate_href', autospec=True) - def test_download_hard_link(self, _validate_mock, stat_mock, access_mock, - remove_mock, link_mock): + def test_download_hard_link(self, _validate_mock, remove_mock, link_mock, + copy_mock): _validate_mock.return_value = self.href_path - stat_mock.return_value.st_dev = 'dev1' file_mock = mock.Mock(spec=io.BytesIO) file_mock.name = 'file' self.service.download(self.href, file_mock) _validate_mock.assert_called_once_with(mock.ANY, self.href) - self.assertEqual(2, stat_mock.call_count) - access_mock.assert_called_once_with(self.href_path, os.R_OK | os.W_OK) remove_mock.assert_called_once_with('file') link_mock.assert_called_once_with(self.href_path, 'file') + copy_mock.assert_not_called() - @mock.patch.object(os, 'sendfile', return_value=42, autospec=True) - @mock.patch.object(os.path, 'getsize', return_value=42, autospec=True) - @mock.patch.object(builtins, 'open', autospec=True) - @mock.patch.object(os, 'access', return_value=False, autospec=True) - @mock.patch.object(os, 'stat', autospec=True) - @mock.patch.object(image_service.FileImageService, 'validate_href', - autospec=True) - def test_download_copy(self, _validate_mock, stat_mock, access_mock, - open_mock, size_mock, copy_mock): - _validate_mock.return_value = self.href_path - stat_mock.return_value.st_dev = 'dev1' - file_mock = mock.MagicMock(spec=io.BytesIO) - file_mock.name = 'file' - input_mock = mock.MagicMock(spec=io.BytesIO) - open_mock.return_value = input_mock - self.service.download(self.href, file_mock) - _validate_mock.assert_called_once_with(mock.ANY, self.href) - self.assertEqual(2, stat_mock.call_count) - access_mock.assert_called_once_with(self.href_path, os.R_OK | os.W_OK) - copy_mock.assert_called_once_with(file_mock.fileno(), - input_mock.__enter__().fileno(), - 0, 42) - - @mock.patch.object(os, 'sendfile', autospec=True) - @mock.patch.object(os.path, 'getsize', return_value=42, autospec=True) - @mock.patch.object(builtins, 'open', autospec=True) - @mock.patch.object(os, 'access', return_value=False, autospec=True) - @mock.patch.object(os, 'stat', autospec=True) + @mock.patch.object(shutil, 'copyfile', autospec=True) + @mock.patch.object(os, 'link', autospec=True) + @mock.patch.object(os, 'remove', autospec=True) @mock.patch.object(image_service.FileImageService, 'validate_href', autospec=True) - def test_download_copy_segmented(self, _validate_mock, stat_mock, - access_mock, open_mock, size_mock, - copy_mock): - # Fake a 3G + 1k image - chunk_size = image_service.SENDFILE_CHUNK_SIZE - fake_image_size = chunk_size * 3 + 1024 - fake_chunk_seq = [chunk_size, chunk_size, chunk_size, 1024] + def test_download_copy(self, _validate_mock, remove_mock, link_mock, + copy_mock): _validate_mock.return_value = self.href_path - stat_mock.return_value.st_dev = 'dev1' + link_mock.side_effect = PermissionError file_mock = mock.MagicMock(spec=io.BytesIO) file_mock.name = 'file' - input_mock = mock.MagicMock(spec=io.BytesIO) - open_mock.return_value = input_mock - size_mock.return_value = fake_image_size - copy_mock.side_effect = fake_chunk_seq self.service.download(self.href, file_mock) _validate_mock.assert_called_once_with(mock.ANY, self.href) - self.assertEqual(2, stat_mock.call_count) - access_mock.assert_called_once_with(self.href_path, os.R_OK | os.W_OK) - copy_calls = [mock.call(file_mock.fileno(), - input_mock.__enter__().fileno(), - chunk_size * i, - fake_chunk_seq[i]) for i in range(4)] - copy_mock.assert_has_calls(copy_calls) - size_mock.assert_called_once_with(self.href_path) - - @mock.patch.object(os, 'remove', side_effect=OSError, autospec=True) - @mock.patch.object(os, 'access', return_value=True, autospec=True) - @mock.patch.object(os, 'stat', autospec=True) - @mock.patch.object(image_service.FileImageService, 'validate_href', - autospec=True) - def test_download_hard_link_fail(self, _validate_mock, stat_mock, - access_mock, remove_mock): - _validate_mock.return_value = self.href_path - stat_mock.return_value.st_dev = 'dev1' - file_mock = mock.MagicMock(spec=io.BytesIO) - file_mock.name = 'file' - self.assertRaises(exception.ImageDownloadFailed, - self.service.download, self.href, file_mock) - _validate_mock.assert_called_once_with(mock.ANY, self.href) - self.assertEqual(2, stat_mock.call_count) - access_mock.assert_called_once_with(self.href_path, os.R_OK | os.W_OK) + link_mock.assert_called_once_with(self.href_path, 'file') + copy_mock.assert_called_once_with(self.href_path, 'file') - @mock.patch.object(os, 'sendfile', side_effect=OSError, autospec=True) - @mock.patch.object(os.path, 'getsize', return_value=42, autospec=True) - @mock.patch.object(builtins, 'open', autospec=True) - @mock.patch.object(os, 'access', return_value=False, autospec=True) - @mock.patch.object(os, 'stat', autospec=True) + @mock.patch.object(shutil, 'copyfile', autospec=True) + @mock.patch.object(os, 'link', autospec=True) + @mock.patch.object(os, 'remove', autospec=True) @mock.patch.object(image_service.FileImageService, 'validate_href', autospec=True) - def test_download_copy_fail(self, _validate_mock, stat_mock, access_mock, - open_mock, size_mock, copy_mock): + def test_download_copy_fail(self, _validate_mock, remove_mock, link_mock, + copy_mock): _validate_mock.return_value = self.href_path - stat_mock.return_value.st_dev = 'dev1' + link_mock.side_effect = PermissionError + copy_mock.side_effect = PermissionError file_mock = mock.MagicMock(spec=io.BytesIO) file_mock.name = 'file' - input_mock = mock.MagicMock(spec=io.BytesIO) - open_mock.return_value = input_mock self.assertRaises(exception.ImageDownloadFailed, self.service.download, self.href, file_mock) _validate_mock.assert_called_once_with(mock.ANY, self.href) - self.assertEqual(2, stat_mock.call_count) - access_mock.assert_called_once_with(self.href_path, os.R_OK | os.W_OK) - size_mock.assert_called_once_with(self.href_path) + link_mock.assert_called_once_with(self.href_path, 'file') + copy_mock.assert_called_once_with(self.href_path, 'file') class ServiceGetterTestCase(base.TestCase): diff --git a/ironic/tests/unit/common/test_rpc_service.py b/ironic/tests/unit/common/test_rpc_service.py index 09446ecf8..752b7665a 100644 --- a/ironic/tests/unit/common/test_rpc_service.py +++ b/ironic/tests/unit/common/test_rpc_service.py @@ -22,6 +22,7 @@ from oslo_utils import timeutils from ironic.common import context from ironic.common import rpc from ironic.common import rpc_service +from ironic.common import service as ironic_service from ironic.conductor import manager from ironic.objects import base as objects_base from ironic.tests.unit.db import base as db_base @@ -39,6 +40,8 @@ class TestRPCService(db_base.DbTestCase): mgr_module = "ironic.conductor.manager" mgr_class = "ConductorManager" self.rpc_svc = rpc_service.RPCService(host, mgr_module, mgr_class) + # register oslo_service DEFAULT config options + ironic_service.process_launcher() self.rpc_svc.manager.dbapi = self.dbapi @mock.patch.object(manager.ConductorManager, 'prepare_host', autospec=True) @@ -123,7 +126,10 @@ class TestRPCService(db_base.DbTestCase): with mock.patch.object(self.dbapi, 'get_online_conductors', autospec=True) as mock_cond_list: mock_cond_list.return_value = [conductor1] - self.rpc_svc.stop() + with mock.patch.object(self.dbapi, 'get_nodeinfo_list', + autospec=True) as mock_nodeinfo_list: + mock_nodeinfo_list.return_value = [] + self.rpc_svc.stop() # single conductor so exit immediately without waiting mock_sleep.assert_not_called() @@ -139,7 +145,11 @@ class TestRPCService(db_base.DbTestCase): autospec=True) as mock_cond_list: # multiple conductors, so wait for hash_ring_reset_interval mock_cond_list.return_value = [conductor1, conductor2] - self.rpc_svc.stop() + with mock.patch.object(self.dbapi, 'get_nodeinfo_list', + autospec=True) as mock_nodeinfo_list: + mock_nodeinfo_list.return_value = [] + self.rpc_svc.stop() + mock_nodeinfo_list.assert_called_once() # wait the total CONF.hash_ring_reset_interval 15 seconds mock_sleep.assert_has_calls([mock.call(15)]) @@ -160,7 +170,11 @@ class TestRPCService(db_base.DbTestCase): autospec=True) as mock_cond_list: # multiple conductors, so wait for hash_ring_reset_interval mock_cond_list.return_value = [conductor1, conductor2] - self.rpc_svc.stop() + with mock.patch.object(self.dbapi, 'get_nodeinfo_list', + autospec=True) as mock_nodeinfo_list: + mock_nodeinfo_list.return_value = [] + self.rpc_svc.stop() + mock_nodeinfo_list.assert_called_once() # wait the remaining 10 seconds mock_sleep.assert_has_calls([mock.call(10)]) @@ -181,7 +195,35 @@ class TestRPCService(db_base.DbTestCase): autospec=True) as mock_cond_list: # multiple conductors, so wait for hash_ring_reset_interval mock_cond_list.return_value = [conductor1, conductor2] - self.rpc_svc.stop() + with mock.patch.object(self.dbapi, 'get_nodeinfo_list', + autospec=True) as mock_nodeinfo_list: + mock_nodeinfo_list.return_value = [] + self.rpc_svc.stop() + mock_nodeinfo_list.assert_called_once() # no wait required, CONF.hash_ring_reset_interval already exceeded mock_sleep.assert_not_called() + + @mock.patch.object(timeutils, 'utcnow', autospec=True) + @mock.patch.object(time, 'sleep', autospec=True) + def test_stop_has_reserved(self, mock_sleep, mock_utcnow): + mock_utcnow.return_value = datetime.datetime(2023, 2, 2, 21, 10, 0) + conductor1 = db_utils.get_test_conductor(hostname='fake_host') + conductor2 = db_utils.get_test_conductor(hostname='other_fake_host') + + with mock.patch.object(self.dbapi, 'get_online_conductors', + autospec=True) as mock_cond_list: + # multiple conductors, so wait for hash_ring_reset_interval + mock_cond_list.return_value = [conductor1, conductor2] + with mock.patch.object(self.dbapi, 'get_nodeinfo_list', + autospec=True) as mock_nodeinfo_list: + # 3 calls to manager has_reserved until all reservation locks + # are released + mock_nodeinfo_list.side_effect = [['a', 'b'], ['a'], []] + self.rpc_svc.stop() + self.assertEqual(3, mock_nodeinfo_list.call_count) + + # wait the remaining 15 seconds, then wait until has_reserved + # returns False + mock_sleep.assert_has_calls( + [mock.call(15), mock.call(1), mock.call(1)]) diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py index 34e805deb..cdfbf14ee 100644 --- a/ironic/tests/unit/conductor/test_cleaning.py +++ b/ironic/tests/unit/conductor/test_cleaning.py @@ -436,6 +436,36 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertFalse(node.maintenance) self.assertIsNone(node.fault) + @mock.patch('ironic.drivers.modules.fake.FakePower.set_power_state', + autospec=True) + @mock.patch.object(n_flat.FlatNetwork, 'validate', autospec=True) + @mock.patch.object(conductor_steps, 'set_node_cleaning_steps', + autospec=True) + def test_do_node_clean_steps_fail_poweroff(self, mock_steps, mock_validate, + mock_power, clean_steps=None, + invalid_exc=True): + if invalid_exc: + mock_steps.side_effect = exception.InvalidParameterValue('invalid') + else: + mock_steps.side_effect = exception.NodeCleaningFailure('failure') + tgt_prov_state = states.MANAGEABLE if clean_steps else states.AVAILABLE + self.config(poweroff_in_cleanfail=True, group='conductor') + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + uuid=uuidutils.generate_uuid(), + provision_state=states.CLEANING, + power_state=states.POWER_ON, + target_provision_state=tgt_prov_state) + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + cleaning.do_node_clean(task, clean_steps=clean_steps) + mock_validate.assert_called_once_with(mock.ANY, task) + node.refresh() + self.assertEqual(states.CLEANFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + mock_steps.assert_called_once_with(mock.ANY, disable_ramdisk=False) + self.assertTrue(mock_power.called) + def test__do_node_clean_automated_steps_fail(self): for invalid in (True, False): self.__do_node_clean_steps_fail(invalid_exc=invalid) diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 7278486a5..5244e4c4b 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -5089,7 +5089,8 @@ class ManagerDoSyncPowerStateTestCase(db_base.DbTestCase): self.power = self.driver.power self.node = obj_utils.create_test_node( self.context, driver='fake-hardware', maintenance=False, - provision_state=states.AVAILABLE, instance_uuid=uuidutils.uuid) + provision_state=states.AVAILABLE, + instance_uuid=uuidutils.generate_uuid()) self.task = mock.Mock(spec_set=['context', 'driver', 'node', 'upgrade_lock', 'shared']) self.task.context = self.context diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 52fc72436..27c4bfa86 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -243,11 +243,12 @@ class NodePowerActionTestCase(db_base.DbTestCase): self.config(host='my-host') # Required for exception handling mock_notif.__name__ = 'NodeSetPowerStateNotification' - node = obj_utils.create_test_node(self.context, - uuid=uuidutils.generate_uuid(), - driver='fake-hardware', - instance_uuid=uuidutils.uuid, - power_state=states.POWER_OFF) + node = obj_utils.create_test_node( + self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + instance_uuid=uuidutils.generate_uuid(), + power_state=states.POWER_OFF) task = task_manager.TaskManager(self.context, node.uuid) get_power_mock.return_value = states.POWER_OFF diff --git a/playbooks/metal3-ci/run.yaml b/playbooks/metal3-ci/run.yaml index d49ddc6de..fdee0ffde 100644 --- a/playbooks/metal3-ci/run.yaml +++ b/playbooks/metal3-ci/run.yaml @@ -9,6 +9,8 @@ CONTROL_PLANE_MACHINE_COUNT: 1 IMAGE_OS: ubuntu IMAGE_USERNAME: zuul + IRONIC_FROM_SOURCE: "true" + IRONIC_SOURCE: "/home/zuul/src/opendev.org/openstack/ironic" # NOTE(dtantsur): we don't have enough resources to provision even # a 2-node cluster, so only provision a control plane node. NUM_NODES: 2 diff --git a/releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml b/releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml new file mode 100644 index 000000000..2856bac06 --- /dev/null +++ b/releasenotes/notes/Cleanfail-power-off-13b5fdcc2727866a.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Add new conductor conf option: [conductor]poweroff_in_cleanfail + (default: False). when True nodes entering clean failed state + will be powered off. This option may be unsafe when using + Cleaning to perform hardware-transformative actions such as + firmware upgrade. diff --git a/releasenotes/notes/cross-link-1ffd1a4958f14fd7.yaml b/releasenotes/notes/cross-link-1ffd1a4958f14fd7.yaml new file mode 100644 index 000000000..caac13dd4 --- /dev/null +++ b/releasenotes/notes/cross-link-1ffd1a4958f14fd7.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixes ``Invalid cross-device link`` in some cases when using ``file://`` + image URLs. diff --git a/releasenotes/notes/graceful_shutdown_wait-9a62627714b86726.yaml b/releasenotes/notes/graceful_shutdown_wait-9a62627714b86726.yaml new file mode 100644 index 000000000..778b7dc6f --- /dev/null +++ b/releasenotes/notes/graceful_shutdown_wait-9a62627714b86726.yaml @@ -0,0 +1,15 @@ +--- +features: + - | + On shutdown the conductor will wait for at most + ``[DEFAULT]graceful_shutdown_timeout`` seconds for existing lock node + reservations to clear. Previously lock reservations were cleared + immediately, which in some cases would result in nodes going into a failed + state. +upgrade: + - | + ``[DEFAULT]graceful_shutdown_timeout`` defaults to 60s. Systemd + ``TimeoutStopSec`` defaults to 30s. Kubernetes + ``terminationGracePeriodSeconds`` defaults to 90s. It is recommended to + align the value of ``[DEFAULT]graceful_shutdown_timeout`` with the graceful + timeout of the process manager of the conductor process.
\ No newline at end of file diff --git a/tools/test-setup.sh b/tools/test-setup.sh index 16974adb5..924f0c824 100755 --- a/tools/test-setup.sh +++ b/tools/test-setup.sh @@ -83,5 +83,5 @@ EOF chmod 0600 $HOME/.pgpass # Now create our database -psql -h 127.0.0.1 -U $DB_USER -d template1 -c "DROP DATABASE IF EXISTS openstack_citest" +psql -h 127.0.0.1 -U $DB_USER -d postgres -c "DROP DATABASE IF EXISTS openstack_citest" createdb -h 127.0.0.1 -U $DB_USER -l C -T template0 -E utf8 openstack_citest @@ -132,7 +132,7 @@ commands = {posargs} # [W503] Line break before binary operator. ignore = E129,E741,W503 filename = *.py,app.wsgi -exclude = .venv,.git,.tox,dist,doc,*lib/python*,*egg,build +exclude=.*,dist,doc,*lib/python*,*egg,build import-order-style = pep8 application-import-names = ironic max-complexity=19 |