diff options
author | Zuul <zuul@review.opendev.org> | 2020-05-29 10:53:15 +0000 |
---|---|---|
committer | Gerrit Code Review <review@openstack.org> | 2020-05-29 10:53:15 +0000 |
commit | 911bc51a103b98621a803a84b00a96c09a610ac8 (patch) | |
tree | 2b00cbb5b53b60f68b88f33b74a25652d589f658 | |
parent | 64fcd40406ba3ffec1cdb84cd796eca09a3246ee (diff) | |
parent | f2eac72e888595615bd0fd79648c538bb889be5f (diff) | |
download | ironic-911bc51a103b98621a803a84b00a96c09a610ac8.tar.gz |
Merge "redfish: handle hardware that is unable to set persistent boot" into stable/train
-rw-r--r-- | ironic/common/utils.py | 28 | ||||
-rw-r--r-- | ironic/drivers/modules/redfish/management.py | 107 | ||||
-rw-r--r-- | ironic/drivers/modules/redfish/power.py | 11 | ||||
-rw-r--r-- | ironic/tests/unit/drivers/modules/redfish/test_management.py | 82 | ||||
-rw-r--r-- | ironic/tests/unit/drivers/modules/redfish/test_power.py | 36 | ||||
-rw-r--r-- | releasenotes/notes/redfish-sadness-6e2a37b3f45ef1aa.yaml | 18 |
6 files changed, 258 insertions, 24 deletions
diff --git a/ironic/common/utils.py b/ironic/common/utils.py index ee34041f0..037ec78cd 100644 --- a/ironic/common/utils.py +++ b/ironic/common/utils.py @@ -537,3 +537,31 @@ def validate_conductor_group(conductor_group): raise exception.InvalidConductorGroup(group=conductor_group) if not re.match(r'^[a-zA-Z0-9_\-\.]*$', conductor_group): raise exception.InvalidConductorGroup(group=conductor_group) + + +def set_node_nested_field(node, collection, field, value): + """Set a value in a dictionary field of a node. + + :param node: Node object. + :param collection: Name of the field with the dictionary. + :param field: Nested field name. + :param value: New value. + """ + col = getattr(node, collection) + col[field] = value + setattr(node, collection, col) + + +def pop_node_nested_field(node, collection, field, default=None): + """Pop a value from a dictionary field of a node. + + :param node: Node object. + :param collection: Name of the field with the dictionary. + :param field: Nested field name. + :param default: The default value to return. + :return: The removed value or the default. + """ + col = getattr(node, collection) + result = col.pop(field, default) + setattr(node, collection, col) + return result diff --git a/ironic/drivers/modules/redfish/management.py b/ironic/drivers/modules/redfish/management.py index 32f6b8094..1b238bd60 100644 --- a/ironic/drivers/modules/redfish/management.py +++ b/ironic/drivers/modules/redfish/management.py @@ -22,6 +22,7 @@ from ironic.common import boot_devices from ironic.common import boot_modes from ironic.common import exception from ironic.common.i18n import _ +from ironic.common import utils from ironic.conductor import task_manager from ironic.drivers import base from ironic.drivers.modules.redfish import utils as redfish_utils @@ -56,6 +57,63 @@ if sushy: BOOT_DEVICE_PERSISTENT_MAP.items()} +def _set_boot_device(task, system, device, + enabled=sushy.BOOT_SOURCE_ENABLED_ONCE): + """An internal routine to set the boot device. + + :param task: a task from TaskManager. + :param system: a Redfish System object. + :param device: the Redfish boot device. + :param enabled: Redfish boot device persistence value. + :raises: SushyError on an error from the Sushy library + """ + desired_enabled = enabled + current_enabled = system.boot.get('enabled') + + # NOTE(etingof): this can be racy, esp if BMC is not RESTful + new_enabled = (desired_enabled + if desired_enabled != current_enabled else None) + + use_new_set_system_boot = True + + try: + try: + system.set_system_boot_options(device, enabled=new_enabled) + except AttributeError: + new_enabled = enabled + use_new_set_system_boot = False + system.set_system_boot_source(device, enabled=new_enabled) + except sushy.exceptions.SushyError as e: + if new_enabled == sushy.BOOT_SOURCE_ENABLED_CONTINUOUS: + # NOTE(dtantsur): continuous boot device settings have been + # removed from Redfish, and some vendors stopped supporting + # it before an alternative was provided. As a work around, + # use one-time boot and restore the boot device on every + # reboot via RedfishPower. + LOG.debug('Error %(error)s when trying to set a ' + 'persistent boot device on node %(node)s, ' + 'falling back to one-time boot settings', + {'error': e, 'node': task.node.uuid}) + + if use_new_set_system_boot: + system.set_system_boot_options( + device, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE) + else: + system.set_system_boot_source( + device, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE) + + LOG.warning('Could not set persistent boot device to ' + '%(dev)s for node %(node)s, using one-time ' + 'boot device instead', + {'dev': device, 'node': task.node.uuid}) + utils.set_node_nested_field( + task.node, 'driver_internal_info', + 'redfish_boot_device', device) + task.node.save() + else: + raise + + class RedfishManagement(base.ManagementInterface): def __init__(self): @@ -96,6 +154,33 @@ class RedfishManagement(base.ManagementInterface): return list(BOOT_DEVICE_MAP_REV) @task_manager.require_exclusive_lock + def restore_boot_device(self, task, system): + """Restore boot device if needed. + + Checks the redfish_boot_device internal flag and sets the one-time + boot device accordingly. A warning is issued if it fails. + + This method is supposed to be called from the Redfish power interface + and should be considered private to the Redfish hardware type. + + :param task: a task from TaskManager. + :param system: a Redfish System object. + """ + device = task.node.driver_internal_info.get('redfish_boot_device') + if not device: + return + + LOG.debug('Restoring boot device %(dev)s on node %(node)s', + {'dev': device, 'node': task.node.uuid}) + try: + _set_boot_device(task, system, device) + except sushy.exceptions.SushyError as e: + LOG.warning('Unable to recover boot device %(dev)s for node ' + '%(node)s, relying on the pre-configured boot order. ' + 'Error: %(error)s', + {'dev': device, 'node': task.node.uuid, 'error': e}) + + @task_manager.require_exclusive_lock def set_boot_device(self, task, device, persistent=False): """Set the boot device for a node. @@ -112,24 +197,16 @@ class RedfishManagement(base.ManagementInterface): :raises: RedfishConnectionError when it fails to connect to Redfish :raises: RedfishError on an error from the Sushy library """ - system = redfish_utils.get_system(task.node) + utils.pop_node_nested_field( + task.node, 'driver_internal_info', 'redfish_boot_device') + task.node.save() - desired_persistence = BOOT_DEVICE_PERSISTENT_MAP_REV[persistent] - current_persistence = system.boot.get('enabled') - - # NOTE(etingof): this can be racy, esp if BMC is not RESTful - enabled = (desired_persistence - if desired_persistence != current_persistence else None) + system = redfish_utils.get_system(task.node) try: - try: - system.set_system_boot_options( - BOOT_DEVICE_MAP_REV[device], enabled=enabled) - except AttributeError: - system.set_system_boot_source( - BOOT_DEVICE_MAP_REV[device], - enabled=BOOT_DEVICE_PERSISTENT_MAP_REV[persistent]) - + _set_boot_device( + task, system, BOOT_DEVICE_MAP_REV[device], + enabled=BOOT_DEVICE_PERSISTENT_MAP_REV[persistent]) except sushy.exceptions.SushyError as e: error_msg = (_('Redfish set boot device failed for node ' '%(node)s. Error: %(error)s') % diff --git a/ironic/drivers/modules/redfish/power.py b/ironic/drivers/modules/redfish/power.py index cb944a427..0a6f3f338 100644 --- a/ironic/drivers/modules/redfish/power.py +++ b/ironic/drivers/modules/redfish/power.py @@ -22,6 +22,7 @@ from ironic.common import states from ironic.conductor import task_manager from ironic.conductor import utils as cond_utils from ironic.drivers import base +from ironic.drivers.modules.redfish import management as redfish_mgmt from ironic.drivers.modules.redfish import utils as redfish_utils LOG = log.getLogger(__name__) @@ -123,6 +124,12 @@ class RedfishPower(base.PowerInterface): :raises: RedfishError on an error from the Sushy library """ system = redfish_utils.get_system(task.node) + + if (power_state in (states.POWER_ON, states.SOFT_REBOOT, states.REBOOT) + and isinstance(task.driver.management, + redfish_mgmt.RedfishManagement)): + task.driver.management.restore_boot_device(task, system) + try: _set_power_state(task, system, power_state, timeout=timeout) except sushy.exceptions.SushyError as e: @@ -151,6 +158,10 @@ class RedfishPower(base.PowerInterface): next_state = states.POWER_OFF _set_power_state(task, system, next_state, timeout=timeout) + if isinstance(task.driver.management, + redfish_mgmt.RedfishManagement): + task.driver.management.restore_boot_device(task, system) + next_state = states.POWER_ON _set_power_state(task, system, next_state, timeout=timeout) except sushy.exceptions.SushyError as e: diff --git a/ironic/tests/unit/drivers/modules/redfish/test_management.py b/ironic/tests/unit/drivers/modules/redfish/test_management.py index bf9887f5c..9dc0a5c65 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_management.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_management.py @@ -118,6 +118,8 @@ class RedfishManagementTestCase(db_base.DbTestCase): fake_system.set_system_boot_source.assert_called_once_with( expected, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE) mock_get_system.assert_called_once_with(task.node) + self.assertNotIn('redfish_boot_device', + task.node.driver_internal_info) # Reset mocks fake_system.set_system_boot_options.reset_mock() @@ -141,6 +143,8 @@ class RedfishManagementTestCase(db_base.DbTestCase): fake_system.set_system_boot_options.assert_called_once_with( sushy.BOOT_SOURCE_TARGET_PXE, enabled=expected) mock_get_system.assert_called_once_with(task.node) + self.assertNotIn('redfish_boot_device', + task.node.driver_internal_info) # Reset mocks fake_system.set_system_boot_options.reset_mock() @@ -188,6 +192,84 @@ class RedfishManagementTestCase(db_base.DbTestCase): sushy.BOOT_SOURCE_TARGET_PXE, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE) mock_get_system.assert_called_once_with(task.node) + self.assertNotIn('redfish_boot_device', + task.node.driver_internal_info) + + @mock.patch.object(sushy, 'Sushy', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + def test_set_boot_device_persistence_fallback(self, mock_get_system, + mock_sushy): + fake_system = mock.Mock() + fake_system.set_system_boot_options.side_effect = [ + sushy.exceptions.SushyError(), + None, + ] + mock_get_system.return_value = fake_system + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.management.set_boot_device( + task, boot_devices.PXE, persistent=True) + fake_system.set_system_boot_options.assert_has_calls([ + mock.call(sushy.BOOT_SOURCE_TARGET_PXE, + enabled=sushy.BOOT_SOURCE_ENABLED_CONTINUOUS), + mock.call(sushy.BOOT_SOURCE_TARGET_PXE, + enabled=sushy.BOOT_SOURCE_ENABLED_ONCE), + ]) + mock_get_system.assert_called_once_with(task.node) + + task.node.refresh() + self.assertEqual( + sushy.BOOT_SOURCE_TARGET_PXE, + task.node.driver_internal_info['redfish_boot_device']) + + def test_restore_boot_device(self): + fake_system = mock.Mock() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.driver_internal_info['redfish_boot_device'] = ( + sushy.BOOT_SOURCE_TARGET_HDD + ) + + task.driver.management.restore_boot_device(task, fake_system) + + fake_system.set_system_boot_options.assert_called_once_with( + sushy.BOOT_SOURCE_TARGET_HDD, + enabled=sushy.BOOT_SOURCE_ENABLED_ONCE) + # The stored boot device is kept intact + self.assertEqual( + sushy.BOOT_SOURCE_TARGET_HDD, + task.node.driver_internal_info['redfish_boot_device']) + + def test_restore_boot_device_noop(self): + fake_system = mock.Mock() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.management.restore_boot_device(task, fake_system) + + self.assertFalse(fake_system.set_system_boot_options.called) + + @mock.patch.object(redfish_mgmt.LOG, 'warning', autospec=True) + def test_restore_boot_device_failure(self, mock_log): + fake_system = mock.Mock() + fake_system.set_system_boot_options.side_effect = ( + sushy.exceptions.SushyError() + ) + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.driver_internal_info['redfish_boot_device'] = ( + sushy.BOOT_SOURCE_TARGET_HDD + ) + + task.driver.management.restore_boot_device(task, fake_system) + + fake_system.set_system_boot_options.assert_called_once_with( + sushy.BOOT_SOURCE_TARGET_HDD, + enabled=sushy.BOOT_SOURCE_ENABLED_ONCE) + self.assertTrue(mock_log.called) + # The stored boot device is kept intact + self.assertEqual( + sushy.BOOT_SOURCE_TARGET_HDD, + task.node.driver_internal_info['redfish_boot_device']) @mock.patch.object(redfish_utils, 'get_system', autospec=True) def test_get_boot_device(self, mock_get_system): diff --git a/ironic/tests/unit/drivers/modules/redfish/test_power.py b/ironic/tests/unit/drivers/modules/redfish/test_power.py index 031ca42f1..bbbd57184 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_power.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_power.py @@ -19,6 +19,7 @@ from oslo_utils import importutils from ironic.common import exception from ironic.common import states from ironic.conductor import task_manager +from ironic.drivers.modules.redfish import management as redfish_mgmt from ironic.drivers.modules.redfish import power as redfish_power from ironic.drivers.modules.redfish import utils as redfish_utils from ironic.tests.unit.db import base as db_base @@ -82,19 +83,21 @@ class RedfishPowerTestCase(db_base.DbTestCase): mock_get_system.assert_called_once_with(task.node) mock_get_system.reset_mock() + @mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device', + autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) - def test_set_power_state(self, mock_get_system): + def test_set_power_state(self, mock_get_system, mock_restore_bootdev): with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: expected_values = [ - (states.POWER_ON, sushy.RESET_ON), - (states.POWER_OFF, sushy.RESET_FORCE_OFF), - (states.REBOOT, sushy.RESET_FORCE_RESTART), - (states.SOFT_REBOOT, sushy.RESET_GRACEFUL_RESTART), - (states.SOFT_POWER_OFF, sushy.RESET_GRACEFUL_SHUTDOWN) + (states.POWER_ON, sushy.RESET_ON, True), + (states.POWER_OFF, sushy.RESET_FORCE_OFF, False), + (states.REBOOT, sushy.RESET_FORCE_RESTART, True), + (states.SOFT_REBOOT, sushy.RESET_GRACEFUL_RESTART, True), + (states.SOFT_POWER_OFF, sushy.RESET_GRACEFUL_SHUTDOWN, False) ] - for target, expected in expected_values: + for target, expected, restore_bootdev in expected_values: if target in (states.POWER_OFF, states.SOFT_POWER_OFF): final = sushy.SYSTEM_POWER_STATE_OFF transient = sushy.SYSTEM_POWER_STATE_ON @@ -113,9 +116,15 @@ class RedfishPowerTestCase(db_base.DbTestCase): system_result[0].reset_system.assert_called_once_with(expected) mock_get_system.assert_called_with(task.node) self.assertEqual(4, mock_get_system.call_count) + if restore_bootdev: + mock_restore_bootdev.assert_called_once_with( + task.driver.management, task, system_result[0]) + else: + self.assertFalse(mock_restore_bootdev.called) # Reset mocks mock_get_system.reset_mock() + mock_restore_bootdev.reset_mock() @mock.patch.object(redfish_utils, 'get_system', autospec=True) def test_set_power_state_not_reached(self, mock_get_system): @@ -164,8 +173,11 @@ class RedfishPowerTestCase(db_base.DbTestCase): sushy.RESET_ON) mock_get_system.assert_called_once_with(task.node) + @mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device', + autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) - def test_reboot_from_power_off(self, mock_get_system): + def test_reboot_from_power_off(self, mock_get_system, + mock_restore_bootdev): with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: system_result = [ @@ -185,9 +197,13 @@ class RedfishPowerTestCase(db_base.DbTestCase): sushy.RESET_ON) mock_get_system.assert_called_with(task.node) self.assertEqual(3, mock_get_system.call_count) + mock_restore_bootdev.assert_called_once_with( + task.driver.management, task, system_result[0]) + @mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device', + autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) - def test_reboot_from_power_on(self, mock_get_system): + def test_reboot_from_power_on(self, mock_get_system, mock_restore_bootdev): with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: system_result = [ @@ -209,6 +225,8 @@ class RedfishPowerTestCase(db_base.DbTestCase): ]) mock_get_system.assert_called_with(task.node) self.assertEqual(3, mock_get_system.call_count) + mock_restore_bootdev.assert_called_once_with( + task.driver.management, task, system_result[0]) @mock.patch.object(redfish_utils, 'get_system', autospec=True) def test_reboot_not_reached(self, mock_get_system): diff --git a/releasenotes/notes/redfish-sadness-6e2a37b3f45ef1aa.yaml b/releasenotes/notes/redfish-sadness-6e2a37b3f45ef1aa.yaml new file mode 100644 index 000000000..a8cb499be --- /dev/null +++ b/releasenotes/notes/redfish-sadness-6e2a37b3f45ef1aa.yaml @@ -0,0 +1,18 @@ +--- +fixes: + - | + Provides a workaround for hardware that does not support persistent boot + device setting with the ``redfish`` hardware type. When such situation is + detected, ironic will fall back to one-time boot device setting, restoring + it on every reboot. +issues: + - | + Some redfish-enabled hardware is known not to support persistent boot + device setting that is used by the Bare Metal service for deployed + instances. The ``redfish`` hardware type tries to work around this problem, + but rebooting such an instance in-band may cause it to boot incorrectly. + A predictable boot order should be configured in the node's boot firmware + to avoid issues and at least metadata cleaning must be enabled. + See `this mailing list thread + <http://lists.openstack.org/pipermail/openstack-discuss/2020-April/014543.html>`_ + for technical details. |