summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.opendev.org>2021-12-09 09:38:39 +0000
committerGerrit Code Review <review@openstack.org>2021-12-09 09:38:39 +0000
commit8abc930d974e426025b97ffb35400f1b57511d25 (patch)
tree25d5a9a0b058d305288fedf4c601c46a517ccd45
parent3cd964fa847cfc24ce217b74d264b11b6facf2b4 (diff)
parentc6b1cb1c325d37374eaffb1517e8c6c388e7f807 (diff)
downloadironic-python-agent-8abc930d974e426025b97ffb35400f1b57511d25.tar.gz
Merge "Burn-in: Add SMART self test to disk burn-in"
-rw-r--r--ironic_python_agent/burnin.py84
-rw-r--r--ironic_python_agent/tests/unit/test_burnin.py77
-rw-r--r--releasenotes/notes/add-smart-test-to-disk-burnin-d02d31e23e5efa9a.yaml7
3 files changed, 168 insertions, 0 deletions
diff --git a/ironic_python_agent/burnin.py b/ironic_python_agent/burnin.py
index c97eea96..841dbfb1 100644
--- a/ironic_python_agent/burnin.py
+++ b/ironic_python_agent/burnin.py
@@ -10,6 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import json
import time
from ironic_lib import utils
@@ -84,6 +85,83 @@ def stress_ng_vm(node):
raise errors.CommandExecutionError(error_msg)
+def _smart_test_status(device):
+ """Get the SMART test status of a device
+
+ :param device: The device to check.
+ :raises: CommandExecutionError if the execution of smartctl fails.
+ :returns: A string with the SMART test status of the device and
+ None if the status is not available.
+ """
+ args = ['smartctl', '-ja', device.name]
+ try:
+ out, _ = utils.execute(*args)
+ smart_info = json.loads(out)
+ if smart_info:
+ return smart_info['ata_smart_data'][
+ 'self_test']['status']['string']
+ except (processutils.ProcessExecutionError, OSError, KeyError) as e:
+ LOG.error('SMART test on %(device)s failed with '
+ '%(err)s', {'device': device.name, 'err': e})
+ return None
+
+
+def _run_smart_test(devices):
+ """Launch a SMART test on the passed devices
+
+ :param devices: A list of device objects to check.
+ :raises: CommandExecutionError if the execution of smartctl fails.
+ :raises: CleaningError if the SMART test on any of the devices fails.
+ """
+ failed_devices = []
+ for device in devices:
+ args = ['smartctl', '-t', 'long', device.name]
+ LOG.info('SMART self test command: %s',
+ ' '.join(map(str, args)))
+ try:
+ utils.execute(*args)
+ except (processutils.ProcessExecutionError, OSError) as e:
+ LOG.error("Starting SMART test on %(device)s failed with: "
+ "%(err)s", {'device': device.name, 'err': e})
+ failed_devices.append(device.name)
+ if failed_devices:
+ error_msg = ("fio (disk) failed to start SMART self test on %s",
+ ', '.join(failed_devices))
+ raise errors.CleaningError(error_msg)
+
+ # wait for the test to finish and report the test results
+ failed_devices = []
+ while True:
+ for device in list(devices):
+ status = _smart_test_status(device)
+ if status is None:
+ devices.remove(device)
+ continue
+ if "in progress" in status:
+ msg = "SMART test still running on %s ..." % device.name
+ LOG.debug(msg)
+ continue
+ if "completed without error" in status:
+ msg = "%s passed SMART test" % device.name
+ LOG.info(msg)
+ devices.remove(device)
+ continue
+ failed_devices.append(device.name)
+ LOG.warning("%(device)s failed SMART test with: %(err)s",
+ {'device': device.name, 'err': status})
+ devices.remove(device)
+ if not devices:
+ break
+ LOG.info("SMART tests still running ...")
+ time.sleep(30)
+
+ # fail the clean step if the SMART test has failed
+ if failed_devices:
+ msg = ('fio (disk) SMART test failed for %s' % ' '.join(
+ map(str, failed_devices)))
+ raise errors.CleaningError(msg)
+
+
def fio_disk(node):
"""Burn-in the disks with fio
@@ -118,6 +196,12 @@ def fio_disk(node):
LOG.error(error_msg)
raise errors.CommandExecutionError(error_msg)
+ # if configured, run a smart self test on all devices and fail the
+ # step if any of the devices reports an error
+ smart_test = info.get('agent_burnin_fio_disk_smart_test', False)
+ if smart_test:
+ _run_smart_test(devices)
+
def _do_fio_network(writer, runtime, partner):
diff --git a/ironic_python_agent/tests/unit/test_burnin.py b/ironic_python_agent/tests/unit/test_burnin.py
index 2258352e..16aea375 100644
--- a/ironic_python_agent/tests/unit/test_burnin.py
+++ b/ironic_python_agent/tests/unit/test_burnin.py
@@ -21,6 +21,39 @@ from ironic_python_agent import hardware
from ironic_python_agent.tests.unit import base
+SMART_OUTPUT_JSON_COMPLETED = ("""
+{
+ "ata_smart_data": {
+ "self_test": {
+ "status": {
+ "value": 0,
+ "string": "completed without error",
+ "passed": true
+ },
+ "polling_minutes": {
+ "short": 1,
+ "extended": 2,
+ "conveyance": 2
+ }
+ }
+ }
+}
+""")
+
+SMART_OUTPUT_JSON_MISSING = ("""
+{
+ "ata_smart_data": {
+ "self_test": {
+ "status": {
+ "value": 0,
+ "passed": true
+ }
+ }
+ }
+}
+""")
+
+
@mock.patch.object(utils, 'execute', autospec=True)
class TestBurnin(base.IronicAgentTest):
@@ -133,6 +166,50 @@ class TestBurnin(base.IronicAgentTest):
'--loops', 5, '--runtime', 600, '--time_based', '--name',
'/dev/sdj', '--name', '/dev/hdaa')
+ def test__smart_test_status(self, mock_execute):
+ device = hardware.BlockDevice('/dev/sdj', 'big', 1073741824, True)
+ mock_execute.return_value = ([SMART_OUTPUT_JSON_COMPLETED, 'err'])
+
+ status = burnin._smart_test_status(device)
+
+ mock_execute.assert_called_once_with('smartctl', '-ja', '/dev/sdj')
+ self.assertEqual(status, "completed without error")
+
+ def test__smart_test_status_missing(self, mock_execute):
+ device = hardware.BlockDevice('/dev/sdj', 'big', 1073741824, True)
+ mock_execute.return_value = ([SMART_OUTPUT_JSON_MISSING, 'err'])
+
+ status = burnin._smart_test_status(device)
+
+ mock_execute.assert_called_once_with('smartctl', '-ja', '/dev/sdj')
+ self.assertIsNone(status)
+
+ @mock.patch.object(burnin, '_smart_test_status', autospec=True)
+ @mock.patch.object(hardware, 'list_all_block_devices', autospec=True)
+ def test_fio_disk_smart_test(self, mock_list, mock_status, mock_execute):
+
+ node = {'driver_info': {'agent_burnin_fio_disk_smart_test': True}}
+
+ mock_list.return_value = [
+ hardware.BlockDevice('/dev/sdj', 'big', 1073741824, True),
+ hardware.BlockDevice('/dev/hdaa', 'small', 65535, False),
+ ]
+ mock_status.return_value = "completed without error"
+ mock_execute.return_value = (['out', 'err'])
+
+ burnin.fio_disk(node)
+
+ expected_calls = [
+ mock.call('fio', '--rw', 'readwrite', '--bs', '4k', '--direct', 1,
+ '--ioengine', 'libaio', '--iodepth', '32', '--verify',
+ 'crc32c', '--verify_dump', 1, '--continue_on_error',
+ 'verify', '--loops', 4, '--runtime', 0, '--time_based',
+ '--name', '/dev/sdj', '--name', '/dev/hdaa'),
+ mock.call('smartctl', '-t', 'long', '/dev/sdj'),
+ mock.call('smartctl', '-t', 'long', '/dev/hdaa')
+ ]
+ mock_execute.assert_has_calls(expected_calls)
+
@mock.patch.object(hardware, 'list_all_block_devices', autospec=True)
def test_fio_disk_no_fio(self, mock_list, mock_execute):
diff --git a/releasenotes/notes/add-smart-test-to-disk-burnin-d02d31e23e5efa9a.yaml b/releasenotes/notes/add-smart-test-to-disk-burnin-d02d31e23e5efa9a.yaml
new file mode 100644
index 00000000..0e95b911
--- /dev/null
+++ b/releasenotes/notes/add-smart-test-to-disk-burnin-d02d31e23e5efa9a.yaml
@@ -0,0 +1,7 @@
+---
+features:
+ - |
+ Add 'agent_burnin_fio_disk_smart_test' option in driver-info for disk
+ burn-in. If set to True, this option will launch a parallel SMART self
+ test on all devices after the disk burn-in and fail the disk burn-in
+ clean step if any of the tests fail.