diff options
author | Arne Wiebalck <Arne.Wiebalck@cern.ch> | 2021-06-28 10:49:15 +0200 |
---|---|---|
committer | Arne Wiebalck <Arne.Wiebalck@cern.ch> | 2021-07-13 11:36:31 +0200 |
commit | cacdd9bab3e46ee5717c34c4facc27c14158bd70 (patch) | |
tree | 3c0374c70704debc31f2ff969abf5c43c35a7d94 | |
parent | 20e145e4da853cd759387e8d8727086f399e51b3 (diff) | |
download | ironic-python-agent-cacdd9bab3e46ee5717c34c4facc27c14158bd70.tar.gz |
Burn-in: Add network step
Add a clean step for network burn-in via fio. Get basic
run parameters from the node's driver_info.
Story: #2007523
Task: #42385
Change-Id: I2861696740b2de9ec38f7e9fc2c5e448c009d0bf
-rw-r--r-- | doc/source/admin/hardware_managers.rst | 3 | ||||
-rw-r--r-- | ironic_python_agent/burnin.py | 74 | ||||
-rw-r--r-- | ironic_python_agent/hardware.py | 15 | ||||
-rw-r--r-- | ironic_python_agent/tests/unit/test_burnin.py | 74 | ||||
-rw-r--r-- | ironic_python_agent/tests/unit/test_hardware.py | 7 | ||||
-rw-r--r-- | releasenotes/notes/add_burnin_network-4856153d21c25f4a.yaml | 6 |
6 files changed, 179 insertions, 0 deletions
diff --git a/doc/source/admin/hardware_managers.rst b/doc/source/admin/hardware_managers.rst index 4228dd85..290fcf0f 100644 --- a/doc/source/admin/hardware_managers.rst +++ b/doc/source/admin/hardware_managers.rst @@ -82,6 +82,9 @@ Clean steps ``deploy.burnin_memory`` Stress-test the memory of a node via stress-ng for a configurable amount of time. Disabled by default. +``deploy.burnin_network`` + Stress-test the network of a pair of nodes via fio for a configurable + amount of time. Disabled by default. ``deploy.erase_devices`` Securely erases all information from all recognized disk devices. Relatively fast when secure ATA erase is available, otherwise can take diff --git a/ironic_python_agent/burnin.py b/ironic_python_agent/burnin.py index 77f83ac4..375f11d3 100644 --- a/ironic_python_agent/burnin.py +++ b/ironic_python_agent/burnin.py @@ -10,6 +10,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import time + from ironic_lib import utils from oslo_concurrency import processutils from oslo_log import log @@ -19,6 +21,9 @@ from ironic_python_agent import hardware LOG = log.getLogger(__name__) +NETWORK_BURNIN_ROLES = frozenset(['writer', 'reader']) +NETWORK_READER_CYCLE = 30 + def stress_ng_cpu(node): """Burn-in the CPU with stress-ng @@ -115,3 +120,72 @@ def fio_disk(node): {'err': e}) LOG.error(error_msg) raise errors.CommandExecutionError(error_msg) + + +def _do_fio_network(writer, runtime, partner): + + args = ['fio', '--ioengine', 'net', '--port', '9000', '--fill_device', 1, + '--group_reporting', '--gtod_reduce', 1, '--numjobs', 16] + if writer: + xargs = ['--name', 'writer', '--rw', 'write', '--runtime', runtime, + '--time_based', '--listen'] + else: + xargs = ['--name', 'reader', '--rw', 'read', '--hostname', partner] + args.extend(xargs) + + while True: + LOG.info('Burn-in fio network command: %s', ' '.join(map(str, args))) + try: + out, err = utils.execute(*args) + # fio reports on stdout + LOG.info(out) + break + except (processutils.ProcessExecutionError, OSError) as e: + error_msg = ("fio (network) failed with error %(err)s", + {'err': e}) + LOG.error(error_msg) + # while the writer blocks in fio, the reader fails with + # 'Connection {refused, timeout}' errors if the partner + # is not ready, so we need to wait explicitly + if not writer and 'Connection' in str(e): + LOG.info("fio (network): reader retrying in %s seconds ...", + NETWORK_READER_CYCLE) + time.sleep(NETWORK_READER_CYCLE) + else: + raise errors.CommandExecutionError(error_msg) + + +def fio_network(node): + """Burn-in the network with fio + + Run an fio network job for a pair of nodes for a configurable + amount of time. The pair is statically defined in driver_info + via 'agent_burnin_fio_network_config'. + The writer will wait for the reader to connect, then write to the + network. Upon completion, the roles are swapped. + + Note (arne_wiebalck): Initial version. The plan is to make the + match making dynamic by posting availability + on a distributed backend, e.g. via tooz. + + :param node: Ironic node object + :raises: CommandExecutionError if the execution of fio fails. + :raises: CleaningError if the configuration is incomplete. + """ + + info = node.get('driver_info', {}) + runtime = info.get('agent_burnin_fio_network_runtime', 21600) + + # get our role and identify our partner + config = info.get('agent_burnin_fio_network_config') + if not config: + error_msg = ("fio (network) failed to find " + "'agent_burnin_fio_network_config' in driver_info") + raise errors.CleaningError(error_msg) + LOG.debug("agent_burnin_fio_network_config is %s", str(config)) + role = config.get('role') + partner = config.get('partner') + + _do_fio_network(role == 'writer', runtime, partner) + LOG.debug("fio (network): first direction done, swapping roles ...") + _do_fio_network(not role == 'writer', runtime, partner) diff --git a/ironic_python_agent/hardware.py b/ironic_python_agent/hardware.py index cc75c5ce..04426419 100644 --- a/ironic_python_agent/hardware.py +++ b/ironic_python_agent/hardware.py @@ -1418,6 +1418,14 @@ class GenericHardwareManager(HardwareManager): """ burnin.stress_ng_vm(node) + def burnin_network(self, node, ports): + """Burn-in the network + + :param node: Ironic node object + :param ports: list of Ironic port objects + """ + burnin.fio_network(node) + def _shred_block_device(self, node, block_device): """Erase a block device using shred. @@ -1912,6 +1920,13 @@ class GenericHardwareManager(HardwareManager): 'reboot_requested': False, 'abortable': True }, + { + 'step': 'burnin_network', + 'priority': 0, + 'interface': 'deploy', + 'reboot_requested': False, + 'abortable': True + }, ] def get_deploy_steps(self, node, ports): diff --git a/ironic_python_agent/tests/unit/test_burnin.py b/ironic_python_agent/tests/unit/test_burnin.py index 316aaebd..18025b41 100644 --- a/ironic_python_agent/tests/unit/test_burnin.py +++ b/ironic_python_agent/tests/unit/test_burnin.py @@ -144,3 +144,77 @@ class TestBurnin(base.IronicAgentTest): self.assertRaises(errors.CommandExecutionError, burnin.fio_disk, node) + + def test_fio_network_reader(self, mock_execute): + + node = {'driver_info': {'agent_burnin_fio_network_runtime': 600, + 'agent_burnin_fio_network_config': + {'partner': 'host-002', + 'role': 'reader'}}} + mock_execute.return_value = (['out', 'err']) + + burnin.fio_network(node) + + expected_calls = [ + mock.call('fio', '--ioengine', 'net', '--port', '9000', + '--fill_device', 1, '--group_reporting', + '--gtod_reduce', 1, '--numjobs', 16, '--name', + 'reader', '--rw', 'read', '--hostname', 'host-002'), + mock.call('fio', '--ioengine', 'net', '--port', '9000', + '--fill_device', 1, '--group_reporting', + '--gtod_reduce', 1, '--numjobs', 16, '--name', 'writer', + '--rw', 'write', '--runtime', 600, '--time_based', + '--listen')] + mock_execute.assert_has_calls(expected_calls) + + def test_fio_network_writer(self, mock_execute): + + node = {'driver_info': {'agent_burnin_fio_network_runtime': 600, + 'agent_burnin_fio_network_config': + {'partner': 'host-001', + 'role': 'writer'}}} + mock_execute.return_value = (['out', 'err']) + + burnin.fio_network(node) + + expected_calls = [ + mock.call('fio', '--ioengine', 'net', '--port', '9000', + '--fill_device', 1, '--group_reporting', + '--gtod_reduce', 1, '--numjobs', 16, '--name', 'writer', + '--rw', 'write', '--runtime', 600, '--time_based', + '--listen'), + mock.call('fio', '--ioengine', 'net', '--port', '9000', + '--fill_device', 1, '--group_reporting', + '--gtod_reduce', 1, '--numjobs', 16, '--name', + 'reader', '--rw', 'read', '--hostname', 'host-001')] + mock_execute.assert_has_calls(expected_calls) + + def test_fio_network_no_fio(self, mock_execute): + + node = {'driver_info': {'agent_burnin_fio_network_config': + {'partner': 'host-003', 'role': 'reader'}}} + mock_execute.side_effect = processutils.ProcessExecutionError('boom') + + self.assertRaises(errors.CommandExecutionError, + burnin.fio_network, node) + + @mock.patch('time.sleep', autospec=True) + def test_fio_network_reader_loop(self, mock_time, mock_execute): + + node = {'driver_info': {'agent_burnin_fio_network_config': + {'partner': 'host-004', 'role': 'reader'}}} + # mock the infinite loop + mock_execute.side_effect = (processutils.ProcessExecutionError( + 'Connection timeout'), + processutils.ProcessExecutionError( + 'Connection timeout'), + processutils.ProcessExecutionError( + 'Connection refused'), + ['out', 'err'], # connected! + ['out', 'err']) # reversed roles + + burnin.fio_network(node) + + # we loop 3 times, then do the 2 fio calls + self.assertEqual(5, mock_execute.call_count) + self.assertEqual(3, mock_time.call_count) diff --git a/ironic_python_agent/tests/unit/test_hardware.py b/ironic_python_agent/tests/unit/test_hardware.py index fbb8e650..61001260 100644 --- a/ironic_python_agent/tests/unit/test_hardware.py +++ b/ironic_python_agent/tests/unit/test_hardware.py @@ -170,6 +170,13 @@ class TestGenericHardwareManager(base.IronicAgentTest): 'interface': 'deploy', 'reboot_requested': False, 'abortable': True + }, + { + 'step': 'burnin_network', + 'priority': 0, + 'interface': 'deploy', + 'reboot_requested': False, + 'abortable': True } ] clean_steps = self.hardware.get_clean_steps(self.node, []) diff --git a/releasenotes/notes/add_burnin_network-4856153d21c25f4a.yaml b/releasenotes/notes/add_burnin_network-4856153d21c25f4a.yaml new file mode 100644 index 00000000..e197624a --- /dev/null +++ b/releasenotes/notes/add_burnin_network-4856153d21c25f4a.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + Adds a burn-in cleaning step 'burnin_network' to stress test the + network interface for a configurable amount of time with fio. To + use this step, fio needs to be installed on the RAM disk. |