summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArne Wiebalck <Arne.Wiebalck@cern.ch>2021-06-28 10:49:15 +0200
committerArne Wiebalck <Arne.Wiebalck@cern.ch>2021-07-13 11:36:31 +0200
commitcacdd9bab3e46ee5717c34c4facc27c14158bd70 (patch)
tree3c0374c70704debc31f2ff969abf5c43c35a7d94
parent20e145e4da853cd759387e8d8727086f399e51b3 (diff)
downloadironic-python-agent-cacdd9bab3e46ee5717c34c4facc27c14158bd70.tar.gz
Burn-in: Add network step
Add a clean step for network burn-in via fio. Get basic run parameters from the node's driver_info. Story: #2007523 Task: #42385 Change-Id: I2861696740b2de9ec38f7e9fc2c5e448c009d0bf
-rw-r--r--doc/source/admin/hardware_managers.rst3
-rw-r--r--ironic_python_agent/burnin.py74
-rw-r--r--ironic_python_agent/hardware.py15
-rw-r--r--ironic_python_agent/tests/unit/test_burnin.py74
-rw-r--r--ironic_python_agent/tests/unit/test_hardware.py7
-rw-r--r--releasenotes/notes/add_burnin_network-4856153d21c25f4a.yaml6
6 files changed, 179 insertions, 0 deletions
diff --git a/doc/source/admin/hardware_managers.rst b/doc/source/admin/hardware_managers.rst
index 4228dd85..290fcf0f 100644
--- a/doc/source/admin/hardware_managers.rst
+++ b/doc/source/admin/hardware_managers.rst
@@ -82,6 +82,9 @@ Clean steps
``deploy.burnin_memory``
Stress-test the memory of a node via stress-ng for a configurable
amount of time. Disabled by default.
+``deploy.burnin_network``
+ Stress-test the network of a pair of nodes via fio for a configurable
+ amount of time. Disabled by default.
``deploy.erase_devices``
Securely erases all information from all recognized disk devices.
Relatively fast when secure ATA erase is available, otherwise can take
diff --git a/ironic_python_agent/burnin.py b/ironic_python_agent/burnin.py
index 77f83ac4..375f11d3 100644
--- a/ironic_python_agent/burnin.py
+++ b/ironic_python_agent/burnin.py
@@ -10,6 +10,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import time
+
from ironic_lib import utils
from oslo_concurrency import processutils
from oslo_log import log
@@ -19,6 +21,9 @@ from ironic_python_agent import hardware
LOG = log.getLogger(__name__)
+NETWORK_BURNIN_ROLES = frozenset(['writer', 'reader'])
+NETWORK_READER_CYCLE = 30
+
def stress_ng_cpu(node):
"""Burn-in the CPU with stress-ng
@@ -115,3 +120,72 @@ def fio_disk(node):
{'err': e})
LOG.error(error_msg)
raise errors.CommandExecutionError(error_msg)
+
+
+def _do_fio_network(writer, runtime, partner):
+
+ args = ['fio', '--ioengine', 'net', '--port', '9000', '--fill_device', 1,
+ '--group_reporting', '--gtod_reduce', 1, '--numjobs', 16]
+ if writer:
+ xargs = ['--name', 'writer', '--rw', 'write', '--runtime', runtime,
+ '--time_based', '--listen']
+ else:
+ xargs = ['--name', 'reader', '--rw', 'read', '--hostname', partner]
+ args.extend(xargs)
+
+ while True:
+ LOG.info('Burn-in fio network command: %s', ' '.join(map(str, args)))
+ try:
+ out, err = utils.execute(*args)
+ # fio reports on stdout
+ LOG.info(out)
+ break
+ except (processutils.ProcessExecutionError, OSError) as e:
+ error_msg = ("fio (network) failed with error %(err)s",
+ {'err': e})
+ LOG.error(error_msg)
+ # while the writer blocks in fio, the reader fails with
+ # 'Connection {refused, timeout}' errors if the partner
+ # is not ready, so we need to wait explicitly
+ if not writer and 'Connection' in str(e):
+ LOG.info("fio (network): reader retrying in %s seconds ...",
+ NETWORK_READER_CYCLE)
+ time.sleep(NETWORK_READER_CYCLE)
+ else:
+ raise errors.CommandExecutionError(error_msg)
+
+
+def fio_network(node):
+ """Burn-in the network with fio
+
+ Run an fio network job for a pair of nodes for a configurable
+ amount of time. The pair is statically defined in driver_info
+ via 'agent_burnin_fio_network_config'.
+ The writer will wait for the reader to connect, then write to the
+ network. Upon completion, the roles are swapped.
+
+ Note (arne_wiebalck): Initial version. The plan is to make the
+ match making dynamic by posting availability
+ on a distributed backend, e.g. via tooz.
+
+ :param node: Ironic node object
+ :raises: CommandExecutionError if the execution of fio fails.
+ :raises: CleaningError if the configuration is incomplete.
+ """
+
+ info = node.get('driver_info', {})
+ runtime = info.get('agent_burnin_fio_network_runtime', 21600)
+
+ # get our role and identify our partner
+ config = info.get('agent_burnin_fio_network_config')
+ if not config:
+ error_msg = ("fio (network) failed to find "
+ "'agent_burnin_fio_network_config' in driver_info")
+ raise errors.CleaningError(error_msg)
+ LOG.debug("agent_burnin_fio_network_config is %s", str(config))
+ role = config.get('role')
+ partner = config.get('partner')
+
+ _do_fio_network(role == 'writer', runtime, partner)
+ LOG.debug("fio (network): first direction done, swapping roles ...")
+ _do_fio_network(not role == 'writer', runtime, partner)
diff --git a/ironic_python_agent/hardware.py b/ironic_python_agent/hardware.py
index cc75c5ce..04426419 100644
--- a/ironic_python_agent/hardware.py
+++ b/ironic_python_agent/hardware.py
@@ -1418,6 +1418,14 @@ class GenericHardwareManager(HardwareManager):
"""
burnin.stress_ng_vm(node)
+ def burnin_network(self, node, ports):
+ """Burn-in the network
+
+ :param node: Ironic node object
+ :param ports: list of Ironic port objects
+ """
+ burnin.fio_network(node)
+
def _shred_block_device(self, node, block_device):
"""Erase a block device using shred.
@@ -1912,6 +1920,13 @@ class GenericHardwareManager(HardwareManager):
'reboot_requested': False,
'abortable': True
},
+ {
+ 'step': 'burnin_network',
+ 'priority': 0,
+ 'interface': 'deploy',
+ 'reboot_requested': False,
+ 'abortable': True
+ },
]
def get_deploy_steps(self, node, ports):
diff --git a/ironic_python_agent/tests/unit/test_burnin.py b/ironic_python_agent/tests/unit/test_burnin.py
index 316aaebd..18025b41 100644
--- a/ironic_python_agent/tests/unit/test_burnin.py
+++ b/ironic_python_agent/tests/unit/test_burnin.py
@@ -144,3 +144,77 @@ class TestBurnin(base.IronicAgentTest):
self.assertRaises(errors.CommandExecutionError,
burnin.fio_disk, node)
+
+ def test_fio_network_reader(self, mock_execute):
+
+ node = {'driver_info': {'agent_burnin_fio_network_runtime': 600,
+ 'agent_burnin_fio_network_config':
+ {'partner': 'host-002',
+ 'role': 'reader'}}}
+ mock_execute.return_value = (['out', 'err'])
+
+ burnin.fio_network(node)
+
+ expected_calls = [
+ mock.call('fio', '--ioengine', 'net', '--port', '9000',
+ '--fill_device', 1, '--group_reporting',
+ '--gtod_reduce', 1, '--numjobs', 16, '--name',
+ 'reader', '--rw', 'read', '--hostname', 'host-002'),
+ mock.call('fio', '--ioengine', 'net', '--port', '9000',
+ '--fill_device', 1, '--group_reporting',
+ '--gtod_reduce', 1, '--numjobs', 16, '--name', 'writer',
+ '--rw', 'write', '--runtime', 600, '--time_based',
+ '--listen')]
+ mock_execute.assert_has_calls(expected_calls)
+
+ def test_fio_network_writer(self, mock_execute):
+
+ node = {'driver_info': {'agent_burnin_fio_network_runtime': 600,
+ 'agent_burnin_fio_network_config':
+ {'partner': 'host-001',
+ 'role': 'writer'}}}
+ mock_execute.return_value = (['out', 'err'])
+
+ burnin.fio_network(node)
+
+ expected_calls = [
+ mock.call('fio', '--ioengine', 'net', '--port', '9000',
+ '--fill_device', 1, '--group_reporting',
+ '--gtod_reduce', 1, '--numjobs', 16, '--name', 'writer',
+ '--rw', 'write', '--runtime', 600, '--time_based',
+ '--listen'),
+ mock.call('fio', '--ioengine', 'net', '--port', '9000',
+ '--fill_device', 1, '--group_reporting',
+ '--gtod_reduce', 1, '--numjobs', 16, '--name',
+ 'reader', '--rw', 'read', '--hostname', 'host-001')]
+ mock_execute.assert_has_calls(expected_calls)
+
+ def test_fio_network_no_fio(self, mock_execute):
+
+ node = {'driver_info': {'agent_burnin_fio_network_config':
+ {'partner': 'host-003', 'role': 'reader'}}}
+ mock_execute.side_effect = processutils.ProcessExecutionError('boom')
+
+ self.assertRaises(errors.CommandExecutionError,
+ burnin.fio_network, node)
+
+ @mock.patch('time.sleep', autospec=True)
+ def test_fio_network_reader_loop(self, mock_time, mock_execute):
+
+ node = {'driver_info': {'agent_burnin_fio_network_config':
+ {'partner': 'host-004', 'role': 'reader'}}}
+ # mock the infinite loop
+ mock_execute.side_effect = (processutils.ProcessExecutionError(
+ 'Connection timeout'),
+ processutils.ProcessExecutionError(
+ 'Connection timeout'),
+ processutils.ProcessExecutionError(
+ 'Connection refused'),
+ ['out', 'err'], # connected!
+ ['out', 'err']) # reversed roles
+
+ burnin.fio_network(node)
+
+ # we loop 3 times, then do the 2 fio calls
+ self.assertEqual(5, mock_execute.call_count)
+ self.assertEqual(3, mock_time.call_count)
diff --git a/ironic_python_agent/tests/unit/test_hardware.py b/ironic_python_agent/tests/unit/test_hardware.py
index fbb8e650..61001260 100644
--- a/ironic_python_agent/tests/unit/test_hardware.py
+++ b/ironic_python_agent/tests/unit/test_hardware.py
@@ -170,6 +170,13 @@ class TestGenericHardwareManager(base.IronicAgentTest):
'interface': 'deploy',
'reboot_requested': False,
'abortable': True
+ },
+ {
+ 'step': 'burnin_network',
+ 'priority': 0,
+ 'interface': 'deploy',
+ 'reboot_requested': False,
+ 'abortable': True
}
]
clean_steps = self.hardware.get_clean_steps(self.node, [])
diff --git a/releasenotes/notes/add_burnin_network-4856153d21c25f4a.yaml b/releasenotes/notes/add_burnin_network-4856153d21c25f4a.yaml
new file mode 100644
index 00000000..e197624a
--- /dev/null
+++ b/releasenotes/notes/add_burnin_network-4856153d21c25f4a.yaml
@@ -0,0 +1,6 @@
+---
+features:
+ - |
+ Adds a burn-in cleaning step 'burnin_network' to stress test the
+ network interface for a configurable amount of time with fio. To
+ use this step, fio needs to be installed on the RAM disk.