summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.opendev.org>2022-01-20 21:17:38 +0000
committerGerrit Code Review <review@openstack.org>2022-01-20 21:17:38 +0000
commite06dd22e7862a6bbb4ea97499d64d6de576f2836 (patch)
tree419eb4b2d6a7b926270d2fb61506e21ba982cd1c
parent0b69890c114e6527d55b0c9ce4817bdb66faaede (diff)
parent7f15455d8d106a7d7ecdbc90e96f06d7e322ac4f (diff)
downloadironic-python-agent-e06dd22e7862a6bbb4ea97499d64d6de576f2836.tar.gz
Merge "Burn-in: Dynamic network pairing"
-rw-r--r--ironic_python_agent/burnin.py149
-rw-r--r--ironic_python_agent/tests/unit/test_burnin.py136
-rw-r--r--lower-constraints.txt2
-rw-r--r--releasenotes/notes/add_burnin_dynamic_network_pairing-33e398255050eb98.yaml17
-rw-r--r--requirements.txt1
-rw-r--r--setup.cfg4
6 files changed, 293 insertions, 16 deletions
diff --git a/ironic_python_agent/burnin.py b/ironic_python_agent/burnin.py
index 79844538..2af51b2a 100644
--- a/ironic_python_agent/burnin.py
+++ b/ironic_python_agent/burnin.py
@@ -11,11 +11,13 @@
# limitations under the License.
import json
+import socket
import time
from ironic_lib import utils
from oslo_concurrency import processutils
from oslo_log import log
+from tooz import coordination
from ironic_python_agent import errors
from ironic_python_agent import hardware
@@ -252,45 +254,160 @@ def _do_fio_network(writer, runtime, partner, outputfile):
raise errors.CommandExecutionError(error_msg)
+def _find_network_burnin_partner_and_role(backend_url, group_name, timeout):
+ """Find a partner node for network burn-in and get our role.
+
+ :param backend_url: The tooz backend url.
+ :param group_name: The tooz group name for pairing.
+ :param timeout:Timeout in seconds for a node to wait for a partner.
+ :returns: A set with the partner node and the role of the local node.
+ """
+
+ member_id = socket.gethostname()
+ coordinator = coordination.get_coordinator(backend_url, member_id)
+ coordinator.start(start_heart=True)
+
+ groups = coordinator.get_groups()
+ for group in groups.get():
+ if group_name == group.decode('utf-8'):
+ LOG.debug("Found group %s", group_name)
+ break
+ else:
+ LOG.info("Creating group %s", group_name)
+ coordinator.create_group(group_name)
+
+ def join_group(group_name):
+ request = coordinator.join_group(group_name)
+ request.get()
+
+ def leave_group(group_name):
+ request = coordinator.leave_group(group_name)
+ request.get()
+
+ # Attempt to get the pairing lock. The lock is released when:
+ # a) a node enters the group and is the first to join, or
+ # b) a node enters second, finished pairing, sees
+ # the pairing node exiting, and left itself.
+ # The lock 'walls' all nodes willing to pair.
+ group_lock = coordinator.get_lock("group_lock")
+ with group_lock:
+ # we need the initial members in order to know the first
+ # node (which may leave quickly when we join)
+ init_members = coordinator.get_members(group_name)
+ LOG.info("Original group members are %s", init_members.get())
+ members_cnt = len(init_members.get())
+
+ join_group(group_name)
+
+ # we assign the first node the writer role since it will
+ # leave the group first, it may be ready once the second
+ # node leaves the group, and we save one wait cycle
+ if not members_cnt:
+ first = True
+ role = "writer"
+ group_lock.release() # allow second node to enter
+ else:
+ first = False
+ role = "reader"
+
+ partner = None
+ start_pairing = time.time()
+ while time.time() - start_pairing < timeout:
+ if first:
+ # we are the first and therefore need to wait
+ # for another node to join
+ members = coordinator.get_members(group_name)
+ members_cnt = len(members.get())
+ else:
+ # use the initial members in case the other
+ # node leaves before we get an updated list
+ members = init_members
+
+ assert members_cnt < 3
+
+ if members_cnt == 2 or not first:
+ LOG.info("Two members, start pairing...")
+ for member in members.get():
+ node = member.decode('utf-8')
+ if node != member_id:
+ partner = node
+ if not partner:
+ error_msg = ("fio (network) no partner to pair found")
+ raise errors.CleaningError(error_msg)
+
+ # if you are the second to enter, wait for the first to exit
+ if not first:
+ members = coordinator.get_members(group_name)
+ while (len(members.get()) == 2):
+ time.sleep(0.2)
+ members = coordinator.get_members(group_name)
+ leave_group(group_name)
+ group_lock.release()
+ else:
+ leave_group(group_name)
+ break
+ else:
+ LOG.info("One member, waiting for second node to join ...")
+ time.sleep(1)
+ else:
+ leave_group(group_name)
+ error_msg = ("fio (network) timed out to find partner")
+ raise errors.CleaningError(error_msg)
+
+ return (partner, role)
+
+
def fio_network(node):
"""Burn-in the network with fio
Run an fio network job for a pair of nodes for a configurable
- amount of time. The pair is statically defined in driver_info
- via 'agent_burnin_fio_network_config'.
+ amount of time. The pair is either statically defined in
+ driver_info via 'agent_burnin_fio_network_config' or the role
+ and partner is found dynamically via a tooz backend.
+
The writer will wait for the reader to connect, then write to the
network. Upon completion, the roles are swapped.
- Note (arne_wiebalck): Initial version. The plan is to make the
- match making dynamic by posting availability
- on a distributed backend, e.g. via tooz.
-
:param node: Ironic node object
:raises: CommandExecutionError if the execution of fio fails.
:raises: CleaningError if the configuration is incomplete.
"""
-
info = node.get('driver_info', {})
runtime = info.get('agent_burnin_fio_network_runtime', 21600)
outputfile = info.get('agent_burnin_fio_network_outputfile', None)
# get our role and identify our partner
config = info.get('agent_burnin_fio_network_config')
- if not config:
- error_msg = ("fio (network) failed to find "
- "'agent_burnin_fio_network_config' in driver_info")
- raise errors.CleaningError(error_msg)
- LOG.debug("agent_burnin_fio_network_config is %s", str(config))
+ if config:
+ LOG.debug("static agent_burnin_fio_network_config is %s",
+ config)
+ role = config.get('role')
+ partner = config.get('partner')
+ else:
+ timeout = info.get(
+ 'agent_burnin_fio_network_pairing_timeout', 900)
+ group_name = info.get(
+ 'agent_burnin_fio_network_pairing_group_name',
+ 'ironic.network-burnin')
+ backend_url = info.get(
+ 'agent_burnin_fio_network_pairing_backend_url', None)
+ if not backend_url:
+ msg = ('fio (network): dynamic pairing config is missing '
+ 'agent_burnin_fio_network_pairing_backend_url')
+ raise errors.CleaningError(msg)
+ LOG.info("dynamic pairing for network burn-in ...")
+ (partner, role) = _find_network_burnin_partner_and_role(
+ backend_url=backend_url,
+ group_name=group_name,
+ timeout=timeout)
- role = config.get('role')
if role not in NETWORK_BURNIN_ROLES:
error_msg = "fio (network) found an unknown role: %s" % role
raise errors.CleaningError(error_msg)
-
- partner = config.get('partner')
if not partner:
- error_msg = ("fio (network) failed to find partner")
+ error_msg = "fio (network) failed to find partner"
raise errors.CleaningError(error_msg)
+ LOG.info("fio (network): partner %s, role is %s", partner, role)
logfilename = None
if outputfile:
diff --git a/ironic_python_agent/tests/unit/test_burnin.py b/ironic_python_agent/tests/unit/test_burnin.py
index 32708130..25d334e7 100644
--- a/ironic_python_agent/tests/unit/test_burnin.py
+++ b/ironic_python_agent/tests/unit/test_burnin.py
@@ -14,6 +14,7 @@ from unittest import mock
from ironic_lib import utils
from oslo_concurrency import processutils
+from tooz import coordination
from ironic_python_agent import burnin
from ironic_python_agent import errors
@@ -379,3 +380,138 @@ class TestBurnin(base.IronicAgentTest):
# we loop 3 times, then do the 2 fio calls
self.assertEqual(5, mock_execute.call_count)
self.assertEqual(3, mock_time.call_count)
+
+ def test_fio_network_dynamic_pairing_raise_missing_config(self,
+ mock_execute):
+ node = {'driver_info': {}}
+ self.assertRaises(errors.CleaningError, burnin.fio_network, node)
+
+ def test_fio_network_dynamic_pairing_raise_wrong_config(self,
+ mock_execute):
+ node = {'driver_info': {
+ 'backend_url': 'zookeeper://zookeeper-host-01:2181',
+ 'group_name': 'ironic.dynamic-network-burnin',
+ 'timeout': 600}}
+ self.assertRaises(errors.CleaningError, burnin.fio_network, node)
+
+ @mock.patch.object(burnin, '_find_network_burnin_partner_and_role',
+ autospec=True)
+ def test_fio_network_dynamic_pairing_defaults(self, mock_find,
+ mock_execute):
+ node = {'driver_info': {
+ 'agent_burnin_fio_network_pairing_backend_url':
+ 'zookeeper://zookeeper-host-01:2181'}}
+ mock_find.return_value = ['partner-host', 'reader']
+ mock_execute.return_value = (['out', 'err'])
+
+ burnin.fio_network(node)
+
+ mock_find.assert_called_once_with(
+ backend_url='zookeeper://zookeeper-host-01:2181',
+ group_name='ironic.network-burnin',
+ timeout=900)
+
+ @mock.patch.object(burnin, '_find_network_burnin_partner_and_role',
+ autospec=True)
+ def test_fio_network_dynamic_pairing_no_defaults(self, mock_find,
+ mock_execute):
+ node = {'driver_info': {
+ 'agent_burnin_fio_network_pairing_backend_url':
+ 'zookeeper://zookeeper-host-01:2181',
+ 'agent_burnin_fio_network_pairing_group_name':
+ 'ironic.special-group',
+ 'agent_burnin_fio_network_pairing_timeout': 600}}
+ mock_find.return_value = ['partner-host', 'reader']
+ mock_execute.return_value = (['out', 'err'])
+
+ burnin.fio_network(node)
+
+ mock_find.assert_called_once_with(
+ backend_url='zookeeper://zookeeper-host-01:2181',
+ group_name='ironic.special-group',
+ timeout=600)
+
+ @mock.patch.object(coordination, 'get_coordinator', autospec=True)
+ def test_fio_network_dynamic_find_timeout(self, mock_get_coordinator,
+ mock_execute):
+ mock_coordinator = mock.MagicMock()
+ mock_get_coordinator.return_value = mock_coordinator
+
+ # timeout since no other node is joining
+ self.assertRaises(errors.CleaningError,
+ burnin._find_network_burnin_partner_and_role,
+ "zk://xyz", 'group', 2)
+
+ # group did not exist, so we created it
+ mock_coordinator.create_group.assert_called_once_with('group')
+ mock_coordinator.join_group.assert_called_once()
+ # get_members is called initially, then every second
+ # up to the timeout
+ self.assertEqual(3, mock_coordinator.get_members.call_count)
+
+ @mock.patch.object(coordination, 'get_coordinator', autospec=True)
+ def test_fio_network_dynamic_find_pair_1st(self, mock_get_coordinator,
+ mock_execute):
+ mock_coordinator = mock.MagicMock()
+ mock_get_coordinator.return_value = mock_coordinator
+
+ class Members:
+ def __init__(self, members=[]):
+ self.members = members
+
+ def get(self):
+ return self.members
+
+ # we are the first node to enter, so no other host
+ # initially until the second one appears after some
+ # interations
+ mock_coordinator.get_members.side_effect = \
+ [Members(), Members([b'host1']), Members([b'host1']),
+ Members([b'host1']), Members([b'host1', b'host2'])]
+
+ (partner, role) = \
+ burnin._find_network_burnin_partner_and_role("zk://xyz",
+ "group", 10)
+
+ # ... so we will leave first and be the writer
+ self.assertEqual((partner, role), ("host2", "writer"))
+
+ # group did not exist, so we created it
+ mock_coordinator.create_group.assert_called_once_with('group')
+ mock_coordinator.join_group.assert_called_once()
+ # get_members is called initially, then every second
+ # up to the timeout
+ self.assertEqual(5, mock_coordinator.get_members.call_count)
+
+ @mock.patch.object(coordination, 'get_coordinator', autospec=True)
+ def test_fio_network_dynamic_find_pair_2nd(self, mock_get_coordinator,
+ mock_execute):
+ mock_coordinator = mock.MagicMock()
+ mock_get_coordinator.return_value = mock_coordinator
+
+ class Members:
+ def __init__(self, members=[]):
+ self.members = members
+
+ def get(self):
+ return self.members
+
+ # we are the second node to enter, host1 is there before us ...
+ mock_coordinator.get_members.side_effect = \
+ [Members([b'host1']),
+ Members([b'host1', b'host2']),
+ Members([b'host2'])]
+
+ (partner, role) = \
+ burnin._find_network_burnin_partner_and_role("zk://xyz",
+ "group", 10)
+
+ # ... so we will leave second and be the reader
+ self.assertEqual((partner, role), ("host1", "reader"))
+
+ # group did not exist, so we created it
+ mock_coordinator.create_group.assert_called_once_with('group')
+ mock_coordinator.join_group.assert_called_once()
+ # get_members is called initially, then every second until the
+ # other node appears
+ self.assertEqual(3, mock_coordinator.get_members.call_count)
diff --git a/lower-constraints.txt b/lower-constraints.txt
index 8fbb7f13..64365d59 100644
--- a/lower-constraints.txt
+++ b/lower-constraints.txt
@@ -7,6 +7,7 @@ dogpile.cache==0.9.2
eventlet==0.18.2
importlib_metadata==1.7.0;python_version<'3.8'
ironic-lib==5.1.0
+kazoo==2.8.0
netifaces==0.10.4
openstacksdk==0.49.0
oslo.concurrency==3.26.0
@@ -24,3 +25,4 @@ stestr==1.0.0
stevedore==1.20.0
tenacity==6.2.0
testtools==2.2.0
+tooz==2.7.2
diff --git a/releasenotes/notes/add_burnin_dynamic_network_pairing-33e398255050eb98.yaml b/releasenotes/notes/add_burnin_dynamic_network_pairing-33e398255050eb98.yaml
new file mode 100644
index 00000000..1802f7ec
--- /dev/null
+++ b/releasenotes/notes/add_burnin_dynamic_network_pairing-33e398255050eb98.yaml
@@ -0,0 +1,17 @@
+---
+features:
+ - |
+ For network burn-in, nodes can now be paired dynamically via a
+ distributed coordination backend (as an alternative to a static
+ configuration). This allows burn-in to proceed on a 'first come
+ first served' basis with the nodes available, rather than a node
+ being blocked since the static partner is currently delayed.
+ In order to configure this dynamic pairing, the nodes will need
+ at least 'agent_burnin_fio_network_pairing_backend_url' in their
+ driver-info (the URL for the coordination backend). In order to
+ separate different hardware types, which may be using different
+ networks and shall be burnt-in separately, the nodes can in
+ addition define 'agent_burnin_fio_network_pairing_group_name' to
+ have pairing only happening between nodes in the same group. An
+ additional 'agent_burnin_fio_network_pairing_timeout' allows to
+ limit the time given to the nodes to wait for a partner.
diff --git a/requirements.txt b/requirements.txt
index cadb5c30..3badcd0e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,3 +20,4 @@ tenacity>=6.2.0 # Apache-2.0
ironic-lib>=5.1.0 # Apache-2.0
Werkzeug>=1.0.1 # BSD License
cryptography>=2.3 # BSD/Apache-2.0
+tooz>=2.7.2 # Apache-2.0
diff --git a/setup.cfg b/setup.cfg
index 3ab4d5e2..010cd4ed 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -66,3 +66,7 @@ api_doc_dir = contributor/api
tag_build =
tag_date = 0
tag_svn_revision = 0
+
+[extras]
+burnin-network-kazoo =
+ kazoo>=2.8.0 # Apache-2.0