summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulia Kreger <juliaashleykreger@gmail.com>2020-03-26 09:14:30 -0700
committerDmitry Tantsur <dtantsur@protonmail.com>2020-03-27 14:40:35 +0000
commit9c116c4ef3d10a2eadb0fb0ddd13eae24a470f99 (patch)
tree1dc565b020c3d5a75a837a904541bc61c6dc890d
parent5101808e8daceeef8cb486e71124c6b1efd4b319 (diff)
downloadironic-9c116c4ef3d10a2eadb0fb0ddd13eae24a470f99.tar.gz
Retry agent get_command_status upon failures
The agent command status code lacks any retry mechanism which meant if any intermittent failure such as a dropped packet or an overloaded firewall could potentially begin to cause the entire deployment or cleaning process to derail and fail. This fix addes logic to ensure we retry upon such failures. Worth noting, the exact same logic has been used elsewhere in the agent client code for the exact same problem when issuing commands. Change-Id: I4f6581b7fb895ed2b1d505b9947e363665551b57 Story: 2007470 Task: 39158 (cherry picked from commit 242775ae184582ce51a006e5797e7017133617f3)
-rw-r--r--ironic/drivers/modules/agent_client.py15
-rw-r--r--ironic/tests/unit/drivers/modules/test_agent_client.py11
-rw-r--r--releasenotes/notes/agent-command-status-retry-f9b6f53a823c6b01.yaml6
3 files changed, 31 insertions, 1 deletions
diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py
index 0e3fcb57b..94191799b 100644
--- a/ironic/drivers/modules/agent_client.py
+++ b/ironic/drivers/modules/agent_client.py
@@ -133,6 +133,10 @@ class AgentClient(object):
return result
@METRICS.timer('AgentClient.get_commands_status')
+ @retrying.retry(
+ retry_on_exception=(
+ lambda e: isinstance(e, exception.AgentConnectionFailed)),
+ stop_max_attempt_number=CONF.agent.max_command_attempts)
def get_commands_status(self, node):
"""Get command status from agent.
@@ -162,7 +166,16 @@ class AgentClient(object):
"""
url = self._get_command_url(node)
LOG.debug('Fetching status of agent commands for node %s', node.uuid)
- resp = self.session.get(url, timeout=CONF.agent.command_timeout)
+ try:
+ resp = self.session.get(url, timeout=CONF.agent.command_timeout)
+ except (requests.ConnectionError, requests.Timeout) as e:
+ msg = (_('Failed to connect to the agent running on node %(node)s '
+ 'to collect commands status. '
+ 'Error: %(error)s') %
+ {'node': node.uuid, 'error': e})
+ LOG.error(msg)
+ raise exception.AgentConnectionFailed(reason=msg)
+
result = resp.json()['commands']
status = '; '.join('%(cmd)s: result "%(res)s", error "%(err)s"' %
{'cmd': r.get('command_name'),
diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py
index 4a5d6f89f..b309b1d04 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_client.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_client.py
@@ -192,6 +192,17 @@ class TestAgentClient(base.TestCase):
'api_version': CONF.agent.agent_api_version},
timeout=CONF.agent.command_timeout)
+ def test_get_commands_status_retries(self):
+ with mock.patch.object(self.client.session, 'get',
+ autospec=True) as mock_get:
+ res = mock.MagicMock(spec_set=['json'])
+ res.json.return_value = {'commands': []}
+ mock_get.side_effect = [
+ requests.ConnectionError('boom'),
+ res]
+ self.assertEqual([], self.client.get_commands_status(self.node))
+ self.assertEqual(2, mock_get.call_count)
+
def test_prepare_image(self):
self.client._command = mock.MagicMock(spec_set=[])
image_info = {'image_id': 'image'}
diff --git a/releasenotes/notes/agent-command-status-retry-f9b6f53a823c6b01.yaml b/releasenotes/notes/agent-command-status-retry-f9b6f53a823c6b01.yaml
new file mode 100644
index 000000000..df4920252
--- /dev/null
+++ b/releasenotes/notes/agent-command-status-retry-f9b6f53a823c6b01.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+ - |
+ Fixes an issue with the agent client code where checks of the agent
+ command status had no logic to prevent an intermittent or transient
+ connection failure from causing the entire operation to fail.