summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIury Gregory Melo Ferreira <imelofer@redhat.com>2018-11-23 16:19:50 +0100
committerJulia Kreger <juliaashleykreger@gmail.com>2020-08-25 18:43:33 +0000
commitad3cd1ba769864014d7e38947650e8fba893b467 (patch)
treec36105cc474b4a7dbdce008d6717ab57dc1eb453
parent800a5c4d147aaa41834f159c210417e7ed5a7eb2 (diff)
downloadironic-stable/queens.tar.gz
Retries and timeout for IPA commandstable/queens
Adds retries and timeout configuration parameters for IPA commands that can fail in case of network glitches. Change-Id: I817a07bf38c0ee1dd7e8599cf4d646a22ab7027f Story: #2004420 Task: #28071 (cherry picked from commit a2cffd8c4a12c2967c8424ccc2cca0189a70be75) (cherry picked from commit 5673679f69e2c0fe35150ffed127c96d282f3655)
-rw-r--r--ironic/conf/agent.py8
-rw-r--r--ironic/drivers/modules/agent_client.py16
-rw-r--r--ironic/tests/unit/drivers/modules/test_agent_client.py77
-rw-r--r--releasenotes/notes/ipa-command-retries-and-timeout-29b0be3f2c21328c.yaml5
4 files changed, 95 insertions, 11 deletions
diff --git a/ironic/conf/agent.py b/ironic/conf/agent.py
index 81ced891e..d7ef8853d 100644
--- a/ironic/conf/agent.py
+++ b/ironic/conf/agent.py
@@ -84,6 +84,14 @@ opts = [
'forever or until manually deleted. Used when the '
'deploy_logs_storage_backend is configured to '
'"swift".')),
+ cfg.IntOpt('command_timeout',
+ default=60,
+ help=_('Timeout (in seconds) for IPA commands')),
+ cfg.IntOpt('max_command_attempts',
+ default=3,
+ help=_('This is the maximum number of attempts that will be '
+ 'done for IPA commands that fails due to network '
+ 'problems')),
]
diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py
index f54837801..53cca01cf 100644
--- a/ironic/drivers/modules/agent_client.py
+++ b/ironic/drivers/modules/agent_client.py
@@ -16,6 +16,7 @@ from ironic_lib import metrics_utils
from oslo_log import log
from oslo_serialization import jsonutils
import requests
+import retrying
from six.moves import http_client
from ironic.common import exception
@@ -53,6 +54,10 @@ class AgentClient(object):
})
@METRICS.timer('AgentClient._command')
+ @retrying.retry(
+ retry_on_exception=(
+ lambda e: isinstance(e, exception.AgentConnectionFailed)),
+ stop_max_attempt_number=CONF.agent.max_command_attempts)
def _command(self, node, method, params, wait=False):
url = self._get_command_url(node)
body = self._get_command_body(method, params)
@@ -63,11 +68,12 @@ class AgentClient(object):
{'node': node.uuid, 'method': method})
try:
- response = self.session.post(url, params=request_params, data=body)
- except requests.ConnectionError as e:
- msg = (_('Failed to invoke agent command %(method)s for node '
- '%(node)s. Error: %(error)s') %
- {'method': method, 'node': node.uuid, 'error': e})
+ response = self.session.post(url, params=request_params, data=body,
+ timeout=CONF.agent.command_timeout)
+ except (requests.ConnectionError, requests.Timeout) as e:
+ msg = (_('Failed to connect to the agent running on node %(node)s '
+ 'for invoking command %(method)s. Error: %(error)s') %
+ {'node': node.uuid, 'method': method, 'error': e})
LOG.error(msg)
raise exception.AgentConnectionFailed(reason=msg)
except requests.RequestException as e:
diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py
index 6ddc7d2fd..b52d73252 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_client.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_client.py
@@ -96,7 +96,8 @@ class TestAgentClient(base.TestCase):
self.client.session.post.assert_called_once_with(
url,
data=body,
- params={'wait': 'false'})
+ params={'wait': 'false'},
+ timeout=60)
def test__command_fail_json(self):
response_text = 'this be not json matey!'
@@ -114,7 +115,8 @@ class TestAgentClient(base.TestCase):
self.client.session.post.assert_called_once_with(
url,
data=body,
- params={'wait': 'false'})
+ params={'wait': 'false'},
+ timeout=60)
def test__command_fail_post(self):
error = 'Boom'
@@ -145,12 +147,74 @@ class TestAgentClient(base.TestCase):
e = self.assertRaises(exception.AgentConnectionFailed,
self.client._command,
self.node, method, params)
- self.assertEqual('Connection to agent failed: Failed to invoke '
- 'agent command %(method)s for node %(node)s. '
- 'Error: %(error)s' %
+ self.assertEqual('Connection to agent failed: Failed to connect to '
+ 'the agent running on node %(node)s for invoking '
+ 'command %(method)s. Error: %(error)s' %
{'method': method, 'node': self.node.uuid,
'error': error}, str(e))
+ def test__command_fail_all_attempts(self):
+ error = 'Connection Timeout'
+ method = 'standby.run_image'
+ image_info = {'image_id': 'test_image'}
+ params = {'image_info': image_info}
+ self.client.session.post.side_effect = [requests.Timeout(error),
+ requests.Timeout(error),
+ requests.Timeout(error),
+ requests.Timeout(error)]
+ self.client._get_command_url(self.node)
+ self.client._get_command_body(method, params)
+
+ e = self.assertRaises(exception.AgentConnectionFailed,
+ self.client._command,
+ self.node, method, params)
+ self.assertEqual('Connection to agent failed: Failed to connect to '
+ 'the agent running on node %(node)s for invoking '
+ 'command %(method)s. Error: %(error)s' %
+ {'method': method, 'node': self.node.uuid,
+ 'error': error}, str(e))
+ self.assertEqual(3, self.client.session.post.call_count)
+
+ def test__command_succeed_after_two_timeouts(self):
+ error = 'Connection Timeout'
+ response_data = {'status': 'ok'}
+ response_text = json.dumps(response_data)
+ method = 'standby.run_image'
+ image_info = {'image_id': 'test_image'}
+ params = {'image_info': image_info}
+ self.client.session.post.side_effect = [requests.Timeout(error),
+ requests.Timeout(error),
+ MockResponse(response_text)]
+
+ response = self.client._command(self.node, method, params)
+ self.assertEqual(3, self.client.session.post.call_count)
+ self.assertEqual(response, response_data)
+ self.client.session.post.assert_called_with(
+ self.client._get_command_url(self.node),
+ data=self.client._get_command_body(method, params),
+ params={'wait': 'false'},
+ timeout=60)
+
+ def test__command_succeed_after_one_timeout(self):
+ error = 'Connection Timeout'
+ response_data = {'status': 'ok'}
+ response_text = json.dumps(response_data)
+ method = 'standby.run_image'
+ image_info = {'image_id': 'test_image'}
+ params = {'image_info': image_info}
+ self.client.session.post.side_effect = [requests.Timeout(error),
+ MockResponse(response_text),
+ requests.Timeout(error)]
+
+ response = self.client._command(self.node, method, params)
+ self.assertEqual(2, self.client.session.post.call_count)
+ self.assertEqual(response, response_data)
+ self.client.session.post.assert_called_with(
+ self.client._get_command_url(self.node),
+ data=self.client._get_command_body(method, params),
+ params={'wait': 'false'},
+ timeout=60)
+
def test__command_error_code(self):
response_text = '{"faultstring": "you dun goofd"}'
self.client.session.post.return_value = MockResponse(
@@ -168,7 +232,8 @@ class TestAgentClient(base.TestCase):
self.client.session.post.assert_called_once_with(
url,
data=body,
- params={'wait': 'false'})
+ params={'wait': 'false'},
+ timeout=60)
def test_get_commands_status(self):
with mock.patch.object(self.client.session, 'get',
diff --git a/releasenotes/notes/ipa-command-retries-and-timeout-29b0be3f2c21328c.yaml b/releasenotes/notes/ipa-command-retries-and-timeout-29b0be3f2c21328c.yaml
new file mode 100644
index 000000000..f9e87c377
--- /dev/null
+++ b/releasenotes/notes/ipa-command-retries-and-timeout-29b0be3f2c21328c.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+ - |
+ Adds ``command_timeout`` and ``max_command_attempts`` configuration options
+ to IPA, so when connection errors occur the command will be executed again.