diff options
author | Iury Gregory Melo Ferreira <imelofer@redhat.com> | 2018-11-23 16:19:50 +0100 |
---|---|---|
committer | Julia Kreger <juliaashleykreger@gmail.com> | 2020-08-25 18:43:33 +0000 |
commit | ad3cd1ba769864014d7e38947650e8fba893b467 (patch) | |
tree | c36105cc474b4a7dbdce008d6717ab57dc1eb453 /ironic | |
parent | 800a5c4d147aaa41834f159c210417e7ed5a7eb2 (diff) | |
download | ironic-stable/queens.tar.gz |
Retries and timeout for IPA commandstable/queens
Adds retries and timeout configuration parameters for IPA
commands that can fail in case of network glitches.
Change-Id: I817a07bf38c0ee1dd7e8599cf4d646a22ab7027f
Story: #2004420
Task: #28071
(cherry picked from commit a2cffd8c4a12c2967c8424ccc2cca0189a70be75)
(cherry picked from commit 5673679f69e2c0fe35150ffed127c96d282f3655)
Diffstat (limited to 'ironic')
-rw-r--r-- | ironic/conf/agent.py | 8 | ||||
-rw-r--r-- | ironic/drivers/modules/agent_client.py | 16 | ||||
-rw-r--r-- | ironic/tests/unit/drivers/modules/test_agent_client.py | 77 |
3 files changed, 90 insertions, 11 deletions
diff --git a/ironic/conf/agent.py b/ironic/conf/agent.py index 81ced891e..d7ef8853d 100644 --- a/ironic/conf/agent.py +++ b/ironic/conf/agent.py @@ -84,6 +84,14 @@ opts = [ 'forever or until manually deleted. Used when the ' 'deploy_logs_storage_backend is configured to ' '"swift".')), + cfg.IntOpt('command_timeout', + default=60, + help=_('Timeout (in seconds) for IPA commands')), + cfg.IntOpt('max_command_attempts', + default=3, + help=_('This is the maximum number of attempts that will be ' + 'done for IPA commands that fails due to network ' + 'problems')), ] diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py index f54837801..53cca01cf 100644 --- a/ironic/drivers/modules/agent_client.py +++ b/ironic/drivers/modules/agent_client.py @@ -16,6 +16,7 @@ from ironic_lib import metrics_utils from oslo_log import log from oslo_serialization import jsonutils import requests +import retrying from six.moves import http_client from ironic.common import exception @@ -53,6 +54,10 @@ class AgentClient(object): }) @METRICS.timer('AgentClient._command') + @retrying.retry( + retry_on_exception=( + lambda e: isinstance(e, exception.AgentConnectionFailed)), + stop_max_attempt_number=CONF.agent.max_command_attempts) def _command(self, node, method, params, wait=False): url = self._get_command_url(node) body = self._get_command_body(method, params) @@ -63,11 +68,12 @@ class AgentClient(object): {'node': node.uuid, 'method': method}) try: - response = self.session.post(url, params=request_params, data=body) - except requests.ConnectionError as e: - msg = (_('Failed to invoke agent command %(method)s for node ' - '%(node)s. Error: %(error)s') % - {'method': method, 'node': node.uuid, 'error': e}) + response = self.session.post(url, params=request_params, data=body, + timeout=CONF.agent.command_timeout) + except (requests.ConnectionError, requests.Timeout) as e: + msg = (_('Failed to connect to the agent running on node %(node)s ' + 'for invoking command %(method)s. Error: %(error)s') % + {'node': node.uuid, 'method': method, 'error': e}) LOG.error(msg) raise exception.AgentConnectionFailed(reason=msg) except requests.RequestException as e: diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py index 6ddc7d2fd..b52d73252 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_client.py +++ b/ironic/tests/unit/drivers/modules/test_agent_client.py @@ -96,7 +96,8 @@ class TestAgentClient(base.TestCase): self.client.session.post.assert_called_once_with( url, data=body, - params={'wait': 'false'}) + params={'wait': 'false'}, + timeout=60) def test__command_fail_json(self): response_text = 'this be not json matey!' @@ -114,7 +115,8 @@ class TestAgentClient(base.TestCase): self.client.session.post.assert_called_once_with( url, data=body, - params={'wait': 'false'}) + params={'wait': 'false'}, + timeout=60) def test__command_fail_post(self): error = 'Boom' @@ -145,12 +147,74 @@ class TestAgentClient(base.TestCase): e = self.assertRaises(exception.AgentConnectionFailed, self.client._command, self.node, method, params) - self.assertEqual('Connection to agent failed: Failed to invoke ' - 'agent command %(method)s for node %(node)s. ' - 'Error: %(error)s' % + self.assertEqual('Connection to agent failed: Failed to connect to ' + 'the agent running on node %(node)s for invoking ' + 'command %(method)s. Error: %(error)s' % {'method': method, 'node': self.node.uuid, 'error': error}, str(e)) + def test__command_fail_all_attempts(self): + error = 'Connection Timeout' + method = 'standby.run_image' + image_info = {'image_id': 'test_image'} + params = {'image_info': image_info} + self.client.session.post.side_effect = [requests.Timeout(error), + requests.Timeout(error), + requests.Timeout(error), + requests.Timeout(error)] + self.client._get_command_url(self.node) + self.client._get_command_body(method, params) + + e = self.assertRaises(exception.AgentConnectionFailed, + self.client._command, + self.node, method, params) + self.assertEqual('Connection to agent failed: Failed to connect to ' + 'the agent running on node %(node)s for invoking ' + 'command %(method)s. Error: %(error)s' % + {'method': method, 'node': self.node.uuid, + 'error': error}, str(e)) + self.assertEqual(3, self.client.session.post.call_count) + + def test__command_succeed_after_two_timeouts(self): + error = 'Connection Timeout' + response_data = {'status': 'ok'} + response_text = json.dumps(response_data) + method = 'standby.run_image' + image_info = {'image_id': 'test_image'} + params = {'image_info': image_info} + self.client.session.post.side_effect = [requests.Timeout(error), + requests.Timeout(error), + MockResponse(response_text)] + + response = self.client._command(self.node, method, params) + self.assertEqual(3, self.client.session.post.call_count) + self.assertEqual(response, response_data) + self.client.session.post.assert_called_with( + self.client._get_command_url(self.node), + data=self.client._get_command_body(method, params), + params={'wait': 'false'}, + timeout=60) + + def test__command_succeed_after_one_timeout(self): + error = 'Connection Timeout' + response_data = {'status': 'ok'} + response_text = json.dumps(response_data) + method = 'standby.run_image' + image_info = {'image_id': 'test_image'} + params = {'image_info': image_info} + self.client.session.post.side_effect = [requests.Timeout(error), + MockResponse(response_text), + requests.Timeout(error)] + + response = self.client._command(self.node, method, params) + self.assertEqual(2, self.client.session.post.call_count) + self.assertEqual(response, response_data) + self.client.session.post.assert_called_with( + self.client._get_command_url(self.node), + data=self.client._get_command_body(method, params), + params={'wait': 'false'}, + timeout=60) + def test__command_error_code(self): response_text = '{"faultstring": "you dun goofd"}' self.client.session.post.return_value = MockResponse( @@ -168,7 +232,8 @@ class TestAgentClient(base.TestCase): self.client.session.post.assert_called_once_with( url, data=body, - params={'wait': 'false'}) + params={'wait': 'false'}, + timeout=60) def test_get_commands_status(self): with mock.patch.object(self.client.session, 'get', |