Fix deployment when executing a command fails after the command starts

If the agent accepts a command, but is unable to reply to Ironic (which sporadically happens before of the eventlet's TLS implementation), we currently retry the request and fail because the command is already executing. Ironic now detects this situation by checking the list of executing commands after receiving a connection error. If the requested command is last one, we assume that the command request succeeded. Ideally, we should pass a request ID to IPA and then compare it. Such a change would affect the API contract between the agent and Ironic and thus would not be backportable. Change-Id: I2ea21c9ec440fa7ddf8578cf7b34d6d0ebbb5dc8
author: Dmitry Tantsur <dtantsur@protonmail.com> 2021-05-07 10:38:16 +0200
committer: Dmitry Tantsur <dtantsur@protonmail.com> 2021-05-07 17:50:06 +0200
commit: abfe383cc32d63dd3df2b90f0feace08d73b8364 (patch)
tree: 310f748209d92fbc2f19844623e22bc97efb8970
parent: 2e5e2e5eb57bb152eeb77880dbdc2add7111b932 (diff)
download: ironic-abfe383cc32d63dd3df2b90f0feace08d73b8364.tar.gz
3 files changed, 203 insertions, 21 deletions
diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py
index a3e1048db..29172a5cc 100644
--- a/ironic/drivers/modules/agent_client.py
+++ b/ironic/drivers/modules/agent_client.py
@@ -179,11 +179,9 @@ class AgentClient(object):
                 verify=self._get_verify(node),
                 timeout=CONF.agent.command_timeout)
         except (requests.ConnectionError, requests.Timeout) as e:
-            msg = (_('Failed to connect to the agent running on node %(node)s '
-                     'for invoking command %(method)s. Error: %(error)s') %
-                   {'node': node.uuid, 'method': method, 'error': e})
-            LOG.error(msg)
-            raise exception.AgentConnectionFailed(reason=msg)
+            result = self._handle_timeout_on_command_execution(node, method,
+                                                               params, e)
+            response = None
         except requests.RequestException as e:
             msg = (_('Error invoking agent command %(method)s for node '
                      '%(node)s. Error: %(error)s') %
@@ -191,28 +189,31 @@ class AgentClient(object):
             LOG.error(msg)
             raise exception.IronicException(msg)
 
-        # TODO(russellhaering): real error handling
-        try:
-            result = response.json()
-        except ValueError:
-            msg = _(
-                'Unable to decode response as JSON.\n'
-                'Request URL: %(url)s\nRequest body: "%(body)s"\n'
-                'Response status code: %(code)s\n'
-                'Response: "%(response)s"'
-            ) % ({'response': response.text, 'body': body, 'url': url,
-                  'code': response.status_code})
-            LOG.error(msg)
-            raise exception.IronicException(msg)
+        if response is not None:
+            # TODO(russellhaering): real error handling
+            try:
+                result = response.json()
+            except ValueError:
+                msg = _(
+                    'Unable to decode response as JSON.\n'
+                    'Request URL: %(url)s\nRequest body: "%(body)s"\n'
+                    'Response status code: %(code)s\n'
+                    'Response: "%(response)s"'
+                ) % ({'response': response.text, 'body': body, 'url': url,
+                      'code': response.status_code})
+                LOG.error(msg)
+                raise exception.IronicException(msg)
 
         error = result.get('command_error')
         LOG.debug('Agent command %(method)s for node %(node)s returned '
-                  'result %(res)s, error %(error)s, HTTP status code %(code)d',
+                  'result %(res)s, error %(error)s, HTTP status code %(code)s',
                   {'node': node.uuid, 'method': method,
                    'res': result.get('command_result'),
                    'error': error,
-                   'code': response.status_code})
-        if response.status_code >= http_client.BAD_REQUEST:
+                   'code': response.status_code if response is not None
+                   else 'unknown'})
+        if (response is not None
+                and response.status_code >= http_client.BAD_REQUEST):
             faultstring = result.get('faultstring')
             if 'agent_token' in faultstring:
                 LOG.error('Agent command %(method)s for node %(node)s '
@@ -319,6 +320,61 @@ class AgentClient(object):
                   {'node': node.uuid, 'status': status})
         return result
 
+    def _status_if_last_command_matches(self, node, method, params):
+        """Return the status of the given command if it's the last running."""
+        try:
+            method = method.split('.', 1)[1]
+        except IndexError:
+            pass
+
+        commands = self.get_commands_status(node)
+        if not commands:
+            return None
+
+        # TODO(dtantsur): a more reliable way to detect repeated execution
+        # would be to pass a sort of require ID to the agent.
+
+        command = commands[-1]
+        if command['command_name'] != method:
+            LOG.debug('Command %(cmd)s is not currently executing, the last '
+                      'command is %(curr)s',
+                      {'cmd': method, 'curr': command['command_name']})
+            return None
+
+        if command['command_status'] != 'RUNNING':
+            LOG.debug('Command %(cmd)s is not currently executing, its status '
+                      'is %(curr)s',
+                      {'cmd': method, 'curr': command['command_status']})
+            return None
+
+        return command
+
+    def _handle_timeout_on_command_execution(self, node, method, params,
+                                             error):
+        result = None
+        # NOTE(dtantsur): it is possible, especially with eventlet+TLS, that
+        # agent receives a command but fails to return the result to Ironic.
+        # To avoid a failure, check if the last command is the one we're trying
+        # to execute.
+        try:
+            result = self._status_if_last_command_matches(node, method, params)
+        except Exception as e:
+            msg = (_('Failed to connect to the agent running on node '
+                     '%(node)s for checking the last command status '
+                     'after failing to invoke command %(method)s. '
+                     'Error: %(error)s') %
+                   {'node': node.uuid, 'method': method, 'error': e})
+            LOG.error(msg)
+
+        if result is None:
+            msg = (_('Failed to connect to the agent running on node %(node)s '
+                     'for invoking command %(method)s. Error: %(error)s') %
+                   {'node': node.uuid, 'method': method, 'error': error})
+            LOG.error(msg)
+            raise exception.AgentConnectionFailed(reason=msg)
+
+        return result
+
     def get_last_command_status(self, node, method):
         """Get the last status for the given command.
 
diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py
index 59f76aa15..193316576 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_client.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_client.py
@@ -193,6 +193,123 @@ class TestAgentClient(base.TestCase):
             timeout=60,
             verify=True)
         self.assertEqual(3, self.client.session.post.call_count)
+        self.assertTrue(self.client.session.get.called)
+
+    def test__command_fail_connect_no_command_running(self):
+        error = 'Boom'
+        self.client.session.post.side_effect = requests.ConnectionError(error)
+        self.client.session.get.return_value.json.return_value = {
+            'commands': []
+        }
+        method = 'foo.bar'
+        params = {}
+
+        url = self.client._get_command_url(self.node)
+        self.client._get_command_body(method, params)
+
+        e = self.assertRaises(exception.AgentConnectionFailed,
+                              self.client._command,
+                              self.node, method, params)
+        self.assertEqual('Connection to agent failed: Failed to connect to '
+                         'the agent running on node %(node)s for invoking '
+                         'command %(method)s. Error: %(error)s' %
+                         {'method': method, 'node': self.node.uuid,
+                          'error': error}, str(e))
+        self.client.session.post.assert_called_with(
+            url,
+            data=mock.ANY,
+            params={'wait': 'false'},
+            timeout=60,
+            verify=True)
+        self.assertEqual(3, self.client.session.post.call_count)
+        self.assertTrue(self.client.session.get.called)
+
+    def test__command_fail_connect_wrong_command_running(self):
+        error = 'Boom'
+        self.client.session.post.side_effect = requests.ConnectionError(error)
+        self.client.session.get.return_value.json.return_value = {
+            'commands': [
+                {'command_name': 'meow', 'command_status': 'RUNNING'},
+            ]
+        }
+        method = 'foo.bar'
+        params = {}
+
+        url = self.client._get_command_url(self.node)
+        self.client._get_command_body(method, params)
+
+        e = self.assertRaises(exception.AgentConnectionFailed,
+                              self.client._command,
+                              self.node, method, params)
+        self.assertEqual('Connection to agent failed: Failed to connect to '
+                         'the agent running on node %(node)s for invoking '
+                         'command %(method)s. Error: %(error)s' %
+                         {'method': method, 'node': self.node.uuid,
+                          'error': error}, str(e))
+        self.client.session.post.assert_called_with(
+            url,
+            data=mock.ANY,
+            params={'wait': 'false'},
+            timeout=60,
+            verify=True)
+        self.assertEqual(3, self.client.session.post.call_count)
+        self.assertTrue(self.client.session.get.called)
+
+    def test__command_fail_connect_command_not_running(self):
+        error = 'Boom'
+        self.client.session.post.side_effect = requests.ConnectionError(error)
+        self.client.session.get.return_value.json.return_value = {
+            'commands': [
+                {'command_name': 'bar', 'command_status': 'FINISHED'},
+            ]
+        }
+        method = 'foo.bar'
+        params = {}
+
+        url = self.client._get_command_url(self.node)
+        self.client._get_command_body(method, params)
+
+        e = self.assertRaises(exception.AgentConnectionFailed,
+                              self.client._command,
+                              self.node, method, params)
+        self.assertEqual('Connection to agent failed: Failed to connect to '
+                         'the agent running on node %(node)s for invoking '
+                         'command %(method)s. Error: %(error)s' %
+                         {'method': method, 'node': self.node.uuid,
+                          'error': error}, str(e))
+        self.client.session.post.assert_called_with(
+            url,
+            data=mock.ANY,
+            params={'wait': 'false'},
+            timeout=60,
+            verify=True)
+        self.assertEqual(3, self.client.session.post.call_count)
+        self.assertTrue(self.client.session.get.called)
+
+    def test__command_fail_connect_command_is_running(self):
+        error = 'Boom'
+        self.client.session.post.side_effect = requests.ConnectionError(error)
+        self.client.session.get.return_value.json.return_value = {
+            'commands': [
+                {'command_name': 'bar', 'command_status': 'RUNNING'},
+            ]
+        }
+        method = 'foo.bar'
+        params = {}
+
+        url = self.client._get_command_url(self.node)
+        self.client._get_command_body(method, params)
+
+        result = self.client._command(self.node, method, params)
+        self.assertEqual({'command_name': 'bar', 'command_status': 'RUNNING'},
+                         result)
+        self.client.session.post.assert_called_once_with(
+            url,
+            data=mock.ANY,
+            params={'wait': 'false'},
+            timeout=60,
+            verify=True)
+        self.assertTrue(self.client.session.get.called)
 
     def test__command_error_code(self):
         response_text = {"faultstring": "you dun goofd"}
diff --git a/releasenotes/notes/agent-last-command-4ec6967c995ba84a.yaml b/releasenotes/notes/agent-last-command-4ec6967c995ba84a.yaml
new file mode 100644
index 000000000..1ab3a4846
--- /dev/null
+++ b/releasenotes/notes/agent-last-command-4ec6967c995ba84a.yaml
@@ -0,0 +1,9 @@
+---
+fixes:
+  - |
+    If the agent accepts a command, but is unable to reply to Ironic (which
+    sporadically happens before of the eventlet's TLS implementation), we
+    currently retry the request and fail because the command is already
+    executing. Ironic now detects this situation by checking the list of
+    executing commands after receiving a connection error. If the requested
+    command is the last one, we assume that the command request succeeded.
author	Dmitry Tantsur <dtantsur@protonmail.com>	2021-05-07 10:38:16 +0200
committer	Dmitry Tantsur <dtantsur@protonmail.com>	2021-05-07 17:50:06 +0200
commit	abfe383cc32d63dd3df2b90f0feace08d73b8364 (patch)
tree	310f748209d92fbc2f19844623e22bc97efb8970
parent	2e5e2e5eb57bb152eeb77880dbdc2add7111b932 (diff)
download	ironic-abfe383cc32d63dd3df2b90f0feace08d73b8364.tar.gz