summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulia Kreger <juliaashleykreger@gmail.com>2020-10-27 06:53:42 -0700
committerJulia Kreger <juliaashleykreger@gmail.com>2021-03-02 19:37:05 +0000
commitd1ffc6a557c6b3f3641bee2cfc9ec3be4649bdbe (patch)
tree8ff52b768ddc963b6f5ee733de657f00c4a99e37
parent90da180a17852eab6af4b038fbd92989feccb5f8 (diff)
downloadironic-d1ffc6a557c6b3f3641bee2cfc9ec3be4649bdbe.tar.gz
Handle agent still doing the prior command
The agent command exec model is based upon an incoming heartbeat, however heartbeats are independent and commands can take a long time. For example, software RAID setup in CI can encounter this. From an IPA log: [-] Picked root device /dev/md0 for node c6ca0af2-baec-40d6-879d-cbb5c751aafb based on root device hints {'name': '/dev/md0'} [-] Attempting to download image from http://199.204.45.248:3928/agent_images/ c6ca0af2-baec-40d6-879d-cbb5c751aafb [-] Executing command: standby.get_partition_uuids with args: {} execute_command /usr/local/lib/python3.6/site-packages/ironic_python_agent/extensions/base.py:255 [-] Tried to execute standby.get_partition_uuids, agent is still executing Command name: execute_deploy_step, params: {'step': {'interface': 'deploy', 'step': 'write_image', 'args': {'image_info': {'id': 'cb9e199a-af1b-4a6f-b00e-f284008b8046', 'urls': ['http://199.204.45.248:3928/agent_images/c6ca0af2-baec-40d6-879d-cbb5c751aafb'], 'disk_format': 'raw', 'container_format': 'bare', 'stream_raw_images': True, 'os_hash_algo': 'sha512', 'os_hash_value':<trimed> This was with code built on master, using master images. Inside the conductor log, it notes that it is likely an out of date agent because only AgentAPIError is evaluated, however any API error is evaluated this way. In reality, we need to explicitly flag *when* we have an error that is because we've tried to soon as something is already being worked upon. The result, is to evaluate and return an exception indicating work is already in flight. Update - It looks like, the original fix to prevent busy agent recognition did not fully detect all cases as getting steps is a command which can get skipped by accident with a busy agent, under certain circumstances. Change I5d86878b5ed6142ed2630adee78c0867c49b663f in ironic-python-agent also changed the string that was being checked for the previous handling, where we really should have just made the string we were checking lower case in ironic. Oh well! This should fix things right up. Story: 2008167 Task: 41175 Change-Id: Ia169640b7084d17d26f22e457c7af512db6d21d6 (cherry picked from commit 545dc2106b757e69d473c2997bbb2ac34fc131d3)
-rw-r--r--ironic/common/exception.py5
-rw-r--r--ironic/conductor/cleaning.py11
-rw-r--r--ironic/conductor/deployments.py10
-rw-r--r--ironic/drivers/modules/agent_base.py10
-rw-r--r--ironic/drivers/modules/agent_client.py37
-rw-r--r--ironic/tests/unit/conductor/test_cleaning.py31
-rw-r--r--ironic/tests/unit/conductor/test_deployments.py37
-rw-r--r--ironic/tests/unit/drivers/modules/test_agent_base.py4
-rw-r--r--ironic/tests/unit/drivers/modules/test_agent_client.py45
-rw-r--r--releasenotes/notes/fix-busy-agent-check-3cf75242b4783009.yaml6
10 files changed, 188 insertions, 8 deletions
diff --git a/ironic/common/exception.py b/ironic/common/exception.py
index 912121a96..c2e5030e8 100644
--- a/ironic/common/exception.py
+++ b/ironic/common/exception.py
@@ -813,3 +813,8 @@ class UnknownAttribute(ClientSideError):
else:
self.fieldname = name
super(UnknownAttribute, self).__init__(self.msg)
+
+
+class AgentInProgress(IronicException):
+ _msg_fmt = _('Node %(node)s command "%(command)s" failed. Agent is '
+ 'presently executing a command. Error %(error)s')
diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py
index f4e6d0b68..a612f74bc 100644
--- a/ironic/conductor/cleaning.py
+++ b/ironic/conductor/cleaning.py
@@ -177,6 +177,17 @@ def do_next_clean_step(task, step_index):
else None)
task.process_event('wait', target_state=target_state)
return
+ if isinstance(e, exception.AgentInProgress):
+ LOG.info('Conductor attempted to process clean step for '
+ 'node %(node)s. Agent indicated it is presently '
+ 'executing a command. Error: %(error)s',
+ {'node': task.node.uuid,
+ 'error': e})
+ driver_internal_info['skip_current_clean_step'] = False
+ node.driver_internal_info = driver_internal_info
+ target_state = states.MANAGEABLE if manual_clean else None
+ task.process_event('wait', target_state=target_state)
+ return
msg = (_('Node %(node)s failed step %(step)s: '
'%(exc)s') %
diff --git a/ironic/conductor/deployments.py b/ironic/conductor/deployments.py
index 145f79db5..1c93d11ea 100644
--- a/ironic/conductor/deployments.py
+++ b/ironic/conductor/deployments.py
@@ -257,6 +257,16 @@ def do_next_deploy_step(task, step_index):
node.driver_internal_info = driver_internal_info
task.process_event('wait')
return
+ if isinstance(e, exception.AgentInProgress):
+ LOG.info('Conductor attempted to process deploy step for '
+ 'node %(node)s. Agent indicated it is presently '
+ 'executing a command. Error: %(error)s',
+ {'node': task.node.uuid,
+ 'error': e})
+ driver_internal_info['skip_current_deploy_step'] = False
+ node.driver_internal_info = driver_internal_info
+ task.process_event('wait')
+ return
log_msg = ('Node %(node)s failed deploy step %(step)s. Error: '
'%(err)s' %
{'node': node.uuid, 'step': node.deploy_step, 'err': e})
diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py
index 4227389e7..c826e6a25 100644
--- a/ironic/drivers/modules/agent_base.py
+++ b/ironic/drivers/modules/agent_base.py
@@ -902,11 +902,11 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
call = getattr(self._client, 'get_%s_steps' % step_type)
try:
agent_result = call(node, task.ports).get('command_result', {})
- except exception.AgentAPIError as exc:
- if 'agent is busy' in str(exc):
- LOG.debug('Agent is busy with a command, will refresh steps '
- 'on the next heartbeat')
- return
+ except exception.AgentInProgress as exc:
+ LOG.debug('Agent for node %(node)s is busy with a command, '
+ 'will refresh steps on the next heartbeat.',
+ {'node': task.node.uuid})
+ return
# TODO(dtantsur): change to just 'raise'
if step_type == 'clean':
diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py
index a6f9083c3..f0c411c76 100644
--- a/ironic/drivers/modules/agent_client.py
+++ b/ironic/drivers/modules/agent_client.py
@@ -153,6 +153,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
+ :raises: AgentInProgress when the command fails to execute as the agent
+ is presently executing the prior command.
:returns: A dict containing command result from agent, see
get_commands_status for a sample.
"""
@@ -223,6 +225,17 @@ class AgentClient(object):
'Expected 2xx HTTP status code, got %(code)d.',
{'method': method, 'node': node.uuid,
'code': response.status_code})
+ if (response.status_code == http_client.CONFLICT
+ or 'agent is busy' in faultstring.lower()):
+ # HTTP 409 check as an explicit check of if the agent
+ # is already busy.
+ # NOTE(TheJulia): The agent sends upper case A as of
+ # late victoria, but lower case the entire message
+ # for compatability with pre-late victoria agents
+ # which returns HTTP 409.
+ raise exception.AgentInProgress(node=node.uuid,
+ command=method,
+ error=faultstring)
raise exception.AgentAPIError(node=node.uuid,
status=response.status_code,
error=faultstring)
@@ -337,6 +350,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
+ :raises: AgentInProgress when the command fails to execute as the agent
+ is presently executing the prior command.
:returns: A dict containing command status from agent.
See :func:`get_commands_status` for a command result sample.
"""
@@ -370,6 +385,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
+ :raises: AgentInProgress when the command fails to execute as the agent
+ is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
"""
@@ -400,6 +417,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
+ :raises: AgentInProgress when the command fails to execute as the agent
+ is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
"""
@@ -456,6 +475,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
+ :raises: AgentInProgress when the command fails to execute as the agent
+ is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
The value of key command_result is in the form of:
@@ -487,6 +508,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
+ :raises: AgentInProgress when the command fails to execute as the agent
+ is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
The value of key command_result is in the form of:
@@ -550,6 +573,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
+ :raises: AgentInProgress when the command fails to execute as the agent
+ is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
The value of key command_result is in the form of:
@@ -581,6 +606,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
+ :raises: AgentInProgress when the command fails to execute as the agent
+ is presently executing the prior command.
:returns: A dict containing command response from agent.
"""
@@ -597,6 +624,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
+ :raises: AgentInProgress when the command fails to execute as the agent
+ is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
"""
@@ -612,6 +641,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
+ :raises: AgentInProgress when the command fails to execute as the agent
+ is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
"""
@@ -627,6 +658,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
+ :raises: AgentInProgress when the command fails to execute as the agent
+ is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
"""
@@ -643,6 +676,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
+ :raises: AgentInProgress when the command fails to execute as the agent
+ is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
"""
@@ -660,6 +695,8 @@ class AgentClient(object):
to issue the request, or there was a malformed response from
the agent.
:raises: AgentAPIError when agent failed to execute specified command.
+ :raises: AgentInProgress when the command fails to execute as the agent
+ is presently executing the prior command.
:raises: InstanceRescueFailure when the agent ramdisk is too old
to support transmission of the rescue password.
:returns: A dict containing command response from agent.
diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py
index 59cf9e37b..685c8f6bd 100644
--- a/ironic/tests/unit/conductor/test_cleaning.py
+++ b/ironic/tests/unit/conductor/test_cleaning.py
@@ -783,6 +783,37 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
autospec=True)
+ def test_do_next_clean_step_agent_busy(self, mock_execute):
+ # When a clean step fails, go to CLEANWAIT
+ tgt_prov_state = states.MANAGEABLE
+
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ provision_state=states.CLEANING,
+ target_provision_state=tgt_prov_state,
+ last_error=None,
+ driver_internal_info={'clean_steps': self.clean_steps,
+ 'clean_step_index': None,
+ 'cleaning_reboot': True},
+ clean_step={})
+ mock_execute.side_effect = exception.AgentInProgress(
+ reason='still meowing')
+ with task_manager.acquire(
+ self.context, node.uuid, shared=False) as task:
+ cleaning.do_next_clean_step(task, 0)
+
+ node.refresh()
+ # Make sure we go to CLEANWAIT
+ self.assertEqual(states.CLEANWAIT, node.provision_state)
+ self.assertEqual(tgt_prov_state, node.target_provision_state)
+ self.assertEqual(self.clean_steps[0], node.clean_step)
+ self.assertEqual(0, node.driver_internal_info['clean_step_index'])
+ self.assertFalse(node.driver_internal_info['skip_current_clean_step'])
+ mock_execute.assert_called_once_with(
+ mock.ANY, mock.ANY, self.clean_steps[0])
+
+ @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
+ autospec=True)
def test_do_next_clean_step_oob_reboot_last_step(self, mock_execute):
# Resume where last_step is the last cleaning step
tgt_prov_state = states.MANAGEABLE
diff --git a/ironic/tests/unit/conductor/test_deployments.py b/ironic/tests/unit/conductor/test_deployments.py
index e305d63ff..5999a5ade 100644
--- a/ironic/tests/unit/conductor/test_deployments.py
+++ b/ironic/tests/unit/conductor/test_deployments.py
@@ -767,7 +767,42 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
self._stop_service()
node.refresh()
- # Make sure we go to CLEANWAIT
+ # Make sure we go to DEPLOYWAIT
+ self.assertEqual(states.DEPLOYWAIT, node.provision_state)
+ self.assertEqual(tgt_prov_state, node.target_provision_state)
+ self.assertEqual(self.deploy_steps[0], node.deploy_step)
+ self.assertEqual(0, node.driver_internal_info['deploy_step_index'])
+ self.assertFalse(node.driver_internal_info['skip_current_deploy_step'])
+ mock_execute.assert_called_once_with(
+ mock.ANY, mock.ANY, self.deploy_steps[0])
+
+ @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
+ autospec=True)
+ def test_do_next_deploy_step_agent_busy(self, mock_execute):
+ # When a deploy step fails, go to DEPLOYWAIT
+ tgt_prov_state = states.ACTIVE
+
+ self._start_service()
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ provision_state=states.DEPLOYING,
+ target_provision_state=tgt_prov_state,
+ last_error=None,
+ driver_internal_info={'deploy_steps': self.deploy_steps,
+ 'deploy_step_index': None,
+ 'deployment_reboot': True},
+ clean_step={})
+ mock_execute.side_effect = exception.AgentInProgress(
+ reason='meow')
+
+ with task_manager.acquire(
+ self.context, node.uuid, shared=False) as task:
+ deployments.do_next_deploy_step(task, 0)
+
+ self._stop_service()
+ node.refresh()
+
+ # Make sure we go to DEPLOYWAIT
self.assertEqual(states.DEPLOYWAIT, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertEqual(self.deploy_steps[0], node.deploy_step)
diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py
index b533e9f8c..8c1b3cfdb 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_base.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_base.py
@@ -2181,8 +2181,8 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest):
@mock.patch.object(agent_client.AgentClient, 'get_deploy_steps',
autospec=True)
def test_refresh_steps_busy(self, client_mock, log_mock):
- client_mock.side_effect = exception.AgentAPIError(
- node="node", status="500", error='agent is busy')
+ client_mock.side_effect = exception.AgentInProgress(
+ node="node", error='Agent is busy : maximum meowing')
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py
index e5463e7c1..0823d75a8 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_client.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_client.py
@@ -237,6 +237,51 @@ class TestAgentClient(base.TestCase):
timeout=60,
verify=True)
+ def test__command_error_code_agent_busy(self):
+ # Victoria and previous busy status check.
+ response_text = {"faultstring": "Agent is busy - meowing"}
+ self.client.session.post.return_value = MockResponse(
+ response_text, status_code=http_client.BAD_REQUEST)
+ method = 'standby.run_image'
+ image_info = {'image_id': 'test_image'}
+ params = {'image_info': image_info}
+
+ url = self.client._get_command_url(self.node)
+ body = self.client._get_command_body(method, params)
+
+ self.assertRaises(exception.AgentInProgress,
+ self.client._command,
+ self.node, method, params)
+ self.client.session.post.assert_called_once_with(
+ url,
+ data=body,
+ params={'wait': 'false'},
+ timeout=60,
+ verify=True)
+
+ def test__command_error_code_agent_busy_conflict(self):
+ # Post Wallaby logic as the IPA return code changed to
+ # better delineate the state.
+ response_text = {"faultstring": "lolcat says busy meowing"}
+ self.client.session.post.return_value = MockResponse(
+ response_text, status_code=http_client.CONFLICT)
+ method = 'standby.run_image'
+ image_info = {'image_id': 'test_image'}
+ params = {'image_info': image_info}
+
+ url = self.client._get_command_url(self.node)
+ body = self.client._get_command_body(method, params)
+
+ self.assertRaises(exception.AgentInProgress,
+ self.client._command,
+ self.node, method, params)
+ self.client.session.post.assert_called_once_with(
+ url,
+ data=body,
+ params={'wait': 'false'},
+ timeout=60,
+ verify=True)
+
@mock.patch('os.path.exists', autospec=True, return_value=True)
def test__command_verify(self, mock_exists):
response_data = {'status': 'ok'}
diff --git a/releasenotes/notes/fix-busy-agent-check-3cf75242b4783009.yaml b/releasenotes/notes/fix-busy-agent-check-3cf75242b4783009.yaml
new file mode 100644
index 000000000..df490b178
--- /dev/null
+++ b/releasenotes/notes/fix-busy-agent-check-3cf75242b4783009.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+ - |
+ Fixes recognition of a busy agent to also handle recognition
+ during deployment steps by more uniformly detecting and
+ identifying when the ``ironic-python-agent`` service is busy.