summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulia Kreger <juliaashleykreger@gmail.com>2020-08-25 14:37:16 -0400
committerJulia Kreger <juliaashleykreger@gmail.com>2020-08-25 14:37:16 -0400
commit800a5c4d147aaa41834f159c210417e7ed5a7eb2 (patch)
tree6134dd073dac7a54d01f44c46bee70c277b50202
parent918478b4e14876dbc5dd9a6f40951979751276fd (diff)
downloadironic-800a5c4d147aaa41834f159c210417e7ed5a7eb2.tar.gz
Fix for failure in cleaning
The cleaning operation may fail, if an in-band clean step were to execute after the completion of out-of-band clean step that performs reboot of the node. The failure is caused because of race condition where in cleaning is resumed before the Ironic Python Agent(IPA) is ready to execute clean steps. Story: #2002731 Task: #22580 Change-Id: Idaacb9fbb1ea3ac82cdb6769df05d8206660c8cb (cherry picked from commit 7c5a04c1149f14900f504f32e000a7b4e69e661f)
-rw-r--r--ironic/common/exception.py4
-rw-r--r--ironic/conductor/manager.py14
-rw-r--r--ironic/drivers/modules/agent_base_vendor.py1
-rw-r--r--ironic/drivers/modules/agent_client.py6
-rw-r--r--ironic/drivers/modules/drac/raid.py4
-rw-r--r--ironic/tests/unit/conductor/test_manager.py108
-rw-r--r--ironic/tests/unit/drivers/modules/test_agent_client.py18
-rw-r--r--releasenotes/notes/resume-cleaning-post-oob-reboot-b76c23f98219a8d2.yaml10
8 files changed, 165 insertions, 0 deletions
diff --git a/ironic/common/exception.py b/ironic/common/exception.py
index 17c07db8d..50f7a6fe6 100644
--- a/ironic/common/exception.py
+++ b/ironic/common/exception.py
@@ -774,3 +774,7 @@ class InstanceRescueFailure(IronicException):
class InstanceUnrescueFailure(IronicException):
_msg_fmt = _('Failed to unrescue instance %(instance)s for node '
'%(node)s: %(reason)s')
+
+
+class AgentConnectionFailed(IronicException):
+ _msg_fmt = _("Connection to agent failed: %(reason)s")
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index c1d23fe7c..37c5f72f9 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -1253,6 +1253,19 @@ class ConductorManager(base_manager.BaseConductorManager):
try:
result = interface.execute_clean_step(task, step)
except Exception as e:
+ if isinstance(e, exception.AgentConnectionFailed):
+ if task.node.driver_internal_info.get('cleaning_reboot'):
+ LOG.info('Agent is not yet running on node %(node)s '
+ 'after cleaning reboot, waiting for agent to '
+ 'come up to run next clean step %(step)s.',
+ {'node': node.uuid, 'step': step})
+ driver_internal_info['skip_current_clean_step'] = False
+ node.driver_internal_info = driver_internal_info
+ target_state = (states.MANAGEABLE if manual_clean
+ else None)
+ task.process_event('wait', target_state=target_state)
+ return
+
msg = (_('Node %(node)s failed step %(step)s: '
'%(exc)s') %
{'node': node.uuid, 'exc': e,
@@ -1286,6 +1299,7 @@ class ConductorManager(base_manager.BaseConductorManager):
node.clean_step = None
driver_internal_info['clean_steps'] = None
driver_internal_info.pop('clean_step_index', None)
+ driver_internal_info.pop('cleaning_reboot', None)
node.driver_internal_info = driver_internal_info
node.save()
try:
diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py
index 8dd50d378..830d2951d 100644
--- a/ironic/drivers/modules/agent_base_vendor.py
+++ b/ironic/drivers/modules/agent_base_vendor.py
@@ -472,6 +472,7 @@ class AgentDeployMixin(HeartbeatMixin):
info = task.node.driver_internal_info
info.pop('cleaning_reboot', None)
task.node.driver_internal_info = info
+ task.node.save()
_notify_conductor_resume_clean(task)
return
else:
diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py
index e9624b00a..f54837801 100644
--- a/ironic/drivers/modules/agent_client.py
+++ b/ironic/drivers/modules/agent_client.py
@@ -64,6 +64,12 @@ class AgentClient(object):
try:
response = self.session.post(url, params=request_params, data=body)
+ except requests.ConnectionError as e:
+ msg = (_('Failed to invoke agent command %(method)s for node '
+ '%(node)s. Error: %(error)s') %
+ {'method': method, 'node': node.uuid, 'error': e})
+ LOG.error(msg)
+ raise exception.AgentConnectionFailed(reason=msg)
except requests.RequestException as e:
msg = (_('Error invoking agent command %(method)s for node '
'%(node)s. Error: %(error)s') %
diff --git a/ironic/drivers/modules/drac/raid.py b/ironic/drivers/modules/drac/raid.py
index bd2a3f5ac..584dd5aaf 100644
--- a/ironic/drivers/modules/drac/raid.py
+++ b/ironic/drivers/modules/drac/raid.py
@@ -895,4 +895,8 @@ class DracRAID(base.RAIDInterface):
def _resume_cleaning(self, task):
raid_common.update_raid_info(
task.node, self.get_logical_disks(task))
+ driver_internal_info = task.node.driver_internal_info
+ driver_internal_info['cleaning_reboot'] = True
+ task.node.driver_internal_info = driver_internal_info
+ task.node.save()
agent_base_vendor._notify_conductor_resume_clean(task)
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index 29f2bc527..cf57f65bc 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -2780,6 +2780,114 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
def test__do_next_clean_step_manual_execute_fail(self):
self._do_next_clean_step_execute_fail(manual=True)
+ @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
+ autospec=True)
+ def test_do_next_clean_step_oob_reboot(self, mock_execute):
+ # When a clean step fails, go to CLEANWAIT
+ tgt_prov_state = states.MANAGEABLE
+
+ self._start_service()
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ provision_state=states.CLEANING,
+ target_provision_state=tgt_prov_state,
+ last_error=None,
+ driver_internal_info={'clean_steps': self.clean_steps,
+ 'clean_step_index': None,
+ 'cleaning_reboot': True},
+ clean_step={})
+ mock_execute.side_effect = exception.AgentConnectionFailed(
+ reason='failed')
+
+ with task_manager.acquire(
+ self.context, node.uuid, shared=False) as task:
+ self.service._do_next_clean_step(task, 0)
+
+ self._stop_service()
+ node.refresh()
+
+ # Make sure we go to CLEANWAIT
+ self.assertEqual(states.CLEANWAIT, node.provision_state)
+ self.assertEqual(tgt_prov_state, node.target_provision_state)
+ self.assertEqual(self.clean_steps[0], node.clean_step)
+ self.assertEqual(0, node.driver_internal_info['clean_step_index'])
+ self.assertFalse(node.driver_internal_info['skip_current_clean_step'])
+ mock_execute.assert_called_once_with(
+ mock.ANY, mock.ANY, self.clean_steps[0])
+
+ @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
+ autospec=True)
+ def test_do_next_clean_step_oob_reboot_last_step(self, mock_execute):
+ # Resume where last_step is the last cleaning step
+ tgt_prov_state = states.MANAGEABLE
+ info = {'clean_steps': self.clean_steps,
+ 'cleaning_reboot': True,
+ 'clean_step_index': len(self.clean_steps) - 1}
+
+ self._start_service()
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ provision_state=states.CLEANING,
+ target_provision_state=tgt_prov_state,
+ last_error=None,
+ driver_internal_info=info,
+ clean_step=self.clean_steps[-1])
+
+ with task_manager.acquire(
+ self.context, node.uuid, shared=False) as task:
+ self.service._do_next_clean_step(task, None)
+
+ self._stop_service()
+ node.refresh()
+
+ # Cleaning should be complete without calling additional steps
+ self.assertEqual(tgt_prov_state, node.provision_state)
+ self.assertEqual(states.NOSTATE, node.target_provision_state)
+ self.assertEqual({}, node.clean_step)
+ self.assertNotIn('clean_step_index', node.driver_internal_info)
+ self.assertNotIn('cleaning_reboot', node.driver_internal_info)
+ self.assertIsNone(node.driver_internal_info['clean_steps'])
+ self.assertFalse(mock_execute.called)
+
+ @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
+ autospec=True)
+ @mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True)
+ def test_do_next_clean_step_oob_reboot_fail(self, tear_mock,
+ mock_execute):
+ # When a clean step fails with no reboot requested go to CLEANFAIL
+ tgt_prov_state = states.MANAGEABLE
+
+ self._start_service()
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ provision_state=states.CLEANING,
+ target_provision_state=tgt_prov_state,
+ last_error=None,
+ driver_internal_info={'clean_steps': self.clean_steps,
+ 'clean_step_index': None},
+ clean_step={})
+ mock_execute.side_effect = exception.AgentConnectionFailed(
+ reason='failed')
+
+ with task_manager.acquire(
+ self.context, node.uuid, shared=False) as task:
+ self.service._do_next_clean_step(task, 0)
+ tear_mock.assert_called_once_with(task.driver.deploy, task)
+
+ self._stop_service()
+ node.refresh()
+
+ # Make sure we go to CLEANFAIL, clear clean_steps
+ self.assertEqual(states.CLEANFAIL, node.provision_state)
+ self.assertEqual(tgt_prov_state, node.target_provision_state)
+ self.assertEqual({}, node.clean_step)
+ self.assertNotIn('clean_step_index', node.driver_internal_info)
+ self.assertNotIn('skip_current_clean_step', node.driver_internal_info)
+ self.assertIsNotNone(node.last_error)
+ self.assertTrue(node.maintenance)
+ mock_execute.assert_called_once_with(
+ mock.ANY, mock.ANY, self.clean_steps[0])
+
@mock.patch.object(manager, 'LOG', autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
autospec=True)
diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py
index f8396b3ce..6ddc7d2fd 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_client.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_client.py
@@ -133,6 +133,24 @@ class TestAgentClient(base.TestCase):
{'method': method, 'node': self.node.uuid,
'error': error}, str(e))
+ def test__command_fail_connect(self):
+ error = 'Boom'
+ self.client.session.post.side_effect = requests.ConnectionError(error)
+ method = 'foo.bar'
+ params = {}
+
+ self.client._get_command_url(self.node)
+ self.client._get_command_body(method, params)
+
+ e = self.assertRaises(exception.AgentConnectionFailed,
+ self.client._command,
+ self.node, method, params)
+ self.assertEqual('Connection to agent failed: Failed to invoke '
+ 'agent command %(method)s for node %(node)s. '
+ 'Error: %(error)s' %
+ {'method': method, 'node': self.node.uuid,
+ 'error': error}, str(e))
+
def test__command_error_code(self):
response_text = '{"faultstring": "you dun goofd"}'
self.client.session.post.return_value = MockResponse(
diff --git a/releasenotes/notes/resume-cleaning-post-oob-reboot-b76c23f98219a8d2.yaml b/releasenotes/notes/resume-cleaning-post-oob-reboot-b76c23f98219a8d2.yaml
new file mode 100644
index 000000000..4b1e36a9c
--- /dev/null
+++ b/releasenotes/notes/resume-cleaning-post-oob-reboot-b76c23f98219a8d2.yaml
@@ -0,0 +1,10 @@
+---
+fixes:
+ - |
+ The cleaning operation may fail, if an in-band clean step were to
+ execute after the completion of out-of-band clean step that
+ performs reboot of the node. The failure is caused because of race
+ condition where in cleaning is resumed before the Ironic Python
+ Agent(IPA) is ready to execute clean steps. This has been fixed.
+ For more information, see `bug 2002731
+ <https://storyboard.openstack.org/#!/story/2002731>`_.