summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShivanand Tendulker <stendulker@gmail.com>2019-08-01 08:20:00 -0400
committerShivanand Tendulker <stendulker@gmail.com>2019-08-20 03:56:31 -0400
commitba0e73fa1550339f696431490fdb7d7723ef0903 (patch)
tree45cfc1b8ba3753c5e81b2605f1e6b693aeacdfbd
parentcdf5bca715cf5f1382614539426ec5536cb4e314 (diff)
downloadironic-ba0e73fa1550339f696431490fdb7d7723ef0903.tar.gz
Asynchronous out of band deploy steps fails to execute
Asynchronous out of band steps in a deploy template fails to execute. This commit fixes that issue. Asynchronous steps can set 'skip_current_deploy_step' flag to False in 'driver_internal_info' to make sure that upon reboot same step is re-executed. Also it can set 'deployment_reboot' flag to True in 'driver_internal_info' to signal that it has rebooted the node. Co-Authored-By: Mark Goddard <mark@stackhpc.com> Change-Id: If6217afb5453c311d5ca71ba37458a9b97c18395 Story: 2006342 Task: 36095 (cherry picked from commit 8f907886a1ed0de70c34aef84ba892c3e6a5cd49)
-rw-r--r--ironic/conductor/manager.py30
-rw-r--r--ironic/conductor/utils.py2
-rw-r--r--ironic/drivers/modules/agent.py9
-rw-r--r--ironic/drivers/modules/agent_base_vendor.py3
-rw-r--r--ironic/drivers/modules/iscsi_deploy.py11
-rw-r--r--ironic/tests/unit/conductor/test_manager.py117
-rw-r--r--ironic/tests/unit/conductor/test_utils.py1
-rw-r--r--ironic/tests/unit/drivers/modules/test_agent.py17
-rw-r--r--ironic/tests/unit/drivers/modules/test_agent_base_vendor.py6
-rw-r--r--ironic/tests/unit/drivers/modules/test_iscsi_deploy.py21
-rw-r--r--releasenotes/notes/fixes-execution-of-out-of-band-deploy-steps-1f5967e7bfcabbf9.yaml6
11 files changed, 213 insertions, 10 deletions
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index 7070077ac..466e96c53 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -875,8 +875,9 @@ class ConductorManager(base_manager.BaseConductorManager):
action=event, node=task.node.uuid,
state=task.node.provision_state)
- def _get_node_next_deploy_steps(self, task):
- return self._get_node_next_steps(task, 'deploy')
+ def _get_node_next_deploy_steps(self, task, skip_current_step=True):
+ return self._get_node_next_steps(task, 'deploy',
+ skip_current_step=skip_current_step)
@METRICS.timer('ConductorManager.continue_node_deploy')
def continue_node_deploy(self, context, node_id):
@@ -914,7 +915,17 @@ class ConductorManager(base_manager.BaseConductorManager):
'state': node.provision_state,
'deploy_state': ', '.join(expected_states)})
- next_step_index = self._get_node_next_deploy_steps(task)
+ info = node.driver_internal_info
+ try:
+ skip_current_step = info.pop('skip_current_deploy_step')
+ except KeyError:
+ skip_current_step = True
+ else:
+ node.driver_internal_info = info
+ node.save()
+
+ next_step_index = self._get_node_next_deploy_steps(
+ task, skip_current_step=skip_current_step)
# TODO(rloo): When deprecation period is over and node is in
# states.DEPLOYWAIT only, delete the 'if' and always 'resume'.
@@ -3836,6 +3847,16 @@ def _do_next_deploy_step(task, step_index, conductor_id):
try:
result = interface.execute_deploy_step(task, step)
except exception.IronicException as e:
+ if isinstance(e, exception.AgentConnectionFailed):
+ if task.node.driver_internal_info.get('deployment_reboot'):
+ LOG.info('Agent is not yet running on node %(node)s after '
+ 'deployment reboot, waiting for agent to come up '
+ 'to run next deploy step %(step)s.',
+ {'node': node.uuid, 'step': step})
+ driver_internal_info['skip_current_deploy_step'] = False
+ node.driver_internal_info = driver_internal_info
+ task.process_event('wait')
+ return
log_msg = ('Node %(node)s failed deploy step %(step)s. Error: '
'%(err)s' %
{'node': node.uuid, 'step': node.deploy_step, 'err': e})
@@ -3860,7 +3881,7 @@ def _do_next_deploy_step(task, step_index, conductor_id):
node.save()
# Check if the step is done or not. The step should return
- # states.CLEANWAIT if the step is still being executed, or
+ # states.DEPLOYWAIT if the step is still being executed, or
# None if the step is done.
# NOTE(deva): Some drivers may return states.DEPLOYWAIT
# eg. if they are waiting for a callback
@@ -3890,6 +3911,7 @@ def _do_next_deploy_step(task, step_index, conductor_id):
driver_internal_info = node.driver_internal_info
driver_internal_info['deploy_steps'] = None
driver_internal_info.pop('deploy_step_index', None)
+ driver_internal_info.pop('deployment_reboot', None)
node.driver_internal_info = driver_internal_info
node.save()
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index 88d9b3814..6778b208e 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -461,6 +461,8 @@ def deploying_error_handler(task, logmsg, errmsg, traceback=False,
node.deploy_step = {}
info = node.driver_internal_info
info.pop('deploy_step_index', None)
+ # Clear any leftover metadata about deployment reboots
+ info.pop('deployment_reboot', None)
node.driver_internal_info = info
if cleanup_err:
diff --git a/ironic/drivers/modules/agent.py b/ironic/drivers/modules/agent.py
index b4f640e30..9e1a067a0 100644
--- a/ironic/drivers/modules/agent.py
+++ b/ironic/drivers/modules/agent.py
@@ -463,7 +463,14 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface):
task.process_event('wait')
self.continue_deploy(task)
elif task.driver.storage.should_write_image(task):
- manager_utils.node_power_action(task, states.REBOOT)
+ # Check if the driver has already performed a reboot in a previous
+ # deploy step.
+ if not task.node.driver_internal_info.get('deployment_reboot'):
+ manager_utils.node_power_action(task, states.REBOOT)
+ info = task.node.driver_internal_info
+ info.pop('deployment_reboot', None)
+ task.node.driver_internal_info = info
+ task.node.save()
return states.DEPLOYWAIT
else:
# TODO(TheJulia): At some point, we should de-dupe this code
diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py
index fff521673..1857e82c4 100644
--- a/ironic/drivers/modules/agent_base_vendor.py
+++ b/ironic/drivers/modules/agent_base_vendor.py
@@ -366,6 +366,9 @@ class HeartbeatMixin(object):
else:
node.touch_provisioning()
else:
+ # The exceptions from RPC are not possible as we using cast
+ # here
+ manager_utils.notify_conductor_resume_deploy(task)
node.touch_provisioning()
elif node.provision_state == states.CLEANWAIT:
node.touch_provisioning()
diff --git a/ironic/drivers/modules/iscsi_deploy.py b/ironic/drivers/modules/iscsi_deploy.py
index b663db697..e6932a19a 100644
--- a/ironic/drivers/modules/iscsi_deploy.py
+++ b/ironic/drivers/modules/iscsi_deploy.py
@@ -421,7 +421,16 @@ class ISCSIDeploy(AgentDeployMixin, base.DeployInterface):
# Standard deploy process
deploy_utils.cache_instance_image(task.context, node)
check_image_size(task)
- manager_utils.node_power_action(task, states.REBOOT)
+ # Check if the driver has already performed a reboot in a previous
+ # deploy step.
+ if not task.node.driver_internal_info.get('deployment_reboot',
+ False):
+ manager_utils.node_power_action(task, states.REBOOT)
+ info = task.node.driver_internal_info
+ info.pop('deployment_reboot', None)
+ task.node.driver_internal_info = info
+ task.node.save()
+
return states.DEPLOYWAIT
else:
# Boot to an Storage Volume
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index c0c2f6645..5d6b059ac 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -1845,6 +1845,109 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
mock.ANY, 1, mock.ANY)
self.assertFalse(mock_event.called)
+ @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
+ autospec=True)
+ def _continue_node_deploy_skip_step(self, mock_spawn, skip=True):
+ # test that skipping current step mechanism works
+ driver_info = {'deploy_steps': self.deploy_steps,
+ 'deploy_step_index': 0}
+ if not skip:
+ driver_info['skip_current_deploy_step'] = skip
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ provision_state=states.DEPLOYWAIT,
+ target_provision_state=states.MANAGEABLE,
+ driver_internal_info=driver_info, deploy_step=self.deploy_steps[0])
+ self._start_service()
+ self.service.continue_node_deploy(self.context, node.uuid)
+ self._stop_service()
+ node.refresh()
+ if skip:
+ expected_step_index = 1
+ else:
+ self.assertNotIn(
+ 'skip_current_deploy_step', node.driver_internal_info)
+ expected_step_index = 0
+ mock_spawn.assert_called_with(mock.ANY, manager._do_next_deploy_step,
+ mock.ANY, expected_step_index, mock.ANY)
+
+ def test_continue_node_deploy_skip_step(self):
+ self._continue_node_deploy_skip_step()
+
+ def test_continue_node_deploy_no_skip_step(self):
+ self._continue_node_deploy_skip_step(skip=False)
+
+ @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
+ autospec=True)
+ def test_do_next_deploy_step_oob_reboot(self, mock_execute):
+ # When a deploy step fails, go to DEPLOYWAIT
+ tgt_prov_state = states.ACTIVE
+
+ self._start_service()
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ provision_state=states.DEPLOYING,
+ target_provision_state=tgt_prov_state,
+ last_error=None,
+ driver_internal_info={'deploy_steps': self.deploy_steps,
+ 'deploy_step_index': None,
+ 'deployment_reboot': True},
+ clean_step={})
+ mock_execute.side_effect = exception.AgentConnectionFailed(
+ reason='failed')
+
+ with task_manager.acquire(
+ self.context, node.uuid, shared=False) as task:
+ manager._do_next_deploy_step(task, 0, mock.ANY)
+
+ self._stop_service()
+ node.refresh()
+
+ # Make sure we go to CLEANWAIT
+ self.assertEqual(states.DEPLOYWAIT, node.provision_state)
+ self.assertEqual(tgt_prov_state, node.target_provision_state)
+ self.assertEqual(self.deploy_steps[0], node.deploy_step)
+ self.assertEqual(0, node.driver_internal_info['deploy_step_index'])
+ self.assertFalse(node.driver_internal_info['skip_current_deploy_step'])
+ mock_execute.assert_called_once_with(
+ mock.ANY, mock.ANY, self.deploy_steps[0])
+
+ @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
+ autospec=True)
+ def test_do_next_clean_step_oob_reboot_fail(self,
+ mock_execute):
+ # When a deploy step fails with no reboot requested go to DEPLOYFAIL
+ tgt_prov_state = states.ACTIVE
+
+ self._start_service()
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ provision_state=states.DEPLOYING,
+ target_provision_state=tgt_prov_state,
+ last_error=None,
+ driver_internal_info={'deploy_steps': self.deploy_steps,
+ 'deploy_step_index': None},
+ deploy_step={})
+ mock_execute.side_effect = exception.AgentConnectionFailed(
+ reason='failed')
+
+ with task_manager.acquire(
+ self.context, node.uuid, shared=False) as task:
+ manager._do_next_deploy_step(task, 0, mock.ANY)
+
+ self._stop_service()
+ node.refresh()
+
+ # Make sure we go to DEPLOYFAIL, clear deploy_steps
+ self.assertEqual(states.DEPLOYFAIL, node.provision_state)
+ self.assertEqual(tgt_prov_state, node.target_provision_state)
+ self.assertEqual({}, node.deploy_step)
+ self.assertNotIn('deploy_step_index', node.driver_internal_info)
+ self.assertNotIn('skip_current_deploy_step', node.driver_internal_info)
+ self.assertIsNotNone(node.last_error)
+ mock_execute.assert_called_once_with(
+ mock.ANY, mock.ANY, self.deploy_steps[0])
+
@mgr_utils.mock_record_keepalive
class DoNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@@ -2641,7 +2744,7 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
mock_execute.assert_called_once_with(mock.ANY, mock.ANY,
self.deploy_steps[0])
- def test__get_node_next_deploy_steps(self):
+ def _test__get_node_next_deploy_steps(self, skip=True):
driver_internal_info = {'deploy_steps': self.deploy_steps,
'deploy_step_index': 0}
node = obj_utils.create_test_node(
@@ -2653,8 +2756,16 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
deploy_step=self.deploy_steps[0])
with task_manager.acquire(self.context, node.uuid) as task:
- step_index = self.service._get_node_next_deploy_steps(task)
- self.assertEqual(1, step_index)
+ step_index = self.service._get_node_next_deploy_steps(
+ task, skip_current_step=skip)
+ expected_index = 1 if skip else 0
+ self.assertEqual(expected_index, step_index)
+
+ def test__get_node_next_deploy_steps(self):
+ self._test__get_node_next_deploy_steps()
+
+ def test__get_node_next_deploy_steps_no_skip(self):
+ self._test__get_node_next_deploy_steps(skip=False)
def test__get_node_next_deploy_steps_unset_deploy_step(self):
driver_internal_info = {'deploy_steps': self.deploy_steps,
diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py
index dc2e4b3d0..5c5fb68b3 100644
--- a/ironic/tests/unit/conductor/test_utils.py
+++ b/ironic/tests/unit/conductor/test_utils.py
@@ -917,6 +917,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
self.assertEqual(self.errmsg, self.node.last_error)
self.assertEqual({}, self.node.deploy_step)
self.assertNotIn('deploy_step_index', self.node.driver_internal_info)
+ self.assertNotIn('deployment_reboot', self.node.driver_internal_info)
self.task.process_event.assert_called_once_with('fail')
def _test_deploying_error_handler_cleanup(self, exc, expected_str):
diff --git a/ironic/tests/unit/drivers/modules/test_agent.py b/ironic/tests/unit/drivers/modules/test_agent.py
index e804f3d17..15d5b675a 100644
--- a/ironic/tests/unit/drivers/modules/test_agent.py
+++ b/ironic/tests/unit/drivers/modules/test_agent.py
@@ -322,6 +322,23 @@ class TestAgentDeploy(db_base.DbTestCase):
self.assertFalse(mock_pxe_instance.called)
@mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True)
+ @mock.patch('ironic.conductor.utils.node_power_action', autospec=True)
+ def test_deploy_with_deployment_reboot(self, power_mock,
+ mock_pxe_instance):
+ driver_internal_info = self.node.driver_internal_info
+ driver_internal_info['deployment_reboot'] = True
+ self.node.driver_internal_info = driver_internal_info
+ self.node.save()
+ with task_manager.acquire(
+ self.context, self.node['uuid'], shared=False) as task:
+ driver_return = self.driver.deploy(task)
+ self.assertEqual(driver_return, states.DEPLOYWAIT)
+ self.assertFalse(power_mock.called)
+ self.assertFalse(mock_pxe_instance.called)
+ self.assertNotIn(
+ 'deployment_reboot', task.node.driver_internal_info)
+
+ @mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True)
@mock.patch.object(noop_storage.NoopStorage, 'should_write_image',
autospec=True)
def test_deploy_storage_should_write_image_false(self, mock_write,
diff --git a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py
index 5a21ffe28..eec26f9b6 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py
@@ -138,6 +138,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
self.assertFalse(cd_mock.called)
rti_mock.assert_called_once_with(self.deploy, task)
+ @mock.patch.object(manager_utils,
+ 'notify_conductor_resume_deploy', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'in_core_deploy_step', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
@@ -151,7 +153,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
def test_heartbeat_not_in_core_deploy_step(self, rti_mock, cd_mock,
deploy_is_done_mock,
deploy_started_mock,
- in_deploy_mock):
+ in_deploy_mock,
+ in_resume_deploy_mock):
# Check that heartbeats do not trigger deployment actions when not in
# the deploy.deploy step.
in_deploy_mock.return_value = False
@@ -170,6 +173,7 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
self.assertFalse(deploy_is_done_mock.called)
self.assertFalse(cd_mock.called)
self.assertFalse(rti_mock.called)
+ self.assertTrue(in_resume_deploy_mock.called)
@mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
autospec=True)
diff --git a/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py b/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py
index 054a81e50..e36431b54 100644
--- a/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py
+++ b/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py
@@ -795,6 +795,27 @@ class ISCSIDeployTestCase(db_base.DbTestCase):
mock_check_image_size.assert_called_once_with(task)
mock_node_power_action.assert_called_once_with(task, states.REBOOT)
+ @mock.patch.object(manager_utils, 'node_power_action', autospec=True)
+ @mock.patch.object(iscsi_deploy, 'check_image_size', autospec=True)
+ @mock.patch.object(deploy_utils, 'cache_instance_image', autospec=True)
+ def test_deploy_with_deployment_reboot(self, mock_cache_instance_image,
+ mock_check_image_size,
+ mock_node_power_action):
+ driver_internal_info = self.node.driver_internal_info
+ driver_internal_info['deployment_reboot'] = True
+ self.node.driver_internal_info = driver_internal_info
+ self.node.save()
+ with task_manager.acquire(self.context,
+ self.node.uuid, shared=False) as task:
+ state = task.driver.deploy.deploy(task)
+ self.assertEqual(state, states.DEPLOYWAIT)
+ mock_cache_instance_image.assert_called_once_with(
+ self.context, task.node)
+ mock_check_image_size.assert_called_once_with(task)
+ self.assertFalse(mock_node_power_action.called)
+ self.assertNotIn(
+ 'deployment_reboot', task.node.driver_internal_info)
+
@mock.patch.object(noop_storage.NoopStorage, 'should_write_image',
autospec=True)
@mock.patch.object(flat_network.FlatNetwork,
diff --git a/releasenotes/notes/fixes-execution-of-out-of-band-deploy-steps-1f5967e7bfcabbf9.yaml b/releasenotes/notes/fixes-execution-of-out-of-band-deploy-steps-1f5967e7bfcabbf9.yaml
new file mode 100644
index 000000000..68a1686ab
--- /dev/null
+++ b/releasenotes/notes/fixes-execution-of-out-of-band-deploy-steps-1f5967e7bfcabbf9.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+ - |
+ Fixes an issue wherein asynchronous out-of-band deploy steps in
+ deployment template fails to execute. See `story 2006342
+ <https://storyboard.openstack.org/#!/story/2006342>`__ for details.