diff options
author | Shivanand Tendulker <stendulker@gmail.com> | 2019-08-01 08:20:00 -0400 |
---|---|---|
committer | Shivanand Tendulker <stendulker@gmail.com> | 2019-08-20 03:56:31 -0400 |
commit | ba0e73fa1550339f696431490fdb7d7723ef0903 (patch) | |
tree | 45cfc1b8ba3753c5e81b2605f1e6b693aeacdfbd | |
parent | cdf5bca715cf5f1382614539426ec5536cb4e314 (diff) | |
download | ironic-ba0e73fa1550339f696431490fdb7d7723ef0903.tar.gz |
Asynchronous out of band deploy steps fails to execute
Asynchronous out of band steps in a deploy template fails to
execute. This commit fixes that issue. Asynchronous steps can
set 'skip_current_deploy_step' flag to False in
'driver_internal_info' to make sure that upon reboot same step
is re-executed. Also it can set 'deployment_reboot' flag to True
in 'driver_internal_info' to signal that it has rebooted the node.
Co-Authored-By: Mark Goddard <mark@stackhpc.com>
Change-Id: If6217afb5453c311d5ca71ba37458a9b97c18395
Story: 2006342
Task: 36095
(cherry picked from commit 8f907886a1ed0de70c34aef84ba892c3e6a5cd49)
-rw-r--r-- | ironic/conductor/manager.py | 30 | ||||
-rw-r--r-- | ironic/conductor/utils.py | 2 | ||||
-rw-r--r-- | ironic/drivers/modules/agent.py | 9 | ||||
-rw-r--r-- | ironic/drivers/modules/agent_base_vendor.py | 3 | ||||
-rw-r--r-- | ironic/drivers/modules/iscsi_deploy.py | 11 | ||||
-rw-r--r-- | ironic/tests/unit/conductor/test_manager.py | 117 | ||||
-rw-r--r-- | ironic/tests/unit/conductor/test_utils.py | 1 | ||||
-rw-r--r-- | ironic/tests/unit/drivers/modules/test_agent.py | 17 | ||||
-rw-r--r-- | ironic/tests/unit/drivers/modules/test_agent_base_vendor.py | 6 | ||||
-rw-r--r-- | ironic/tests/unit/drivers/modules/test_iscsi_deploy.py | 21 | ||||
-rw-r--r-- | releasenotes/notes/fixes-execution-of-out-of-band-deploy-steps-1f5967e7bfcabbf9.yaml | 6 |
11 files changed, 213 insertions, 10 deletions
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 7070077ac..466e96c53 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -875,8 +875,9 @@ class ConductorManager(base_manager.BaseConductorManager): action=event, node=task.node.uuid, state=task.node.provision_state) - def _get_node_next_deploy_steps(self, task): - return self._get_node_next_steps(task, 'deploy') + def _get_node_next_deploy_steps(self, task, skip_current_step=True): + return self._get_node_next_steps(task, 'deploy', + skip_current_step=skip_current_step) @METRICS.timer('ConductorManager.continue_node_deploy') def continue_node_deploy(self, context, node_id): @@ -914,7 +915,17 @@ class ConductorManager(base_manager.BaseConductorManager): 'state': node.provision_state, 'deploy_state': ', '.join(expected_states)}) - next_step_index = self._get_node_next_deploy_steps(task) + info = node.driver_internal_info + try: + skip_current_step = info.pop('skip_current_deploy_step') + except KeyError: + skip_current_step = True + else: + node.driver_internal_info = info + node.save() + + next_step_index = self._get_node_next_deploy_steps( + task, skip_current_step=skip_current_step) # TODO(rloo): When deprecation period is over and node is in # states.DEPLOYWAIT only, delete the 'if' and always 'resume'. @@ -3836,6 +3847,16 @@ def _do_next_deploy_step(task, step_index, conductor_id): try: result = interface.execute_deploy_step(task, step) except exception.IronicException as e: + if isinstance(e, exception.AgentConnectionFailed): + if task.node.driver_internal_info.get('deployment_reboot'): + LOG.info('Agent is not yet running on node %(node)s after ' + 'deployment reboot, waiting for agent to come up ' + 'to run next deploy step %(step)s.', + {'node': node.uuid, 'step': step}) + driver_internal_info['skip_current_deploy_step'] = False + node.driver_internal_info = driver_internal_info + task.process_event('wait') + return log_msg = ('Node %(node)s failed deploy step %(step)s. Error: ' '%(err)s' % {'node': node.uuid, 'step': node.deploy_step, 'err': e}) @@ -3860,7 +3881,7 @@ def _do_next_deploy_step(task, step_index, conductor_id): node.save() # Check if the step is done or not. The step should return - # states.CLEANWAIT if the step is still being executed, or + # states.DEPLOYWAIT if the step is still being executed, or # None if the step is done. # NOTE(deva): Some drivers may return states.DEPLOYWAIT # eg. if they are waiting for a callback @@ -3890,6 +3911,7 @@ def _do_next_deploy_step(task, step_index, conductor_id): driver_internal_info = node.driver_internal_info driver_internal_info['deploy_steps'] = None driver_internal_info.pop('deploy_step_index', None) + driver_internal_info.pop('deployment_reboot', None) node.driver_internal_info = driver_internal_info node.save() diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 88d9b3814..6778b208e 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -461,6 +461,8 @@ def deploying_error_handler(task, logmsg, errmsg, traceback=False, node.deploy_step = {} info = node.driver_internal_info info.pop('deploy_step_index', None) + # Clear any leftover metadata about deployment reboots + info.pop('deployment_reboot', None) node.driver_internal_info = info if cleanup_err: diff --git a/ironic/drivers/modules/agent.py b/ironic/drivers/modules/agent.py index b4f640e30..9e1a067a0 100644 --- a/ironic/drivers/modules/agent.py +++ b/ironic/drivers/modules/agent.py @@ -463,7 +463,14 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface): task.process_event('wait') self.continue_deploy(task) elif task.driver.storage.should_write_image(task): - manager_utils.node_power_action(task, states.REBOOT) + # Check if the driver has already performed a reboot in a previous + # deploy step. + if not task.node.driver_internal_info.get('deployment_reboot'): + manager_utils.node_power_action(task, states.REBOOT) + info = task.node.driver_internal_info + info.pop('deployment_reboot', None) + task.node.driver_internal_info = info + task.node.save() return states.DEPLOYWAIT else: # TODO(TheJulia): At some point, we should de-dupe this code diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py index fff521673..1857e82c4 100644 --- a/ironic/drivers/modules/agent_base_vendor.py +++ b/ironic/drivers/modules/agent_base_vendor.py @@ -366,6 +366,9 @@ class HeartbeatMixin(object): else: node.touch_provisioning() else: + # The exceptions from RPC are not possible as we using cast + # here + manager_utils.notify_conductor_resume_deploy(task) node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() diff --git a/ironic/drivers/modules/iscsi_deploy.py b/ironic/drivers/modules/iscsi_deploy.py index b663db697..e6932a19a 100644 --- a/ironic/drivers/modules/iscsi_deploy.py +++ b/ironic/drivers/modules/iscsi_deploy.py @@ -421,7 +421,16 @@ class ISCSIDeploy(AgentDeployMixin, base.DeployInterface): # Standard deploy process deploy_utils.cache_instance_image(task.context, node) check_image_size(task) - manager_utils.node_power_action(task, states.REBOOT) + # Check if the driver has already performed a reboot in a previous + # deploy step. + if not task.node.driver_internal_info.get('deployment_reboot', + False): + manager_utils.node_power_action(task, states.REBOOT) + info = task.node.driver_internal_info + info.pop('deployment_reboot', None) + task.node.driver_internal_info = info + task.node.save() + return states.DEPLOYWAIT else: # Boot to an Storage Volume diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index c0c2f6645..5d6b059ac 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -1845,6 +1845,109 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, mock.ANY, 1, mock.ANY) self.assertFalse(mock_event.called) + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + def _continue_node_deploy_skip_step(self, mock_spawn, skip=True): + # test that skipping current step mechanism works + driver_info = {'deploy_steps': self.deploy_steps, + 'deploy_step_index': 0} + if not skip: + driver_info['skip_current_deploy_step'] = skip + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.DEPLOYWAIT, + target_provision_state=states.MANAGEABLE, + driver_internal_info=driver_info, deploy_step=self.deploy_steps[0]) + self._start_service() + self.service.continue_node_deploy(self.context, node.uuid) + self._stop_service() + node.refresh() + if skip: + expected_step_index = 1 + else: + self.assertNotIn( + 'skip_current_deploy_step', node.driver_internal_info) + expected_step_index = 0 + mock_spawn.assert_called_with(mock.ANY, manager._do_next_deploy_step, + mock.ANY, expected_step_index, mock.ANY) + + def test_continue_node_deploy_skip_step(self): + self._continue_node_deploy_skip_step() + + def test_continue_node_deploy_no_skip_step(self): + self._continue_node_deploy_skip_step(skip=False) + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step', + autospec=True) + def test_do_next_deploy_step_oob_reboot(self, mock_execute): + # When a deploy step fails, go to DEPLOYWAIT + tgt_prov_state = states.ACTIVE + + self._start_service() + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.DEPLOYING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'deploy_steps': self.deploy_steps, + 'deploy_step_index': None, + 'deployment_reboot': True}, + clean_step={}) + mock_execute.side_effect = exception.AgentConnectionFailed( + reason='failed') + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + manager._do_next_deploy_step(task, 0, mock.ANY) + + self._stop_service() + node.refresh() + + # Make sure we go to CLEANWAIT + self.assertEqual(states.DEPLOYWAIT, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual(self.deploy_steps[0], node.deploy_step) + self.assertEqual(0, node.driver_internal_info['deploy_step_index']) + self.assertFalse(node.driver_internal_info['skip_current_deploy_step']) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, self.deploy_steps[0]) + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step', + autospec=True) + def test_do_next_clean_step_oob_reboot_fail(self, + mock_execute): + # When a deploy step fails with no reboot requested go to DEPLOYFAIL + tgt_prov_state = states.ACTIVE + + self._start_service() + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.DEPLOYING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'deploy_steps': self.deploy_steps, + 'deploy_step_index': None}, + deploy_step={}) + mock_execute.side_effect = exception.AgentConnectionFailed( + reason='failed') + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + manager._do_next_deploy_step(task, 0, mock.ANY) + + self._stop_service() + node.refresh() + + # Make sure we go to DEPLOYFAIL, clear deploy_steps + self.assertEqual(states.DEPLOYFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual({}, node.deploy_step) + self.assertNotIn('deploy_step_index', node.driver_internal_info) + self.assertNotIn('skip_current_deploy_step', node.driver_internal_info) + self.assertIsNotNone(node.last_error) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, self.deploy_steps[0]) + @mgr_utils.mock_record_keepalive class DoNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): @@ -2641,7 +2744,7 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin, mock_execute.assert_called_once_with(mock.ANY, mock.ANY, self.deploy_steps[0]) - def test__get_node_next_deploy_steps(self): + def _test__get_node_next_deploy_steps(self, skip=True): driver_internal_info = {'deploy_steps': self.deploy_steps, 'deploy_step_index': 0} node = obj_utils.create_test_node( @@ -2653,8 +2756,16 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin, deploy_step=self.deploy_steps[0]) with task_manager.acquire(self.context, node.uuid) as task: - step_index = self.service._get_node_next_deploy_steps(task) - self.assertEqual(1, step_index) + step_index = self.service._get_node_next_deploy_steps( + task, skip_current_step=skip) + expected_index = 1 if skip else 0 + self.assertEqual(expected_index, step_index) + + def test__get_node_next_deploy_steps(self): + self._test__get_node_next_deploy_steps() + + def test__get_node_next_deploy_steps_no_skip(self): + self._test__get_node_next_deploy_steps(skip=False) def test__get_node_next_deploy_steps_unset_deploy_step(self): driver_internal_info = {'deploy_steps': self.deploy_steps, diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index dc2e4b3d0..5c5fb68b3 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -917,6 +917,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase): self.assertEqual(self.errmsg, self.node.last_error) self.assertEqual({}, self.node.deploy_step) self.assertNotIn('deploy_step_index', self.node.driver_internal_info) + self.assertNotIn('deployment_reboot', self.node.driver_internal_info) self.task.process_event.assert_called_once_with('fail') def _test_deploying_error_handler_cleanup(self, exc, expected_str): diff --git a/ironic/tests/unit/drivers/modules/test_agent.py b/ironic/tests/unit/drivers/modules/test_agent.py index e804f3d17..15d5b675a 100644 --- a/ironic/tests/unit/drivers/modules/test_agent.py +++ b/ironic/tests/unit/drivers/modules/test_agent.py @@ -322,6 +322,23 @@ class TestAgentDeploy(db_base.DbTestCase): self.assertFalse(mock_pxe_instance.called) @mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True) + @mock.patch('ironic.conductor.utils.node_power_action', autospec=True) + def test_deploy_with_deployment_reboot(self, power_mock, + mock_pxe_instance): + driver_internal_info = self.node.driver_internal_info + driver_internal_info['deployment_reboot'] = True + self.node.driver_internal_info = driver_internal_info + self.node.save() + with task_manager.acquire( + self.context, self.node['uuid'], shared=False) as task: + driver_return = self.driver.deploy(task) + self.assertEqual(driver_return, states.DEPLOYWAIT) + self.assertFalse(power_mock.called) + self.assertFalse(mock_pxe_instance.called) + self.assertNotIn( + 'deployment_reboot', task.node.driver_internal_info) + + @mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True) @mock.patch.object(noop_storage.NoopStorage, 'should_write_image', autospec=True) def test_deploy_storage_should_write_image_false(self, mock_write, diff --git a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py index 5a21ffe28..eec26f9b6 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py @@ -138,6 +138,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): self.assertFalse(cd_mock.called) rti_mock.assert_called_once_with(self.deploy, task) + @mock.patch.object(manager_utils, + 'notify_conductor_resume_deploy', autospec=True) @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'in_core_deploy_step', autospec=True) @mock.patch.object(agent_base_vendor.HeartbeatMixin, @@ -151,7 +153,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): def test_heartbeat_not_in_core_deploy_step(self, rti_mock, cd_mock, deploy_is_done_mock, deploy_started_mock, - in_deploy_mock): + in_deploy_mock, + in_resume_deploy_mock): # Check that heartbeats do not trigger deployment actions when not in # the deploy.deploy step. in_deploy_mock.return_value = False @@ -170,6 +173,7 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): self.assertFalse(deploy_is_done_mock.called) self.assertFalse(cd_mock.called) self.assertFalse(rti_mock.called) + self.assertTrue(in_resume_deploy_mock.called) @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy', autospec=True) diff --git a/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py b/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py index 054a81e50..e36431b54 100644 --- a/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py +++ b/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py @@ -795,6 +795,27 @@ class ISCSIDeployTestCase(db_base.DbTestCase): mock_check_image_size.assert_called_once_with(task) mock_node_power_action.assert_called_once_with(task, states.REBOOT) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(iscsi_deploy, 'check_image_size', autospec=True) + @mock.patch.object(deploy_utils, 'cache_instance_image', autospec=True) + def test_deploy_with_deployment_reboot(self, mock_cache_instance_image, + mock_check_image_size, + mock_node_power_action): + driver_internal_info = self.node.driver_internal_info + driver_internal_info['deployment_reboot'] = True + self.node.driver_internal_info = driver_internal_info + self.node.save() + with task_manager.acquire(self.context, + self.node.uuid, shared=False) as task: + state = task.driver.deploy.deploy(task) + self.assertEqual(state, states.DEPLOYWAIT) + mock_cache_instance_image.assert_called_once_with( + self.context, task.node) + mock_check_image_size.assert_called_once_with(task) + self.assertFalse(mock_node_power_action.called) + self.assertNotIn( + 'deployment_reboot', task.node.driver_internal_info) + @mock.patch.object(noop_storage.NoopStorage, 'should_write_image', autospec=True) @mock.patch.object(flat_network.FlatNetwork, diff --git a/releasenotes/notes/fixes-execution-of-out-of-band-deploy-steps-1f5967e7bfcabbf9.yaml b/releasenotes/notes/fixes-execution-of-out-of-band-deploy-steps-1f5967e7bfcabbf9.yaml new file mode 100644 index 000000000..68a1686ab --- /dev/null +++ b/releasenotes/notes/fixes-execution-of-out-of-band-deploy-steps-1f5967e7bfcabbf9.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixes an issue wherein asynchronous out-of-band deploy steps in + deployment template fails to execute. See `story 2006342 + <https://storyboard.openstack.org/#!/story/2006342>`__ for details. |