From fbf2515b4c1dc5f279bb3df2fcb2193a1b52673a Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Fri, 14 Apr 2023 12:58:56 -0700 Subject: Remove silent failure to find a node on rebuild We have been ignoring the case where a rebuild or evacuate is triggered and we fail to find *any* node for our host. This appears to be a very old behavior, which I traced back ten years to this: https://review.opendev.org/c/openstack/nova/+/35851 which was merely fixing the failure to reset instance.node during an evacuate (which re-uses rebuild, which before that was a single-host operation). That patch intended to make a failure to find a node for our host a non-fatal error, but it just means we fall through that check with no node selected, which means we never update instance.node *and* run ResourceTracker code that will fail to find the node later. So, this makes it an explicit error, where we stop further processing, set the migration for the evacuation to 'failed', and send a notification for it. This is the same behavior as happens further down if we find that the instance has been deleted underneath us. Change-Id: I88b962aaeaa0554da4ab00906ac4d9e6deb43589 --- nova/compute/manager.py | 14 +++++++++++++- nova/tests/unit/compute/test_compute.py | 19 +++++++++++++------ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 5ea71827fc..5c42aa4d89 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -3791,9 +3791,21 @@ class ComputeManager(manager.Manager): try: compute_node = self._get_compute_info(context, self.host) scheduled_node = compute_node.hypervisor_hostname - except exception.ComputeHostNotFound: + except exception.ComputeHostNotFound as e: + # This means we were asked to rebuild one of our own + # instances, or another instance as a target of an + # evacuation, but we are unable to find a matching compute + # node. LOG.exception('Failed to get compute_info for %s', self.host) + self._set_migration_status(migration, 'failed') + self._notify_instance_rebuild_error(context, instance, e, + bdms) + raise exception.InstanceFaultRollback( + inner_exception=exception.BuildAbortException( + instance_uuid=instance.uuid, + reason=e.format_message())) + else: scheduled_node = instance.node diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 49cf15ec17..bb9b726912 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -13513,7 +13513,8 @@ class EvacuateHostTestCase(BaseTestCase): super(EvacuateHostTestCase, self).tearDown() def _rebuild(self, on_shared_storage=True, migration=None, - send_node=False, vm_states_is_stopped=False): + send_node=False, vm_states_is_stopped=False, + expect_error=False): network_api = self.compute.network_api ctxt = context.get_admin_context() @@ -13560,6 +13561,11 @@ class EvacuateHostTestCase(BaseTestCase): action='power_off', phase='start'), mock.call(ctxt, self.inst, self.inst.host, action='power_off', phase='end')]) + elif expect_error: + mock_notify_rebuild.assert_has_calls([ + mock.call(ctxt, self.inst, self.compute.host, + phase='error', exception=mock.ANY, bdms=bdms)]) + return else: mock_notify_rebuild.assert_has_calls([ mock.call(ctxt, self.inst, self.inst.host, phase='start', @@ -13614,14 +13620,15 @@ class EvacuateHostTestCase(BaseTestCase): mock.patch.object(self.compute, '_get_compute_info', side_effect=fake_get_compute_info) ) as (mock_inst, mock_get): - self._rebuild() + self.assertRaises(exception.InstanceFaultRollback, + self._rebuild, expect_error=True) # Should be on destination host instance = db.instance_get(self.context, self.inst.id) - self.assertEqual(instance['host'], self.compute.host) - self.assertIsNone(instance['node']) - self.assertTrue(mock_inst.called) - self.assertTrue(mock_get.called) + self.assertEqual('fake_host_2', instance['host']) + self.assertEqual('fakenode2', instance['node']) + mock_inst.assert_not_called() + mock_get.assert_called_once_with(mock.ANY, self.compute.host) def test_rebuild_on_host_node_passed(self): patch_get_info = mock.patch.object(self.compute, '_get_compute_info') -- cgit v1.2.1