summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan Smith <dansmith@redhat.com>2023-04-14 12:58:56 -0700
committerDan Smith <dansmith@redhat.com>2023-04-24 15:26:52 -0700
commitfbf2515b4c1dc5f279bb3df2fcb2193a1b52673a (patch)
treed867904ce721e60bbca0d8cab33afcd3a19ff3de
parent2f86a8a0883be7306b0631fdea1e481d0b0863ac (diff)
downloadnova-fbf2515b4c1dc5f279bb3df2fcb2193a1b52673a.tar.gz
Remove silent failure to find a node on rebuild
We have been ignoring the case where a rebuild or evacuate is triggered and we fail to find *any* node for our host. This appears to be a very old behavior, which I traced back ten years to this: https://review.opendev.org/c/openstack/nova/+/35851 which was merely fixing the failure to reset instance.node during an evacuate (which re-uses rebuild, which before that was a single-host operation). That patch intended to make a failure to find a node for our host a non-fatal error, but it just means we fall through that check with no node selected, which means we never update instance.node *and* run ResourceTracker code that will fail to find the node later. So, this makes it an explicit error, where we stop further processing, set the migration for the evacuation to 'failed', and send a notification for it. This is the same behavior as happens further down if we find that the instance has been deleted underneath us. Change-Id: I88b962aaeaa0554da4ab00906ac4d9e6deb43589
-rw-r--r--nova/compute/manager.py14
-rw-r--r--nova/tests/unit/compute/test_compute.py19
2 files changed, 26 insertions, 7 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 5ea71827fc..5c42aa4d89 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -3791,9 +3791,21 @@ class ComputeManager(manager.Manager):
try:
compute_node = self._get_compute_info(context, self.host)
scheduled_node = compute_node.hypervisor_hostname
- except exception.ComputeHostNotFound:
+ except exception.ComputeHostNotFound as e:
+ # This means we were asked to rebuild one of our own
+ # instances, or another instance as a target of an
+ # evacuation, but we are unable to find a matching compute
+ # node.
LOG.exception('Failed to get compute_info for %s',
self.host)
+ self._set_migration_status(migration, 'failed')
+ self._notify_instance_rebuild_error(context, instance, e,
+ bdms)
+ raise exception.InstanceFaultRollback(
+ inner_exception=exception.BuildAbortException(
+ instance_uuid=instance.uuid,
+ reason=e.format_message()))
+
else:
scheduled_node = instance.node
diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py
index 49cf15ec17..bb9b726912 100644
--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@@ -13513,7 +13513,8 @@ class EvacuateHostTestCase(BaseTestCase):
super(EvacuateHostTestCase, self).tearDown()
def _rebuild(self, on_shared_storage=True, migration=None,
- send_node=False, vm_states_is_stopped=False):
+ send_node=False, vm_states_is_stopped=False,
+ expect_error=False):
network_api = self.compute.network_api
ctxt = context.get_admin_context()
@@ -13560,6 +13561,11 @@ class EvacuateHostTestCase(BaseTestCase):
action='power_off', phase='start'),
mock.call(ctxt, self.inst, self.inst.host,
action='power_off', phase='end')])
+ elif expect_error:
+ mock_notify_rebuild.assert_has_calls([
+ mock.call(ctxt, self.inst, self.compute.host,
+ phase='error', exception=mock.ANY, bdms=bdms)])
+ return
else:
mock_notify_rebuild.assert_has_calls([
mock.call(ctxt, self.inst, self.inst.host, phase='start',
@@ -13614,14 +13620,15 @@ class EvacuateHostTestCase(BaseTestCase):
mock.patch.object(self.compute, '_get_compute_info',
side_effect=fake_get_compute_info)
) as (mock_inst, mock_get):
- self._rebuild()
+ self.assertRaises(exception.InstanceFaultRollback,
+ self._rebuild, expect_error=True)
# Should be on destination host
instance = db.instance_get(self.context, self.inst.id)
- self.assertEqual(instance['host'], self.compute.host)
- self.assertIsNone(instance['node'])
- self.assertTrue(mock_inst.called)
- self.assertTrue(mock_get.called)
+ self.assertEqual('fake_host_2', instance['host'])
+ self.assertEqual('fakenode2', instance['node'])
+ mock_inst.assert_not_called()
+ mock_get.assert_called_once_with(mock.ANY, self.compute.host)
def test_rebuild_on_host_node_passed(self):
patch_get_info = mock.patch.object(self.compute, '_get_compute_info')