summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan Smith <dansmith@redhat.com>2023-04-14 10:56:48 -0700
committerDan Smith <dansmith@redhat.com>2023-04-24 15:26:52 -0700
commit82deb0ce4be588d675882a28ef7cacb97d8ffd1b (patch)
tree82307238083cf9cff4b859de9c7f01a88336c489
parentfbf2515b4c1dc5f279bb3df2fcb2193a1b52673a (diff)
downloadnova-82deb0ce4be588d675882a28ef7cacb97d8ffd1b.tar.gz
Stop ignoring missing compute nodes in claims
The resource tracker will silently ignore attempts to claim resources when the node requested is not managed by this host. The misleading "self.disabled(nodename)" check will fail if the nodename is not known to the resource tracker, causing us to bail early with a NopClaim. That means we also don't do additional setup like creating a migration context for the instance, claim resources in placement, and handle PCI/NUMA things. This behavior is quite old, and clearly doesn't make sense in a world with things like placement. The bulk of the test changes here are due to the fact that a lot of tests were relying on this silent ignoring of a mismatching node, because they were passing node names that weren't even tracked. This change makes us raise an error if this happens so that we can actually catch it, and avoid silently continuing with no resource claim. Change-Id: I416126ee5d10428c296fe618aa877cca0e8dffcf
-rw-r--r--nova/compute/resource_tracker.py38
-rw-r--r--nova/tests/unit/compute/test_compute_mgr.py43
-rw-r--r--nova/tests/unit/compute/test_resource_tracker.py19
-rw-r--r--nova/tests/unit/compute/test_shelve.py4
4 files changed, 66 insertions, 38 deletions
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
index 3f911f3708..9ee6670c17 100644
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -146,16 +146,20 @@ class ResourceTracker(object):
during the instance build.
"""
if self.disabled(nodename):
- # instance_claim() was called before update_available_resource()
- # (which ensures that a compute node exists for nodename). We
- # shouldn't get here but in case we do, just set the instance's
- # host and nodename attribute (probably incorrect) and return a
- # NoopClaim.
- # TODO(jaypipes): Remove all the disabled junk from the resource
- # tracker. Servicegroup API-level active-checking belongs in the
- # nova-compute manager.
- self._set_instance_host_and_node(instance, nodename)
- return claims.NopClaim()
+ # If we get here, it means we are trying to claim for an instance
+ # that was scheduled to a node that we do not have in our list,
+ # or is in some other way unmanageable by this node. This would
+ # mean that we are unable to account for resources, create
+ # allocations in placement, or do any of the other accounting
+ # necessary for this to work. In the past, this situation was
+ # effectively ignored silently, but in a world where we track
+ # resources with placement and instance assignment to compute nodes
+ # by service, we can no longer be leaky.
+ raise exception.ComputeResourcesUnavailable(
+ ('Attempt to claim resources for instance %(inst)s '
+ 'on unknown node %(node)s failed') % {
+ 'inst': instance.uuid,
+ 'node': nodename})
# sanity checks:
if instance.host:
@@ -280,9 +284,17 @@ class ResourceTracker(object):
context, instance, new_flavor, nodename, move_type)
if self.disabled(nodename):
- # compute_driver doesn't support resource tracking, just
- # generate the migration record and continue the resize:
- return claims.NopClaim(migration=migration)
+ # This means we were asked to accept an incoming migration to a
+ # node that we do not own or track. We really should not get here,
+ # but if we do, we must refuse to continue with the migration
+ # process, since we cannot account for those resources, create
+ # allocations in placement, etc. This has been a silent resource
+ # leak in the past, but it must be a hard failure now.
+ raise exception.ComputeResourcesUnavailable(
+ ('Attempt to claim move resources for instance %(inst)s on '
+ 'unknown node %(node)s failed') % {
+ 'inst': instance.uuid,
+ 'node': 'nodename'})
cn = self.compute_nodes[nodename]
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 1c69cd8f1c..73c9d32197 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -2560,10 +2560,11 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
self.assertFalse(mock_get_info.called)
self.assertFalse(mock_sync_power_state.called)
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch('nova.compute.manager.ComputeManager.'
'_sync_instance_power_state')
def test_query_driver_power_state_and_sync_not_found_driver(
- self, mock_sync_power_state):
+ self, mock_sync_power_state, mock_claim):
error = exception.InstanceNotFound(instance_id=1)
with mock.patch.object(self.compute.driver,
'get_info', side_effect=error) as mock_get_info:
@@ -6568,6 +6569,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
fake_rt = fake_resource_tracker.FakeResourceTracker(self.compute.host,
self.compute.driver)
self.compute.rt = fake_rt
+ self.compute.driver._set_nodes([self.node])
+ self.compute.rt.compute_nodes = {self.node: objects.ComputeNode()}
self.allocations = {
uuids.provider1: {
@@ -6857,6 +6860,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock_get_arqs.assert_called_once_with(
self.instance.uuid, only_resolved=True)
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch.object(fake_driver.FakeDriver, 'spawn')
@mock.patch('nova.objects.Instance.save')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
@@ -6868,7 +6872,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
def test_spawn_called_with_accel_info(self, mock_ins_usage,
mock_ins_create, mock_dev_tag, mock_certs, mock_req_group_map,
- mock_get_allocations, mock_ins_save, mock_spawn):
+ mock_get_allocations, mock_ins_save, mock_spawn, mock_claim):
accel_info = [{'k1': 'v1', 'k2': 'v2'}]
@@ -7142,13 +7146,15 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.security_groups, self.block_device_mapping,
request_spec={}, host_lists=[fake_host_list])
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
@mock.patch.object(fake_driver.FakeDriver, 'spawn')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
def test_rescheduled_exception_with_non_ascii_exception(self,
- mock_notify, mock_save, mock_spawn, mock_build, mock_shutdown):
+ mock_notify, mock_save, mock_spawn, mock_build, mock_shutdown,
+ mock_claim):
exc = exception.NovaException(u's\xe9quence')
mock_build.return_value = self.network_info
@@ -7164,7 +7170,6 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.accel_uuids)
mock_save.assert_has_calls([
mock.call(),
- mock.call(),
mock.call(expected_task_state='block_device_mapping'),
])
mock_notify.assert_has_calls([
@@ -7670,6 +7675,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.assertEqual(10, mock_failed.call_count)
mock_succeeded.assert_not_called()
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
@mock.patch.object(fake_driver.FakeDriver, 'spawn')
@@ -7677,7 +7683,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
def _test_instance_exception(self, exc, raised_exc,
mock_notify, mock_save, mock_spawn,
- mock_build, mock_shutdown):
+ mock_build, mock_shutdown, mock_claim):
"""This method test the instance related InstanceNotFound
and reschedule on exception errors. The test cases get from
arguments.
@@ -7700,7 +7706,6 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock_save.assert_has_calls([
mock.call(),
- mock.call(),
mock.call(expected_task_state='block_device_mapping')])
mock_notify.assert_has_calls([
mock.call(self.context, self.instance, 'create.start',
@@ -7811,11 +7816,12 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
'_shutdown_instance'),
mock.patch.object(self.compute,
'_validate_instance_group_policy'),
+ mock.patch.object(self.compute.rt, 'instance_claim'),
mock.patch('nova.compute.utils.notify_about_instance_create')
) as (spawn, save,
_build_networks_for_instance, _notify_about_instance_usage,
_shutdown_instance, _validate_instance_group_policy,
- mock_notify):
+ mock_claim, mock_notify):
self.assertRaises(exception.BuildAbortException,
self.compute._build_and_run_instance, self.context,
@@ -7846,7 +7852,6 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
save.assert_has_calls([
mock.call(),
- mock.call(),
mock.call(
expected_task_state=task_states.BLOCK_DEVICE_MAPPING)])
@@ -7908,11 +7913,12 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
request_spec={}, host_lists=[fake_host_list])
mock_nil.assert_called_once_with(self.instance)
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch.object(manager.ComputeManager, '_build_resources')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
def test_build_resources_buildabort_reraise(self, mock_notify, mock_save,
- mock_build):
+ mock_build, mock_claim):
exc = exception.BuildAbortException(
instance_uuid=self.instance.uuid, reason='')
mock_build.side_effect = exc
@@ -7926,7 +7932,6 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.node, self.limits, self.filter_properties,
request_spec=[], accel_uuids=self.accel_uuids)
- mock_save.assert_called_once_with()
mock_notify.assert_has_calls([
mock.call(self.context, self.instance, 'create.start',
extra_usage_info={'image_name': self.image.get('name')}),
@@ -8581,10 +8586,11 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
ctxt, instance, req_networks)
warning_mock.assert_not_called()
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch('nova.compute.utils.notify_about_instance_create')
@mock.patch.object(manager.ComputeManager, '_instance_update')
def test_launched_at_in_create_end_notification(self,
- mock_instance_update, mock_notify_instance_create):
+ mock_instance_update, mock_notify_instance_create, mock_claim):
def fake_notify(*args, **kwargs):
if args[2] == 'create.end':
@@ -8624,6 +8630,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.flags(default_access_ip_network_name='test1')
instance = fake_instance.fake_db_instance()
+ @mock.patch.object(self.compute.rt, 'instance_claim')
@mock.patch.object(db, 'instance_update_and_get_original',
return_value=({}, instance))
@mock.patch.object(self.compute.driver, 'spawn')
@@ -8632,7 +8639,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
@mock.patch.object(db, 'instance_extra_update_by_uuid')
@mock.patch.object(self.compute, '_notify_about_instance_usage')
def _check_access_ip(mock_notify, mock_extra, mock_networks,
- mock_spawn, mock_db_update):
+ mock_spawn, mock_db_update, mock_claim):
self.compute._build_and_run_instance(self.context, self.instance,
self.image, self.injected_files, self.admin_pass,
self.requested_networks, self.security_groups,
@@ -8653,8 +8660,10 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
_check_access_ip()
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch.object(manager.ComputeManager, '_instance_update')
- def test_create_error_on_instance_delete(self, mock_instance_update):
+ def test_create_error_on_instance_delete(self, mock_instance_update,
+ mock_claim):
def fake_notify(*args, **kwargs):
if args[2] == 'create.error':
@@ -8668,7 +8677,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock.patch.object(self.compute,
'_build_networks_for_instance', return_value=[]),
mock.patch.object(self.instance, 'save',
- side_effect=[None, None, None, exc]),
+ side_effect=[None, None, exc]),
mock.patch.object(self.compute, '_notify_about_instance_usage',
side_effect=fake_notify)
) as (mock_spawn, mock_networks, mock_save, mock_notify):
@@ -8697,7 +8706,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock.patch.object(
self.compute, '_build_networks_for_instance', return_value=[]),
mock.patch.object(self.instance, 'save'),
- ) as (mock_spawn, mock_networks, mock_save):
+ mock.patch.object(self.compute.rt, 'instance_claim'),
+ ) as (mock_spawn, mock_networks, mock_save, mock_claim):
self.compute._build_and_run_instance(
self.context,
self.instance, self.image, self.injected_files,
@@ -8747,7 +8757,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock.patch.object(self.instance, 'save'),
mock.patch('nova.scheduler.client.report.'
'SchedulerReportClient._get_resource_provider'),
- ) as (mock_spawn, mock_networks, mock_save, mock_get_rp):
+ mock.patch.object(self.compute.rt, 'instance_claim'),
+ ) as (mock_spawn, mock_networks, mock_save, mock_get_rp, mock_claim):
mock_get_rp.return_value = {
'uuid': uuids.rp1,
'name': 'compute1:sriov-agent:ens3'
diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py
index cd36b8987f..b6770cb5b8 100644
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@@ -2231,14 +2231,19 @@ class TestInstanceClaim(BaseTestCase):
self.rt.compute_nodes = {}
self.assertTrue(self.rt.disabled(_NODENAME))
- with mock.patch.object(self.instance, 'save'):
- claim = self.rt.instance_claim(mock.sentinel.ctx, self.instance,
- _NODENAME, self.allocations, None)
+ # Reset all changes to the instance to make sure that we can detect
+ # any manipulation after the failure.
+ self.instance.obj_reset_changes(recursive=True)
- self.assertEqual(self.rt.host, self.instance.host)
- self.assertEqual(self.rt.host, self.instance.launched_on)
- self.assertEqual(_NODENAME, self.instance.node)
- self.assertIsInstance(claim, claims.NopClaim)
+ with mock.patch.object(self.instance, 'save') as mock_save:
+ self.assertRaises(exc.ComputeResourcesUnavailable,
+ self.rt.instance_claim,
+ mock.sentinel.ctx, self.instance,
+ _NODENAME, self.allocations, None)
+ mock_save.assert_not_called()
+
+ # Make sure the instance was not touched by the failed claim process
+ self.assertEqual(set(), self.instance.obj_what_changed())
@mock.patch('nova.compute.utils.is_volume_backed_instance')
@mock.patch('nova.objects.MigrationList.get_in_progress_and_error')
diff --git a/nova/tests/unit/compute/test_shelve.py b/nova/tests/unit/compute/test_shelve.py
index 0a1e3f54fc..62321bddec 100644
--- a/nova/tests/unit/compute/test_shelve.py
+++ b/nova/tests/unit/compute/test_shelve.py
@@ -646,7 +646,7 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
self.compute.unshelve_instance(
self.context, instance, image=None,
- filter_properties={}, node='fake-node', request_spec=request_spec,
+ filter_properties={}, node='fakenode2', request_spec=request_spec,
accel_uuids=[])
mock_update_pci.assert_called_once_with(
@@ -700,7 +700,7 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
self.assertRaises(test.TestingException,
self.compute.unshelve_instance, self.context, instance,
image=shelved_image, filter_properties={},
- node='fake-node', request_spec=fake_spec, accel_uuids=[])
+ node='fakenode2', request_spec=fake_spec, accel_uuids=[])
self.assertEqual(instance.image_ref, initial_image_ref)
@mock.patch.object(objects.InstanceList, 'get_by_filters')