summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.opendev.org>2023-04-26 17:01:42 +0000
committerGerrit Code Review <review@openstack.org>2023-04-26 17:01:42 +0000
commit66f085f946752f7f84804efa22f739a7119a6efc (patch)
tree35543b6d2c1da86c07d553d5bf09d4a4c7148768
parent316e3e0a3b24fcb0597fd3193ec4d8f6fc0abc44 (diff)
parent82deb0ce4be588d675882a28ef7cacb97d8ffd1b (diff)
downloadnova-66f085f946752f7f84804efa22f739a7119a6efc.tar.gz
Merge "Stop ignoring missing compute nodes in claims"
-rw-r--r--nova/compute/resource_tracker.py38
-rw-r--r--nova/tests/unit/compute/test_compute_mgr.py43
-rw-r--r--nova/tests/unit/compute/test_resource_tracker.py19
-rw-r--r--nova/tests/unit/compute/test_shelve.py4
4 files changed, 66 insertions, 38 deletions
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
index 3f911f3708..9ee6670c17 100644
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -146,16 +146,20 @@ class ResourceTracker(object):
during the instance build.
"""
if self.disabled(nodename):
- # instance_claim() was called before update_available_resource()
- # (which ensures that a compute node exists for nodename). We
- # shouldn't get here but in case we do, just set the instance's
- # host and nodename attribute (probably incorrect) and return a
- # NoopClaim.
- # TODO(jaypipes): Remove all the disabled junk from the resource
- # tracker. Servicegroup API-level active-checking belongs in the
- # nova-compute manager.
- self._set_instance_host_and_node(instance, nodename)
- return claims.NopClaim()
+ # If we get here, it means we are trying to claim for an instance
+ # that was scheduled to a node that we do not have in our list,
+ # or is in some other way unmanageable by this node. This would
+ # mean that we are unable to account for resources, create
+ # allocations in placement, or do any of the other accounting
+ # necessary for this to work. In the past, this situation was
+ # effectively ignored silently, but in a world where we track
+ # resources with placement and instance assignment to compute nodes
+ # by service, we can no longer be leaky.
+ raise exception.ComputeResourcesUnavailable(
+ ('Attempt to claim resources for instance %(inst)s '
+ 'on unknown node %(node)s failed') % {
+ 'inst': instance.uuid,
+ 'node': nodename})
# sanity checks:
if instance.host:
@@ -280,9 +284,17 @@ class ResourceTracker(object):
context, instance, new_flavor, nodename, move_type)
if self.disabled(nodename):
- # compute_driver doesn't support resource tracking, just
- # generate the migration record and continue the resize:
- return claims.NopClaim(migration=migration)
+ # This means we were asked to accept an incoming migration to a
+ # node that we do not own or track. We really should not get here,
+ # but if we do, we must refuse to continue with the migration
+ # process, since we cannot account for those resources, create
+ # allocations in placement, etc. This has been a silent resource
+ # leak in the past, but it must be a hard failure now.
+ raise exception.ComputeResourcesUnavailable(
+ ('Attempt to claim move resources for instance %(inst)s on '
+ 'unknown node %(node)s failed') % {
+ 'inst': instance.uuid,
+ 'node': 'nodename'})
cn = self.compute_nodes[nodename]
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 1c69cd8f1c..73c9d32197 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -2560,10 +2560,11 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
self.assertFalse(mock_get_info.called)
self.assertFalse(mock_sync_power_state.called)
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch('nova.compute.manager.ComputeManager.'
'_sync_instance_power_state')
def test_query_driver_power_state_and_sync_not_found_driver(
- self, mock_sync_power_state):
+ self, mock_sync_power_state, mock_claim):
error = exception.InstanceNotFound(instance_id=1)
with mock.patch.object(self.compute.driver,
'get_info', side_effect=error) as mock_get_info:
@@ -6568,6 +6569,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
fake_rt = fake_resource_tracker.FakeResourceTracker(self.compute.host,
self.compute.driver)
self.compute.rt = fake_rt
+ self.compute.driver._set_nodes([self.node])
+ self.compute.rt.compute_nodes = {self.node: objects.ComputeNode()}
self.allocations = {
uuids.provider1: {
@@ -6857,6 +6860,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock_get_arqs.assert_called_once_with(
self.instance.uuid, only_resolved=True)
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch.object(fake_driver.FakeDriver, 'spawn')
@mock.patch('nova.objects.Instance.save')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
@@ -6868,7 +6872,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
def test_spawn_called_with_accel_info(self, mock_ins_usage,
mock_ins_create, mock_dev_tag, mock_certs, mock_req_group_map,
- mock_get_allocations, mock_ins_save, mock_spawn):
+ mock_get_allocations, mock_ins_save, mock_spawn, mock_claim):
accel_info = [{'k1': 'v1', 'k2': 'v2'}]
@@ -7142,13 +7146,15 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.security_groups, self.block_device_mapping,
request_spec={}, host_lists=[fake_host_list])
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
@mock.patch.object(fake_driver.FakeDriver, 'spawn')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
def test_rescheduled_exception_with_non_ascii_exception(self,
- mock_notify, mock_save, mock_spawn, mock_build, mock_shutdown):
+ mock_notify, mock_save, mock_spawn, mock_build, mock_shutdown,
+ mock_claim):
exc = exception.NovaException(u's\xe9quence')
mock_build.return_value = self.network_info
@@ -7164,7 +7170,6 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.accel_uuids)
mock_save.assert_has_calls([
mock.call(),
- mock.call(),
mock.call(expected_task_state='block_device_mapping'),
])
mock_notify.assert_has_calls([
@@ -7670,6 +7675,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.assertEqual(10, mock_failed.call_count)
mock_succeeded.assert_not_called()
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
@mock.patch.object(fake_driver.FakeDriver, 'spawn')
@@ -7677,7 +7683,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
def _test_instance_exception(self, exc, raised_exc,
mock_notify, mock_save, mock_spawn,
- mock_build, mock_shutdown):
+ mock_build, mock_shutdown, mock_claim):
"""This method test the instance related InstanceNotFound
and reschedule on exception errors. The test cases get from
arguments.
@@ -7700,7 +7706,6 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock_save.assert_has_calls([
mock.call(),
- mock.call(),
mock.call(expected_task_state='block_device_mapping')])
mock_notify.assert_has_calls([
mock.call(self.context, self.instance, 'create.start',
@@ -7811,11 +7816,12 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
'_shutdown_instance'),
mock.patch.object(self.compute,
'_validate_instance_group_policy'),
+ mock.patch.object(self.compute.rt, 'instance_claim'),
mock.patch('nova.compute.utils.notify_about_instance_create')
) as (spawn, save,
_build_networks_for_instance, _notify_about_instance_usage,
_shutdown_instance, _validate_instance_group_policy,
- mock_notify):
+ mock_claim, mock_notify):
self.assertRaises(exception.BuildAbortException,
self.compute._build_and_run_instance, self.context,
@@ -7846,7 +7852,6 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
save.assert_has_calls([
mock.call(),
- mock.call(),
mock.call(
expected_task_state=task_states.BLOCK_DEVICE_MAPPING)])
@@ -7908,11 +7913,12 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
request_spec={}, host_lists=[fake_host_list])
mock_nil.assert_called_once_with(self.instance)
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch.object(manager.ComputeManager, '_build_resources')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
def test_build_resources_buildabort_reraise(self, mock_notify, mock_save,
- mock_build):
+ mock_build, mock_claim):
exc = exception.BuildAbortException(
instance_uuid=self.instance.uuid, reason='')
mock_build.side_effect = exc
@@ -7926,7 +7932,6 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.node, self.limits, self.filter_properties,
request_spec=[], accel_uuids=self.accel_uuids)
- mock_save.assert_called_once_with()
mock_notify.assert_has_calls([
mock.call(self.context, self.instance, 'create.start',
extra_usage_info={'image_name': self.image.get('name')}),
@@ -8581,10 +8586,11 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
ctxt, instance, req_networks)
warning_mock.assert_not_called()
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch('nova.compute.utils.notify_about_instance_create')
@mock.patch.object(manager.ComputeManager, '_instance_update')
def test_launched_at_in_create_end_notification(self,
- mock_instance_update, mock_notify_instance_create):
+ mock_instance_update, mock_notify_instance_create, mock_claim):
def fake_notify(*args, **kwargs):
if args[2] == 'create.end':
@@ -8624,6 +8630,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.flags(default_access_ip_network_name='test1')
instance = fake_instance.fake_db_instance()
+ @mock.patch.object(self.compute.rt, 'instance_claim')
@mock.patch.object(db, 'instance_update_and_get_original',
return_value=({}, instance))
@mock.patch.object(self.compute.driver, 'spawn')
@@ -8632,7 +8639,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
@mock.patch.object(db, 'instance_extra_update_by_uuid')
@mock.patch.object(self.compute, '_notify_about_instance_usage')
def _check_access_ip(mock_notify, mock_extra, mock_networks,
- mock_spawn, mock_db_update):
+ mock_spawn, mock_db_update, mock_claim):
self.compute._build_and_run_instance(self.context, self.instance,
self.image, self.injected_files, self.admin_pass,
self.requested_networks, self.security_groups,
@@ -8653,8 +8660,10 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
_check_access_ip()
+ @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch.object(manager.ComputeManager, '_instance_update')
- def test_create_error_on_instance_delete(self, mock_instance_update):
+ def test_create_error_on_instance_delete(self, mock_instance_update,
+ mock_claim):
def fake_notify(*args, **kwargs):
if args[2] == 'create.error':
@@ -8668,7 +8677,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock.patch.object(self.compute,
'_build_networks_for_instance', return_value=[]),
mock.patch.object(self.instance, 'save',
- side_effect=[None, None, None, exc]),
+ side_effect=[None, None, exc]),
mock.patch.object(self.compute, '_notify_about_instance_usage',
side_effect=fake_notify)
) as (mock_spawn, mock_networks, mock_save, mock_notify):
@@ -8697,7 +8706,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock.patch.object(
self.compute, '_build_networks_for_instance', return_value=[]),
mock.patch.object(self.instance, 'save'),
- ) as (mock_spawn, mock_networks, mock_save):
+ mock.patch.object(self.compute.rt, 'instance_claim'),
+ ) as (mock_spawn, mock_networks, mock_save, mock_claim):
self.compute._build_and_run_instance(
self.context,
self.instance, self.image, self.injected_files,
@@ -8747,7 +8757,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock.patch.object(self.instance, 'save'),
mock.patch('nova.scheduler.client.report.'
'SchedulerReportClient._get_resource_provider'),
- ) as (mock_spawn, mock_networks, mock_save, mock_get_rp):
+ mock.patch.object(self.compute.rt, 'instance_claim'),
+ ) as (mock_spawn, mock_networks, mock_save, mock_get_rp, mock_claim):
mock_get_rp.return_value = {
'uuid': uuids.rp1,
'name': 'compute1:sriov-agent:ens3'
diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py
index cd36b8987f..b6770cb5b8 100644
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@@ -2231,14 +2231,19 @@ class TestInstanceClaim(BaseTestCase):
self.rt.compute_nodes = {}
self.assertTrue(self.rt.disabled(_NODENAME))
- with mock.patch.object(self.instance, 'save'):
- claim = self.rt.instance_claim(mock.sentinel.ctx, self.instance,
- _NODENAME, self.allocations, None)
+ # Reset all changes to the instance to make sure that we can detect
+ # any manipulation after the failure.
+ self.instance.obj_reset_changes(recursive=True)
- self.assertEqual(self.rt.host, self.instance.host)
- self.assertEqual(self.rt.host, self.instance.launched_on)
- self.assertEqual(_NODENAME, self.instance.node)
- self.assertIsInstance(claim, claims.NopClaim)
+ with mock.patch.object(self.instance, 'save') as mock_save:
+ self.assertRaises(exc.ComputeResourcesUnavailable,
+ self.rt.instance_claim,
+ mock.sentinel.ctx, self.instance,
+ _NODENAME, self.allocations, None)
+ mock_save.assert_not_called()
+
+ # Make sure the instance was not touched by the failed claim process
+ self.assertEqual(set(), self.instance.obj_what_changed())
@mock.patch('nova.compute.utils.is_volume_backed_instance')
@mock.patch('nova.objects.MigrationList.get_in_progress_and_error')
diff --git a/nova/tests/unit/compute/test_shelve.py b/nova/tests/unit/compute/test_shelve.py
index 0a1e3f54fc..62321bddec 100644
--- a/nova/tests/unit/compute/test_shelve.py
+++ b/nova/tests/unit/compute/test_shelve.py
@@ -646,7 +646,7 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
self.compute.unshelve_instance(
self.context, instance, image=None,
- filter_properties={}, node='fake-node', request_spec=request_spec,
+ filter_properties={}, node='fakenode2', request_spec=request_spec,
accel_uuids=[])
mock_update_pci.assert_called_once_with(
@@ -700,7 +700,7 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
self.assertRaises(test.TestingException,
self.compute.unshelve_instance, self.context, instance,
image=shelved_image, filter_properties={},
- node='fake-node', request_spec=fake_spec, accel_uuids=[])
+ node='fakenode2', request_spec=fake_spec, accel_uuids=[])
self.assertEqual(instance.image_ref, initial_image_ref)
@mock.patch.object(objects.InstanceList, 'get_by_filters')