summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBalazs Gibizer <balazs.gibizer@est.tech>2021-09-23 20:05:45 +0200
committerBalazs Gibizer <balazs.gibizer@est.tech>2021-09-24 15:52:21 +0200
commit0b1fa9b4ae01114299c5225d66e1f6eba25be43e (patch)
tree56db1d9fab962946cde8de8712021eab620ec8f3
parent7f00f7be226511840747643919d167c97a021bea (diff)
downloadnova-0b1fa9b4ae01114299c5225d66e1f6eba25be43e.tar.gz
Reproduce bug 1944759
Add functional tests to reproduce the race between resize_instance() and update_available_resources(). Related-Bug: #1944759 Change-Id: Icb7e3379248fe00f9a94f9860181b5de44902379 (cherry picked from commit 3e4e4489b7a6e9cdefcc6ff02ed99a0a70420fca) (cherry picked from commit e6c6880465824f1e327a54143f32bb5a5816ff6c) (cherry picked from commit 140ae45d98dabd30aef5c0ac075346de4eabcea1)
-rw-r--r--nova/tests/functional/libvirt/test_numa_servers.py121
1 files changed, 121 insertions, 0 deletions
diff --git a/nova/tests/functional/libvirt/test_numa_servers.py b/nova/tests/functional/libvirt/test_numa_servers.py
index 28f8463aea..90afeb763c 100644
--- a/nova/tests/functional/libvirt/test_numa_servers.py
+++ b/nova/tests/functional/libvirt/test_numa_servers.py
@@ -711,6 +711,127 @@ class NUMAServersTest(NUMAServersTestBase):
server = self._wait_for_state_change(server, 'ACTIVE')
+ def _assert_pinned_cpus(self, hostname, expected_number_of_pinned):
+ numa_topology = objects.NUMATopology.obj_from_db_obj(
+ objects.ComputeNode.get_by_nodename(
+ self.ctxt, hostname,
+ ).numa_topology,
+ )
+ self.assertEqual(
+ expected_number_of_pinned, len(numa_topology.cells[0].pinned_cpus))
+
+ def _create_server_and_resize_bug_1944759(self):
+ self.flags(
+ cpu_dedicated_set='0-3', cpu_shared_set='4-7', group='compute')
+ self.flags(vcpu_pin_set=None)
+
+ # start services
+ self.start_compute(hostname='test_compute0')
+ self.start_compute(hostname='test_compute1')
+
+ flavor_a_id = self._create_flavor(
+ vcpu=2, extra_spec={'hw:cpu_policy': 'dedicated'})
+ server = self._create_server(flavor_id=flavor_a_id)
+
+ src_host = server['OS-EXT-SRV-ATTR:host']
+ self._assert_pinned_cpus(src_host, 2)
+
+ # we don't really care what the new flavor is, so long as the old
+ # flavor is using pinning. We use a similar flavor for simplicity.
+ flavor_b_id = self._create_flavor(
+ vcpu=2, extra_spec={'hw:cpu_policy': 'dedicated'})
+
+ orig_rpc_finish_resize = nova.compute.rpcapi.ComputeAPI.finish_resize
+
+ # Simulate that the finish_resize call overlaps with an
+ # update_available_resource periodic job
+ def inject_periodic_to_finish_resize(*args, **kwargs):
+ self._run_periodics()
+ return orig_rpc_finish_resize(*args, **kwargs)
+
+ self.stub_out(
+ 'nova.compute.rpcapi.ComputeAPI.finish_resize',
+ inject_periodic_to_finish_resize,
+ )
+
+ # TODO(stephenfin): The mock of 'migrate_disk_and_power_off' should
+ # probably be less...dumb
+ with mock.patch(
+ 'nova.virt.libvirt.driver.LibvirtDriver'
+ '.migrate_disk_and_power_off', return_value='{}',
+ ):
+ post = {'resize': {'flavorRef': flavor_b_id}}
+ self.api.post_server_action(server['id'], post)
+ server = self._wait_for_state_change(server, 'VERIFY_RESIZE')
+
+ dst_host = server['OS-EXT-SRV-ATTR:host']
+
+ # This is a resource accounting bug, we should have 2 cpus pinned on
+ # both computes. The source should have it due to the outbound
+ # migration and the destination due to the instance running there
+ self._assert_pinned_cpus(src_host, 0)
+ self._assert_pinned_cpus(dst_host, 2)
+
+ return server, src_host, dst_host
+
+ def test_resize_confirm_bug_1944759(self):
+ server, src_host, dst_host = (
+ self._create_server_and_resize_bug_1944759())
+
+ # Now confirm the resize
+ post = {'confirmResize': None}
+
+ # FIXME(gibi): This is bug 1944759 where during resize, on the source
+ # node the resize_instance() call at the point of calling finish_resize
+ # overlaps with a update_available_resources() periodic job. This
+ # causes that the periodic job will not track the migration nor the
+ # instance and therefore freeing the resource allocation. Then when
+ # later the resize is confirmed the confirm_resize on the source
+ # compute also wants to free up the resources, the pinned CPUs, and it
+ # fails as they are already freed.
+ exc = self.assertRaises(
+ client.OpenStackApiException,
+ self.api.post_server_action, server['id'], post
+ )
+ self.assertEqual(500, exc.response.status_code)
+ self.assertIn('CPUUnpinningInvalid', str(exc))
+
+ # confirm failed above but the resource allocation reflects that the
+ # VM is running on the dest node
+ self._assert_pinned_cpus(src_host, 0)
+ self._assert_pinned_cpus(dst_host, 2)
+
+ self._run_periodics()
+
+ # and such allocation situation is stable so as a recovery the VM
+ # can be reset-state to ACTIVE without problem.
+ self._assert_pinned_cpus(src_host, 0)
+ self._assert_pinned_cpus(dst_host, 2)
+
+ def test_resize_revert_bug_1944759(self):
+ server, src_host, dst_host = (
+ self._create_server_and_resize_bug_1944759())
+
+ # Now revert the resize
+ post = {'revertResize': None}
+
+ # reverts actually succeeds (not like confirm) but the resource
+ # allocation is still flaky
+ self.api.post_server_action(server['id'], post)
+ self._wait_for_state_change(server, 'ACTIVE')
+
+ # This is a resource accounting bug. After the revert the source host
+ # should have 2 cpus pinned due to the instance.
+ self._assert_pinned_cpus(src_host, 0)
+ self._assert_pinned_cpus(dst_host, 0)
+
+ # running the periodic job will fix the resource accounting
+ self._run_periodics()
+
+ # this is now correct
+ self._assert_pinned_cpus(src_host, 2)
+ self._assert_pinned_cpus(dst_host, 0)
+
class NUMAServerTestWithCountingQuotaFromPlacement(NUMAServersTest):