summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJenkins <jenkins@review.openstack.org>2017-08-25 02:58:09 +0000
committerGerrit Code Review <review@openstack.org>2017-08-25 02:58:09 +0000
commit8b162ba21f06272f915070473640d493dd70beec (patch)
tree83966c1696cced0a853f624aa353cbff4ea26710
parentd79c4f91386430dd07da44d1805edfb43c940949 (diff)
parent1dafea91bf6070c414c54ec0c26ebb4b6cb096c0 (diff)
downloadnova-16.0.0.tar.gz
Merge "Cleanup allocations in failed prep_resize" into stable/pike16.0.0.0rc216.0.0
-rw-r--r--nova/compute/manager.py18
-rw-r--r--nova/compute/resource_tracker.py24
-rw-r--r--nova/scheduler/client/report.py14
-rw-r--r--nova/tests/functional/test_servers.py105
4 files changed, 122 insertions, 39 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index ee4a1abd4b..0d98f33692 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -3786,6 +3786,7 @@ class ComputeManager(manager.Manager):
current_period=True)
self._notify_about_instance_usage(
context, instance, "resize.prep.start")
+ failed = False
try:
self._prep_resize(context, image, instance,
instance_type, filter_properties,
@@ -3794,14 +3795,31 @@ class ComputeManager(manager.Manager):
# instance to be migrated is backed by LVM.
# Remove when LVM migration is implemented.
except exception.MigrationPreCheckError:
+ # TODO(mriedem): How is it even possible to get here?
+ # _prep_resize does not call the driver. The resize_instance
+ # method does, but we RPC cast to the source node to do that
+ # so we shouldn't even get this exception...
+ failed = True
raise
except Exception:
+ failed = True
# try to re-schedule the resize elsewhere:
exc_info = sys.exc_info()
self._reschedule_resize_or_reraise(context, image, instance,
exc_info, instance_type, request_spec,
filter_properties)
finally:
+ if failed:
+ # Since we hit a failure, we're either rescheduling or dead
+ # and either way we need to cleanup any allocations created
+ # by the scheduler for the destination node. Note that for
+ # a resize to the same host, the scheduler will merge the
+ # flavors, so here we'd be subtracting the new flavor from
+ # the allocated resources on this node.
+ rt = self._get_resource_tracker()
+ rt.delete_allocation_for_failed_resize(
+ instance, node, instance_type)
+
extra_usage_info = dict(
new_instance_type=instance_type.name,
new_instance_type_id=instance_type.id)
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
index 437c765d2e..41a8047f02 100644
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -1237,6 +1237,30 @@ class ResourceTracker(object):
'on the source node %s', cn.uuid,
instance=instance)
+ def delete_allocation_for_failed_resize(self, instance, node, flavor):
+ """Delete instance allocations for the node during a failed resize
+
+ :param instance: The instance being resized/migrated.
+ :param node: The node provider on which the instance should have
+ allocations to remove. If this is a resize to the same host, then
+ the new_flavor resources are subtracted from the single allocation.
+ :param flavor: This is the new_flavor during a resize.
+ """
+ resources = scheduler_utils.resources_from_flavor(instance, flavor)
+ cn = self.compute_nodes[node]
+ res = self.reportclient.remove_provider_from_instance_allocation(
+ instance.uuid, cn.uuid, instance.user_id, instance.project_id,
+ resources)
+ if not res:
+ if instance.instance_type_id == flavor.id:
+ operation = 'migration'
+ else:
+ operation = 'resize'
+ LOG.error('Failed to clean allocation after a failed '
+ '%(operation)s on node %(node)s',
+ {'operation': operation, 'node': cn.uuid},
+ instance=instance)
+
def _find_orphaned_instances(self):
"""Given the set of instances and migrations already account for
by resource tracker, sanity check the hypervisor to determine
diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py
index dcbd59a74e..08b5235188 100644
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@@ -1014,11 +1014,15 @@ class SchedulerReportClient(object):
then PUTs the resulting allocation back to the placement API for the
consumer.
- We call this method on two occasions: on the source host during a
- confirm_resize() operation and on the destination host during a
- revert_resize() operation. This is important to reconcile the
- "doubled-up" allocation that the scheduler constructs when claiming
- resources against the destination host during a move operation.
+ We call this method on three occasions:
+
+ 1. On the source host during a confirm_resize() operation.
+ 2. On the destination host during a revert_resize() operation.
+ 3. On the destination host when prep_resize fails.
+
+ This is important to reconcile the "doubled-up" allocation that the
+ scheduler constructs when claiming resources against the destination
+ host during a move operation.
If the move was between hosts, the entire allocation for rp_uuid will
be dropped. If the move is a resize on the same host, then we will
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index d932906bd5..1e9efb1b56 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -1953,39 +1953,17 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
self.assertFlavorMatchesAllocation(
{'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
- def test_rescheduling_when_migrating_instance(self):
- """Tests that allocations are removed from the destination node by
- the compute service when a cold migrate / resize fails and a reschedule
- request is sent back to conductor.
+ def _wait_for_prep_resize_fail_completion(self, server, expected_action):
+ """Polls instance action events for the given instance and action
+ until it finds the compute_prep_resize action event with an error
+ result.
"""
- source_hostname = self.compute1.manager.host
- server = self._boot_and_check_allocations(
- self.flavor1, source_hostname)
-
- def fake_prep_resize(*args, **kwargs):
- raise test.TestingException('Simulated _prep_resize failure.')
-
- # Yes this isn't great in a functional test, but it's simple.
- self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
- fake_prep_resize)
-
- # Now migrate the server which is going to fail on the destination.
- self.api.post_server_action(server['id'], {'migrate': None})
-
- # When the destination compute calls conductor and conductor calls
- # the scheduler to get another host, it's going to fail since there
- # are only two hosts and we started on the first and failed on the
- # second, so the scheduler will raise NoValidHost and a fault will
- # be recorded on the instance. However, the instance is not put into
- # ERROR state since it's still OK on the source host, but faults are
- # only shown in the API for ERROR/DELETED instances, so we can't poll
- # for the fault. So we poll for instance action events instead.
completion_event = None
for attempt in range(10):
actions = self.api.get_instance_actions(server['id'])
# Look for the migrate action.
for action in actions:
- if action['action'] == 'migrate':
+ if action['action'] == expected_action:
events = (
self.api.api_get(
'/servers/%s/os-instance-actions/%s' %
@@ -2009,19 +1987,78 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
self.fail('Timed out waiting for compute_prep_resize failure '
'event. Current instance actions: %s' % actions)
+ def test_rescheduling_when_migrating_instance(self):
+ """Tests that allocations are removed from the destination node by
+ the compute service when a cold migrate / resize fails and a reschedule
+ request is sent back to conductor.
+ """
+ source_hostname = self.compute1.manager.host
+ server = self._boot_and_check_allocations(
+ self.flavor1, source_hostname)
+
+ def fake_prep_resize(*args, **kwargs):
+ raise test.TestingException('Simulated _prep_resize failure.')
+
+ # Yes this isn't great in a functional test, but it's simple.
+ self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
+ fake_prep_resize)
+
+ # Now migrate the server which is going to fail on the destination.
+ self.api.post_server_action(server['id'], {'migrate': None})
+
+ self._wait_for_prep_resize_fail_completion(server, 'migrate')
+
dest_hostname = self._other_hostname(source_hostname)
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
failed_usages = self._get_provider_usages(dest_rp_uuid)
- # FIXME(mriedem): Due to bug 1712850 the allocations are not removed
- # from the destination node before the reschedule. Remove this once
- # the bug is fixed as it should be:
- # # Expects no allocation records on the failed host.
- # self.assertFlavorMatchesAllocation(
- # {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
- self.assertFlavorMatchesAllocation(self.flavor1, failed_usages)
+ # Expects no allocation records on the failed host.
+ self.assertFlavorMatchesAllocation(
+ {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
# Ensure the allocation records still exist on the source host.
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
source_usages = self._get_provider_usages(source_rp_uuid)
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
+
+ def test_resize_to_same_host_prep_resize_fails(self):
+ """Tests that when we resize to the same host and resize fails in
+ the prep_resize method, we cleanup the allocations before rescheduling.
+ """
+ # make sure that the test only uses a single host
+ compute2_service_id = self.admin_api.get_services(
+ host=self.compute2.host, binary='nova-compute')[0]['id']
+ self.admin_api.put_service(compute2_service_id, {'status': 'disabled'})
+
+ hostname = self.compute1.manager.host
+ rp_uuid = self._get_provider_uuid_by_host(hostname)
+
+ server = self._boot_and_check_allocations(self.flavor1, hostname)
+
+ def fake_prep_resize(*args, **kwargs):
+ # Ensure the allocations are doubled now before we fail.
+ usages = self._get_provider_usages(rp_uuid)
+ self.assertFlavorsMatchAllocation(
+ self.flavor1, self.flavor2, usages)
+ raise test.TestingException('Simulated _prep_resize failure.')
+
+ # Yes this isn't great in a functional test, but it's simple.
+ self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
+ fake_prep_resize)
+
+ self.flags(allow_resize_to_same_host=True)
+ resize_req = {
+ 'resize': {
+ 'flavorRef': self.flavor2['id']
+ }
+ }
+ self.api.post_server_action(server['id'], resize_req)
+
+ self._wait_for_prep_resize_fail_completion(server, 'resize')
+
+ # Ensure the allocation records still exist on the host.
+ source_rp_uuid = self._get_provider_uuid_by_host(hostname)
+ source_usages = self._get_provider_usages(source_rp_uuid)
+ # The new_flavor should have been subtracted from the doubled
+ # allocation which just leaves us with the original flavor.
+ self.assertFlavorMatchesAllocation(self.flavor1, source_usages)