Merge "Resource tracker compatibility with Ocata and Pike"16.0.0.0rc1

author: Jenkins <jenkins@review.openstack.org> 2017-08-11 07:06:48 +0000
committer: Gerrit Code Review <review@openstack.org> 2017-08-11 07:06:48 +0000
commit: 4cd6a3a1b473795a63f076c7986d1b48d83af99d (patch)
tree: 0095c11d7daa078cfbd4c85f81655b311c6683f4
parent: c58d4422d87af8b1e8583972d3eef5c7db327264 (diff)
parent: ddef9dcfc0f7b8f80c6f67f2bfcaf752558a7788 (diff)
download: nova-16.0.0.0rc1.tar.gz
10 files changed, 136 insertions, 43 deletions
diff --git a/doc/notification_samples/service-update.json b/doc/notification_samples/service-update.json
index e080ab9c65..9e7220e73a 100644
--- a/doc/notification_samples/service-update.json
+++ b/doc/notification_samples/service-update.json
@@ -13,7 +13,7 @@
             "disabled_reason": null,
             "report_count": 1,
             "forced_down": false,
-            "version": 21,
+            "version": 22,
             "availability_zone": null,
             "uuid": "fa69c544-906b-4a6a-a9c6-c1f7a8078c73"
         }
diff --git a/nova/cmd/compute.py b/nova/cmd/compute.py
index 53342e2f07..eb48853d45 100644
--- a/nova/cmd/compute.py
+++ b/nova/cmd/compute.py
@@ -52,7 +52,7 @@ def main():
 
     cmd_common.block_db_access('nova-compute')
     objects_base.NovaObject.indirection_api = conductor_rpcapi.ConductorAPI()
-
+    objects.Service.enable_min_version_cache()
     server = service.Service.create(binary='nova-compute',
                                     topic=compute_rpcapi.RPC_TOPIC)
     service.serve(server)
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
index 2ab30161ca..994f1ca82f 100644
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -1010,7 +1010,7 @@ class ResourceTracker(object):
                 continue
 
     def _update_usage_from_instance(self, context, instance, nodename,
-                                    is_removed=False):
+            is_removed=False, has_ocata_computes=False):
         """Update usage for a single instance."""
 
         uuid = instance['uuid']
@@ -1038,7 +1038,40 @@ class ResourceTracker(object):
                 self.pci_tracker.update_pci_for_instance(context,
                                                          instance,
                                                          sign=sign)
-            self.reportclient.update_instance_allocation(cn, instance, sign)
+            if has_ocata_computes:
+                LOG.debug("We're on a Pike compute host in a deployment "
+                          "with Ocata compute hosts. Auto-correcting "
+                          "allocations to handle Ocata-style assumptions.")
+                self.reportclient.update_instance_allocation(cn, instance,
+                                                             sign)
+            else:
+                # NOTE(jaypipes): We're on a Pike compute host or later in
+                # a deployment with all compute hosts upgraded to Pike or
+                # later
+                #
+                # If that is the case, then we know that the scheduler will
+                # have properly created an allocation and that the compute
+                # hosts have not attempted to overwrite allocations
+                # **during the periodic update_available_resource() call**.
+                # However, Pike compute hosts may still rework an
+                # allocation for an instance in a move operation during
+                # confirm_resize() on the source host which will remove the
+                # source resource provider from any allocation for an
+                # instance.
+                #
+                # In Queens and beyond, the scheduler will understand when
+                # a move operation has been requested and instead of
+                # creating a doubled-up allocation that contains both the
+                # source and destination host, the scheduler will take the
+                # original allocation (against the source host) and change
+                # the consumer ID of that allocation to be the migration
+                # UUID and not the instance UUID. The scheduler will
+                # allocate the resources for the destination host to the
+                # instance UUID.
+                LOG.debug("We're on a Pike compute host in a deployment "
+                          "with all Pike compute hosts. Skipping "
+                          "auto-correction of allocations.")
+
             # new instance, update compute node resource usage:
             self._update_usage(self._get_usage_dict(instance), nodename,
                                sign=sign)
@@ -1068,9 +1101,44 @@ class ResourceTracker(object):
         cn.current_workload = 0
         cn.running_vms = 0
 
+        # NOTE(jaypipes): In Pike, we need to be tolerant of Ocata compute
+        # nodes that overwrite placement allocations to look like what the
+        # resource tracker *thinks* is correct. When an instance is
+        # migrated from an Ocata compute node to a Pike compute node, the
+        # Pike scheduler will have created a "doubled-up" allocation that
+        # contains allocated resources against both the source and
+        # destination hosts. The Ocata source compute host, during its
+        # update_available_resource() periodic call will find the instance
+        # in its list of known instances and will call
+        # update_instance_allocation() in the report client. That call will
+        # pull the allocations for the instance UUID which will contain
+        # both the source and destination host providers in the allocation
+        # set. Seeing that this is different from what the Ocata source
+        # host thinks it should be and will overwrite the allocation to
+        # only be an allocation against itself.
+        #
+        # And therefore, here we need to have Pike compute hosts
+        # "correct" the improper healing that the Ocata source host did
+        # during its periodic interval. When the instance is fully migrated
+        # to the Pike compute host, the Ocata compute host will find an
+        # allocation that refers to itself for an instance it no longer
+        # controls and will *delete* all allocations that refer to that
+        # instance UUID, assuming that the instance has been deleted. We
+        # need the destination Pike compute host to recreate that
+        # allocation to refer to its own resource provider UUID.
+        #
+        # For Pike compute nodes that migrate to either a Pike compute host
+        # or a Queens compute host, we do NOT want the Pike compute host to
+        # be "healing" allocation information. Instead, we rely on the Pike
+        # scheduler to properly create allocations during scheduling.
+        compute_version = objects.Service.get_minimum_version(
+            context, 'nova-compute')
+        has_ocata_computes = compute_version < 22
+
         for instance in instances:
             if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL:
-                self._update_usage_from_instance(context, instance, nodename)
+                self._update_usage_from_instance(context, instance, nodename,
+                    has_ocata_computes=has_ocata_computes)
 
         self._remove_deleted_instances_allocations(context, cn)
 
@@ -1141,7 +1209,6 @@ class ResourceTracker(object):
                           "There are allocations remaining against the source "
                           "host that might need to be removed: %s.",
                           instance_uuid, instance.host, instance.node, alloc)
-                continue
 
     def _find_orphaned_instances(self):
         """Given the set of instances and migrations already account for
diff --git a/nova/objects/service.py b/nova/objects/service.py
index 6a7c8ea8a5..ff9058910b 100644
--- a/nova/objects/service.py
+++ b/nova/objects/service.py
@@ -32,7 +32,7 @@ LOG = logging.getLogger(__name__)
 
 
 # NOTE(danms): This is the global service version counter
-SERVICE_VERSION = 21
+SERVICE_VERSION = 22
 
 
 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
@@ -110,6 +110,9 @@ SERVICE_VERSION_HISTORY = (
     {'compute_rpc': '4.16'},
     # Version 21: Compute RPC version 4.17
     {'compute_rpc': '4.17'},
+    # Version 22: A marker for the behaviour change of auto-healing code on the
+    # compute host regarding allocations against an instance
+    {'compute_rpc': '4.17'},
 )
 
 
diff --git a/nova/test.py b/nova/test.py
index c4fc62a10f..24d65105db 100644
--- a/nova/test.py
+++ b/nova/test.py
@@ -260,6 +260,7 @@ class TestCase(testtools.TestCase):
         self._base_test_obj_backup = copy.copy(
             objects_base.NovaObjectRegistry._registry._obj_classes)
         self.addCleanup(self._restore_obj_registry)
+        objects.Service.clear_min_version_cache()
 
         # NOTE(danms): Reset the cached list of cells
         from nova.compute import api
diff --git a/nova/tests/functional/compute/test_resource_tracker.py b/nova/tests/functional/compute/test_resource_tracker.py
index ee7d98e472..9559ceaee2 100644
--- a/nova/tests/functional/compute/test_resource_tracker.py
+++ b/nova/tests/functional/compute/test_resource_tracker.py
@@ -290,12 +290,17 @@ class IronicResourceTrackerTest(test.TestCase):
                 self.assertEqual(3, len(inv))
 
             # Now "spawn" an instance to the first compute node by calling the
-            # RT's instance_claim(), which should, in the case of an Ironic
-            # instance, grab the full compute node for the instance and write
-            # allocation records for VCPU, MEMORY_MB, and DISK_GB
+            # RT's instance_claim().
             cn1_obj = self.COMPUTE_NODE_FIXTURES[uuids.cn1]
             cn1_nodename = cn1_obj.hypervisor_hostname
             inst = self.INSTANCE_FIXTURES[uuids.instance1]
+            # Since we're pike, the scheduler would have created our
+            # allocation for us. So, we can use our old update routine
+            # here to mimic that before we go do the compute RT claim,
+            # and then the checks below.
+            self.rt.reportclient.update_instance_allocation(cn1_obj,
+                                                            inst,
+                                                            1)
             with self.rt.instance_claim(self.ctx, inst, cn1_nodename):
                 pass
 
diff --git a/nova/tests/functional/regressions/test_bug_1679750.py b/nova/tests/functional/regressions/test_bug_1679750.py
index 6cbaf9934f..b93ce5107c 100644
--- a/nova/tests/functional/regressions/test_bug_1679750.py
+++ b/nova/tests/functional/regressions/test_bug_1679750.py
@@ -41,7 +41,8 @@ class TestLocalDeleteAllocations(test.TestCase,
         self.start_service('conductor')
         self.start_service('consoleauth')
 
-        self.flags(group='scheduler', driver='chance_scheduler')
+        self.flags(enabled_filters=['RetryFilter', 'ComputeFilter'],
+                   group='filter_scheduler')
         self.start_service('scheduler')
 
         self.compute = self.start_service('compute')
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index 503c9da25b..7c2433dc6f 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -1081,11 +1081,7 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
 
         self.useFixture(policy_fixture.RealPolicyFixture())
         self.useFixture(nova_fixtures.NeutronFixture(self))
-
-        # NOTE(gibi): After fix 1707071 we need to set the service version to
-        # pike to test pike only interactions. We need separate tests for
-        # ocata - pike interactions
-        # self.useFixture(nova_fixtures.AllServicesCurrent())
+        self.useFixture(nova_fixtures.AllServicesCurrent())
 
         placement = self.useFixture(nova_fixtures.PlacementFixture())
         self.placement_api = placement.api
@@ -1308,7 +1304,6 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
         source_allocation = allocations[source_rp_uuid]['resources']
         self.assertFlavorMatchesAllocation(old_flavor, source_allocation)
 
-        self.assertEqual(2, len(allocations))
         dest_allocation = allocations[dest_rp_uuid]['resources']
         self.assertFlavorMatchesAllocation(new_flavor, dest_allocation)
 
@@ -1394,6 +1389,7 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
         # destination host
         allocations = self._get_allocations_by_server_uuid(server['id'])
 
+        # and the server allocates only from the target host
         self.assertEqual(1, len(allocations))
 
         source_usages = self._get_provider_usages(source_rp_uuid)
@@ -1407,6 +1403,12 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
                          'The source host %s still has usages after the '
                          'resize has been confirmed' % source_hostname)
 
+        # and the target host allocation should be according to the new flavor
+        self.assertFlavorMatchesAllocation(self.flavor2, dest_usages)
+
+        dest_allocation = allocations[dest_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(self.flavor2, dest_allocation)
+
         self._run_periodics()
 
         # Check we're still accurate after running the periodics
@@ -1462,29 +1464,13 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
 
         usages = self._get_provider_usages(rp_uuid)
 
-        # NOTE(gibi): This is bug 1707071 where the compute "healing" periodic
-        # tramples on the doubled allocations created in the scheduler.
-        self.assertFlavorMatchesAllocation(new_flavor, usages)
-
-        # NOTE(gibi): After fixing bug 1707252 the following is expected
-        # self.assertEqual(old_flavor['vcpus'] + new_flavor['vcpus'],
-        #                  usages['VCPU'])
-        # self.assertEqual(old_flavor['ram'] + new_flavor['ram'],
-        #                  usages['MEMORY_MB'])
-        # self.assertEqual(old_flavor['disk'] + new_flavor['disk'],
-        #                  usages['DISK_GB'])
+        self.assertFlavorsMatchAllocation(old_flavor, new_flavor, usages)
 
         allocations = self._get_allocations_by_server_uuid(server['id'])
         self.assertEqual(1, len(allocations))
         allocation = allocations[rp_uuid]['resources']
 
-        # NOTE(gibi): After fixing bug 1707252 the following is expected
-        # self.assertEqual(old_flavor['vcpus'] + new_flavor['vcpus'],
-        #                  allocation['VCPU'])
-        # self.assertEqual(old_flavor['ram'] + new_flavor['ram'],
-        #                  allocation['MEMORY_MB'])
-        # self.assertEqual(old_flavor['disk'] + new_flavor['disk'],
-        #                  allocation['DISK_GB'])
+        self.assertFlavorsMatchAllocation(old_flavor, new_flavor, allocation)
 
     def test_resize_revert_same_host(self):
         # make sure that the test only uses a single host
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index a66182d067..beb955d2ab 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -5643,6 +5643,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
                 self.instance, self.instance.node)
             self.assertIsNotNone(self.instance.migration_context)
 
+        @mock.patch('nova.objects.Service.get_minimum_version',
+                    return_value=22)
         @mock.patch.object(self.compute, "_notify_about_instance_usage")
         @mock.patch.object(self.compute, "_set_instance_info")
         @mock.patch.object(self.instance, 'save')
@@ -5665,7 +5667,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
                                     mock_mig_save,
                                     mock_inst_save,
                                     mock_set,
-                                    mock_notify):
+                                    mock_notify,
+                                    mock_version):
             self.compute.finish_revert_resize(context=self.context,
                                               instance=self.instance,
                                               reservations=None,
diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py
index 6c456b7e4f..e2c726c2bc 100644
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@@ -31,6 +31,7 @@ from nova.objects import base as obj_base
 from nova.objects import fields as obj_fields
 from nova.objects import pci_device
 from nova.pci import manager as pci_manager
+from nova.scheduler import utils as sched_utils
 from nova import test
 from nova.tests.unit.objects import test_pci_device as fake_pci_device
 from nova.tests import uuidsentinel as uuids
@@ -467,7 +468,9 @@ class BaseTestCase(test.NoDBTestCase):
 
 class TestUpdateAvailableResources(BaseTestCase):
 
-    def _update_available_resources(self):
+    @mock.patch('nova.objects.Service.get_minimum_version',
+                return_value=22)
+    def _update_available_resources(self, version_mock):
         # We test RT._update separately, since the complexity
         # of the update_available_resource() function is high enough as
         # it is, we just want to focus here on testing the resources
@@ -1688,6 +1691,8 @@ class TestInstanceClaim(BaseTestCase):
 class TestResize(BaseTestCase):
     @mock.patch('nova.compute.utils.is_volume_backed_instance',
                 return_value=False)
+    @mock.patch('nova.objects.Service.get_minimum_version',
+                return_value=22)
     @mock.patch('nova.objects.InstancePCIRequests.get_by_instance',
                 return_value=objects.InstancePCIRequests(requests=[]))
     @mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
@@ -1698,7 +1703,7 @@ class TestResize(BaseTestCase):
     @mock.patch('nova.objects.ComputeNode.save')
     def test_resize_claim_same_host(self, save_mock, get_mock, migr_mock,
                                     get_cn_mock, pci_mock, instance_pci_mock,
-                                    is_bfv_mock):
+                                    version_mock, is_bfv_mock):
         # Resize an existing instance from its current flavor (instance type
         # 1) to a new flavor (instance type 2) and verify that the compute
         # node's resources are appropriately updated to account for the new
@@ -1788,6 +1793,8 @@ class TestResize(BaseTestCase):
 
     @mock.patch('nova.compute.utils.is_volume_backed_instance',
                 return_value=False)
+    @mock.patch('nova.objects.Service.get_minimum_version',
+                return_value=22)
     @mock.patch('nova.objects.InstancePCIRequests.get_by_instance_uuid',
                 return_value=objects.InstancePCIRequests(requests=[]))
     @mock.patch('nova.objects.InstancePCIRequests.get_by_instance',
@@ -1809,8 +1816,10 @@ class TestResize(BaseTestCase):
                                     pci_get_by_compute_node_mock,
                                     pci_get_by_instance_mock,
                                     pci_get_by_instance_uuid_mock,
+                                    version_mock,
                                     is_bfv_mock,
                                     revert=False):
+
         self.flags(reserved_host_disk_mb=0,
                    reserved_host_memory_mb=0)
         virt_resources = copy.deepcopy(_VIRT_DRIVER_AVAIL_RESOURCES)
@@ -1893,12 +1902,23 @@ class TestResize(BaseTestCase):
             ignore=['stats']
         ))
 
-    def test_instance_build_resize_confirm(self):
-        self._test_instance_build_resize()
+        cn = self.rt.compute_nodes[_NODENAME]
+        cn_uuid = cn.uuid
+        rc = self.sched_client_mock.reportclient
+        remove_method = rc.remove_provider_from_instance_allocation
+        expected_resources = sched_utils.resources_from_flavor(instance,
+            flavor)
+        remove_method.assert_called_once_with(instance.uuid, cn_uuid,
+            instance.user_id, instance.project_id, expected_resources)
 
     def test_instance_build_resize_revert(self):
         self._test_instance_build_resize(revert=True)
 
+    def test_instance_build_resize_confirm(self):
+        self._test_instance_build_resize()
+
+    @mock.patch('nova.objects.Service.get_minimum_version',
+                return_value=22)
     @mock.patch('nova.pci.stats.PciDeviceStats.support_requests',
                 return_value=True)
     @mock.patch('nova.objects.PciDevice.save')
@@ -1911,7 +1931,7 @@ class TestResize(BaseTestCase):
     @mock.patch('nova.objects.ComputeNode.save')
     def test_resize_claim_dest_host_with_pci(self, save_mock, get_mock,
             migr_mock, get_cn_mock, pci_mock, pci_req_mock, pci_claim_mock,
-            pci_dev_save_mock, pci_supports_mock):
+            pci_dev_save_mock, pci_supports_mock, version_mock):
         # Starting from an empty destination compute node, perform a resize
         # operation for an instance containing SR-IOV PCI devices on the
         # original host.
@@ -2038,7 +2058,12 @@ class TestResize(BaseTestCase):
                 self.rt.drop_move_claim(ctx, instance, _NODENAME)
                 mock_pci_free_device.assert_called_once_with(
                     pci_dev, mock.ANY)
+                # Check that we grabbed resourced for the right flavor...
+                mock_resources.assert_called_once_with(instance,
+                    instance.flavor)
 
+    @mock.patch('nova.objects.Service.get_minimum_version',
+                return_value=22)
     @mock.patch('nova.objects.InstancePCIRequests.get_by_instance',
                 return_value=objects.InstancePCIRequests(requests=[]))
     @mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
@@ -2048,7 +2073,7 @@ class TestResize(BaseTestCase):
     @mock.patch('nova.objects.InstanceList.get_by_host_and_node')
     @mock.patch('nova.objects.ComputeNode.save')
     def test_resize_claim_two_instances(self, save_mock, get_mock, migr_mock,
-            get_cn_mock, pci_mock, instance_pci_mock):
+            get_cn_mock, pci_mock, instance_pci_mock, version_mock):
         # Issue two resize claims against a destination host with no prior
         # instances on it and validate that the accounting for resources is
         # correct.
@@ -2163,6 +2188,8 @@ class TestResize(BaseTestCase):
 
 
 class TestRebuild(BaseTestCase):
+    @mock.patch('nova.objects.Service.get_minimum_version',
+                return_value=22)
     @mock.patch('nova.objects.InstancePCIRequests.get_by_instance',
                 return_value=objects.InstancePCIRequests(requests=[]))
     @mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
@@ -2172,7 +2199,7 @@ class TestRebuild(BaseTestCase):
     @mock.patch('nova.objects.InstanceList.get_by_host_and_node')
     @mock.patch('nova.objects.ComputeNode.save')
     def test_rebuild_claim(self, save_mock, get_mock, migr_mock, get_cn_mock,
-            pci_mock, instance_pci_mock):
+            pci_mock, instance_pci_mock, version_mock):
         # Rebuild an instance, emulating an evacuate command issued against the
         # original instance. The rebuild operation uses the resource tracker's
         # _move_claim() method, but unlike with resize_claim(), rebuild_claim()
author	Jenkins <jenkins@review.openstack.org>	2017-08-11 07:06:48 +0000
committer	Gerrit Code Review <review@openstack.org>	2017-08-11 07:06:48 +0000
commit	4cd6a3a1b473795a63f076c7986d1b48d83af99d (patch)
tree	0095c11d7daa078cfbd4c85f81655b311c6683f4
parent	c58d4422d87af8b1e8583972d3eef5c7db327264 (diff)
parent	ddef9dcfc0f7b8f80c6f67f2bfcaf752558a7788 (diff)
download	nova-16.0.0.0rc1.tar.gz