1 files changed, 185 insertions, 98 deletions
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
index 1c492bcb27..9ee6670c17 100644
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -30,6 +30,7 @@ import retrying
 
 from nova.compute import claims
 from nova.compute import monitors
+from nova.compute import pci_placement_translator
 from nova.compute import provider_config
 from nova.compute import stats as compute_stats
 from nova.compute import task_states
@@ -48,6 +49,7 @@ from nova import rpc
 from nova.scheduler.client import report
 from nova import utils
 from nova.virt import hardware
+from nova.virt import node
 
 
 CONF = nova.conf.CONF
@@ -103,7 +105,7 @@ class ResourceTracker(object):
         monitor_handler = monitors.MonitorHandler(self)
         self.monitors = monitor_handler.monitors
         self.old_resources = collections.defaultdict(objects.ComputeNode)
-        self.reportclient = reportclient or report.SchedulerReportClient()
+        self.reportclient = reportclient or report.report_client_singleton()
         self.ram_allocation_ratio = CONF.ram_allocation_ratio
         self.cpu_allocation_ratio = CONF.cpu_allocation_ratio
         self.disk_allocation_ratio = CONF.disk_allocation_ratio
@@ -144,16 +146,20 @@ class ResourceTracker(object):
                   during the instance build.
         """
         if self.disabled(nodename):
-            # instance_claim() was called before update_available_resource()
-            # (which ensures that a compute node exists for nodename). We
-            # shouldn't get here but in case we do, just set the instance's
-            # host and nodename attribute (probably incorrect) and return a
-            # NoopClaim.
-            # TODO(jaypipes): Remove all the disabled junk from the resource
-            # tracker. Servicegroup API-level active-checking belongs in the
-            # nova-compute manager.
-            self._set_instance_host_and_node(instance, nodename)
-            return claims.NopClaim()
+            # If we get here, it means we are trying to claim for an instance
+            # that was scheduled to a node that we do not have in our list,
+            # or is in some other way unmanageable by this node. This would
+            # mean that we are unable to account for resources, create
+            # allocations in placement, or do any of the other accounting
+            # necessary for this to work. In the past, this situation was
+            # effectively ignored silently, but in a world where we track
+            # resources with placement and instance assignment to compute nodes
+            # by service, we can no longer be leaky.
+            raise exception.ComputeResourcesUnavailable(
+                ('Attempt to claim resources for instance %(inst)s '
+                 'on unknown node %(node)s failed') % {
+                     'inst': instance.uuid,
+                     'node': nodename})
 
         # sanity checks:
         if instance.host:
@@ -278,9 +284,17 @@ class ResourceTracker(object):
                 context, instance, new_flavor, nodename, move_type)
 
         if self.disabled(nodename):
-            # compute_driver doesn't support resource tracking, just
-            # generate the migration record and continue the resize:
-            return claims.NopClaim(migration=migration)
+            # This means we were asked to accept an incoming migration to a
+            # node that we do not own or track. We really should not get here,
+            # but if we do, we must refuse to continue with the migration
+            # process, since we cannot account for those resources, create
+            # allocations in placement, etc. This has been a silent resource
+            # leak in the past, but it must be a hard failure now.
+            raise exception.ComputeResourcesUnavailable(
+                ('Attempt to claim move resources for instance %(inst)s on '
+                 'unknown node %(node)s failed') % {
+                     'inst': instance.uuid,
+                     'node': 'nodename'})
 
         cn = self.compute_nodes[nodename]
 
@@ -328,7 +342,12 @@ class ResourceTracker(object):
             migration_id=migration.id,
             old_numa_topology=instance.numa_topology,
             new_numa_topology=claim.claimed_numa_topology,
-            old_pci_devices=instance.pci_devices,
+            # NOTE(gibi): the _update_usage_from_migration call below appends
+            # the newly claimed pci devices to the instance.pci_devices list
+            # to keep the migration context independent we need to make a copy
+            # that list here. We need a deep copy as we need to duplicate
+            # the instance.pci_devices.objects list
+            old_pci_devices=copy.deepcopy(instance.pci_devices),
             new_pci_devices=claimed_pci_devices,
             old_pci_requests=instance.pci_requests,
             new_pci_requests=new_pci_requests,
@@ -613,18 +632,11 @@ class ResourceTracker(object):
         :param prefix: Prefix to use when accessing migration context
             attributes. 'old_' or 'new_', with 'new_' being the default.
         """
-        # Remove usage for an instance that is tracked in migrations, such as
-        # on the dest node during revert resize.
-        if instance['uuid'] in self.tracked_migrations:
-            migration = self.tracked_migrations.pop(instance['uuid'])
+        if instance["uuid"] in self.tracked_migrations:
             if not flavor:
-                flavor = self._get_flavor(instance, prefix, migration)
-        # Remove usage for an instance that is not tracked in migrations (such
-        # as on the source node after a migration).
-        # NOTE(lbeliveau): On resize on the same node, the instance is
-        # included in both tracked_migrations and tracked_instances.
-        elif instance['uuid'] in self.tracked_instances:
-            self.tracked_instances.remove(instance['uuid'])
+                flavor = self._get_flavor(
+                    instance, prefix, self.tracked_migrations[instance["uuid"]]
+                )
 
         if flavor is not None:
             numa_topology = self._get_migration_context_resource(
@@ -640,6 +652,15 @@ class ResourceTracker(object):
             ctxt = context.elevated()
             self._update(ctxt, self.compute_nodes[nodename])
 
+        # Remove usage for an instance that is tracked in migrations, such as
+        # on the dest node during revert resize.
+        self.tracked_migrations.pop(instance['uuid'], None)
+        # Remove usage for an instance that is not tracked in migrations (such
+        # as on the source node after a migration).
+        # NOTE(lbeliveau): On resize on the same node, the instance is
+        # included in both tracked_migrations and tracked_instances.
+        self.tracked_instances.discard(instance['uuid'])
+
     @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
     def update_usage(self, context, instance, nodename):
         """Update the resource usage and stats after a change in an
@@ -660,50 +681,6 @@ class ResourceTracker(object):
         return (nodename not in self.compute_nodes or
                 not self.driver.node_is_available(nodename))
 
-    def _check_for_nodes_rebalance(self, context, resources, nodename):
-        """Check if nodes rebalance has happened.
-
-        The ironic driver maintains a hash ring mapping bare metal nodes
-        to compute nodes. If a compute dies, the hash ring is rebuilt, and
-        some of its bare metal nodes (more precisely, those not in ACTIVE
-        state) are assigned to other computes.
-
-        This method checks for this condition and adjusts the database
-        accordingly.
-
-        :param context: security context
-        :param resources: initial values
-        :param nodename: node name
-        :returns: True if a suitable compute node record was found, else False
-        """
-        if not self.driver.rebalances_nodes:
-            return False
-
-        # Its possible ironic just did a node re-balance, so let's
-        # check if there is a compute node that already has the correct
-        # hypervisor_hostname. We can re-use that rather than create a
-        # new one and have to move existing placement allocations
-        cn_candidates = objects.ComputeNodeList.get_by_hypervisor(
-            context, nodename)
-
-        if len(cn_candidates) == 1:
-            cn = cn_candidates[0]
-            LOG.info("ComputeNode %(name)s moving from %(old)s to %(new)s",
-                     {"name": nodename, "old": cn.host, "new": self.host})
-            cn.host = self.host
-            self.compute_nodes[nodename] = cn
-            self._copy_resources(cn, resources)
-            self._setup_pci_tracker(context, cn, resources)
-            self._update(context, cn)
-            return True
-        elif len(cn_candidates) > 1:
-            LOG.error(
-                "Found more than one ComputeNode for nodename %s. "
-                "Please clean up the orphaned ComputeNode records in your DB.",
-                nodename)
-
-        return False
-
     def _init_compute_node(self, context, resources):
         """Initialize the compute node if it does not already exist.
 
@@ -721,6 +698,7 @@ class ResourceTracker(object):
             False otherwise
         """
         nodename = resources['hypervisor_hostname']
+        node_uuid = resources['uuid']
 
         # if there is already a compute node just use resources
         # to initialize
@@ -732,23 +710,43 @@ class ResourceTracker(object):
 
         # now try to get the compute node record from the
         # database. If we get one we use resources to initialize
-        cn = self._get_compute_node(context, nodename)
+
+        # We use read_deleted=True so that we will find and recover a deleted
+        # node object, if necessary.
+        with utils.temporary_mutation(context, read_deleted='yes'):
+            cn = self._get_compute_node(context, node_uuid)
+            if cn and cn.deleted:
+                # Undelete and save this right now so that everything below
+                # can continue without read_deleted=yes
+                LOG.info('Undeleting compute node %s', cn.uuid)
+                cn.deleted = False
+                cn.deleted_at = None
+                cn.save()
         if cn:
+            if cn.host != self.host:
+                LOG.info("ComputeNode %(name)s moving from %(old)s to %(new)s",
+                         {"name": nodename, "old": cn.host, "new": self.host})
+                cn.host = self.host
+                self._update(context, cn)
+
             self.compute_nodes[nodename] = cn
             self._copy_resources(cn, resources)
             self._setup_pci_tracker(context, cn, resources)
             return False
 
-        if self._check_for_nodes_rebalance(context, resources, nodename):
-            return False
-
         # there was no local copy and none in the database
         # so we need to create a new compute node. This needs
         # to be initialized with resource values.
         cn = objects.ComputeNode(context)
         cn.host = self.host
         self._copy_resources(cn, resources, initial=True)
-        cn.create()
+        try:
+            cn.create()
+        except exception.DuplicateRecord:
+            raise exception.InvalidConfiguration(
+                'Duplicate compute node record found for host %s node %s' % (
+                cn.host, cn.hypervisor_hostname))
+
         # Only map the ComputeNode into compute_nodes if create() was OK
         # because if create() fails, on the next run through here nodename
         # would be in compute_nodes and we won't try to create again (because
@@ -881,6 +879,14 @@ class ResourceTracker(object):
         # contains a non-None value, even for non-Ironic nova-compute hosts. It
         # is this value that will be populated in the compute_nodes table.
         resources['host_ip'] = CONF.my_ip
+        if 'uuid' not in resources:
+            # NOTE(danms): Any driver that does not provide a uuid per
+            # node gets the locally-persistent compute_id. Only ironic
+            # should be setting the per-node uuid (and returning
+            # multiple nodes in general). If this is the first time we
+            # are creating a compute node on this host, we will
+            # generate and persist this uuid for the future.
+            resources['uuid'] = node.get_local_node_uuid()
 
         # We want the 'cpu_info' to be None from the POV of the
         # virt driver, but the DB requires it to be non-null so
@@ -985,8 +991,6 @@ class ResourceTracker(object):
         # notified when instances are deleted, we need remove all usages
         # from deleted instances.
         self.pci_tracker.clean_usage(instances, migrations)
-        dev_pools_obj = self.pci_tracker.stats.to_device_pools_obj()
-        cn.pci_device_pools = dev_pools_obj
 
         self._report_final_resource_view(nodename)
 
@@ -1008,14 +1012,13 @@ class ResourceTracker(object):
         if startup:
             self._check_resources(context)
 
-    def _get_compute_node(self, context, nodename):
+    def _get_compute_node(self, context, node_uuid):
         """Returns compute node for the host and nodename."""
         try:
-            return objects.ComputeNode.get_by_host_and_nodename(
-                context, self.host, nodename)
+            return objects.ComputeNode.get_by_uuid(context, node_uuid)
         except exception.NotFound:
             LOG.warning("No compute node record for %(host)s:%(node)s",
-                        {'host': self.host, 'node': nodename})
+                        {'host': self.host, 'node': node_uuid})
 
     def _report_hypervisor_resource_view(self, resources):
         """Log the hypervisor's view of free resources.
@@ -1126,6 +1129,28 @@ class ResourceTracker(object):
             LOG.error('Unable to find services table record for nova-compute '
                       'host %s', self.host)
 
+    def _should_expose_remote_managed_ports_trait(self,
+                                                  is_supported: bool):
+        """Determine whether COMPUTE_REMOTE_MANAGED_PORTS should be exposed.
+
+        Determines if the COMPUTE_REMOTE_MANAGED_PORTS trait needs to be
+        exposed based on the respective compute driver capability and
+        the presence of remote managed devices on a given host. Whether such
+        devices are present or not depends on the Whitelist configuration
+        (presence of a remote_managed tag association with some PCI devices)
+        and their physical presence (plugged in, enumerated by the OS).
+
+        The aim of having this check is to optimize host lookup by prefiltering
+        hosts that have compute driver support but no hardware. The check
+        does not consider free device count - just the presence of device
+        pools since device availability may change between a prefilter check
+        and a later check in PciPassthroughFilter.
+
+        :param bool is_supported: Is the trait supported by the compute driver
+        """
+        return (is_supported and
+            self.pci_tracker.pci_stats.has_remote_managed_device_pools())
+
     def _get_traits(self, context, nodename, provider_tree):
         """Synchronizes internal and external traits for the node provider.
 
@@ -1149,7 +1174,11 @@ class ResourceTracker(object):
         # traits that are missing, and remove any existing set traits
         # that are not currently supported.
         for trait, supported in self.driver.capabilities_as_traits().items():
-            if supported:
+            add_trait = supported
+            if trait == os_traits.COMPUTE_REMOTE_MANAGED_PORTS:
+                add_trait &= self._should_expose_remote_managed_ports_trait(
+                    supported)
+            if add_trait:
                 traits.add(trait)
             elif trait in traits:
                 traits.remove(trait)
@@ -1163,9 +1192,16 @@ class ResourceTracker(object):
 
         return list(traits)
 
-    @retrying.retry(stop_max_attempt_number=4,
-                    retry_on_exception=lambda e: isinstance(
-                        e, exception.ResourceProviderUpdateConflict))
+    @retrying.retry(
+        stop_max_attempt_number=4,
+        retry_on_exception=lambda e: isinstance(
+            e,
+            (
+                exception.ResourceProviderUpdateConflict,
+                exception.PlacementReshapeConflict,
+            ),
+        ),
+    )
     def _update_to_placement(self, context, compute_node, startup):
         """Send resource and inventory changes to placement."""
         # NOTE(jianghuaw): Some resources(e.g. VGPU) are not saved in the
@@ -1185,7 +1221,9 @@ class ResourceTracker(object):
             context, compute_node.uuid, name=compute_node.hypervisor_hostname)
         # Let the virt driver rearrange the provider tree and set/update
         # the inventory, traits, and aggregates throughout.
-        allocs = None
+        allocs = self.reportclient.get_allocations_for_provider_tree(
+            context, nodename)
+        driver_reshaped = False
         try:
             self.driver.update_provider_tree(prov_tree, nodename)
         except exception.ReshapeNeeded:
@@ -1196,10 +1234,9 @@ class ResourceTracker(object):
             LOG.info("Performing resource provider inventory and "
                      "allocation data migration during compute service "
                      "startup or fast-forward upgrade.")
-            allocs = self.reportclient.get_allocations_for_provider_tree(
-                context, nodename)
-            self.driver.update_provider_tree(prov_tree, nodename,
-                                             allocations=allocs)
+            self.driver.update_provider_tree(
+                prov_tree, nodename, allocations=allocs)
+            driver_reshaped = True
 
         # Inject driver capabilities traits into the provider
         # tree.  We need to determine the traits that the virt
@@ -1220,25 +1257,77 @@ class ResourceTracker(object):
             context, nodename, provider_tree=prov_tree)
         prov_tree.update_traits(nodename, traits)
 
+        instances_under_same_host_resize = [
+            migration.instance_uuid
+            for migration in self.tracked_migrations.values()
+            if migration.is_same_host_resize
+        ]
+        # NOTE(gibi): Tracking PCI in placement is different from other
+        # resources.
+        #
+        # While driver.update_provider_tree is used to let the virt driver
+        # create any kind of placement model for a resource the PCI data
+        # modelling is done virt driver independently by the PCI tracker.
+        # So the placement reporting needs to be also done here in the resource
+        # tracker independently of the virt driver.
+        #
+        # Additionally, when PCI tracking in placement was introduced there was
+        # already PCI allocations in nova. So both the PCI inventories and
+        # allocations needs to be healed. Moreover, to support rolling upgrade
+        # the placement prefilter for PCI devices was not turned on by default
+        # at the first release of this feature. Therefore, there could be new
+        # PCI allocation without placement being involved until the prefilter
+        # is enabled. So we need to be ready to heal PCI allocations at
+        # every call not just at startup.
+        pci_reshaped = pci_placement_translator.update_provider_tree_for_pci(
+            prov_tree,
+            nodename,
+            self.pci_tracker,
+            allocs,
+            instances_under_same_host_resize,
+        )
+
         self.provider_tree = prov_tree
 
         # This merges in changes from the provider config files loaded in init
         self._merge_provider_configs(self.provider_configs, prov_tree)
 
-        # Flush any changes. If we processed ReshapeNeeded above, allocs is not
-        # None, and this will hit placement's POST /reshaper route.
-        self.reportclient.update_from_provider_tree(context, prov_tree,
-                                                    allocations=allocs)
+        try:
+            # Flush any changes. If we either processed ReshapeNeeded above or
+            # update_provider_tree_for_pci did reshape, then we need to pass
+            # allocs to update_from_provider_tree to hit placement's POST
+            # /reshaper route.
+            self.reportclient.update_from_provider_tree(
+                context,
+                prov_tree,
+                allocations=allocs if driver_reshaped or pci_reshaped else None
+            )
+        except exception.InventoryInUse as e:
+            # This means an inventory reconfiguration (e.g.: removing a parent
+            # PF and adding a VF under that parent) was not possible due to
+            # existing allocations. Translate the exception to prevent the
+            # compute service to start
+            raise exception.PlacementPciException(error=str(e))
 
     def _update(self, context, compute_node, startup=False):
         """Update partial stats locally and populate them to Scheduler."""
+
+        self._update_to_placement(context, compute_node, startup)
+
+        if self.pci_tracker:
+            # sync PCI device pool state stored in the compute node with
+            # the actual state from the PCI tracker as we commit changes in
+            # the DB and in the PCI tracker below
+            dev_pools_obj = self.pci_tracker.stats.to_device_pools_obj()
+            compute_node.pci_device_pools = dev_pools_obj
+
         # _resource_change will update self.old_resources if it detects changes
         # but we want to restore those if compute_node.save() fails.
         nodename = compute_node.hypervisor_hostname
         old_compute = self.old_resources[nodename]
         if self._resource_change(compute_node):
             # If the compute_node's resource changed, update to DB. Note that
-            # _update_to_placement below does not supersede the need to do this
+            # _update_to_placement above does not supersede the need to do this
             # because there are stats-related fields in the ComputeNode object
             # which could have changed and still need to be reported to the
             # scheduler filters/weighers (which could be out of tree as well).
@@ -1251,8 +1340,6 @@ class ResourceTracker(object):
                 with excutils.save_and_reraise_exception(logger=LOG):
                     self.old_resources[nodename] = old_compute
 
-        self._update_to_placement(context, compute_node, startup)
-
         if self.pci_tracker:
             self.pci_tracker.save(context)
 
@@ -1825,7 +1912,7 @@ class ResourceTracker(object):
                         raise ValueError(_(
                             "Provider config '%(source_file_name)s' attempts "
                             "to define a trait that is owned by the "
-                            "virt driver or specified via the placment api. "
+                            "virt driver or specified via the placement api. "
                             "Invalid traits '%(invalid)s' must be removed "
                             "from '%(source_file_name)s'.") % {
                                 'source_file_name': source_file_name,