1 files changed, 353 insertions, 38 deletions
diff --git a/nova/pci/stats.py b/nova/pci/stats.py
index e8e810fa4f..c6e4844b34 100644
--- a/nova/pci/stats.py
+++ b/nova/pci/stats.py
@@ -13,17 +13,19 @@
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
-
+import collections
 import copy
 import typing as ty
 
 from oslo_config import cfg
 from oslo_log import log as logging
+from oslo_utils import strutils
 
 from nova import exception
 from nova import objects
 from nova.objects import fields
 from nova.objects import pci_device_pool
+from nova.pci.request import PCI_REMOTE_MANAGED_TAG
 from nova.pci import utils
 from nova.pci import whitelist
 
@@ -62,12 +64,25 @@ class PciDeviceStats(object):
     """
 
     pool_keys = ['product_id', 'vendor_id', 'numa_node', 'dev_type']
+    # these can be specified in the [pci]device_spec and can be requested via
+    # the PCI alias, but they are matched by the placement
+    # allocation_candidates query, so we can ignore them during pool creation
+    # and during filtering here
+    ignored_spec_tags = ignored_pool_tags = ['resource_class', 'traits']
+    # this is a metadata key in the spec that is matched
+    # specially in _filter_pools_based_on_placement_allocation. So we can
+    # ignore them in the general matching logic.
+    ignored_spec_tags += ['rp_uuids']
+    # this is a metadata key in the pool that is matched
+    # specially in _filter_pools_based_on_placement_allocation. So we can
+    # ignore them in the general matching logic.
+    ignored_pool_tags += ['rp_uuid']
 
     def __init__(
         self,
         numa_topology: 'objects.NUMATopology',
         stats: 'objects.PCIDevicePoolList' = None,
-        dev_filter: whitelist.Whitelist = None,
+        dev_filter: ty.Optional[whitelist.Whitelist] = None,
     ) -> None:
         self.numa_topology = numa_topology
         self.pools = (
@@ -75,7 +90,7 @@ class PciDeviceStats(object):
         )
         self.pools.sort(key=lambda item: len(item))
         self.dev_filter = dev_filter or whitelist.Whitelist(
-            CONF.pci.passthrough_whitelist)
+            CONF.pci.device_spec)
 
     def _equal_properties(
         self, dev: Pool, entry: Pool, matching_keys: ty.List[str],
@@ -95,6 +110,28 @@ class PciDeviceStats(object):
 
         return None
 
+    @staticmethod
+    def _ensure_remote_managed_tag(
+            dev: 'objects.PciDevice', pool: Pool):
+        """Add a remote_managed tag depending on a device type if needed.
+
+        Network devices may be managed remotely, e.g. by a SmartNIC DPU. If
+        a tag has not been explicitly provided, populate it by assuming that
+        a device is not remote managed by default.
+        """
+        if dev.dev_type not in (fields.PciDeviceType.SRIOV_VF,
+                                fields.PciDeviceType.SRIOV_PF,
+                                fields.PciDeviceType.VDPA):
+            return
+
+        # A tag is added here rather than at the client side to avoid an
+        # issue with having objects without this tag specified during an
+        # upgrade to the first version that supports handling this tag.
+        if pool.get(PCI_REMOTE_MANAGED_TAG) is None:
+            # NOTE: tags are compared as strings case-insensitively, see
+            # pci_device_prop_match in nova/pci/utils.py.
+            pool[PCI_REMOTE_MANAGED_TAG] = 'false'
+
     def _create_pool_keys_from_dev(
         self, dev: 'objects.PciDevice',
     ) -> ty.Optional[Pool]:
@@ -110,8 +147,22 @@ class PciDeviceStats(object):
             return None
         tags = devspec.get_tags()
         pool = {k: getattr(dev, k) for k in self.pool_keys}
+
         if tags:
-            pool.update(tags)
+            pool.update(
+                {
+                    k: v
+                    for k, v in tags.items()
+                    if k not in self.ignored_pool_tags
+                }
+            )
+        # NOTE(gibi): since PCI in placement maps a PCI dev or a PF to a
+        # single RP and the scheduler allocates from a specific RP we need
+        # to split the pools by PCI or PF address. We can still keep
+        # the VFs from the same parent PF in a single pool though as they
+        # are equivalent from placement perspective.
+        pool['address'] = dev.parent_addr or dev.address
+
         # NOTE(gibi): parent_ifname acts like a tag during pci claim but
         # not provided as part of the whitelist spec as it is auto detected
         # by the virt driver.
@@ -120,6 +171,9 @@ class PciDeviceStats(object):
         # already in placement.
         if dev.extra_info.get('parent_ifname'):
             pool['parent_ifname'] = dev.extra_info['parent_ifname']
+
+        self._ensure_remote_managed_tag(dev, pool)
+
         return pool
 
     def _get_pool_with_device_type_mismatch(
@@ -197,6 +251,17 @@ class PciDeviceStats(object):
             free_devs.extend(pool['devices'])
         return free_devs
 
+    def _allocate_devs(
+        self, pool: Pool, num: int, request_id: str
+    ) -> ty.List["objects.PciDevice"]:
+        alloc_devices = []
+        for _ in range(num):
+            pci_dev = pool['devices'].pop()
+            self._handle_device_dependents(pci_dev)
+            pci_dev.request_id = request_id
+            alloc_devices.append(pci_dev)
+        return alloc_devices
+
     def consume_requests(
         self,
         pci_requests: 'objects.InstancePCIRequests',
@@ -208,7 +273,10 @@ class PciDeviceStats(object):
         for request in pci_requests:
             count = request.count
 
-            pools = self._filter_pools(self.pools, request, numa_cells)
+            rp_uuids = self._get_rp_uuids_for_request(
+                request=request, provider_mapping=None)
+            pools = self._filter_pools(
+                self.pools, request, numa_cells, rp_uuids=rp_uuids)
 
             # Failed to allocate the required number of devices. Return the
             # devices already allocated during previous iterations back to
@@ -222,22 +290,31 @@ class PciDeviceStats(object):
                           "on the compute node semaphore.")
                 for d in range(len(alloc_devices)):
                     self.add_device(alloc_devices.pop())
-                return None
-
-            for pool in pools:
-                if pool['count'] >= count:
-                    num_alloc = count
-                else:
-                    num_alloc = pool['count']
-                count -= num_alloc
-                pool['count'] -= num_alloc
-                for d in range(num_alloc):
-                    pci_dev = pool['devices'].pop()
-                    self._handle_device_dependents(pci_dev)
-                    pci_dev.request_id = request.request_id
-                    alloc_devices.append(pci_dev)
-                if count == 0:
-                    break
+                raise exception.PciDeviceRequestFailed(requests=pci_requests)
+
+            if not rp_uuids:
+                # if there is no placement allocation then we are free to
+                # consume from the pools in any order:
+                for pool in pools:
+                    if pool['count'] >= count:
+                        num_alloc = count
+                    else:
+                        num_alloc = pool['count']
+                    count -= num_alloc
+                    pool['count'] -= num_alloc
+                    alloc_devices += self._allocate_devs(
+                        pool, num_alloc, request.request_id)
+                    if count == 0:
+                        break
+            else:
+                # but if there is placement allocation then we have to follow
+                # it
+                requested_devs_per_pool_rp = collections.Counter(rp_uuids)
+                for pool in pools:
+                    count = requested_devs_per_pool_rp[pool['rp_uuid']]
+                    pool['count'] -= count
+                    alloc_devices += self._allocate_devs(
+                        pool, count, request.request_id)
 
         return alloc_devices
 
@@ -252,8 +329,12 @@ class PciDeviceStats(object):
         if pci_dev.dev_type == fields.PciDeviceType.SRIOV_PF:
             vfs_list = pci_dev.child_devices
             if vfs_list:
+                free_devs = self.get_free_devs()
                 for vf in vfs_list:
-                    self.remove_device(vf)
+                    # NOTE(gibi): do not try to remove a device that are
+                    # already removed
+                    if vf in free_devs:
+                        self.remove_device(vf)
         elif pci_dev.dev_type in (
             fields.PciDeviceType.SRIOV_VF,
             fields.PciDeviceType.VDPA,
@@ -282,7 +363,15 @@ class PciDeviceStats(object):
         :returns: A list of pools that can be used to support the request if
             this is possible.
         """
-        request_specs = request.spec
+
+        def ignore_keys(spec):
+            return {
+                k: v
+                for k, v in spec.items()
+                if k not in self.ignored_spec_tags
+            }
+
+        request_specs = [ignore_keys(spec) for spec in request.spec]
         return [
             pool for pool in pools
             if utils.pci_device_prop_match(pool, request_specs)
@@ -458,11 +547,73 @@ class PciDeviceStats(object):
             ]
         return pools
 
+    def _filter_pools_for_unrequested_remote_managed_devices(
+        self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest',
+    ) -> ty.List[Pool]:
+        """Filter out pools with remote_managed devices, unless requested.
+
+        Remote-managed devices are not usable for legacy SR-IOV or hardware
+        offload scenarios and must be excluded from allocation.
+
+        :param pools: A list of PCI device pool dicts
+        :param request: An InstancePCIRequest object describing the type,
+            quantity and required NUMA affinity of device(s) we want.
+        :returns: A list of pools that can be used to support the request if
+            this is possible.
+        """
+        if all(not strutils.bool_from_string(spec.get(PCI_REMOTE_MANAGED_TAG))
+               for spec in request.spec):
+            pools = [pool for pool in pools
+                     if not strutils.bool_from_string(
+                         pool.get(PCI_REMOTE_MANAGED_TAG))]
+        return pools
+
+    def _filter_pools_based_on_placement_allocation(
+        self,
+        pools: ty.List[Pool],
+        request: 'objects.InstancePCIRequest',
+        rp_uuids: ty.List[str],
+    ) -> ty.List[Pool]:
+        if not rp_uuids:
+            # If there is no placement allocation then we don't need to filter
+            # by it. This could happen if the instance only has neutron port
+            # based InstancePCIRequest as that is currently not having
+            # placement allocation (except for QoS ports, but that handled in a
+            # separate codepath) or if the [filter_scheduler]pci_in_placement
+            # configuration option is not enabled in the scheduler.
+            return pools
+
+        requested_dev_count_per_rp = collections.Counter(rp_uuids)
+        matching_pools = []
+        for pool in pools:
+            rp_uuid = pool.get('rp_uuid')
+            if rp_uuid is None:
+                # NOTE(gibi): As rp_uuids is not empty the scheduler allocated
+                # PCI resources on this host, so we know that
+                # [pci]report_in_placement is enabled on this host. But this
+                # pool has no RP mapping which can only happen if the pool
+                # contains PCI devices with physical_network tag, as those
+                # devices not yet reported in placement. But if they are not
+                # reported then we can ignore them here too.
+                continue
+
+            if (
+                # the placement allocation contains this pool
+                rp_uuid in requested_dev_count_per_rp and
+                # the amount of dev allocated in placement can be consumed
+                # from the pool
+                pool["count"] >= requested_dev_count_per_rp[rp_uuid]
+            ):
+                matching_pools.append(pool)
+
+        return matching_pools
+
     def _filter_pools(
         self,
         pools: ty.List[Pool],
         request: 'objects.InstancePCIRequest',
         numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']],
+        rp_uuids: ty.List[str],
     ) -> ty.Optional[ty.List[Pool]]:
         """Determine if an individual PCI request can be met.
 
@@ -477,6 +628,9 @@ class PciDeviceStats(object):
             quantity and required NUMA affinity of device(s) we want.
         :param numa_cells: A list of InstanceNUMACell objects whose ``id``
             corresponds to the ``id`` of host NUMACell objects.
+        :param rp_uuids: A list of PR uuids this request fulfilled from in
+            placement. So here we have to consider only the pools matching with
+            thes RP uuids
         :returns: A list of pools that can be used to support the request if
             this is possible, else None.
         """
@@ -547,6 +701,33 @@ class PciDeviceStats(object):
                 before_count - after_count
             )
 
+        # If we're not requesting remote_managed devices then we should not
+        # use these either. Exclude them.
+        before_count = after_count
+        pools = self._filter_pools_for_unrequested_remote_managed_devices(
+            pools, request)
+        after_count = sum([pool['count'] for pool in pools])
+
+        if after_count < before_count:
+            LOG.debug(
+                'Dropped %d device(s) as they are remote-managed devices which'
+                'we have not requested',
+                before_count - after_count
+            )
+
+        # if there is placement allocation for the request then we have to
+        # remove the pools that are not in the placement allocation
+        before_count = after_count
+        pools = self._filter_pools_based_on_placement_allocation(
+            pools, request, rp_uuids)
+        after_count = sum([pool['count'] for pool in pools])
+        if after_count < before_count:
+            LOG.debug(
+                'Dropped %d device(s) that are not part of the placement '
+                'allocation',
+                before_count - after_count
+            )
+
         if after_count < request.count:
             LOG.debug('Not enough PCI devices left to satisfy request')
             return None
@@ -556,6 +737,7 @@ class PciDeviceStats(object):
     def support_requests(
         self,
         requests: ty.List['objects.InstancePCIRequest'],
+        provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]],
         numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,
     ) -> bool:
         """Determine if the PCI requests can be met.
@@ -569,20 +751,38 @@ class PciDeviceStats(object):
         :param requests: A list of InstancePCIRequest object describing the
             types, quantities and required NUMA affinities of devices we want.
         :type requests: nova.objects.InstancePCIRequests
+        :param provider_mapping: A dict keyed by RequestGroup requester_id,
+            to a list of resource provider UUIDs which provide resource
+            for that RequestGroup. If it is None then it signals that the
+            InstancePCIRequest objects already stores a mapping per request.
+            I.e.: we are called _after_ the scheduler made allocations for this
+            request in placement.
         :param numa_cells: A list of InstanceNUMACell objects whose ``id``
             corresponds to the ``id`` of host NUMACells, or None.
         :returns: Whether this compute node can satisfy the given request.
         """
-        # NOTE(yjiang5): this function has high possibility to fail,
-        # so no exception should be triggered for performance reason.
-        return all(
-            self._filter_pools(self.pools, r, numa_cells) for r in requests
-        )
+
+        # try to apply the requests on the copy of the stats if it applies
+        # cleanly then we know that the requests is supported. We call apply
+        # only on a copy as we don't want to actually consume resources from
+        # the pool as at this point this is just a test during host filtering.
+        # Later the scheduler will call apply_request to consume on the
+        # selected host. The compute will call consume_request during PCI claim
+        # to consume not just from the pools but also consume PciDevice
+        # objects.
+        stats = copy.deepcopy(self)
+        try:
+            stats.apply_requests(requests, provider_mapping, numa_cells)
+        except exception.PciDeviceRequestFailed:
+            return False
+
+        return True
 
     def _apply_request(
         self,
         pools: ty.List[Pool],
         request: 'objects.InstancePCIRequest',
+        rp_uuids: ty.List[str],
         numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,
     ) -> bool:
         """Apply an individual PCI request.
@@ -596,6 +796,8 @@ class PciDeviceStats(object):
         :param pools: A list of PCI device pool dicts
         :param request: An InstancePCIRequest object describing the type,
             quantity and required NUMA affinity of device(s) we want.
+        :param rp_uuids: A list of PR uuids this request fulfilled from in
+            placement
         :param numa_cells: A list of InstanceNUMACell objects whose ``id``
             corresponds to the ``id`` of host NUMACell objects.
         :returns: True if the request was applied against the provided pools
@@ -605,22 +807,77 @@ class PciDeviceStats(object):
         # Two concurrent requests may succeed when called support_requests
         # because this method does not remove related devices from the pools
 
-        filtered_pools = self._filter_pools(pools, request, numa_cells)
+        filtered_pools = self._filter_pools(
+            pools, request, numa_cells, rp_uuids)
 
         if not filtered_pools:
             return False
 
-        count = request.count
-        for pool in filtered_pools:
-            count = self._decrease_pool_count(pools, pool, count)
-            if not count:
-                break
+        if not rp_uuids:
+            # If there is no placement allocation for this request then we are
+            # free to consume from the filtered pools in any order
+            count = request.count
+            for pool in filtered_pools:
+                count = self._decrease_pool_count(pools, pool, count)
+                if not count:
+                    break
+        else:
+            # but if there is placement allocation then we have to follow that
+            requested_devs_per_pool_rp = collections.Counter(rp_uuids)
+            for pool in filtered_pools:
+                count = requested_devs_per_pool_rp[pool['rp_uuid']]
+                pool['count'] -= count
+                if pool['count'] == 0:
+                    pools.remove(pool)
 
         return True
 
+    def _get_rp_uuids_for_request(
+        self,
+        provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]],
+        request: 'objects.InstancePCIRequest'
+    ) -> ty.List[str]:
+        """Return the list of RP uuids that are fulfilling the request.
+
+        An RP will be in the list as many times as many devices needs to
+        be allocated from that RP.
+        """
+
+        if request.source == objects.InstancePCIRequest.NEUTRON_PORT:
+            # TODO(gibi): support neutron based requests in a later cycle
+            # an empty list will signal that any PCI pool can be used for this
+            # request
+            return []
+
+        if not provider_mapping:
+            # NOTE(gibi): AFAIK specs is always a list of a single dict
+            # but the object is hard to change retroactively
+            rp_uuids = request.spec[0].get('rp_uuids')
+            if not rp_uuids:
+                # This can happen if [filter_scheduler]pci_in_placement is not
+                # enabled yet
+                # An empty list will signal that any PCI pool can be used for
+                # this request
+                return []
+
+            # TODO(gibi): this is baaad but spec is a dict of string so
+            #  the list is serialized
+            return rp_uuids.split(',')
+
+        # NOTE(gibi): the PCI prefilter generates RequestGroup suffixes from
+        # InstancePCIRequests in the form of {request_id}-{count_index}
+        # NOTE(gibi): a suffixed request group always fulfilled from a single
+        # RP
+        return [
+            rp_uuids[0]
+            for group_id, rp_uuids in provider_mapping.items()
+            if group_id.startswith(request.request_id)
+        ]
+
     def apply_requests(
         self,
         requests: ty.List['objects.InstancePCIRequest'],
+        provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]],
         numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,
     ) -> None:
         """Apply PCI requests to the PCI stats.
@@ -634,15 +891,23 @@ class PciDeviceStats(object):
         :param requests: A list of InstancePCIRequest object describing the
             types, quantities and required NUMA affinities of devices we want.
         :type requests: nova.objects.InstancePCIRequests
+        :param provider_mapping: A dict keyed by RequestGroup requester_id,
+            to a list of resource provider UUIDs which provide resource
+            for that RequestGroup. If it is None then it signals that the
+            InstancePCIRequest objects already stores a mapping per request.
+            I.e.: we are called _after_ the scheduler made allocations for this
+            request in placement.
         :param numa_cells: A list of InstanceNUMACell objects whose ``id``
             corresponds to the ``id`` of host NUMACells, or None.
         :raises: exception.PciDeviceRequestFailed if this compute node cannot
             satisfy the given request.
         """
-        if not all(
-            self._apply_request(self.pools, r, numa_cells) for r in requests
-        ):
-            raise exception.PciDeviceRequestFailed(requests=requests)
+
+        for r in requests:
+            rp_uuids = self._get_rp_uuids_for_request(provider_mapping, r)
+
+            if not self._apply_request(self.pools, r, rp_uuids, numa_cells):
+                raise exception.PciDeviceRequestFailed(requests=requests)
 
     def __iter__(self) -> ty.Iterator[Pool]:
         pools: ty.List[Pool] = []
@@ -667,3 +932,53 @@ class PciDeviceStats(object):
         """Return the contents of the pools as a PciDevicePoolList object."""
         stats = [x for x in self]
         return pci_device_pool.from_pci_stats(stats)
+
+    def has_remote_managed_device_pools(self) -> bool:
+        """Determine whether remote managed device pools are present on a host.
+
+        The check is pool-based, not free device-based and is NUMA cell
+        agnostic.
+        """
+        dummy_req = objects.InstancePCIRequest(
+            count=0,
+            spec=[{'remote_managed': True}]
+        )
+        pools = self._filter_pools_for_spec(self.pools, dummy_req)
+        return bool(pools)
+
+    def populate_pools_metadata_from_assigned_devices(self):
+        """Populate the rp_uuid of each pool based on the rp_uuid of the
+        devices assigned to the pool. This can only be called from the compute
+        where devices are assigned to each pool. This should not be called from
+        the scheduler as there device - pool assignment is not known.
+        """
+        # PciDevices are tracked in placement and flavor based PCI requests
+        # are scheduled and allocated in placement. To be able to correlate
+        # what is allocated in placement and what is consumed in nova we
+        # need to map device pools to RPs. We can do that as the PciDevice
+        # contains the RP UUID that represents it in placement.
+        # NOTE(gibi): We cannot do this when the device is originally added to
+        # the pool as the device -> placement translation, that creates the
+        # RPs, runs after all the device is created and assigned to pools.
+        for pool in self.pools:
+            pool_rps = {
+                dev.extra_info.get("rp_uuid")
+                for dev in pool["devices"]
+                if "rp_uuid" in dev.extra_info
+            }
+            if len(pool_rps) >= 2:
+                # FIXME(gibi): Do we have a 1:1 pool - RP mapping even
+                #  if two PFs providing very similar VFs?
+                raise ValueError(
+                    "We have a pool %s connected to more than one RPs %s in "
+                    "placement via devs %s" % (pool, pool_rps, pool["devices"])
+                )
+
+            if not pool_rps:
+                # this can happen if the nova-compute is upgraded to have the
+                # PCI in placement inventory handling code but
+                # [pci]report_in_placement is not turned on yet.
+                continue
+
+            if pool_rps:  # now we know that it is a single RP
+                pool['rp_uuid'] = next(iter(pool_rps))