diff options
Diffstat (limited to 'nova/pci/stats.py')
-rw-r--r-- | nova/pci/stats.py | 391 |
1 files changed, 353 insertions, 38 deletions
diff --git a/nova/pci/stats.py b/nova/pci/stats.py index e8e810fa4f..c6e4844b34 100644 --- a/nova/pci/stats.py +++ b/nova/pci/stats.py @@ -13,17 +13,19 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. - +import collections import copy import typing as ty from oslo_config import cfg from oslo_log import log as logging +from oslo_utils import strutils from nova import exception from nova import objects from nova.objects import fields from nova.objects import pci_device_pool +from nova.pci.request import PCI_REMOTE_MANAGED_TAG from nova.pci import utils from nova.pci import whitelist @@ -62,12 +64,25 @@ class PciDeviceStats(object): """ pool_keys = ['product_id', 'vendor_id', 'numa_node', 'dev_type'] + # these can be specified in the [pci]device_spec and can be requested via + # the PCI alias, but they are matched by the placement + # allocation_candidates query, so we can ignore them during pool creation + # and during filtering here + ignored_spec_tags = ignored_pool_tags = ['resource_class', 'traits'] + # this is a metadata key in the spec that is matched + # specially in _filter_pools_based_on_placement_allocation. So we can + # ignore them in the general matching logic. + ignored_spec_tags += ['rp_uuids'] + # this is a metadata key in the pool that is matched + # specially in _filter_pools_based_on_placement_allocation. So we can + # ignore them in the general matching logic. + ignored_pool_tags += ['rp_uuid'] def __init__( self, numa_topology: 'objects.NUMATopology', stats: 'objects.PCIDevicePoolList' = None, - dev_filter: whitelist.Whitelist = None, + dev_filter: ty.Optional[whitelist.Whitelist] = None, ) -> None: self.numa_topology = numa_topology self.pools = ( @@ -75,7 +90,7 @@ class PciDeviceStats(object): ) self.pools.sort(key=lambda item: len(item)) self.dev_filter = dev_filter or whitelist.Whitelist( - CONF.pci.passthrough_whitelist) + CONF.pci.device_spec) def _equal_properties( self, dev: Pool, entry: Pool, matching_keys: ty.List[str], @@ -95,6 +110,28 @@ class PciDeviceStats(object): return None + @staticmethod + def _ensure_remote_managed_tag( + dev: 'objects.PciDevice', pool: Pool): + """Add a remote_managed tag depending on a device type if needed. + + Network devices may be managed remotely, e.g. by a SmartNIC DPU. If + a tag has not been explicitly provided, populate it by assuming that + a device is not remote managed by default. + """ + if dev.dev_type not in (fields.PciDeviceType.SRIOV_VF, + fields.PciDeviceType.SRIOV_PF, + fields.PciDeviceType.VDPA): + return + + # A tag is added here rather than at the client side to avoid an + # issue with having objects without this tag specified during an + # upgrade to the first version that supports handling this tag. + if pool.get(PCI_REMOTE_MANAGED_TAG) is None: + # NOTE: tags are compared as strings case-insensitively, see + # pci_device_prop_match in nova/pci/utils.py. + pool[PCI_REMOTE_MANAGED_TAG] = 'false' + def _create_pool_keys_from_dev( self, dev: 'objects.PciDevice', ) -> ty.Optional[Pool]: @@ -110,8 +147,22 @@ class PciDeviceStats(object): return None tags = devspec.get_tags() pool = {k: getattr(dev, k) for k in self.pool_keys} + if tags: - pool.update(tags) + pool.update( + { + k: v + for k, v in tags.items() + if k not in self.ignored_pool_tags + } + ) + # NOTE(gibi): since PCI in placement maps a PCI dev or a PF to a + # single RP and the scheduler allocates from a specific RP we need + # to split the pools by PCI or PF address. We can still keep + # the VFs from the same parent PF in a single pool though as they + # are equivalent from placement perspective. + pool['address'] = dev.parent_addr or dev.address + # NOTE(gibi): parent_ifname acts like a tag during pci claim but # not provided as part of the whitelist spec as it is auto detected # by the virt driver. @@ -120,6 +171,9 @@ class PciDeviceStats(object): # already in placement. if dev.extra_info.get('parent_ifname'): pool['parent_ifname'] = dev.extra_info['parent_ifname'] + + self._ensure_remote_managed_tag(dev, pool) + return pool def _get_pool_with_device_type_mismatch( @@ -197,6 +251,17 @@ class PciDeviceStats(object): free_devs.extend(pool['devices']) return free_devs + def _allocate_devs( + self, pool: Pool, num: int, request_id: str + ) -> ty.List["objects.PciDevice"]: + alloc_devices = [] + for _ in range(num): + pci_dev = pool['devices'].pop() + self._handle_device_dependents(pci_dev) + pci_dev.request_id = request_id + alloc_devices.append(pci_dev) + return alloc_devices + def consume_requests( self, pci_requests: 'objects.InstancePCIRequests', @@ -208,7 +273,10 @@ class PciDeviceStats(object): for request in pci_requests: count = request.count - pools = self._filter_pools(self.pools, request, numa_cells) + rp_uuids = self._get_rp_uuids_for_request( + request=request, provider_mapping=None) + pools = self._filter_pools( + self.pools, request, numa_cells, rp_uuids=rp_uuids) # Failed to allocate the required number of devices. Return the # devices already allocated during previous iterations back to @@ -222,22 +290,31 @@ class PciDeviceStats(object): "on the compute node semaphore.") for d in range(len(alloc_devices)): self.add_device(alloc_devices.pop()) - return None - - for pool in pools: - if pool['count'] >= count: - num_alloc = count - else: - num_alloc = pool['count'] - count -= num_alloc - pool['count'] -= num_alloc - for d in range(num_alloc): - pci_dev = pool['devices'].pop() - self._handle_device_dependents(pci_dev) - pci_dev.request_id = request.request_id - alloc_devices.append(pci_dev) - if count == 0: - break + raise exception.PciDeviceRequestFailed(requests=pci_requests) + + if not rp_uuids: + # if there is no placement allocation then we are free to + # consume from the pools in any order: + for pool in pools: + if pool['count'] >= count: + num_alloc = count + else: + num_alloc = pool['count'] + count -= num_alloc + pool['count'] -= num_alloc + alloc_devices += self._allocate_devs( + pool, num_alloc, request.request_id) + if count == 0: + break + else: + # but if there is placement allocation then we have to follow + # it + requested_devs_per_pool_rp = collections.Counter(rp_uuids) + for pool in pools: + count = requested_devs_per_pool_rp[pool['rp_uuid']] + pool['count'] -= count + alloc_devices += self._allocate_devs( + pool, count, request.request_id) return alloc_devices @@ -252,8 +329,12 @@ class PciDeviceStats(object): if pci_dev.dev_type == fields.PciDeviceType.SRIOV_PF: vfs_list = pci_dev.child_devices if vfs_list: + free_devs = self.get_free_devs() for vf in vfs_list: - self.remove_device(vf) + # NOTE(gibi): do not try to remove a device that are + # already removed + if vf in free_devs: + self.remove_device(vf) elif pci_dev.dev_type in ( fields.PciDeviceType.SRIOV_VF, fields.PciDeviceType.VDPA, @@ -282,7 +363,15 @@ class PciDeviceStats(object): :returns: A list of pools that can be used to support the request if this is possible. """ - request_specs = request.spec + + def ignore_keys(spec): + return { + k: v + for k, v in spec.items() + if k not in self.ignored_spec_tags + } + + request_specs = [ignore_keys(spec) for spec in request.spec] return [ pool for pool in pools if utils.pci_device_prop_match(pool, request_specs) @@ -458,11 +547,73 @@ class PciDeviceStats(object): ] return pools + def _filter_pools_for_unrequested_remote_managed_devices( + self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest', + ) -> ty.List[Pool]: + """Filter out pools with remote_managed devices, unless requested. + + Remote-managed devices are not usable for legacy SR-IOV or hardware + offload scenarios and must be excluded from allocation. + + :param pools: A list of PCI device pool dicts + :param request: An InstancePCIRequest object describing the type, + quantity and required NUMA affinity of device(s) we want. + :returns: A list of pools that can be used to support the request if + this is possible. + """ + if all(not strutils.bool_from_string(spec.get(PCI_REMOTE_MANAGED_TAG)) + for spec in request.spec): + pools = [pool for pool in pools + if not strutils.bool_from_string( + pool.get(PCI_REMOTE_MANAGED_TAG))] + return pools + + def _filter_pools_based_on_placement_allocation( + self, + pools: ty.List[Pool], + request: 'objects.InstancePCIRequest', + rp_uuids: ty.List[str], + ) -> ty.List[Pool]: + if not rp_uuids: + # If there is no placement allocation then we don't need to filter + # by it. This could happen if the instance only has neutron port + # based InstancePCIRequest as that is currently not having + # placement allocation (except for QoS ports, but that handled in a + # separate codepath) or if the [filter_scheduler]pci_in_placement + # configuration option is not enabled in the scheduler. + return pools + + requested_dev_count_per_rp = collections.Counter(rp_uuids) + matching_pools = [] + for pool in pools: + rp_uuid = pool.get('rp_uuid') + if rp_uuid is None: + # NOTE(gibi): As rp_uuids is not empty the scheduler allocated + # PCI resources on this host, so we know that + # [pci]report_in_placement is enabled on this host. But this + # pool has no RP mapping which can only happen if the pool + # contains PCI devices with physical_network tag, as those + # devices not yet reported in placement. But if they are not + # reported then we can ignore them here too. + continue + + if ( + # the placement allocation contains this pool + rp_uuid in requested_dev_count_per_rp and + # the amount of dev allocated in placement can be consumed + # from the pool + pool["count"] >= requested_dev_count_per_rp[rp_uuid] + ): + matching_pools.append(pool) + + return matching_pools + def _filter_pools( self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest', numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']], + rp_uuids: ty.List[str], ) -> ty.Optional[ty.List[Pool]]: """Determine if an individual PCI request can be met. @@ -477,6 +628,9 @@ class PciDeviceStats(object): quantity and required NUMA affinity of device(s) we want. :param numa_cells: A list of InstanceNUMACell objects whose ``id`` corresponds to the ``id`` of host NUMACell objects. + :param rp_uuids: A list of PR uuids this request fulfilled from in + placement. So here we have to consider only the pools matching with + thes RP uuids :returns: A list of pools that can be used to support the request if this is possible, else None. """ @@ -547,6 +701,33 @@ class PciDeviceStats(object): before_count - after_count ) + # If we're not requesting remote_managed devices then we should not + # use these either. Exclude them. + before_count = after_count + pools = self._filter_pools_for_unrequested_remote_managed_devices( + pools, request) + after_count = sum([pool['count'] for pool in pools]) + + if after_count < before_count: + LOG.debug( + 'Dropped %d device(s) as they are remote-managed devices which' + 'we have not requested', + before_count - after_count + ) + + # if there is placement allocation for the request then we have to + # remove the pools that are not in the placement allocation + before_count = after_count + pools = self._filter_pools_based_on_placement_allocation( + pools, request, rp_uuids) + after_count = sum([pool['count'] for pool in pools]) + if after_count < before_count: + LOG.debug( + 'Dropped %d device(s) that are not part of the placement ' + 'allocation', + before_count - after_count + ) + if after_count < request.count: LOG.debug('Not enough PCI devices left to satisfy request') return None @@ -556,6 +737,7 @@ class PciDeviceStats(object): def support_requests( self, requests: ty.List['objects.InstancePCIRequest'], + provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]], numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None, ) -> bool: """Determine if the PCI requests can be met. @@ -569,20 +751,38 @@ class PciDeviceStats(object): :param requests: A list of InstancePCIRequest object describing the types, quantities and required NUMA affinities of devices we want. :type requests: nova.objects.InstancePCIRequests + :param provider_mapping: A dict keyed by RequestGroup requester_id, + to a list of resource provider UUIDs which provide resource + for that RequestGroup. If it is None then it signals that the + InstancePCIRequest objects already stores a mapping per request. + I.e.: we are called _after_ the scheduler made allocations for this + request in placement. :param numa_cells: A list of InstanceNUMACell objects whose ``id`` corresponds to the ``id`` of host NUMACells, or None. :returns: Whether this compute node can satisfy the given request. """ - # NOTE(yjiang5): this function has high possibility to fail, - # so no exception should be triggered for performance reason. - return all( - self._filter_pools(self.pools, r, numa_cells) for r in requests - ) + + # try to apply the requests on the copy of the stats if it applies + # cleanly then we know that the requests is supported. We call apply + # only on a copy as we don't want to actually consume resources from + # the pool as at this point this is just a test during host filtering. + # Later the scheduler will call apply_request to consume on the + # selected host. The compute will call consume_request during PCI claim + # to consume not just from the pools but also consume PciDevice + # objects. + stats = copy.deepcopy(self) + try: + stats.apply_requests(requests, provider_mapping, numa_cells) + except exception.PciDeviceRequestFailed: + return False + + return True def _apply_request( self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest', + rp_uuids: ty.List[str], numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None, ) -> bool: """Apply an individual PCI request. @@ -596,6 +796,8 @@ class PciDeviceStats(object): :param pools: A list of PCI device pool dicts :param request: An InstancePCIRequest object describing the type, quantity and required NUMA affinity of device(s) we want. + :param rp_uuids: A list of PR uuids this request fulfilled from in + placement :param numa_cells: A list of InstanceNUMACell objects whose ``id`` corresponds to the ``id`` of host NUMACell objects. :returns: True if the request was applied against the provided pools @@ -605,22 +807,77 @@ class PciDeviceStats(object): # Two concurrent requests may succeed when called support_requests # because this method does not remove related devices from the pools - filtered_pools = self._filter_pools(pools, request, numa_cells) + filtered_pools = self._filter_pools( + pools, request, numa_cells, rp_uuids) if not filtered_pools: return False - count = request.count - for pool in filtered_pools: - count = self._decrease_pool_count(pools, pool, count) - if not count: - break + if not rp_uuids: + # If there is no placement allocation for this request then we are + # free to consume from the filtered pools in any order + count = request.count + for pool in filtered_pools: + count = self._decrease_pool_count(pools, pool, count) + if not count: + break + else: + # but if there is placement allocation then we have to follow that + requested_devs_per_pool_rp = collections.Counter(rp_uuids) + for pool in filtered_pools: + count = requested_devs_per_pool_rp[pool['rp_uuid']] + pool['count'] -= count + if pool['count'] == 0: + pools.remove(pool) return True + def _get_rp_uuids_for_request( + self, + provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]], + request: 'objects.InstancePCIRequest' + ) -> ty.List[str]: + """Return the list of RP uuids that are fulfilling the request. + + An RP will be in the list as many times as many devices needs to + be allocated from that RP. + """ + + if request.source == objects.InstancePCIRequest.NEUTRON_PORT: + # TODO(gibi): support neutron based requests in a later cycle + # an empty list will signal that any PCI pool can be used for this + # request + return [] + + if not provider_mapping: + # NOTE(gibi): AFAIK specs is always a list of a single dict + # but the object is hard to change retroactively + rp_uuids = request.spec[0].get('rp_uuids') + if not rp_uuids: + # This can happen if [filter_scheduler]pci_in_placement is not + # enabled yet + # An empty list will signal that any PCI pool can be used for + # this request + return [] + + # TODO(gibi): this is baaad but spec is a dict of string so + # the list is serialized + return rp_uuids.split(',') + + # NOTE(gibi): the PCI prefilter generates RequestGroup suffixes from + # InstancePCIRequests in the form of {request_id}-{count_index} + # NOTE(gibi): a suffixed request group always fulfilled from a single + # RP + return [ + rp_uuids[0] + for group_id, rp_uuids in provider_mapping.items() + if group_id.startswith(request.request_id) + ] + def apply_requests( self, requests: ty.List['objects.InstancePCIRequest'], + provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]], numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None, ) -> None: """Apply PCI requests to the PCI stats. @@ -634,15 +891,23 @@ class PciDeviceStats(object): :param requests: A list of InstancePCIRequest object describing the types, quantities and required NUMA affinities of devices we want. :type requests: nova.objects.InstancePCIRequests + :param provider_mapping: A dict keyed by RequestGroup requester_id, + to a list of resource provider UUIDs which provide resource + for that RequestGroup. If it is None then it signals that the + InstancePCIRequest objects already stores a mapping per request. + I.e.: we are called _after_ the scheduler made allocations for this + request in placement. :param numa_cells: A list of InstanceNUMACell objects whose ``id`` corresponds to the ``id`` of host NUMACells, or None. :raises: exception.PciDeviceRequestFailed if this compute node cannot satisfy the given request. """ - if not all( - self._apply_request(self.pools, r, numa_cells) for r in requests - ): - raise exception.PciDeviceRequestFailed(requests=requests) + + for r in requests: + rp_uuids = self._get_rp_uuids_for_request(provider_mapping, r) + + if not self._apply_request(self.pools, r, rp_uuids, numa_cells): + raise exception.PciDeviceRequestFailed(requests=requests) def __iter__(self) -> ty.Iterator[Pool]: pools: ty.List[Pool] = [] @@ -667,3 +932,53 @@ class PciDeviceStats(object): """Return the contents of the pools as a PciDevicePoolList object.""" stats = [x for x in self] return pci_device_pool.from_pci_stats(stats) + + def has_remote_managed_device_pools(self) -> bool: + """Determine whether remote managed device pools are present on a host. + + The check is pool-based, not free device-based and is NUMA cell + agnostic. + """ + dummy_req = objects.InstancePCIRequest( + count=0, + spec=[{'remote_managed': True}] + ) + pools = self._filter_pools_for_spec(self.pools, dummy_req) + return bool(pools) + + def populate_pools_metadata_from_assigned_devices(self): + """Populate the rp_uuid of each pool based on the rp_uuid of the + devices assigned to the pool. This can only be called from the compute + where devices are assigned to each pool. This should not be called from + the scheduler as there device - pool assignment is not known. + """ + # PciDevices are tracked in placement and flavor based PCI requests + # are scheduled and allocated in placement. To be able to correlate + # what is allocated in placement and what is consumed in nova we + # need to map device pools to RPs. We can do that as the PciDevice + # contains the RP UUID that represents it in placement. + # NOTE(gibi): We cannot do this when the device is originally added to + # the pool as the device -> placement translation, that creates the + # RPs, runs after all the device is created and assigned to pools. + for pool in self.pools: + pool_rps = { + dev.extra_info.get("rp_uuid") + for dev in pool["devices"] + if "rp_uuid" in dev.extra_info + } + if len(pool_rps) >= 2: + # FIXME(gibi): Do we have a 1:1 pool - RP mapping even + # if two PFs providing very similar VFs? + raise ValueError( + "We have a pool %s connected to more than one RPs %s in " + "placement via devs %s" % (pool, pool_rps, pool["devices"]) + ) + + if not pool_rps: + # this can happen if the nova-compute is upgraded to have the + # PCI in placement inventory handling code but + # [pci]report_in_placement is not turned on yet. + continue + + if pool_rps: # now we know that it is a single RP + pool['rp_uuid'] = next(iter(pool_rps)) |