diff options
Diffstat (limited to 'nova/pci/stats.py')
-rw-r--r-- | nova/pci/stats.py | 285 |
1 files changed, 255 insertions, 30 deletions
diff --git a/nova/pci/stats.py b/nova/pci/stats.py index 3518b95289..c6e4844b34 100644 --- a/nova/pci/stats.py +++ b/nova/pci/stats.py @@ -13,7 +13,7 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. - +import collections import copy import typing as ty @@ -64,12 +64,25 @@ class PciDeviceStats(object): """ pool_keys = ['product_id', 'vendor_id', 'numa_node', 'dev_type'] + # these can be specified in the [pci]device_spec and can be requested via + # the PCI alias, but they are matched by the placement + # allocation_candidates query, so we can ignore them during pool creation + # and during filtering here + ignored_spec_tags = ignored_pool_tags = ['resource_class', 'traits'] + # this is a metadata key in the spec that is matched + # specially in _filter_pools_based_on_placement_allocation. So we can + # ignore them in the general matching logic. + ignored_spec_tags += ['rp_uuids'] + # this is a metadata key in the pool that is matched + # specially in _filter_pools_based_on_placement_allocation. So we can + # ignore them in the general matching logic. + ignored_pool_tags += ['rp_uuid'] def __init__( self, numa_topology: 'objects.NUMATopology', stats: 'objects.PCIDevicePoolList' = None, - dev_filter: whitelist.Whitelist = None, + dev_filter: ty.Optional[whitelist.Whitelist] = None, ) -> None: self.numa_topology = numa_topology self.pools = ( @@ -134,8 +147,22 @@ class PciDeviceStats(object): return None tags = devspec.get_tags() pool = {k: getattr(dev, k) for k in self.pool_keys} + if tags: - pool.update(tags) + pool.update( + { + k: v + for k, v in tags.items() + if k not in self.ignored_pool_tags + } + ) + # NOTE(gibi): since PCI in placement maps a PCI dev or a PF to a + # single RP and the scheduler allocates from a specific RP we need + # to split the pools by PCI or PF address. We can still keep + # the VFs from the same parent PF in a single pool though as they + # are equivalent from placement perspective. + pool['address'] = dev.parent_addr or dev.address + # NOTE(gibi): parent_ifname acts like a tag during pci claim but # not provided as part of the whitelist spec as it is auto detected # by the virt driver. @@ -224,6 +251,17 @@ class PciDeviceStats(object): free_devs.extend(pool['devices']) return free_devs + def _allocate_devs( + self, pool: Pool, num: int, request_id: str + ) -> ty.List["objects.PciDevice"]: + alloc_devices = [] + for _ in range(num): + pci_dev = pool['devices'].pop() + self._handle_device_dependents(pci_dev) + pci_dev.request_id = request_id + alloc_devices.append(pci_dev) + return alloc_devices + def consume_requests( self, pci_requests: 'objects.InstancePCIRequests', @@ -235,7 +273,10 @@ class PciDeviceStats(object): for request in pci_requests: count = request.count - pools = self._filter_pools(self.pools, request, numa_cells) + rp_uuids = self._get_rp_uuids_for_request( + request=request, provider_mapping=None) + pools = self._filter_pools( + self.pools, request, numa_cells, rp_uuids=rp_uuids) # Failed to allocate the required number of devices. Return the # devices already allocated during previous iterations back to @@ -251,20 +292,29 @@ class PciDeviceStats(object): self.add_device(alloc_devices.pop()) raise exception.PciDeviceRequestFailed(requests=pci_requests) - for pool in pools: - if pool['count'] >= count: - num_alloc = count - else: - num_alloc = pool['count'] - count -= num_alloc - pool['count'] -= num_alloc - for d in range(num_alloc): - pci_dev = pool['devices'].pop() - self._handle_device_dependents(pci_dev) - pci_dev.request_id = request.request_id - alloc_devices.append(pci_dev) - if count == 0: - break + if not rp_uuids: + # if there is no placement allocation then we are free to + # consume from the pools in any order: + for pool in pools: + if pool['count'] >= count: + num_alloc = count + else: + num_alloc = pool['count'] + count -= num_alloc + pool['count'] -= num_alloc + alloc_devices += self._allocate_devs( + pool, num_alloc, request.request_id) + if count == 0: + break + else: + # but if there is placement allocation then we have to follow + # it + requested_devs_per_pool_rp = collections.Counter(rp_uuids) + for pool in pools: + count = requested_devs_per_pool_rp[pool['rp_uuid']] + pool['count'] -= count + alloc_devices += self._allocate_devs( + pool, count, request.request_id) return alloc_devices @@ -313,7 +363,15 @@ class PciDeviceStats(object): :returns: A list of pools that can be used to support the request if this is possible. """ - request_specs = request.spec + + def ignore_keys(spec): + return { + k: v + for k, v in spec.items() + if k not in self.ignored_spec_tags + } + + request_specs = [ignore_keys(spec) for spec in request.spec] return [ pool for pool in pools if utils.pci_device_prop_match(pool, request_specs) @@ -510,11 +568,52 @@ class PciDeviceStats(object): pool.get(PCI_REMOTE_MANAGED_TAG))] return pools + def _filter_pools_based_on_placement_allocation( + self, + pools: ty.List[Pool], + request: 'objects.InstancePCIRequest', + rp_uuids: ty.List[str], + ) -> ty.List[Pool]: + if not rp_uuids: + # If there is no placement allocation then we don't need to filter + # by it. This could happen if the instance only has neutron port + # based InstancePCIRequest as that is currently not having + # placement allocation (except for QoS ports, but that handled in a + # separate codepath) or if the [filter_scheduler]pci_in_placement + # configuration option is not enabled in the scheduler. + return pools + + requested_dev_count_per_rp = collections.Counter(rp_uuids) + matching_pools = [] + for pool in pools: + rp_uuid = pool.get('rp_uuid') + if rp_uuid is None: + # NOTE(gibi): As rp_uuids is not empty the scheduler allocated + # PCI resources on this host, so we know that + # [pci]report_in_placement is enabled on this host. But this + # pool has no RP mapping which can only happen if the pool + # contains PCI devices with physical_network tag, as those + # devices not yet reported in placement. But if they are not + # reported then we can ignore them here too. + continue + + if ( + # the placement allocation contains this pool + rp_uuid in requested_dev_count_per_rp and + # the amount of dev allocated in placement can be consumed + # from the pool + pool["count"] >= requested_dev_count_per_rp[rp_uuid] + ): + matching_pools.append(pool) + + return matching_pools + def _filter_pools( self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest', numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']], + rp_uuids: ty.List[str], ) -> ty.Optional[ty.List[Pool]]: """Determine if an individual PCI request can be met. @@ -529,6 +628,9 @@ class PciDeviceStats(object): quantity and required NUMA affinity of device(s) we want. :param numa_cells: A list of InstanceNUMACell objects whose ``id`` corresponds to the ``id`` of host NUMACell objects. + :param rp_uuids: A list of PR uuids this request fulfilled from in + placement. So here we have to consider only the pools matching with + thes RP uuids :returns: A list of pools that can be used to support the request if this is possible, else None. """ @@ -613,6 +715,19 @@ class PciDeviceStats(object): before_count - after_count ) + # if there is placement allocation for the request then we have to + # remove the pools that are not in the placement allocation + before_count = after_count + pools = self._filter_pools_based_on_placement_allocation( + pools, request, rp_uuids) + after_count = sum([pool['count'] for pool in pools]) + if after_count < before_count: + LOG.debug( + 'Dropped %d device(s) that are not part of the placement ' + 'allocation', + before_count - after_count + ) + if after_count < request.count: LOG.debug('Not enough PCI devices left to satisfy request') return None @@ -622,6 +737,7 @@ class PciDeviceStats(object): def support_requests( self, requests: ty.List['objects.InstancePCIRequest'], + provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]], numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None, ) -> bool: """Determine if the PCI requests can be met. @@ -635,6 +751,12 @@ class PciDeviceStats(object): :param requests: A list of InstancePCIRequest object describing the types, quantities and required NUMA affinities of devices we want. :type requests: nova.objects.InstancePCIRequests + :param provider_mapping: A dict keyed by RequestGroup requester_id, + to a list of resource provider UUIDs which provide resource + for that RequestGroup. If it is None then it signals that the + InstancePCIRequest objects already stores a mapping per request. + I.e.: we are called _after_ the scheduler made allocations for this + request in placement. :param numa_cells: A list of InstanceNUMACell objects whose ``id`` corresponds to the ``id`` of host NUMACells, or None. :returns: Whether this compute node can satisfy the given request. @@ -650,7 +772,7 @@ class PciDeviceStats(object): # objects. stats = copy.deepcopy(self) try: - stats.apply_requests(requests, numa_cells) + stats.apply_requests(requests, provider_mapping, numa_cells) except exception.PciDeviceRequestFailed: return False @@ -660,6 +782,7 @@ class PciDeviceStats(object): self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest', + rp_uuids: ty.List[str], numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None, ) -> bool: """Apply an individual PCI request. @@ -673,6 +796,8 @@ class PciDeviceStats(object): :param pools: A list of PCI device pool dicts :param request: An InstancePCIRequest object describing the type, quantity and required NUMA affinity of device(s) we want. + :param rp_uuids: A list of PR uuids this request fulfilled from in + placement :param numa_cells: A list of InstanceNUMACell objects whose ``id`` corresponds to the ``id`` of host NUMACell objects. :returns: True if the request was applied against the provided pools @@ -682,22 +807,77 @@ class PciDeviceStats(object): # Two concurrent requests may succeed when called support_requests # because this method does not remove related devices from the pools - filtered_pools = self._filter_pools(pools, request, numa_cells) + filtered_pools = self._filter_pools( + pools, request, numa_cells, rp_uuids) if not filtered_pools: return False - count = request.count - for pool in filtered_pools: - count = self._decrease_pool_count(pools, pool, count) - if not count: - break + if not rp_uuids: + # If there is no placement allocation for this request then we are + # free to consume from the filtered pools in any order + count = request.count + for pool in filtered_pools: + count = self._decrease_pool_count(pools, pool, count) + if not count: + break + else: + # but if there is placement allocation then we have to follow that + requested_devs_per_pool_rp = collections.Counter(rp_uuids) + for pool in filtered_pools: + count = requested_devs_per_pool_rp[pool['rp_uuid']] + pool['count'] -= count + if pool['count'] == 0: + pools.remove(pool) return True + def _get_rp_uuids_for_request( + self, + provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]], + request: 'objects.InstancePCIRequest' + ) -> ty.List[str]: + """Return the list of RP uuids that are fulfilling the request. + + An RP will be in the list as many times as many devices needs to + be allocated from that RP. + """ + + if request.source == objects.InstancePCIRequest.NEUTRON_PORT: + # TODO(gibi): support neutron based requests in a later cycle + # an empty list will signal that any PCI pool can be used for this + # request + return [] + + if not provider_mapping: + # NOTE(gibi): AFAIK specs is always a list of a single dict + # but the object is hard to change retroactively + rp_uuids = request.spec[0].get('rp_uuids') + if not rp_uuids: + # This can happen if [filter_scheduler]pci_in_placement is not + # enabled yet + # An empty list will signal that any PCI pool can be used for + # this request + return [] + + # TODO(gibi): this is baaad but spec is a dict of string so + # the list is serialized + return rp_uuids.split(',') + + # NOTE(gibi): the PCI prefilter generates RequestGroup suffixes from + # InstancePCIRequests in the form of {request_id}-{count_index} + # NOTE(gibi): a suffixed request group always fulfilled from a single + # RP + return [ + rp_uuids[0] + for group_id, rp_uuids in provider_mapping.items() + if group_id.startswith(request.request_id) + ] + def apply_requests( self, requests: ty.List['objects.InstancePCIRequest'], + provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]], numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None, ) -> None: """Apply PCI requests to the PCI stats. @@ -711,15 +891,23 @@ class PciDeviceStats(object): :param requests: A list of InstancePCIRequest object describing the types, quantities and required NUMA affinities of devices we want. :type requests: nova.objects.InstancePCIRequests + :param provider_mapping: A dict keyed by RequestGroup requester_id, + to a list of resource provider UUIDs which provide resource + for that RequestGroup. If it is None then it signals that the + InstancePCIRequest objects already stores a mapping per request. + I.e.: we are called _after_ the scheduler made allocations for this + request in placement. :param numa_cells: A list of InstanceNUMACell objects whose ``id`` corresponds to the ``id`` of host NUMACells, or None. :raises: exception.PciDeviceRequestFailed if this compute node cannot satisfy the given request. """ - if not all( - self._apply_request(self.pools, r, numa_cells) for r in requests - ): - raise exception.PciDeviceRequestFailed(requests=requests) + + for r in requests: + rp_uuids = self._get_rp_uuids_for_request(provider_mapping, r) + + if not self._apply_request(self.pools, r, rp_uuids, numa_cells): + raise exception.PciDeviceRequestFailed(requests=requests) def __iter__(self) -> ty.Iterator[Pool]: pools: ty.List[Pool] = [] @@ -757,3 +945,40 @@ class PciDeviceStats(object): ) pools = self._filter_pools_for_spec(self.pools, dummy_req) return bool(pools) + + def populate_pools_metadata_from_assigned_devices(self): + """Populate the rp_uuid of each pool based on the rp_uuid of the + devices assigned to the pool. This can only be called from the compute + where devices are assigned to each pool. This should not be called from + the scheduler as there device - pool assignment is not known. + """ + # PciDevices are tracked in placement and flavor based PCI requests + # are scheduled and allocated in placement. To be able to correlate + # what is allocated in placement and what is consumed in nova we + # need to map device pools to RPs. We can do that as the PciDevice + # contains the RP UUID that represents it in placement. + # NOTE(gibi): We cannot do this when the device is originally added to + # the pool as the device -> placement translation, that creates the + # RPs, runs after all the device is created and assigned to pools. + for pool in self.pools: + pool_rps = { + dev.extra_info.get("rp_uuid") + for dev in pool["devices"] + if "rp_uuid" in dev.extra_info + } + if len(pool_rps) >= 2: + # FIXME(gibi): Do we have a 1:1 pool - RP mapping even + # if two PFs providing very similar VFs? + raise ValueError( + "We have a pool %s connected to more than one RPs %s in " + "placement via devs %s" % (pool, pool_rps, pool["devices"]) + ) + + if not pool_rps: + # this can happen if the nova-compute is upgraded to have the + # PCI in placement inventory handling code but + # [pci]report_in_placement is not turned on yet. + continue + + if pool_rps: # now we know that it is a single RP + pool['rp_uuid'] = next(iter(pool_rps)) |