summaryrefslogtreecommitdiff
path: root/nova/pci/stats.py
diff options
context:
space:
mode:
Diffstat (limited to 'nova/pci/stats.py')
-rw-r--r--nova/pci/stats.py391
1 files changed, 353 insertions, 38 deletions
diff --git a/nova/pci/stats.py b/nova/pci/stats.py
index e8e810fa4f..c6e4844b34 100644
--- a/nova/pci/stats.py
+++ b/nova/pci/stats.py
@@ -13,17 +13,19 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
-
+import collections
import copy
import typing as ty
from oslo_config import cfg
from oslo_log import log as logging
+from oslo_utils import strutils
from nova import exception
from nova import objects
from nova.objects import fields
from nova.objects import pci_device_pool
+from nova.pci.request import PCI_REMOTE_MANAGED_TAG
from nova.pci import utils
from nova.pci import whitelist
@@ -62,12 +64,25 @@ class PciDeviceStats(object):
"""
pool_keys = ['product_id', 'vendor_id', 'numa_node', 'dev_type']
+ # these can be specified in the [pci]device_spec and can be requested via
+ # the PCI alias, but they are matched by the placement
+ # allocation_candidates query, so we can ignore them during pool creation
+ # and during filtering here
+ ignored_spec_tags = ignored_pool_tags = ['resource_class', 'traits']
+ # this is a metadata key in the spec that is matched
+ # specially in _filter_pools_based_on_placement_allocation. So we can
+ # ignore them in the general matching logic.
+ ignored_spec_tags += ['rp_uuids']
+ # this is a metadata key in the pool that is matched
+ # specially in _filter_pools_based_on_placement_allocation. So we can
+ # ignore them in the general matching logic.
+ ignored_pool_tags += ['rp_uuid']
def __init__(
self,
numa_topology: 'objects.NUMATopology',
stats: 'objects.PCIDevicePoolList' = None,
- dev_filter: whitelist.Whitelist = None,
+ dev_filter: ty.Optional[whitelist.Whitelist] = None,
) -> None:
self.numa_topology = numa_topology
self.pools = (
@@ -75,7 +90,7 @@ class PciDeviceStats(object):
)
self.pools.sort(key=lambda item: len(item))
self.dev_filter = dev_filter or whitelist.Whitelist(
- CONF.pci.passthrough_whitelist)
+ CONF.pci.device_spec)
def _equal_properties(
self, dev: Pool, entry: Pool, matching_keys: ty.List[str],
@@ -95,6 +110,28 @@ class PciDeviceStats(object):
return None
+ @staticmethod
+ def _ensure_remote_managed_tag(
+ dev: 'objects.PciDevice', pool: Pool):
+ """Add a remote_managed tag depending on a device type if needed.
+
+ Network devices may be managed remotely, e.g. by a SmartNIC DPU. If
+ a tag has not been explicitly provided, populate it by assuming that
+ a device is not remote managed by default.
+ """
+ if dev.dev_type not in (fields.PciDeviceType.SRIOV_VF,
+ fields.PciDeviceType.SRIOV_PF,
+ fields.PciDeviceType.VDPA):
+ return
+
+ # A tag is added here rather than at the client side to avoid an
+ # issue with having objects without this tag specified during an
+ # upgrade to the first version that supports handling this tag.
+ if pool.get(PCI_REMOTE_MANAGED_TAG) is None:
+ # NOTE: tags are compared as strings case-insensitively, see
+ # pci_device_prop_match in nova/pci/utils.py.
+ pool[PCI_REMOTE_MANAGED_TAG] = 'false'
+
def _create_pool_keys_from_dev(
self, dev: 'objects.PciDevice',
) -> ty.Optional[Pool]:
@@ -110,8 +147,22 @@ class PciDeviceStats(object):
return None
tags = devspec.get_tags()
pool = {k: getattr(dev, k) for k in self.pool_keys}
+
if tags:
- pool.update(tags)
+ pool.update(
+ {
+ k: v
+ for k, v in tags.items()
+ if k not in self.ignored_pool_tags
+ }
+ )
+ # NOTE(gibi): since PCI in placement maps a PCI dev or a PF to a
+ # single RP and the scheduler allocates from a specific RP we need
+ # to split the pools by PCI or PF address. We can still keep
+ # the VFs from the same parent PF in a single pool though as they
+ # are equivalent from placement perspective.
+ pool['address'] = dev.parent_addr or dev.address
+
# NOTE(gibi): parent_ifname acts like a tag during pci claim but
# not provided as part of the whitelist spec as it is auto detected
# by the virt driver.
@@ -120,6 +171,9 @@ class PciDeviceStats(object):
# already in placement.
if dev.extra_info.get('parent_ifname'):
pool['parent_ifname'] = dev.extra_info['parent_ifname']
+
+ self._ensure_remote_managed_tag(dev, pool)
+
return pool
def _get_pool_with_device_type_mismatch(
@@ -197,6 +251,17 @@ class PciDeviceStats(object):
free_devs.extend(pool['devices'])
return free_devs
+ def _allocate_devs(
+ self, pool: Pool, num: int, request_id: str
+ ) -> ty.List["objects.PciDevice"]:
+ alloc_devices = []
+ for _ in range(num):
+ pci_dev = pool['devices'].pop()
+ self._handle_device_dependents(pci_dev)
+ pci_dev.request_id = request_id
+ alloc_devices.append(pci_dev)
+ return alloc_devices
+
def consume_requests(
self,
pci_requests: 'objects.InstancePCIRequests',
@@ -208,7 +273,10 @@ class PciDeviceStats(object):
for request in pci_requests:
count = request.count
- pools = self._filter_pools(self.pools, request, numa_cells)
+ rp_uuids = self._get_rp_uuids_for_request(
+ request=request, provider_mapping=None)
+ pools = self._filter_pools(
+ self.pools, request, numa_cells, rp_uuids=rp_uuids)
# Failed to allocate the required number of devices. Return the
# devices already allocated during previous iterations back to
@@ -222,22 +290,31 @@ class PciDeviceStats(object):
"on the compute node semaphore.")
for d in range(len(alloc_devices)):
self.add_device(alloc_devices.pop())
- return None
-
- for pool in pools:
- if pool['count'] >= count:
- num_alloc = count
- else:
- num_alloc = pool['count']
- count -= num_alloc
- pool['count'] -= num_alloc
- for d in range(num_alloc):
- pci_dev = pool['devices'].pop()
- self._handle_device_dependents(pci_dev)
- pci_dev.request_id = request.request_id
- alloc_devices.append(pci_dev)
- if count == 0:
- break
+ raise exception.PciDeviceRequestFailed(requests=pci_requests)
+
+ if not rp_uuids:
+ # if there is no placement allocation then we are free to
+ # consume from the pools in any order:
+ for pool in pools:
+ if pool['count'] >= count:
+ num_alloc = count
+ else:
+ num_alloc = pool['count']
+ count -= num_alloc
+ pool['count'] -= num_alloc
+ alloc_devices += self._allocate_devs(
+ pool, num_alloc, request.request_id)
+ if count == 0:
+ break
+ else:
+ # but if there is placement allocation then we have to follow
+ # it
+ requested_devs_per_pool_rp = collections.Counter(rp_uuids)
+ for pool in pools:
+ count = requested_devs_per_pool_rp[pool['rp_uuid']]
+ pool['count'] -= count
+ alloc_devices += self._allocate_devs(
+ pool, count, request.request_id)
return alloc_devices
@@ -252,8 +329,12 @@ class PciDeviceStats(object):
if pci_dev.dev_type == fields.PciDeviceType.SRIOV_PF:
vfs_list = pci_dev.child_devices
if vfs_list:
+ free_devs = self.get_free_devs()
for vf in vfs_list:
- self.remove_device(vf)
+ # NOTE(gibi): do not try to remove a device that are
+ # already removed
+ if vf in free_devs:
+ self.remove_device(vf)
elif pci_dev.dev_type in (
fields.PciDeviceType.SRIOV_VF,
fields.PciDeviceType.VDPA,
@@ -282,7 +363,15 @@ class PciDeviceStats(object):
:returns: A list of pools that can be used to support the request if
this is possible.
"""
- request_specs = request.spec
+
+ def ignore_keys(spec):
+ return {
+ k: v
+ for k, v in spec.items()
+ if k not in self.ignored_spec_tags
+ }
+
+ request_specs = [ignore_keys(spec) for spec in request.spec]
return [
pool for pool in pools
if utils.pci_device_prop_match(pool, request_specs)
@@ -458,11 +547,73 @@ class PciDeviceStats(object):
]
return pools
+ def _filter_pools_for_unrequested_remote_managed_devices(
+ self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest',
+ ) -> ty.List[Pool]:
+ """Filter out pools with remote_managed devices, unless requested.
+
+ Remote-managed devices are not usable for legacy SR-IOV or hardware
+ offload scenarios and must be excluded from allocation.
+
+ :param pools: A list of PCI device pool dicts
+ :param request: An InstancePCIRequest object describing the type,
+ quantity and required NUMA affinity of device(s) we want.
+ :returns: A list of pools that can be used to support the request if
+ this is possible.
+ """
+ if all(not strutils.bool_from_string(spec.get(PCI_REMOTE_MANAGED_TAG))
+ for spec in request.spec):
+ pools = [pool for pool in pools
+ if not strutils.bool_from_string(
+ pool.get(PCI_REMOTE_MANAGED_TAG))]
+ return pools
+
+ def _filter_pools_based_on_placement_allocation(
+ self,
+ pools: ty.List[Pool],
+ request: 'objects.InstancePCIRequest',
+ rp_uuids: ty.List[str],
+ ) -> ty.List[Pool]:
+ if not rp_uuids:
+ # If there is no placement allocation then we don't need to filter
+ # by it. This could happen if the instance only has neutron port
+ # based InstancePCIRequest as that is currently not having
+ # placement allocation (except for QoS ports, but that handled in a
+ # separate codepath) or if the [filter_scheduler]pci_in_placement
+ # configuration option is not enabled in the scheduler.
+ return pools
+
+ requested_dev_count_per_rp = collections.Counter(rp_uuids)
+ matching_pools = []
+ for pool in pools:
+ rp_uuid = pool.get('rp_uuid')
+ if rp_uuid is None:
+ # NOTE(gibi): As rp_uuids is not empty the scheduler allocated
+ # PCI resources on this host, so we know that
+ # [pci]report_in_placement is enabled on this host. But this
+ # pool has no RP mapping which can only happen if the pool
+ # contains PCI devices with physical_network tag, as those
+ # devices not yet reported in placement. But if they are not
+ # reported then we can ignore them here too.
+ continue
+
+ if (
+ # the placement allocation contains this pool
+ rp_uuid in requested_dev_count_per_rp and
+ # the amount of dev allocated in placement can be consumed
+ # from the pool
+ pool["count"] >= requested_dev_count_per_rp[rp_uuid]
+ ):
+ matching_pools.append(pool)
+
+ return matching_pools
+
def _filter_pools(
self,
pools: ty.List[Pool],
request: 'objects.InstancePCIRequest',
numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']],
+ rp_uuids: ty.List[str],
) -> ty.Optional[ty.List[Pool]]:
"""Determine if an individual PCI request can be met.
@@ -477,6 +628,9 @@ class PciDeviceStats(object):
quantity and required NUMA affinity of device(s) we want.
:param numa_cells: A list of InstanceNUMACell objects whose ``id``
corresponds to the ``id`` of host NUMACell objects.
+ :param rp_uuids: A list of PR uuids this request fulfilled from in
+ placement. So here we have to consider only the pools matching with
+ thes RP uuids
:returns: A list of pools that can be used to support the request if
this is possible, else None.
"""
@@ -547,6 +701,33 @@ class PciDeviceStats(object):
before_count - after_count
)
+ # If we're not requesting remote_managed devices then we should not
+ # use these either. Exclude them.
+ before_count = after_count
+ pools = self._filter_pools_for_unrequested_remote_managed_devices(
+ pools, request)
+ after_count = sum([pool['count'] for pool in pools])
+
+ if after_count < before_count:
+ LOG.debug(
+ 'Dropped %d device(s) as they are remote-managed devices which'
+ 'we have not requested',
+ before_count - after_count
+ )
+
+ # if there is placement allocation for the request then we have to
+ # remove the pools that are not in the placement allocation
+ before_count = after_count
+ pools = self._filter_pools_based_on_placement_allocation(
+ pools, request, rp_uuids)
+ after_count = sum([pool['count'] for pool in pools])
+ if after_count < before_count:
+ LOG.debug(
+ 'Dropped %d device(s) that are not part of the placement '
+ 'allocation',
+ before_count - after_count
+ )
+
if after_count < request.count:
LOG.debug('Not enough PCI devices left to satisfy request')
return None
@@ -556,6 +737,7 @@ class PciDeviceStats(object):
def support_requests(
self,
requests: ty.List['objects.InstancePCIRequest'],
+ provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]],
numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,
) -> bool:
"""Determine if the PCI requests can be met.
@@ -569,20 +751,38 @@ class PciDeviceStats(object):
:param requests: A list of InstancePCIRequest object describing the
types, quantities and required NUMA affinities of devices we want.
:type requests: nova.objects.InstancePCIRequests
+ :param provider_mapping: A dict keyed by RequestGroup requester_id,
+ to a list of resource provider UUIDs which provide resource
+ for that RequestGroup. If it is None then it signals that the
+ InstancePCIRequest objects already stores a mapping per request.
+ I.e.: we are called _after_ the scheduler made allocations for this
+ request in placement.
:param numa_cells: A list of InstanceNUMACell objects whose ``id``
corresponds to the ``id`` of host NUMACells, or None.
:returns: Whether this compute node can satisfy the given request.
"""
- # NOTE(yjiang5): this function has high possibility to fail,
- # so no exception should be triggered for performance reason.
- return all(
- self._filter_pools(self.pools, r, numa_cells) for r in requests
- )
+
+ # try to apply the requests on the copy of the stats if it applies
+ # cleanly then we know that the requests is supported. We call apply
+ # only on a copy as we don't want to actually consume resources from
+ # the pool as at this point this is just a test during host filtering.
+ # Later the scheduler will call apply_request to consume on the
+ # selected host. The compute will call consume_request during PCI claim
+ # to consume not just from the pools but also consume PciDevice
+ # objects.
+ stats = copy.deepcopy(self)
+ try:
+ stats.apply_requests(requests, provider_mapping, numa_cells)
+ except exception.PciDeviceRequestFailed:
+ return False
+
+ return True
def _apply_request(
self,
pools: ty.List[Pool],
request: 'objects.InstancePCIRequest',
+ rp_uuids: ty.List[str],
numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,
) -> bool:
"""Apply an individual PCI request.
@@ -596,6 +796,8 @@ class PciDeviceStats(object):
:param pools: A list of PCI device pool dicts
:param request: An InstancePCIRequest object describing the type,
quantity and required NUMA affinity of device(s) we want.
+ :param rp_uuids: A list of PR uuids this request fulfilled from in
+ placement
:param numa_cells: A list of InstanceNUMACell objects whose ``id``
corresponds to the ``id`` of host NUMACell objects.
:returns: True if the request was applied against the provided pools
@@ -605,22 +807,77 @@ class PciDeviceStats(object):
# Two concurrent requests may succeed when called support_requests
# because this method does not remove related devices from the pools
- filtered_pools = self._filter_pools(pools, request, numa_cells)
+ filtered_pools = self._filter_pools(
+ pools, request, numa_cells, rp_uuids)
if not filtered_pools:
return False
- count = request.count
- for pool in filtered_pools:
- count = self._decrease_pool_count(pools, pool, count)
- if not count:
- break
+ if not rp_uuids:
+ # If there is no placement allocation for this request then we are
+ # free to consume from the filtered pools in any order
+ count = request.count
+ for pool in filtered_pools:
+ count = self._decrease_pool_count(pools, pool, count)
+ if not count:
+ break
+ else:
+ # but if there is placement allocation then we have to follow that
+ requested_devs_per_pool_rp = collections.Counter(rp_uuids)
+ for pool in filtered_pools:
+ count = requested_devs_per_pool_rp[pool['rp_uuid']]
+ pool['count'] -= count
+ if pool['count'] == 0:
+ pools.remove(pool)
return True
+ def _get_rp_uuids_for_request(
+ self,
+ provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]],
+ request: 'objects.InstancePCIRequest'
+ ) -> ty.List[str]:
+ """Return the list of RP uuids that are fulfilling the request.
+
+ An RP will be in the list as many times as many devices needs to
+ be allocated from that RP.
+ """
+
+ if request.source == objects.InstancePCIRequest.NEUTRON_PORT:
+ # TODO(gibi): support neutron based requests in a later cycle
+ # an empty list will signal that any PCI pool can be used for this
+ # request
+ return []
+
+ if not provider_mapping:
+ # NOTE(gibi): AFAIK specs is always a list of a single dict
+ # but the object is hard to change retroactively
+ rp_uuids = request.spec[0].get('rp_uuids')
+ if not rp_uuids:
+ # This can happen if [filter_scheduler]pci_in_placement is not
+ # enabled yet
+ # An empty list will signal that any PCI pool can be used for
+ # this request
+ return []
+
+ # TODO(gibi): this is baaad but spec is a dict of string so
+ # the list is serialized
+ return rp_uuids.split(',')
+
+ # NOTE(gibi): the PCI prefilter generates RequestGroup suffixes from
+ # InstancePCIRequests in the form of {request_id}-{count_index}
+ # NOTE(gibi): a suffixed request group always fulfilled from a single
+ # RP
+ return [
+ rp_uuids[0]
+ for group_id, rp_uuids in provider_mapping.items()
+ if group_id.startswith(request.request_id)
+ ]
+
def apply_requests(
self,
requests: ty.List['objects.InstancePCIRequest'],
+ provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]],
numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,
) -> None:
"""Apply PCI requests to the PCI stats.
@@ -634,15 +891,23 @@ class PciDeviceStats(object):
:param requests: A list of InstancePCIRequest object describing the
types, quantities and required NUMA affinities of devices we want.
:type requests: nova.objects.InstancePCIRequests
+ :param provider_mapping: A dict keyed by RequestGroup requester_id,
+ to a list of resource provider UUIDs which provide resource
+ for that RequestGroup. If it is None then it signals that the
+ InstancePCIRequest objects already stores a mapping per request.
+ I.e.: we are called _after_ the scheduler made allocations for this
+ request in placement.
:param numa_cells: A list of InstanceNUMACell objects whose ``id``
corresponds to the ``id`` of host NUMACells, or None.
:raises: exception.PciDeviceRequestFailed if this compute node cannot
satisfy the given request.
"""
- if not all(
- self._apply_request(self.pools, r, numa_cells) for r in requests
- ):
- raise exception.PciDeviceRequestFailed(requests=requests)
+
+ for r in requests:
+ rp_uuids = self._get_rp_uuids_for_request(provider_mapping, r)
+
+ if not self._apply_request(self.pools, r, rp_uuids, numa_cells):
+ raise exception.PciDeviceRequestFailed(requests=requests)
def __iter__(self) -> ty.Iterator[Pool]:
pools: ty.List[Pool] = []
@@ -667,3 +932,53 @@ class PciDeviceStats(object):
"""Return the contents of the pools as a PciDevicePoolList object."""
stats = [x for x in self]
return pci_device_pool.from_pci_stats(stats)
+
+ def has_remote_managed_device_pools(self) -> bool:
+ """Determine whether remote managed device pools are present on a host.
+
+ The check is pool-based, not free device-based and is NUMA cell
+ agnostic.
+ """
+ dummy_req = objects.InstancePCIRequest(
+ count=0,
+ spec=[{'remote_managed': True}]
+ )
+ pools = self._filter_pools_for_spec(self.pools, dummy_req)
+ return bool(pools)
+
+ def populate_pools_metadata_from_assigned_devices(self):
+ """Populate the rp_uuid of each pool based on the rp_uuid of the
+ devices assigned to the pool. This can only be called from the compute
+ where devices are assigned to each pool. This should not be called from
+ the scheduler as there device - pool assignment is not known.
+ """
+ # PciDevices are tracked in placement and flavor based PCI requests
+ # are scheduled and allocated in placement. To be able to correlate
+ # what is allocated in placement and what is consumed in nova we
+ # need to map device pools to RPs. We can do that as the PciDevice
+ # contains the RP UUID that represents it in placement.
+ # NOTE(gibi): We cannot do this when the device is originally added to
+ # the pool as the device -> placement translation, that creates the
+ # RPs, runs after all the device is created and assigned to pools.
+ for pool in self.pools:
+ pool_rps = {
+ dev.extra_info.get("rp_uuid")
+ for dev in pool["devices"]
+ if "rp_uuid" in dev.extra_info
+ }
+ if len(pool_rps) >= 2:
+ # FIXME(gibi): Do we have a 1:1 pool - RP mapping even
+ # if two PFs providing very similar VFs?
+ raise ValueError(
+ "We have a pool %s connected to more than one RPs %s in "
+ "placement via devs %s" % (pool, pool_rps, pool["devices"])
+ )
+
+ if not pool_rps:
+ # this can happen if the nova-compute is upgraded to have the
+ # PCI in placement inventory handling code but
+ # [pci]report_in_placement is not turned on yet.
+ continue
+
+ if pool_rps: # now we know that it is a single RP
+ pool['rp_uuid'] = next(iter(pool_rps))