Strictly follow placement allocation during PCI claim

The current PCI claim logic only ensures that the PCI devices are allocated from the pools that are also used by the placement allocation. But it does not make sure that the correct amount of device is consumed from the correct pool. This could lead to a situation where the placement allocation used one dev from RP1 and two devs from RP2 while the PCI claim did the opposite and allocated two devs from the pool mapped to RP1 and one from the other. This patch fixes the logic in the stats module to not only consider the pools but also consider the amount of devices based on the placement allocation (candidate). blueprint: pci-device-tracking-in-placement Change-Id: Ibc8cacc85a09ffc2985c1eb637ae35014b8e595e
author: Balazs Gibizer <gibi@redhat.com> 2022-09-01 16:56:20 +0200
committer: Balazs Gibizer <gibi@redhat.com> 2023-01-05 17:40:02 +0100
commit: 8911da6923c72290e0d556545f6c2e17aaf2e2d4 (patch)
tree: a0b9acd6ae2947e7a81f0f517841c4466a94afd7
parent: 01b5d6ca42020c5f8ced76a9886ba48deff8dac2 (diff)
download: nova-8911da6923c72290e0d556545f6c2e17aaf2e2d4.tar.gz
5 files changed, 192 insertions, 63 deletions
diff --git a/nova/compute/utils.py b/nova/compute/utils.py
index fa78292170..30efc24fc7 100644
--- a/nova/compute/utils.py
+++ b/nova/compute/utils.py
@@ -1536,7 +1536,7 @@ def update_pci_request_with_placement_allocations(
             for spec in pci_request.spec:
                 # FIXME(gibi): this is baaad but spec is a dict of strings so
                 #  we need to serialize
-                spec['rp_uuids'] = ','.join(set(mapping.values()))
+                spec['rp_uuids'] = ','.join(mapping.values())
 
         elif needs_update_due_to_qos(pci_request, provider_mapping):
 
diff --git a/nova/pci/stats.py b/nova/pci/stats.py
index dfc37e22f5..a64baced00 100644
--- a/nova/pci/stats.py
+++ b/nova/pci/stats.py
@@ -13,7 +13,7 @@
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
-
+import collections
 import copy
 import typing as ty
 
@@ -244,6 +244,17 @@ class PciDeviceStats(object):
             free_devs.extend(pool['devices'])
         return free_devs
 
+    def _allocate_devs(
+        self, pool: Pool, num: int, request_id: str
+    ) -> ty.List["objects.PciDevice"]:
+        alloc_devices = []
+        for _ in range(num):
+            pci_dev = pool['devices'].pop()
+            self._handle_device_dependents(pci_dev)
+            pci_dev.request_id = request_id
+            alloc_devices.append(pci_dev)
+        return alloc_devices
+
     def consume_requests(
         self,
         pci_requests: 'objects.InstancePCIRequests',
@@ -274,20 +285,29 @@ class PciDeviceStats(object):
                     self.add_device(alloc_devices.pop())
                 raise exception.PciDeviceRequestFailed(requests=pci_requests)
 
-            for pool in pools:
-                if pool['count'] >= count:
-                    num_alloc = count
-                else:
-                    num_alloc = pool['count']
-                count -= num_alloc
-                pool['count'] -= num_alloc
-                for d in range(num_alloc):
-                    pci_dev = pool['devices'].pop()
-                    self._handle_device_dependents(pci_dev)
-                    pci_dev.request_id = request.request_id
-                    alloc_devices.append(pci_dev)
-                if count == 0:
-                    break
+            if not rp_uuids:
+                # if there is no placement allocation then we are free to
+                # consume from the pools in any order:
+                for pool in pools:
+                    if pool['count'] >= count:
+                        num_alloc = count
+                    else:
+                        num_alloc = pool['count']
+                    count -= num_alloc
+                    pool['count'] -= num_alloc
+                    alloc_devices += self._allocate_devs(
+                        pool, num_alloc, request.request_id)
+                    if count == 0:
+                        break
+            else:
+                # but if there is placement allocation then we have to follow
+                # it
+                requested_devs_per_pool_rp = collections.Counter(rp_uuids)
+                for pool in pools:
+                    count = requested_devs_per_pool_rp[pool['rp_uuid']]
+                    pool['count'] -= count
+                    alloc_devices += self._allocate_devs(
+                        pool, count, request.request_id)
 
         return alloc_devices
 
@@ -543,7 +563,7 @@ class PciDeviceStats(object):
         self,
         pools: ty.List[Pool],
         request: 'objects.InstancePCIRequest',
-        rp_uuids: ty.Set[str],
+        rp_uuids: ty.List[str],
     ) -> ty.List[Pool]:
         if not rp_uuids:
             # If there is no placement allocation then we don't need to filter
@@ -554,6 +574,7 @@ class PciDeviceStats(object):
             # configuration option is not enabled in the scheduler.
             return pools
 
+        requested_dev_count_per_rp = collections.Counter(rp_uuids)
         matching_pools = []
         for pool in pools:
             rp_uuid = pool.get('rp_uuid')
@@ -572,7 +593,13 @@ class PciDeviceStats(object):
                     "scheduler. This pool is ignored now.", pool)
                 continue
 
-            if rp_uuid in rp_uuids:
+            if (
+                # the placement allocation contains this pool
+                rp_uuid in requested_dev_count_per_rp and
+                # the amount of dev allocated in placement can be consumed
+                # from the pool
+                pool["count"] >= requested_dev_count_per_rp[rp_uuid]
+            ):
                 matching_pools.append(pool)
 
         return matching_pools
@@ -582,7 +609,7 @@ class PciDeviceStats(object):
         pools: ty.List[Pool],
         request: 'objects.InstancePCIRequest',
         numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']],
-        rp_uuids: ty.Set[str],
+        rp_uuids: ty.List[str],
     ) -> ty.Optional[ty.List[Pool]]:
         """Determine if an individual PCI request can be met.
 
@@ -751,7 +778,7 @@ class PciDeviceStats(object):
         self,
         pools: ty.List[Pool],
         request: 'objects.InstancePCIRequest',
-        rp_uuids: ty.Set[str],
+        rp_uuids: ty.List[str],
         numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,
     ) -> bool:
         """Apply an individual PCI request.
@@ -782,11 +809,22 @@ class PciDeviceStats(object):
         if not filtered_pools:
             return False
 
-        count = request.count
-        for pool in filtered_pools:
-            count = self._decrease_pool_count(pools, pool, count)
-            if not count:
-                break
+        if not rp_uuids:
+            # If there is no placement allocation for this request then we are
+            # free to consume from the filtered pools in any order
+            count = request.count
+            for pool in filtered_pools:
+                count = self._decrease_pool_count(pools, pool, count)
+                if not count:
+                    break
+        else:
+            # but if there is placement allocation then we have to follow that
+            requested_devs_per_pool_rp = collections.Counter(rp_uuids)
+            for pool in filtered_pools:
+                count = requested_devs_per_pool_rp[pool['rp_uuid']]
+                pool['count'] -= count
+                if pool['count'] == 0:
+                    pools.remove(pool)
 
         return True
 
@@ -794,13 +832,18 @@ class PciDeviceStats(object):
         self,
         provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]],
         request: 'objects.InstancePCIRequest'
-    ) -> ty.Set[str]:
-        """Return the list of RP uuids that are fulfilling the request"""
+    ) -> ty.List[str]:
+        """Return the list of RP uuids that are fulfilling the request.
+
+        An RP will be in the list as many times as many devices needs to
+        be allocated from that RP.
+        """
 
         if request.source == objects.InstancePCIRequest.NEUTRON_PORT:
             # TODO(gibi): support neutron based requests in a later cycle
-            # set() will signal that any PCI pool can be used for this request
-            return set()
+            # an empty list will signal that any PCI pool can be used for this
+            # request
+            return []
 
         if not provider_mapping:
             # NOTE(gibi): AFAIK specs is always a list of a single dict
@@ -809,23 +852,23 @@ class PciDeviceStats(object):
             if not rp_uuids:
                 # This can happen if [filter_scheduler]pci_in_placement is not
                 # enabled yet
-                # set() will signal that any PCI pool can be used for this
-                # request
-                return set()
+                # An empty list will signal that any PCI pool can be used for
+                # this request
+                return []
 
             # TODO(gibi): this is baaad but spec is a dict of string so
             #  the list is serialized
-            return set(rp_uuids.split(','))
+            return rp_uuids.split(',')
 
         # NOTE(gibi): the PCI prefilter generates RequestGroup suffixes from
         # InstancePCIRequests in the form of {request_id}-{count_index}
         # NOTE(gibi): a suffixed request group always fulfilled from a single
         # RP
-        return {
+        return [
             rp_uuids[0]
             for group_id, rp_uuids in provider_mapping.items()
             if group_id.startswith(request.request_id)
-        }
+        ]
 
     def apply_requests(
         self,
diff --git a/nova/tests/functional/libvirt/test_pci_in_placement.py b/nova/tests/functional/libvirt/test_pci_in_placement.py
index ebeaeb0bd3..5cf554c3fe 100644
--- a/nova/tests/functional/libvirt/test_pci_in_placement.py
+++ b/nova/tests/functional/libvirt/test_pci_in_placement.py
@@ -1978,27 +1978,18 @@ class RCAndTraitBasedPCIAliasTests(PlacementPCIReportingTests):
             "compute1_0000:81:00.0", "CUSTOM_MY_VF", 1)
         extra_spec = {"pci_passthrough:alias": "a-vf:3"}
         flavor_id = self._create_flavor(extra_spec=extra_spec)
-        # We expect this to fit, but it does not. The pool filtering logic
-        # only considers which pools can be used based on the allocation
-        # candidate, but does not consider how much device needs to be used
-        # from which pool. So the PCI claim logic sees both PF pools as usable
-        # but allocates 2 dev from 81:00 in nova. Then the PCI allocation
-        # healing logic sees the difference between the placement allocation
-        # and the nova allocation and fails when trys to correct it.
-        self._create_server(
-            flavor_id=flavor_id, networks=[], expected_state="ERROR"
-        )
-        # server_3vf = self._create_server(flavor_id=flavor_id, networks=[])
-        #
-        # self.assertPCIDeviceCounts('compute1', total=4, free=1)
-        # compute1_expected_placement_view["usages"] = {
-        #     "0000:81:00.0": {"CUSTOM_MY_VF": 1},
-        #     "0000:81:01.0": {"CUSTOM_MY_VF": 2},
-        # }
-        # compute1_expected_placement_view["allocations"][server_3vf["id"]] = {
-        #     "0000:81:00.0": {"CUSTOM_MY_VF": 1},
-        #     "0000:81:01.0": {"CUSTOM_MY_VF": 2},
-        # }
-        # self.assert_placement_pci_view(
-        #     "compute1", **compute1_expected_placement_view)
-        # self.assert_no_pci_healing("compute1")
+        # We expect this to fit.
+        server_3vf = self._create_server(flavor_id=flavor_id, networks=[])
+
+        self.assertPCIDeviceCounts('compute1', total=4, free=1)
+        compute1_expected_placement_view["usages"] = {
+            "0000:81:00.0": {"CUSTOM_MY_VF": 1},
+            "0000:81:01.0": {"CUSTOM_MY_VF": 2},
+        }
+        compute1_expected_placement_view["allocations"][server_3vf["id"]] = {
+            "0000:81:00.0": {"CUSTOM_MY_VF": 1},
+            "0000:81:01.0": {"CUSTOM_MY_VF": 2},
+        }
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+        self.assert_no_pci_healing("compute1")
diff --git a/nova/tests/unit/compute/test_utils.py b/nova/tests/unit/compute/test_utils.py
index c77e39ceac..dd10ecd7df 100644
--- a/nova/tests/unit/compute/test_utils.py
+++ b/nova/tests/unit/compute/test_utils.py
@@ -1597,7 +1597,7 @@ class PciRequestUpdateTestCase(test.NoDBTestCase):
             provider_mapping)
 
         self.assertEqual(
-            ",".join({uuids.rp1, uuids.rp2}), req.spec[0]["rp_uuids"]
+            {uuids.rp1, uuids.rp2}, set(req.spec[0]["rp_uuids"].split(','))
         )
 
     def test_pci_request_has_no_mapping(self):
diff --git a/nova/tests/unit/pci/test_stats.py b/nova/tests/unit/pci/test_stats.py
index 2d48513e0e..7eb43a05f4 100644
--- a/nova/tests/unit/pci/test_stats.py
+++ b/nova/tests/unit/pci/test_stats.py
@@ -12,7 +12,7 @@
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
-
+import collections
 from unittest import mock
 
 from oslo_config import cfg
@@ -1039,9 +1039,9 @@ class PciDeviceStatsProviderMappingTestCase(test.NoDBTestCase):
             ),
         ]
         self.flags(device_spec=device_spec, group="pci")
-        dev_filter = whitelist.Whitelist(device_spec)
+        self.dev_filter = whitelist.Whitelist(device_spec)
         self.pci_stats = stats.PciDeviceStats(
-            objects.NUMATopology(), dev_filter=dev_filter
+            objects.NUMATopology(), dev_filter=self.dev_filter
         )
         # add devices represented by different RPs in placement
         # two VFs on the same PF
@@ -1384,6 +1384,101 @@ class PciDeviceStatsProviderMappingTestCase(test.NoDBTestCase):
             {pool['rp_uuid'] for pool in self.pci_stats.pools},
         )
 
+    def _create_two_pools_with_two_vfs(self):
+        # create two pools (PFs) with two VFs each
+        self.pci_stats = stats.PciDeviceStats(
+            objects.NUMATopology(), dev_filter=self.dev_filter
+        )
+        for pf_index in [1, 2]:
+            for vf_index in [1, 2]:
+                dev = objects.PciDevice(
+                    compute_node_id=1,
+                    vendor_id="dead",
+                    product_id="beef",
+                    address=f"0000:81:0{pf_index}.{vf_index}",
+                    parent_addr=f"0000:81:0{pf_index}.0",
+                    numa_node=0,
+                    dev_type="type-VF",
+                )
+                self.pci_stats.add_device(dev)
+                dev.extra_info = {'rp_uuid': getattr(uuids, f"pf{pf_index}")}
+
+        # populate the RP -> pool mapping from the devices to its pools
+        self.pci_stats.populate_pools_metadata_from_assigned_devices()
+
+        # we have 2 pool and 4 devs in total
+        self.num_pools = 2
+        self.assertEqual(self.num_pools, len(self.pci_stats.pools))
+        self.num_devs = 4
+        self.assertEqual(
+            self.num_devs, sum(pool["count"] for pool in self.pci_stats.pools)
+        )
+
+    def test_apply_asymmetric_allocation(self):
+        self._create_two_pools_with_two_vfs()
+        # ask for 3 VFs
+        vf_req = objects.InstancePCIRequest(
+            count=3,
+            alias_name='a-vf',
+            request_id=uuids.vf_req,
+            spec=[
+                {
+                    "vendor_id": "dead",
+                    "product_id": "beef",
+                    "dev_type": "type-VF",
+                }
+            ],
+        )
+
+        # Simulate that placement returned an allocation candidate where 1 VF
+        # is consumed from PF1 and two from PF2
+        mapping = {
+            # the VF is represented by the parent PF RP
+            f"{uuids.vf_req}-0": [uuids.pf1],
+            f"{uuids.vf_req}-1": [uuids.pf2],
+            f"{uuids.vf_req}-2": [uuids.pf2],
+        }
+        # This should fit
+        self.assertTrue(
+            self.pci_stats.support_requests([vf_req], mapping)
+        )
+        # and when consumed the consumption from the pools should be in sync
+        # with the placement allocation. So the PF2 pool is expected to
+        # disappear as it is fully consumed and the PF1 pool should have
+        # one free device.
+        self.pci_stats.apply_requests([vf_req], mapping)
+        self.assertEqual(1, len(self.pci_stats.pools))
+        self.assertEqual(uuids.pf1, self.pci_stats.pools[0]['rp_uuid'])
+        self.assertEqual(1, self.pci_stats.pools[0]['count'])
+
+    def test_consume_asymmetric_allocation(self):
+        self._create_two_pools_with_two_vfs()
+        # ask for 3 VFs
+        vf_req = objects.InstancePCIRequest(
+            count=3,
+            alias_name='a-vf',
+            request_id=uuids.vf_req,
+            spec=[
+                {
+                    "vendor_id": "dead",
+                    "product_id": "beef",
+                    "dev_type": "type-VF",
+                    # Simulate that the scheduler already allocate a candidate
+                    # and the mapping is stored in the request.
+                    # In placement 1 VF is allocated from PF1 and two from PF2
+                    "rp_uuids": ",".join([uuids.pf1, uuids.pf2, uuids.pf2])
+                }
+            ],
+        )
+
+        # So when the PCI claim consumes devices based on this request we
+        # expect that nova follows what is allocated in placement.
+        devs = self.pci_stats.consume_requests([vf_req])
+        self.assertEqual(
+            {"0000:81:01.0": 1, "0000:81:02.0": 2},
+            collections.Counter(dev.parent_addr for dev in devs),
+        )
+
     def test_consume_restricted_by_allocation(self):
         pf_req = objects.InstancePCIRequest(
             count=1,
author	Balazs Gibizer <gibi@redhat.com>	2022-09-01 16:56:20 +0200
committer	Balazs Gibizer <gibi@redhat.com>	2023-01-05 17:40:02 +0100
commit	8911da6923c72290e0d556545f6c2e17aaf2e2d4 (patch)
tree	a0b9acd6ae2947e7a81f0f517841c4466a94afd7
parent	01b5d6ca42020c5f8ced76a9886ba48deff8dac2 (diff)
download	nova-8911da6923c72290e0d556545f6c2e17aaf2e2d4.tar.gz