nova/tests/functional/libvirt/test_reshape.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238

#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

import io
from unittest import mock

from oslo_config import cfg
from oslo_log import log as logging

from nova import context
from nova import objects
from nova.tests.fixtures import libvirt as fakelibvirt
from nova.tests.functional.libvirt import base
from nova.virt.libvirt import utils


CONF = cfg.CONF
LOG = logging.getLogger(__name__)


class VGPUReshapeTests(base.ServersTestBase):

    def test_create_servers_with_vgpu(self):
        """Verify that vgpu reshape works with libvirt driver

        1) create two servers with an old tree where the VGPU resource is on
           the compute provider
        2) trigger a reshape
        3) check that the allocations of the servers are still valid
        4) create another server now against the new tree
        """
        self.mock_file_open.side_effect = [
            io.BytesIO(b''), io.BytesIO(b''), io.BytesIO(b'')]
        # NOTE(gibi): We cannot simply ask the virt driver to create an old
        # RP tree with vgpu on the root RP as that code path does not exist
        # any more. So we have to hack a "bit". We will create a compute
        # service without vgpu support to have the compute RP ready then we
        # manually add the VGPU resources to that RP in placement. Also we make
        # sure that during the instance claim the virt driver does not detect
        # the old tree as that would be a bad time for reshape. Later when the
        # compute service is restarted the driver will do the reshape.

        mdevs = {
            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c01':
                fakelibvirt.FakeMdevDevice(
                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c01',
                    type_id=fakelibvirt.NVIDIA_11_VGPU_TYPE,
                    parent=fakelibvirt.MDEVCAP_DEV1_PCI_ADDR),
            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c02':
                fakelibvirt.FakeMdevDevice(
                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c02',
                    type_id=fakelibvirt.NVIDIA_11_VGPU_TYPE,
                    parent=fakelibvirt.MDEVCAP_DEV2_PCI_ADDR),
            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c03':
                fakelibvirt.FakeMdevDevice(
                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c03',
                    type_id=fakelibvirt.NVIDIA_11_VGPU_TYPE,
                    parent=fakelibvirt.MDEVCAP_DEV3_PCI_ADDR),
        }

        # start a compute with vgpu support disabled so the driver will
        # ignore the content of the above HostMdevDeviceInfo
        self.flags(enabled_mdev_types='', group='devices')

        self.hostname = self.start_compute(
            hostname='compute1',
            mdev_info=fakelibvirt.HostMdevDevicesInfo(devices=mdevs),
        )
        self.compute = self.computes[self.hostname]

        # create the VGPU resource in placement manually
        compute_rp_uuid = self.placement.get(
            '/resource_providers?name=compute1').body[
            'resource_providers'][0]['uuid']
        inventories = self.placement.get(
            '/resource_providers/%s/inventories' % compute_rp_uuid).body
        inventories['inventories']['VGPU'] = {
            'allocation_ratio': 1.0,
            'max_unit': 3,
            'min_unit': 1,
            'reserved': 0,
            'step_size': 1,
            'total': 3}
        self.placement.put(
            '/resource_providers/%s/inventories' % compute_rp_uuid,
            inventories)

        # enabled vgpu support
        self.flags(
            enabled_mdev_types=fakelibvirt.NVIDIA_11_VGPU_TYPE,
            group='devices')
        # We don't want to restart the compute service or it would call for
        # a reshape but we still want to accept some vGPU types so we call
        # directly the needed method
        self.compute.driver.supported_vgpu_types = (
            self.compute.driver._get_supported_vgpu_types())

        # now we boot two servers with vgpu
        extra_spec = {"resources:VGPU": 1}
        flavor_id = self._create_flavor(extra_spec=extra_spec)

        server_req = self._build_server(flavor_id=flavor_id)

        # NOTE(gibi): during instance_claim() there is a
        # driver.update_provider_tree() call that would detect the old tree and
        # would fail as this is not a good time to reshape. To avoid that we
        # temporarily mock update_provider_tree here.
        with mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
                        'update_provider_tree'):
            created_server1 = self.api.post_server({'server': server_req})
            server1 = self._wait_for_state_change(created_server1, 'ACTIVE')
            created_server2 = self.api.post_server({'server': server_req})
            server2 = self._wait_for_state_change(created_server2, 'ACTIVE')

        # Determine which device is associated with which instance
        # { inst.uuid: pgpu_name }
        inst_to_pgpu = {}
        ctx = context.get_admin_context()
        for server in (server1, server2):
            inst = objects.Instance.get_by_uuid(ctx, server['id'])
            mdevs = list(
                self.compute.driver._get_all_assigned_mediated_devices(inst))
            self.assertEqual(1, len(mdevs))
            mdev_uuid = mdevs[0]
            mdev_info = self.compute.driver._get_mediated_device_information(
                utils.mdev_uuid2name(mdev_uuid))
            inst_to_pgpu[inst.uuid] = mdev_info['parent']
        # The VGPUs should have come from different pGPUs
        self.assertNotEqual(*list(inst_to_pgpu.values()))

        # verify that the inventory, usages and allocation are correct before
        # the reshape
        compute_inventory = self.placement.get(
            '/resource_providers/%s/inventories' % compute_rp_uuid).body[
            'inventories']
        self.assertEqual(3, compute_inventory['VGPU']['total'])
        compute_usages = self.placement.get(
            '/resource_providers/%s/usages' % compute_rp_uuid).body[
            'usages']
        self.assertEqual(2, compute_usages['VGPU'])

        for server in (server1, server2):
            allocations = self.placement.get(
                '/allocations/%s' % server['id']).body['allocations']
            # the flavor has disk=10 and ephemeral=10
            self.assertEqual(
                {'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2, 'VGPU': 1},
                allocations[compute_rp_uuid]['resources'])

        # restart compute which will trigger a reshape
        self.compute = self.restart_compute_service(self.hostname)

        # verify that the inventory, usages and allocation are correct after
        # the reshape
        compute_inventory = self.placement.get(
            '/resource_providers/%s/inventories' % compute_rp_uuid).body[
            'inventories']
        self.assertNotIn('VGPU', compute_inventory)

        # NOTE(sbauza): The two instances will use two different pGPUs
        # That said, we need to check all the pGPU inventories for knowing
        # which ones are used.
        usages = {}
        pgpu_uuid_to_name = {}
        for pci_device in [fakelibvirt.MDEVCAP_DEV1_PCI_ADDR,
                           fakelibvirt.MDEVCAP_DEV2_PCI_ADDR,
                           fakelibvirt.MDEVCAP_DEV3_PCI_ADDR]:
            gpu_rp_uuid = self.placement.get(
                '/resource_providers?name=compute1_%s' % pci_device).body[
                'resource_providers'][0]['uuid']
            pgpu_uuid_to_name[gpu_rp_uuid] = pci_device
            gpu_inventory = self.placement.get(
                '/resource_providers/%s/inventories' % gpu_rp_uuid).body[
                'inventories']
            self.assertEqual(1, gpu_inventory['VGPU']['total'])

            gpu_usages = self.placement.get(
                '/resource_providers/%s/usages' % gpu_rp_uuid).body[
                'usages']
            usages[pci_device] = gpu_usages['VGPU']
        # Make sure that both instances are using different pGPUs
        used_devices = [dev for dev, usage in usages.items() if usage == 1]
        avail_devices = list(set(usages.keys()) - set(used_devices))
        self.assertEqual(2, len(used_devices))
        # Make sure that both instances are using the correct pGPUs
        for server in [server1, server2]:
            allocations = self.placement.get(
                '/allocations/%s' % server['id']).body[
                'allocations']
            self.assertEqual(
                {'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2},
                allocations[compute_rp_uuid]['resources'])
            rp_uuids = list(allocations.keys())
            # We only have two RPs, the compute RP (the root) and the child
            # pGPU RP
            gpu_rp_uuid = (rp_uuids[1] if rp_uuids[0] == compute_rp_uuid
                           else rp_uuids[0])
            self.assertEqual(
                {'VGPU': 1},
                allocations[gpu_rp_uuid]['resources'])
            # The pGPU's RP name contains the pGPU name
            self.assertIn(inst_to_pgpu[server['id']],
                          pgpu_uuid_to_name[gpu_rp_uuid])

        # now create one more instance with vgpu against the reshaped tree
        created_server = self.api.post_server({'server': server_req})
        server3 = self._wait_for_state_change(created_server, 'ACTIVE')

        # find the pGPU that wasn't used before we created the third instance
        # It should have taken the previously available pGPU
        device = avail_devices[0]
        gpu_rp_uuid = self.placement.get(
            '/resource_providers?name=compute1_%s' % device).body[
            'resource_providers'][0]['uuid']
        gpu_usages = self.placement.get(
            '/resource_providers/%s/usages' % gpu_rp_uuid).body[
            'usages']
        self.assertEqual(1, gpu_usages['VGPU'])

        allocations = self.placement.get(
            '/allocations/%s' % server3['id']).body[
            'allocations']
        self.assertEqual(
            {'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2},
            allocations[compute_rp_uuid]['resources'])
        self.assertEqual(
            {'VGPU': 1},
            allocations[gpu_rp_uuid]['resources'])