Cyborg shelve/unshelve support

This change extends the conductor manager to append the cyborg resource request to the request spec when performing an unshelve. On shelve offload an instance will be deleted the instance's ARQs binding info to free up the bound ARQs in Cyborg service. And this change passes the ARQs to spawn during unshelve an instance. This change extends the ``shelve_instance``, ``shelve_offload_instance`` and ``unshelve_instance`` rpcapi function to carry the arq_uuids. Co-Authored-By: Wenping Song <songwenping@inspur.com> Implements: blueprint cyborg-shelve-and-unshelve Change-Id: I258df4d77f6d86df1d867a8fe27360731c21d237
author: zhangbailin <zhangbailin@inspur.com> 2020-05-20 18:34:59 +0800
committer: zhangbailin <zhangbailin@inspur.com> 2021-01-15 03:21:17 +0000
commit: 7fbd787b1b5106f3c4143b9de51980ede0167d3e (patch)
tree: 944797571ca958e6d20b3337e702aabcd9d25b75
parent: 261416aeb0187cc7d420bb74d8b330aa66cc37b6 (diff)
download: nova-7fbd787b1b5106f3c4143b9de51980ede0167d3e.tar.gz
15 files changed, 827 insertions, 89 deletions
diff --git a/api-guide/source/accelerator-support.rst b/api-guide/source/accelerator-support.rst
index f7c82944ed..8ad60a0f56 100644
--- a/api-guide/source/accelerator-support.rst
+++ b/api-guide/source/accelerator-support.rst
@@ -43,12 +43,12 @@ The lists of supported and unsupported operations are as below:
   * Rescue and unrescue.
   * Rebuild.
   * Evacuate.
+  * Shelve and unshelve.
 
 * Unsupported operations
 
   * Resize.
   * Suspend and resume.
-  * Shelve and unshelve.
   * Cold migration.
   * Live migration.
 
@@ -56,6 +56,10 @@ The lists of supported and unsupported operations are as below:
 
    Added support for rebuild and evacuate operations.
 
+.. versionchanged:: 23.0.0(Wallaby)
+
+   Added support for shelve and unshelve operations.
+
 Some operations, such as lock and unlock, work as they are effectively
 no-ops for accelerators.
 
diff --git a/nova/api/openstack/compute/shelve.py b/nova/api/openstack/compute/shelve.py
index bb4a7b92bc..acd5b41413 100644
--- a/nova/api/openstack/compute/shelve.py
+++ b/nova/api/openstack/compute/shelve.py
@@ -37,7 +37,7 @@ class ShelveController(wsgi.Controller):
         self.network_api = neutron.API()
 
     @wsgi.response(202)
-    @wsgi.expected_errors((404, 409))
+    @wsgi.expected_errors((404, 403, 409))
     @wsgi.action('shelve')
     def _shelve(self, req, id, body):
         """Move an instance into shelved mode."""
@@ -55,6 +55,8 @@ class ShelveController(wsgi.Controller):
             exception.UnexpectedTaskStateError,
         ) as e:
             raise exc.HTTPConflict(explanation=e.format_message())
+        except exception.ForbiddenWithAccelerators as e:
+            raise exc.HTTPForbidden(explanation=e.format_message())
         except exception.InstanceInvalidState as state_error:
             common.raise_http_conflict_for_instance_invalid_state(state_error,
                                                                   'shelve', id)
diff --git a/nova/compute/api.py b/nova/compute/api.py
index 0b2e8248c8..a03f5f6827 100644
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@@ -4158,7 +4158,7 @@ class API(base.Base):
         return allow_same_host
 
     @reject_vtpm_instances(instance_actions.SHELVE)
-    @block_accelerators()
+    @block_accelerators(until_service=54)
     @check_instance_lock
     @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED,
                                     vm_states.PAUSED, vm_states.SUSPENDED])
@@ -4181,16 +4181,23 @@ class API(base.Base):
 
         self._record_action_start(context, instance, instance_actions.SHELVE)
 
+        accel_uuids = []
+        if instance.flavor.extra_specs.get('accel:device_profile'):
+            cyclient = cyborg.get_client(context)
+            accel_uuids = cyclient.get_arq_uuids_for_instance(instance)
+
         if not compute_utils.is_volume_backed_instance(context, instance):
             name = '%s-shelved' % instance.display_name
             image_meta = compute_utils.create_image(
                 context, instance, name, 'snapshot', self.image_api)
             image_id = image_meta['id']
             self.compute_rpcapi.shelve_instance(context, instance=instance,
-                    image_id=image_id, clean_shutdown=clean_shutdown)
+                image_id=image_id, clean_shutdown=clean_shutdown,
+                accel_uuids=accel_uuids)
         else:
-            self.compute_rpcapi.shelve_offload_instance(context,
-                    instance=instance, clean_shutdown=clean_shutdown)
+            self.compute_rpcapi.shelve_offload_instance(
+                context, instance=instance, clean_shutdown=clean_shutdown,
+                accel_uuids=accel_uuids)
 
     @check_instance_lock
     @check_instance_state(vm_state=[vm_states.SHELVED])
@@ -4202,8 +4209,14 @@ class API(base.Base):
         self._record_action_start(context, instance,
                                   instance_actions.SHELVE_OFFLOAD)
 
-        self.compute_rpcapi.shelve_offload_instance(context, instance=instance,
-            clean_shutdown=clean_shutdown)
+        accel_uuids = []
+        if instance.flavor.extra_specs.get('accel:device_profile'):
+            cyclient = cyborg.get_client(context)
+            accel_uuids = cyclient.get_arq_uuids_for_instance(instance)
+
+        self.compute_rpcapi.shelve_offload_instance(
+            context, instance=instance,
+            clean_shutdown=clean_shutdown, accel_uuids=accel_uuids)
 
     def _validate_unshelve_az(self, context, instance, availability_zone):
         """Verify the specified availability_zone during unshelve.
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 7ae5d3531a..6a296e15e0 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -528,7 +528,7 @@ class ComputeVirtAPI(virtapi.VirtAPI):
 class ComputeManager(manager.Manager):
     """Manages the running instances from creation to destruction."""
 
-    target = messaging.Target(version='5.12')
+    target = messaging.Target(version='5.13')
 
     def __init__(self, compute_driver=None, *args, **kwargs):
         """Load configuration options and connect to the hypervisor."""
@@ -6326,7 +6326,7 @@ class ComputeManager(manager.Manager):
     @wrap_instance_event(prefix='compute')
     @wrap_instance_fault
     def shelve_instance(self, context, instance, image_id,
-                        clean_shutdown):
+                        clean_shutdown, accel_uuids=None):
         """Shelve an instance.
 
         This should be used when you want to take a snapshot of the instance.
@@ -6337,15 +6337,17 @@ class ComputeManager(manager.Manager):
         :param instance: an Instance object
         :param image_id: an image id to snapshot to.
         :param clean_shutdown: give the GuestOS a chance to stop
+        :param accel_uuids: the accelerators uuids for the instance
         """
 
         @utils.synchronized(instance.uuid)
         def do_shelve_instance():
-            self._shelve_instance(context, instance, image_id, clean_shutdown)
+            self._shelve_instance(context, instance, image_id, clean_shutdown,
+                                  accel_uuids)
         do_shelve_instance()
 
     def _shelve_instance(self, context, instance, image_id,
-                         clean_shutdown):
+                         clean_shutdown, accel_uuids=None):
         LOG.info('Shelving', instance=instance)
         offload = CONF.shelved_offload_time == 0
         if offload:
@@ -6400,14 +6402,16 @@ class ComputeManager(manager.Manager):
                 phase=fields.NotificationPhase.END, bdms=bdms)
 
         if offload:
-            self._shelve_offload_instance(context, instance,
-                                          clean_shutdown=False, bdms=bdms)
+            self._shelve_offload_instance(
+                context, instance, clean_shutdown=False, bdms=bdms,
+                accel_uuids=accel_uuids)
 
     @wrap_exception()
     @reverts_task_state
     @wrap_instance_event(prefix='compute')
     @wrap_instance_fault
-    def shelve_offload_instance(self, context, instance, clean_shutdown):
+    def shelve_offload_instance(self, context, instance, clean_shutdown,
+            accel_uuids=None):
         """Remove a shelved instance from the hypervisor.
 
         This frees up those resources for use by other instances, but may lead
@@ -6418,15 +6422,17 @@ class ComputeManager(manager.Manager):
         :param context: request context
         :param instance: nova.objects.instance.Instance
         :param clean_shutdown: give the GuestOS a chance to stop
+        :param accel_uuids: the accelerators uuids for the instance
         """
 
         @utils.synchronized(instance.uuid)
         def do_shelve_offload_instance():
-            self._shelve_offload_instance(context, instance, clean_shutdown)
+            self._shelve_offload_instance(context, instance, clean_shutdown,
+                                          accel_uuids=accel_uuids)
         do_shelve_offload_instance()
 
     def _shelve_offload_instance(self, context, instance, clean_shutdown,
-                                 bdms=None):
+                                 bdms=None, accel_uuids=None):
         LOG.info('Shelve offloading', instance=instance)
         if bdms is None:
             bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
@@ -6451,6 +6457,12 @@ class ComputeManager(manager.Manager):
         # terminate all the connections with the volume server and the host
         self._terminate_volume_connections(context, instance, bdms)
 
+        # NOTE(brinzhang): Free up the accelerator resource occupied
+        # in the cyborg service.
+        if accel_uuids:
+            cyclient = cyborg.get_client(context)
+            cyclient.delete_arqs_for_instance(instance.uuid)
+
         # Free up the resource allocations in the placement service.
         # This should happen *before* the vm_state is changed to
         # SHELVED_OFFLOADED in case client-side code is polling the API to
@@ -6490,8 +6502,9 @@ class ComputeManager(manager.Manager):
     @reverts_task_state
     @wrap_instance_event(prefix='compute')
     @wrap_instance_fault
-    def unshelve_instance(self, context, instance, image,
-                          filter_properties, node, request_spec=None):
+    def unshelve_instance(
+            self, context, instance, image, filter_properties, node,
+            request_spec=None, accel_uuids=None):
         """Unshelve the instance.
 
         :param context: request context
@@ -6502,6 +6515,7 @@ class ComputeManager(manager.Manager):
         :param node: target compute node
         :param request_spec: the RequestSpec object used to schedule the
             instance
+        :param accel_uuids: the accelerators uuids for the instance
         """
         if filter_properties is None:
             filter_properties = {}
@@ -6510,7 +6524,7 @@ class ComputeManager(manager.Manager):
         def do_unshelve_instance():
             self._unshelve_instance(
                 context, instance, image, filter_properties, node,
-                request_spec)
+                request_spec, accel_uuids)
         do_unshelve_instance()
 
     def _unshelve_instance_key_scrub(self, instance):
@@ -6527,7 +6541,7 @@ class ComputeManager(manager.Manager):
         instance.update(keys)
 
     def _unshelve_instance(self, context, instance, image, filter_properties,
-                           node, request_spec):
+                           node, request_spec, accel_uuids):
         LOG.info('Unshelving', instance=instance)
         bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
                 context, instance.uuid)
@@ -6574,6 +6588,19 @@ class ComputeManager(manager.Manager):
                 provider_mappings=provider_mappings)
             network_info = self.network_api.get_instance_nw_info(
                 context, instance)
+
+            accel_info = []
+            if accel_uuids:
+                try:
+                    accel_info = self._get_bound_arq_resources(
+                        context, instance, accel_uuids)
+                except (Exception, eventlet.timeout.Timeout) as exc:
+                    LOG.exception('Failure getting accelerator requests '
+                                  'with the exception: %s', exc,
+                                  instance=instance)
+                    self._build_resources_cleanup(instance, network_info)
+                    raise
+
             with self.rt.instance_claim(context, instance, node, allocations,
                                         limits):
                 self.driver.spawn(context, instance, image_meta,
@@ -6581,7 +6608,8 @@ class ComputeManager(manager.Manager):
                                   admin_password=None,
                                   allocations=allocations,
                                   network_info=network_info,
-                                  block_device_info=block_device_info)
+                                  block_device_info=block_device_info,
+                                  accel_info=accel_info)
         except Exception:
             with excutils.save_and_reraise_exception(logger=LOG):
                 LOG.exception('Instance failed to spawn',
diff --git a/nova/compute/rpcapi.py b/nova/compute/rpcapi.py
index 3558c9f147..6e9675248c 100644
--- a/nova/compute/rpcapi.py
+++ b/nova/compute/rpcapi.py
@@ -380,6 +380,9 @@ class ComputeAPI(object):
                  build_and_run_instance()
         * 5.12 - Add accel_uuids (accelerator requests) parameter to
                  rebuild_instance()
+        * 5.13 - Add accel_uuids (accelerator requests) parameter to
+                 shelve_instance(), shelve_offload_instance() and
+                 unshelve_instance()
     '''
 
     VERSION_ALIASES = {
@@ -1350,40 +1353,65 @@ class ComputeAPI(object):
         cctxt.cast(ctxt, 'restore_instance', instance=instance)
 
     def shelve_instance(self, ctxt, instance, image_id=None,
-                        clean_shutdown=True):
-        version = '5.0'
-        cctxt = self.router.client(ctxt).prepare(
+                        clean_shutdown=True, accel_uuids=None):
+        msg_kwargs = {
+            'instance': instance,
+            'image_id': image_id,
+            'clean_shutdown': clean_shutdown,
+            'accel_uuids': accel_uuids,
+        }
+        client = self.router.client(ctxt)
+        version = '5.13'
+        if not client.can_send_version(version):
+            if accel_uuids:
+                LOG.error("Shelve with accelerators is not supported as "
+                          "RPC version is too old.")
+                raise exception.ForbiddenWithAccelerators()
+            else:
+                msg_kwargs.pop('accel_uuids')
+            version = '5.0'
+        cctxt = client.prepare(
                 server=_compute_host(None, instance), version=version)
-        cctxt.cast(ctxt, 'shelve_instance', instance=instance,
-                   image_id=image_id, clean_shutdown=clean_shutdown)
+        cctxt.cast(ctxt, 'shelve_instance', **msg_kwargs)
 
     def shelve_offload_instance(self, ctxt, instance,
-                                clean_shutdown=True):
-        version = '5.0'
-        cctxt = self.router.client(ctxt).prepare(
+                                clean_shutdown=True, accel_uuids=None):
+        msg_kwargs = {
+            'instance': instance,
+            'clean_shutdown': clean_shutdown,
+            'accel_uuids': accel_uuids,
+        }
+        client = self.router.client(ctxt)
+        version = '5.13'
+        if not client.can_send_version(version):
+            msg_kwargs.pop('accel_uuids')
+            version = '5.0'
+        cctxt = client.prepare(
                 server=_compute_host(None, instance), version=version)
-        cctxt.cast(ctxt, 'shelve_offload_instance', instance=instance,
-                   clean_shutdown=clean_shutdown)
+        cctxt.cast(ctxt, 'shelve_offload_instance', **msg_kwargs)
 
     def unshelve_instance(self, ctxt, instance, host, request_spec, image=None,
-                          filter_properties=None, node=None):
-        version = '5.2'
+                          filter_properties=None, node=None, accel_uuids=None):
+        version = '5.13'
         msg_kwargs = {
             'instance': instance,
             'image': image,
             'filter_properties': filter_properties,
             'node': node,
             'request_spec': request_spec,
+            'accel_uuids': accel_uuids,
         }
 
         client = self.router.client(ctxt)
 
         if not client.can_send_version(version):
+            msg_kwargs.pop('accel_uuids')
+            version = '5.2'
+        if not client.can_send_version(version):
             msg_kwargs.pop('request_spec')
             version = '5.0'
 
-        cctxt = client.prepare(
-                server=host, version=version)
+        cctxt = client.prepare(server=host, version=version)
         cctxt.cast(ctxt, 'unshelve_instance', **msg_kwargs)
 
     def volume_snapshot_create(self, ctxt, instance, volume_id,
diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py
index 4824a1a3c1..605608e79f 100644
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@@ -838,7 +838,7 @@ class ComputeTaskManager(base.Base):
 
             try:
                 accel_uuids = self._create_and_bind_arq_for_instance(
-                    context, instance, host, local_reqspec)
+                    context, instance, host.nodename, local_reqspec)
             except Exception as exc:
                 LOG.exception('Failed to reschedule. Reason: %s', exc)
                 self._cleanup_when_reschedule_fails(
@@ -858,7 +858,7 @@ class ComputeTaskManager(base.Base):
                     limits=host.limits, host_list=host_list,
                     accel_uuids=accel_uuids)
 
-    def _create_and_bind_arq_for_instance(self, context, instance, host,
+    def _create_and_bind_arq_for_instance(self, context, instance, hostname,
                                           request_spec):
         try:
             resource_provider_mapping = (
@@ -867,7 +867,7 @@ class ComputeTaskManager(base.Base):
             # http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html  # noqa
             return self._create_and_bind_arqs(
                 context, instance.uuid, instance.flavor.extra_specs,
-                host.nodename, resource_provider_mapping)
+                hostname, resource_provider_mapping)
         except exception.AcceleratorRequestBindingFailed as exc:
             # If anything failed here we need to cleanup and bail out.
             cyclient = cyborg.get_client(context)
@@ -963,14 +963,19 @@ class ComputeTaskManager(base.Base):
                     filter_properties = request_spec.\
                         to_legacy_filter_properties_dict()
 
-                    port_res_req = (
+                    external_resources = (
                         self.network_api.get_requested_resource_for_instance(
                             context, instance.uuid))
-                    # NOTE(gibi): When cyborg or other module wants to handle
-                    # similar non-nova resources then here we have to collect
-                    # all the external resource requests in a single list and
+                    extra_specs = request_spec.flavor.extra_specs
+                    device_profile = extra_specs.get('accel:device_profile')
+                    external_resources.extend(
+                        cyborg.get_device_profile_request_groups(
+                            context, device_profile) if device_profile else [])
+                    # NOTE(gibi): When other modules want to handle similar
+                    # non-nova resources then here we have to collect all
+                    # the external resource requests in a single list and
                     # add them to the RequestSpec.
-                    request_spec.requested_resources = port_res_req
+                    request_spec.requested_resources = external_resources
 
                     # NOTE(cfriesen): Ensure that we restrict the scheduler to
                     # the cell specified by the instance mapping.
@@ -996,9 +1001,14 @@ class ComputeTaskManager(base.Base):
                     scheduler_utils.fill_provider_mapping(
                         request_spec, selection)
 
+                    # NOTE(brinzhang): For unshelve operation we should
+                    # re-create-and-bound the arqs for the instance.
+                    accel_uuids = self._create_and_bind_arq_for_instance(
+                        context, instance, node, request_spec)
                     self.compute_rpcapi.unshelve_instance(
                         context, instance, host, request_spec, image=image,
-                        filter_properties=filter_properties, node=node)
+                        filter_properties=filter_properties, node=node,
+                        accel_uuids=accel_uuids)
             except (exception.NoValidHost,
                     exception.UnsupportedPolicyException):
                 instance.task_state = None
@@ -1006,7 +1016,11 @@ class ComputeTaskManager(base.Base):
                 LOG.warning("No valid host found for unshelve instance",
                             instance=instance)
                 return
-            except Exception:
+            except Exception as exc:
+                if isinstance(exc, exception.AcceleratorRequestBindingFailed):
+                    cyclient = cyborg.get_client(context)
+                    cyclient.delete_arqs_by_uuid(exc.arqs)
+                LOG.exception('Failed to unshelve. Reason: %s', exc)
                 with excutils.save_and_reraise_exception():
                     instance.task_state = None
                     instance.save()
@@ -1225,12 +1239,24 @@ class ComputeTaskManager(base.Base):
             instance.availability_zone = (
                 availability_zones.get_host_availability_zone(
                     context, host))
+            accel_uuids = []
             try:
-                accel_uuids = self._rebuild_cyborg_arq(
-                    context, instance, host, request_spec, evacuate)
-            except exception.AcceleratorRequestBindingFailed as exc:
-                cyclient = cyborg.get_client(context)
-                cyclient.delete_arqs_by_uuid(exc.arqs)
+                if instance.flavor.extra_specs.get('accel:device_profile'):
+                    cyclient = cyborg.get_client(context)
+                    if evacuate:
+                        # NOTE(brinzhang): For evacuate operation we should
+                        # delete the bound arqs, then re-create-and-bound the
+                        # arqs for the instance.
+                        cyclient.delete_arqs_for_instance(instance.uuid)
+                        accel_uuids = self._create_and_bind_arq_for_instance(
+                            context, instance, node, request_spec)
+                    else:
+                        accel_uuids = cyclient.get_arq_uuids_for_instance(
+                            instance)
+            except Exception as exc:
+                if isinstance(exc, exception.AcceleratorRequestBindingFailed):
+                    cyclient = cyborg.get_client(context)
+                    cyclient.delete_arqs_by_uuid(exc.arqs)
                 LOG.exception('Failed to rebuild. Reason: %s', exc)
                 raise exc
 
@@ -1253,22 +1279,6 @@ class ComputeTaskManager(base.Base):
                 request_spec=request_spec,
                 accel_uuids=accel_uuids)
 
-    def _rebuild_cyborg_arq(
-            self, context, instance, host, request_spec, evacuate):
-        dp_name = instance.flavor.extra_specs.get('accel:device_profile')
-        if not dp_name:
-            return []
-
-        cyclient = cyborg.get_client(context)
-        if not evacuate:
-            return cyclient.get_arq_uuids_for_instance(instance)
-
-        cyclient.delete_arqs_for_instance(instance.uuid)
-        resource_provider_mapping = request_spec.get_request_group_mapping()
-        return self._create_and_bind_arqs(
-            context, instance.uuid, instance.flavor.extra_specs,
-            host, resource_provider_mapping)
-
     def _validate_image_traits_for_rebuild(self, context, instance, image_ref):
         """Validates that the traits specified in the image can be satisfied
         by the providers of the current allocations for the instance during
@@ -1668,7 +1678,7 @@ class ComputeTaskManager(base.Base):
 
             try:
                 accel_uuids = self._create_and_bind_arq_for_instance(
-                        context, instance, host, request_spec)
+                        context, instance, host.nodename, request_spec)
             except Exception as exc:
                 with excutils.save_and_reraise_exception():
                     self._cleanup_build_artifacts(
diff --git a/nova/objects/service.py b/nova/objects/service.py
index 5ce447f536..d314fa3aa1 100644
--- a/nova/objects/service.py
+++ b/nova/objects/service.py
@@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__)
 
 
 # NOTE(danms): This is the global service version counter
-SERVICE_VERSION = 53
+SERVICE_VERSION = 54
 
 
 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
@@ -190,6 +190,10 @@ SERVICE_VERSION_HISTORY = (
     # Version 53: Compute RPC v5.12:
     # Add accel_uuids (accelerator requests) param to rebuild_instance
     {'compute_rpc': '5.12'},
+    # Version 54: Compute RPC v5.13:
+    # Add accel_uuids (accelerator requests) param to shelve_instance and
+    # shelve_offload_instance and unshelve_instance
+    {'compute_rpc': '5.13'},
 )
 
 # This is used to raise an error at service startup if older than N-1 computes
diff --git a/nova/tests/fixtures.py b/nova/tests/fixtures.py
index 931f5f497b..58fe876b4a 100644
--- a/nova/tests/fixtures.py
+++ b/nova/tests/fixtures.py
@@ -3095,7 +3095,10 @@ class CyborgFixture(fixtures.Fixture):
            This function uses bindings indexed by instance UUID to
            populate the bound ARQ templates in CyborgFixture.bound_arq_list.
         """
-        arq_host_rp_list = CyborgFixture.bindings_by_instance[instance_uuid]
+        arq_host_rp_list = CyborgFixture.bindings_by_instance.get(
+                instance_uuid)
+        if not arq_host_rp_list:
+            return []
         # The above looks like:
         # [{'hostname': $hostname,
         #   'device_rp_uuid': $device_rp_uuid,
@@ -3117,7 +3120,7 @@ class CyborgFixture(fixtures.Fixture):
 
     @staticmethod
     def fake_delete_arqs_for_instance(instance_uuid):
-        return None
+        CyborgFixture.bindings_by_instance.pop(instance_uuid, None)
 
     def setUp(self):
         super(CyborgFixture, self).setUp()
diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py
index c5207e6801..49440010a6 100644
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@@ -486,9 +486,17 @@ class InstanceHelperMixin:
         self.api.post_server_action(server['id'], {'shelve': {}})
         return self._wait_for_state_change(server, expected_state)
 
+    def _shelve_offload_server(self, server):
+        """Shelve offload a server."""
+        self.api.post_server_action(server['id'], {'shelveOffload': {}})
+        self._wait_for_server_parameter(server,
+                                        {'status': 'SHELVED_OFFLOADED',
+                                         'OS-EXT-SRV-ATTR:host': None,
+                                         'OS-EXT-AZ:availability_zone': ''})
+
     def _unshelve_server(self, server, expected_state='ACTIVE'):
         """Unshelve a server."""
-        self.api.post_server_action(server['id'], {'unshelve': {}})
+        self.api.post_server_action(server['id'], {'unshelve': None})
         return self._wait_for_state_change(server, expected_state)
 
     def _evacuate_server(
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index 3f23b8ed92..dd7c3418c9 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -33,6 +33,7 @@ from oslo_utils import timeutils
 from nova.compute import api as compute_api
 from nova.compute import instance_actions
 from nova.compute import manager as compute_manager
+from nova.compute import rpcapi as compute_rpcapi
 from nova import context
 from nova import exception
 from nova.network import constants
@@ -7890,6 +7891,29 @@ class AcceleratorServerOpsTest(AcceleratorServerBase):
         self._wait_for_state_change(self.server, 'ACTIVE')
         self._check_allocations_usage(self.server)
 
+    def test_shelve_and_unshelve_ok(self):
+        self.flags(shelved_offload_time=1)
+        arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id'])
+        self.assertEqual(len(arqs), 1)
+        self._shelve_server(self.server, 'SHELVED')
+        arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id'])
+        self.assertEqual(len(arqs), 1)
+        self._unshelve_server(self.server)
+        arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id'])
+        self.assertEqual(len(arqs), 1)
+
+    def test_shelve_offload_and_unshelve_ok(self):
+        self.flags(shelved_offload_time=-1)
+        arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id'])
+        self.assertEqual(len(arqs), 1)
+        self._shelve_server(self.server, 'SHELVED')
+        self._shelve_offload_server(self.server)
+        arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id'])
+        self.assertEqual(len(arqs), 0)
+        self._unshelve_server(self.server)
+        arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id'])
+        self.assertEqual(len(arqs), 1)
+
     def test_resize_fails(self):
         ex = self.assertRaises(client.OpenStackApiException,
             self.api.post_server_action, self.server['id'],
@@ -7949,6 +7973,81 @@ class AcceleratorServerOpsTest(AcceleratorServerBase):
         self.assertEqual(403, ex.response.status_code)
         self._check_allocations_usage(self.server)
 
+    @mock.patch.object(objects.service, 'get_minimum_version_all_cells')
+    def test_shelve_old_compute(self, old_compute_version):
+        """Tests when the source compute service is too old to call
+        shelve so OpenStackApiException is raised.
+        """
+        old_compute_version.return_value = 53
+        ex = self.assertRaises(client.OpenStackApiException,
+            self.api.post_server_action, self.server['id'],
+            {'shelve': {}})
+        self.assertEqual(403, ex.response.status_code)
+        self._check_allocations_usage(self.server)
+
+    def _test_shelve_instance_with_compute_rpc_pin(
+            self, version_cap, body=None):
+        self.flags(compute=version_cap, group='upgrade_levels')
+
+        self.flags(shelved_offload_time=-1)
+        self.api.post_server_action(self.server['id'], body)
+        self._wait_for_state_change(self.server, 'SHELVED')
+
+    def test_shelve_with_compute_rpc_pin_5_0(self):
+        self.flags(compute=5.0, group='upgrade_levels')
+        compute_rpcapi.reset_globals()
+        ex = self.assertRaises(
+            client.OpenStackApiException, self.api.post_server_action,
+            self.server['id'], {'shelve': {}})
+        self.assertEqual(403, ex.response.status_code)
+
+    def test_shelve_instance_5_13(self):
+        body = {'shelve': {}}
+        self._test_shelve_instance_with_compute_rpc_pin(
+                '5.13', body=body)
+
+    def _test_shelve_offload_instance_with_compute_rpc_pin(self, version_cap):
+        self.flags(compute=version_cap, group='upgrade_levels')
+
+        self.flags(shelved_offload_time=2)
+        self.api.post_server_action(self.server['id'], {'shelve': {}})
+        self._wait_for_state_change(self.server, 'SHELVED')
+
+        self.api.post_server_action(self.server['id'], {'shelveOffload': {}})
+        self._wait_for_state_change(self.server, 'SHELVED_OFFLOADED')
+
+    def test_shelve_offload_instance_5_0(self):
+        self._test_shelve_offload_instance_with_compute_rpc_pin('5.0')
+
+    def test_shelve_offload_instance_5_13(self):
+        self._test_shelve_offload_instance_with_compute_rpc_pin('5.13')
+
+    def _test_unshelve_instance_with_compute_rpc_pin(
+            self, version_cap, body=None):
+        self.flags(compute=version_cap, group='upgrade_levels')
+
+        self.api.microversion = '2.87'
+        self.api.post_server_action(self.server['id'], {'shelve': {}})
+        self._wait_for_state_change(self.server, 'SHELVED_OFFLOADED')
+
+        self.api.post_server_action(self.server['id'], body)
+        self._wait_for_state_change(self.server, 'ACTIVE')
+
+    def test_unshelve_instance_5_0(self):
+        body = {'unshelve': None}
+        self._test_unshelve_instance_with_compute_rpc_pin(
+                '5.0', body=body)
+
+    def test_unshelve_instance_5_2(self):
+        body = {'unshelve': None}
+        self._test_unshelve_instance_with_compute_rpc_pin(
+                '5.2', body=body)
+
+    def test_unshelve_instance_5_13(self):
+        body = {'unshelve': None}
+        self._test_unshelve_instance_with_compute_rpc_pin(
+                '5.13', body=body)
+
 
 class CrossCellResizeWithQoSPort(PortResourceRequestBasedSchedulingTestBase):
     NUMBER_OF_CELLS = 2
diff --git a/nova/tests/unit/compute/test_api.py b/nova/tests/unit/compute/test_api.py
index 36aa81f2a1..f450fa60ed 100644
--- a/nova/tests/unit/compute/test_api.py
+++ b/nova/tests/unit/compute/test_api.py
@@ -7637,6 +7637,127 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase):
         # myfunc was not called
         self.assertEqual({}, args_info)
 
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'get_arq_uuids_for_instance')
+    @mock.patch.object(compute_utils, 'create_image')
+    def test_get_arqs_in_shelve(self, mock_create_img, mock_get_arq_uuids):
+        """Test get_arq_uuids_for_instance() was called if the
+        'accel:device_profile' exist in the flavor for shelve operation.
+        """
+        extra_specs = {'accel:device_profile': 'mydp'}
+        flavor = self._create_flavor(extra_specs=extra_specs)
+        params = dict(display_name="vm1")
+        instance = self._create_instance_obj(params=params, flavor=flavor)
+        mock_create_img.return_value = dict(id='fake-image-id')
+        mock_get_arq_uuids.return_value = (
+            ['983d1742-c8ca-4f8a-9f3c-3f54bbafaa7d'])
+        accel_uuids = mock_get_arq_uuids.return_value
+
+        with test.nested(
+            mock.patch('nova.objects.service.get_minimum_version_all_cells',
+                        return_value=54),
+            mock.patch('nova.compute.utils.is_volume_backed_instance',
+                          return_value=False),
+            mock.patch.object(objects.Instance, 'save'),
+            mock.patch.object(self.compute_api, '_record_action_start'),
+            mock.patch.object(compute_rpcapi.ComputeAPI, 'shelve_instance'),
+        ) as (
+            mock_get_min, mock_volume_backend,
+            mock_instance_save, mock_record_action, mock_shelve_service
+        ):
+            self.compute_api.shelve(self.context, instance)
+        mock_get_arq_uuids.assert_called_once()
+        mock_shelve_service.assert_called_once_with(
+            self.context, instance=instance, image_id='fake-image-id',
+            clean_shutdown=True, accel_uuids=accel_uuids)
+
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'get_arq_uuids_for_instance')
+    @mock.patch.object(compute_utils, 'create_image')
+    def test_shelve_with_unsupport_accelerators(
+            self, mock_create_img, mock_get_arq_uuids):
+        """Test get_arq_uuids_for_instance() was called if the
+        'accel:device_profile' exist in the flavor for shelve operation.
+        """
+        extra_specs = {'accel:device_profile': 'mydp'}
+        flavor = self._create_flavor(extra_specs=extra_specs)
+        params = dict(display_name="vm1")
+        instance = self._create_instance_obj(params=params, flavor=flavor)
+
+        with test.nested(
+            mock.patch('nova.objects.service.get_minimum_version_all_cells',
+                        return_value=54),
+            mock.patch('nova.compute.utils.is_volume_backed_instance',
+                          return_value=False),
+            mock.patch.object(objects.Instance, 'save'),
+            mock.patch.object(self.compute_api, '_record_action_start'),
+            mock.patch.object(compute_rpcapi.ComputeAPI, 'shelve_instance'),
+        ) as (
+            mock_get_min, mock_volume_backend,
+            mock_instance_save, mock_record_action, mock_shelve_service
+        ):
+            mock_shelve_service.side_effect = (
+                exception.ForbiddenWithAccelerators(
+                    oldest_supported_version=54,
+                    scope=mock.ANY,
+                    min_service_level=54,
+                    oldest_supported_service=54))
+            self.assertRaises(
+                exception.ForbiddenWithAccelerators,
+                self.compute_api.shelve,
+                self.context,
+                instance)
+
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'get_arq_uuids_for_instance')
+    @mock.patch('nova.objects.service.get_minimum_version_all_cells',
+                return_value=53)
+    def test_shelve_until_service_forbidden(self, mock_get_min,
+                                            mock_get_arq_uuuids):
+        """Ensure a 'ForbiddenWithAccelerators' exception raises if any
+        compute service less than the version of 54.
+        """
+        extra_specs = {'accel:device_profile': 'mydp'}
+        flavor = self._create_flavor(extra_specs=extra_specs)
+        instance = self._create_instance_obj(flavor=flavor)
+        self.assertRaisesRegex(exception.ForbiddenWithAccelerators,
+            'Forbidden with instances that have accelerators.',
+            self.compute_api.shelve, self.context, instance)
+        mock_get_arq_uuuids.assert_not_called()
+
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'get_arq_uuids_for_instance')
+    @mock.patch.object(compute_utils, 'create_image')
+    def test_get_arqs_in_shelve_offload(self, mock_create_img,
+                                        mock_get_arq_uuids):
+        """Test get_arq_uuids_for_instance() was called if the
+        'accel:device_profile' exist in the flavor for
+        shelve offload operation.
+        """
+        extra_specs = {'accel:device_profile': 'mydp'}
+        flavor = self._create_flavor(extra_specs=extra_specs)
+        params = {'vm_state': vm_states.SHELVED}
+        instance = self._create_instance_obj(params=params, flavor=flavor)
+        mock_create_img.return_value = dict(id='fake-image-id')
+        mock_get_arq_uuids.return_value = [uuids.fake]
+        accel_uuids = mock_get_arq_uuids.return_value
+        with test.nested(
+            mock.patch('nova.objects.service.get_minimum_version_all_cells',
+                        return_value=54),
+            mock.patch.object(objects.Instance, 'save'),
+            mock.patch.object(self.compute_api, '_record_action_start'),
+            mock.patch.object(compute_rpcapi.ComputeAPI,
+                              'shelve_offload_instance'),
+        ) as (
+            mock_get_min, mock_instance_save,
+            mock_record_action, mock_shelve_service
+        ):
+            self.compute_api.shelve_offload(self.context, instance)
+        mock_get_arq_uuids.assert_called_once()
+        mock_shelve_service.assert_called_once_with(
+            self.context, instance=instance, clean_shutdown=True,
+            accel_uuids=accel_uuids)
+
     # TODO(huaqiang): Remove in Wallaby
     @mock.patch('nova.objects.service.get_minimum_version_all_cells')
     def test__check_compute_service_for_mixed_instance(self, mock_ver):
diff --git a/nova/tests/unit/compute/test_rpcapi.py b/nova/tests/unit/compute/test_rpcapi.py
index f2655eddfc..24ae515822 100644
--- a/nova/tests/unit/compute/test_rpcapi.py
+++ b/nova/tests/unit/compute/test_rpcapi.py
@@ -897,19 +897,67 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
     def test_shelve_instance(self):
         self._test_compute_api('shelve_instance', 'cast',
                 instance=self.fake_instance_obj, image_id='image_id',
-                clean_shutdown=True, version='5.0')
+                clean_shutdown=True, accel_uuids=None, version='5.13')
+
+    def test_shelve_instance_old_rpcapi(self):
+        # With rpcapi < 5.13, accel_uuids must be dropped in the client call.
+        ctxt = context.RequestContext('fake_user', 'fake_project')
+        compute_api = compute_rpcapi.ComputeAPI()
+        compute_api.router.client = mock.Mock()
+        mock_client = mock.MagicMock()
+        compute_api.router.client.return_value = mock_client
+        # Force can_send_version to False, so that 5.0 version is used.
+        mock_client.can_send_version.return_value = False
+        mock_cctx = mock.MagicMock()
+        mock_client.prepare.return_value = mock_cctx
+        compute_api.shelve_instance(
+                ctxt, instance=self.fake_instance_obj,
+                accel_uuids=[],
+                image_id='image_id', clean_shutdown=True)
+
+        mock_client.can_send_version.assert_called_once_with('5.13')
+        mock_client.prepare.assert_called_with(
+                server=self.fake_instance_obj.host, version='5.0')
+        mock_cctx.cast.assert_called_with(  # No accel_uuids
+                ctxt, 'shelve_instance',
+                instance=self.fake_instance_obj,
+                image_id='image_id', clean_shutdown=True)
 
     def test_shelve_offload_instance(self):
         self._test_compute_api('shelve_offload_instance', 'cast',
                 instance=self.fake_instance_obj,
-                clean_shutdown=True, version='5.0')
+                clean_shutdown=True, accel_uuids=None, version='5.13')
+
+    def test_shelve_offload_instance_old_rpcapi(self):
+        # With rpcapi < 5.13, accel_uuids must be dropped in the client call.
+        ctxt = context.RequestContext('fake_user', 'fake_project')
+        compute_api = compute_rpcapi.ComputeAPI()
+        compute_api.router.client = mock.Mock()
+        mock_client = mock.MagicMock()
+        compute_api.router.client.return_value = mock_client
+        # Force can_send_version to False, so that 5.0 version is used.
+        mock_client.can_send_version.return_value = False
+        mock_cctx = mock.MagicMock()
+        mock_client.prepare.return_value = mock_cctx
+        compute_api.shelve_offload_instance(
+                ctxt, instance=self.fake_instance_obj,
+                accel_uuids=['938af7f9-f136-4e5a-bdbe-3b6feab54311'],
+                clean_shutdown=True,)
+
+        mock_client.can_send_version.assert_called_once_with('5.13')
+        mock_client.prepare.assert_called_with(
+                server=self.fake_instance_obj.host, version='5.0')
+        mock_cctx.cast.assert_called_with(  # No accel_uuids
+                ctxt, 'shelve_offload_instance',
+                instance=self.fake_instance_obj,
+                clean_shutdown=True)
 
     def test_unshelve_instance(self):
         self._test_compute_api('unshelve_instance', 'cast',
                 instance=self.fake_instance_obj, host='host', image='image',
                 filter_properties={'fakeprop': 'fakeval'}, node='node',
-                request_spec=self.fake_request_spec_obj,
-                version='5.2')
+                request_spec=self.fake_request_spec_obj, accel_uuids=None,
+                version='5.13')
 
     def test_cache_image(self):
         self._test_compute_api('cache_images', 'call',
@@ -943,7 +991,10 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
             host='host', request_spec=self.fake_request_spec_obj,
             image='image')
 
-        mock_client.can_send_version.assert_called_once_with('5.2')
+        mock_client.can_send_version.assert_has_calls([
+            mock.call('5.13'),
+            mock.call('5.2'),
+        ])
         mock_client.prepare.assert_called_with(
             server='host', version='5.0')
         mock_cctx.cast.assert_called_with(
diff --git a/nova/tests/unit/compute/test_shelve.py b/nova/tests/unit/compute/test_shelve.py
index fba5a8876e..eeda476c62 100644
--- a/nova/tests/unit/compute/test_shelve.py
+++ b/nova/tests/unit/compute/test_shelve.py
@@ -10,6 +10,7 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
+import eventlet
 import mock
 from oslo_utils import fixture as utils_fixture
 from oslo_utils.fixture import uuidsentinel as uuids
@@ -63,7 +64,8 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
                          mock_notify_instance_usage, mock_get_power_state,
                          mock_snapshot, mock_power_off, mock_terminate,
                          mock_get_bdms, clean_shutdown=True,
-                         guest_power_state=power_state.RUNNING):
+                         guest_power_state=power_state.RUNNING,
+                         accel_uuids=None):
         mock_get_power_state.return_value = 123
 
         CONF.set_override('shelved_offload_time', shelved_offload_time)
@@ -127,7 +129,8 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
             mock_save.side_effect = check_save
             self.compute.shelve_instance(self.context, instance,
                                          image_id=image_id,
-                                         clean_shutdown=clean_shutdown)
+                                         clean_shutdown=clean_shutdown,
+                                         accel_uuids=accel_uuids)
             mock_notify.assert_has_calls([
                 mock.call(self.context, instance, 'fake-mini',
                           action='shelve', phase='start', bdms=fake_bdms),
@@ -181,6 +184,18 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
     def test_shelve_paused_instance(self):
         self._shelve_instance(-1, guest_power_state=power_state.PAUSED)
 
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'delete_arqs_for_instance')
+    def test_shelve_and_offload_with_accel_uuids(self, mock_del_arqs):
+        self._shelve_instance(0, accel_uuids=[uuids.fake])
+        mock_del_arqs.assert_called_once()
+
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'delete_arqs_for_instance')
+    def test_shelve_and_with_accel_uuids(self, mock_del_arqs):
+        self._shelve_instance(-1, accel_uuids=[uuids.fake])
+        mock_del_arqs.assert_not_called()
+
     @mock.patch.object(nova.virt.fake.SmallFakeDriver, 'power_off')
     def test_shelve_offload(self, mock_power_off):
         instance = self._shelve_offload()
@@ -279,10 +294,12 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
                       mock_get_power_state, mock_spawn,
                       mock_prep_block_device, mock_notify_instance_usage,
                       mock_notify_instance_action,
-                      mock_get_bdms):
+                      mock_get_bdms,
+                      accel_uuids=None,
+                      instance=None):
         mock_bdms = mock.Mock()
         mock_get_bdms.return_value = mock_bdms
-        instance = self._create_fake_instance_obj()
+        instance = instance or self._create_fake_instance_obj()
         instance.task_state = task_states.UNSHELVING
         instance.save()
         image = {'id': uuids.image_id}
@@ -340,7 +357,8 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
             self.compute.unshelve_instance(
                 self.context, instance, image=image,
                 filter_properties=filter_properties,
-                node=node, request_spec=objects.RequestSpec())
+                node=node, request_spec=objects.RequestSpec(),
+                accel_uuids=accel_uuids)
 
         mock_notify_instance_action.assert_has_calls([
             mock.call(self.context, instance, 'fake-mini',
@@ -362,7 +380,7 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
         mock_spawn.assert_called_once_with(self.context, instance,
                 test.MatchType(objects.ImageMeta), injected_files=[],
                 admin_password=None, allocations={}, network_info=[],
-                block_device_info='fake_bdm')
+                block_device_info='fake_bdm', accel_info=mock.ANY)
         self.mock_get_allocations.assert_called_once_with(self.context,
                                                           instance.uuid)
         mock_get_power_state.assert_called_once_with(instance)
@@ -380,6 +398,57 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
         self.assertEqual(self.compute.host, instance.host)
         self.assertFalse(instance.auto_disk_config)
 
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_get_bound_arq_resources')
+    def test_unshelve_with_arqs(self, mock_get_arqs):
+        self.test_unshelve(accel_uuids=[uuids.fake])
+        mock_get_arqs.assert_called_once()
+
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_nil_out_instance_obj_host_and_node')
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_terminate_volume_connections')
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_build_resources_cleanup')
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_get_bound_arq_resources')
+    def test_unshelve_with_arqs_failes_cleanup_resources(
+            self, mock_get_arqs, mock_cleanup_resources,
+            mock_bdms, mock_nil):
+        instance = self._create_fake_instance_obj()
+
+        mock_get_arqs.side_effect = exception.AcceleratorRequestOpFailed(
+                instance_uuid=instance.uuid, op='get',
+                msg='Failure getting accelerator requests.')
+
+        exc = self.assertRaises(exception.AcceleratorRequestOpFailed,
+                                self.test_unshelve,
+                                accel_uuids=[uuids.fake],
+                                instance=instance)
+        self.assertIn('Failure getting accelerator requests.', str(exc))
+        mock_cleanup_resources.assert_called_once()
+
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_nil_out_instance_obj_host_and_node')
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_terminate_volume_connections')
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_build_resources_cleanup')
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_get_bound_arq_resources')
+    def test_unshelve_with_arqs_failes_cleanup_resources_timeout(
+            self, mock_get_arqs, mock_cleanup_resources,
+            mock_bdms, mock_nil):
+        instance = self._create_fake_instance_obj()
+
+        mock_get_arqs.side_effect = eventlet.timeout.Timeout()
+
+        self.assertRaises(eventlet.timeout.Timeout,
+                          self.test_unshelve,
+                          accel_uuids=[uuids.fake],
+                          instance=instance)
+        mock_cleanup_resources.assert_called_once()
+
     @mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid')
     @mock.patch('nova.compute.utils.notify_about_instance_action')
     @mock.patch.object(nova.compute.resource_tracker.ResourceTracker,
@@ -464,7 +533,7 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
                                                     {}, limits)
         mock_spawn.assert_called_once_with(self.context, instance,
                 test.MatchType(objects.ImageMeta),
-                injected_files=[], admin_password=None,
+                injected_files=[], admin_password=None, accel_info=[],
                 allocations={}, network_info=[], block_device_info='fake_bdm')
         self.mock_get_allocations.assert_called_once_with(self.context,
                                                           instance.uuid)
@@ -551,7 +620,7 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
                                                     {}, limits)
         mock_spawn.assert_called_once_with(
             self.context, instance, test.MatchType(objects.ImageMeta),
-            injected_files=[], admin_password=None,
+            injected_files=[], admin_password=None, accel_info=[],
             allocations={}, network_info=[], block_device_info='fake_bdm')
         mock_terminate_volume_connections.assert_called_once_with(
             self.context, instance, mock_bdms)
@@ -767,11 +836,11 @@ class ShelveComputeAPITestCase(test_compute.BaseTestCase):
             if boot_from_volume:
                 rpcapi_shelve_offload_instance.assert_called_once_with(
                     self.context, instance=instance,
-                    clean_shutdown=clean_shutdown)
+                    clean_shutdown=clean_shutdown, accel_uuids=[])
             else:
                 rpcapi_shelve_instance.assert_called_once_with(
                     self.context, instance=instance, image_id='fake-image-id',
-                    clean_shutdown=clean_shutdown)
+                    clean_shutdown=clean_shutdown, accel_uuids=[])
 
             db.instance_destroy(self.context, instance['uuid'])
 
@@ -831,7 +900,7 @@ class ShelveComputeAPITestCase(test_compute.BaseTestCase):
             instance_save.assert_called_once_with(expected_task_state=[None])
             rpcapi_shelve_offload_instance.assert_called_once_with(
                     self.context, instance=fake_instance,
-                    clean_shutdown=clean_shutdown)
+                    clean_shutdown=clean_shutdown, accel_uuids=[])
             record.assert_called_once_with(self.context, fake_instance,
                                            instance_actions.SHELVE_OFFLOAD)
 
diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py
index 9dd46a4932..485ae166e0 100644
--- a/nova/tests/unit/conductor/test_conductor.py
+++ b/nova/tests/unit/conductor/test_conductor.py
@@ -1340,6 +1340,7 @@ class _BaseTaskTestCase(object):
         system_metadata['shelved_host'] = 'fake-mini'
 
         fake_spec = fake_request_spec.fake_spec_obj()
+        fake_spec.flavor = instance.flavor
         # FIXME(sbauza): Modify the fake RequestSpec object to either add a
         # non-empty SchedulerRetries object or nullify the field
         fake_spec.retry = None
@@ -1409,6 +1410,7 @@ class _BaseTaskTestCase(object):
             unshelve_instance.assert_called_once_with(
                 self.context, mock.ANY, host['host'],
                 test.MatchType(objects.RequestSpec), image=mock.ANY,
+                accel_uuids=[],
                 filter_properties=filter_properties, node=host['nodename']
             )
 
@@ -1450,6 +1452,7 @@ class _BaseTaskTestCase(object):
         instance.task_state = task_states.UNSHELVING
         # 'shelved_image_id' is None for volumebacked instance
         instance.system_metadata['shelved_image_id'] = None
+        self.request_spec.flavor = instance.flavor
 
         with test.nested(
             mock.patch.object(self.conductor_manager,
@@ -1487,6 +1490,7 @@ class _BaseTaskTestCase(object):
         mock_im.return_value = objects.InstanceMapping(
             cell_mapping=cell_mapping)
         instance = self._create_fake_instance_obj()
+        fake_spec.flavor = instance.flavor
         instance.vm_state = vm_states.SHELVED_OFFLOADED
         instance.save()
         system_metadata = instance.system_metadata
@@ -1506,6 +1510,7 @@ class _BaseTaskTestCase(object):
             filter_properties=dict(
                 # populate_filter_properties adds limits={}
                 fake_spec.to_legacy_filter_properties_dict(), limits={}),
+            accel_uuids=[],
             node='fake_node')
 
     def test_unshelve_instance_schedule_and_rebuild_novalid_host(self):
@@ -1513,6 +1518,7 @@ class _BaseTaskTestCase(object):
         instance.vm_state = vm_states.SHELVED_OFFLOADED
         instance.save()
         system_metadata = instance.system_metadata
+        self.request_spec.flavor = instance.flavor
 
         def fake_schedule_instances(context, request_spec, *instances,
                 **kwargs):
@@ -1559,6 +1565,7 @@ class _BaseTaskTestCase(object):
         system_metadata['shelved_at'] = timeutils.utcnow()
         system_metadata['shelved_image_id'] = 'fake_image_id'
         system_metadata['shelved_host'] = 'fake-mini'
+        self.request_spec.flavor = instance.flavor
         self.assertRaises(messaging.MessagingTimeout,
                           self.conductor_manager.unshelve_instance,
                           self.context, instance, self.request_spec)
@@ -1582,6 +1589,7 @@ class _BaseTaskTestCase(object):
             cell_mapping=objects.CellMapping.get_by_uuid(self.context,
                                                          uuids.cell1))
         instance = self._create_fake_instance_obj()
+        fake_spec.flavor = instance.flavor
         instance.vm_state = vm_states.SHELVED_OFFLOADED
         instance.save()
         system_metadata = instance.system_metadata
@@ -1594,7 +1602,7 @@ class _BaseTaskTestCase(object):
             self.context, fake_spec, [instance.uuid], return_alternates=False)
         mock_unshelve.assert_called_once_with(
             self.context, instance, 'fake_host', fake_spec, image=None,
-            filter_properties={'limits': {}}, node='fake_node')
+            accel_uuids=[], filter_properties={'limits': {}}, node='fake_node')
 
     @mock.patch('nova.scheduler.utils.fill_provider_mapping')
     @mock.patch('nova.network.neutron.API.get_requested_resource_for_instance')
@@ -1607,6 +1615,7 @@ class _BaseTaskTestCase(object):
         instance.save()
 
         request_spec = objects.RequestSpec()
+        request_spec.flavor = instance.flavor
 
         selection = objects.Selection(
             service_host='fake_host',
@@ -1629,6 +1638,87 @@ class _BaseTaskTestCase(object):
         mock_fill_provider_mapping.assert_called_once_with(
             request_spec, selection)
 
+    @mock.patch('nova.accelerator.cyborg.get_device_profile_request_groups')
+    @mock.patch.object(conductor_manager.ComputeTaskManager,
+                       '_create_and_bind_arq_for_instance', )
+    @mock.patch('nova.scheduler.utils.fill_provider_mapping')
+    @mock.patch('nova.network.neutron.API.get_requested_resource_for_instance')
+    @mock.patch.object(conductor_manager.ComputeTaskManager,
+                       '_schedule_instances', )
+    def test_unshelve_instance_resource_request_with_device_profile(
+            self, mock_schedule, mock_get_res_req, mock_fill_provider_mapping,
+            mock_get_arq, mock_get_dp):
+        instance = self._create_fake_instance_obj()
+        instance.vm_state = vm_states.SHELVED_OFFLOADED
+        instance.save()
+
+        request_spec = objects.RequestSpec()
+        request_spec.flavor = instance.flavor
+        request_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'}
+
+        selection = objects.Selection(
+            service_host='fake_host',
+            nodename='fake_node',
+            limits=None)
+        mock_schedule.return_value = [[selection]]
+
+        mock_get_res_req.return_value = []
+
+        dp_groups = [objects.RequestGroup(requester_id='deviceprofile2'),
+                     objects.RequestGroup(requester_id='deviceprofile3')]
+        mock_get_dp.return_value = dp_groups
+        mock_get_arq.return_value = []
+
+        self.conductor_manager.unshelve_instance(
+            self.context, instance, request_spec)
+
+        self.assertEqual(dp_groups, request_spec.requested_resources)
+
+        mock_get_res_req.assert_called_once_with(self.context, instance.uuid)
+        mock_schedule.assert_called_once_with(
+            self.context, request_spec, [instance.uuid],
+            return_alternates=False)
+        mock_fill_provider_mapping.assert_called_once_with(
+            request_spec, selection)
+
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid')
+    @mock.patch('nova.accelerator.cyborg.get_device_profile_request_groups')
+    @mock.patch.object(conductor_manager.ComputeTaskManager,
+                       '_create_and_bind_arq_for_instance', )
+    @mock.patch('nova.scheduler.utils.fill_provider_mapping')
+    @mock.patch('nova.network.neutron.API.get_requested_resource_for_instance')
+    @mock.patch.object(conductor_manager.ComputeTaskManager,
+                       '_schedule_instances', )
+    def test_unshelve_instance_resource_request_with_arq_bind_fail(
+            self, mock_schedule, mock_get_res_req, mock_fill_provider_mapping,
+            mock_get_arqs, mock_get_dp, mock_del_arqs):
+        instance = self._create_fake_instance_obj()
+        instance.vm_state = vm_states.SHELVED_OFFLOADED
+        instance.save()
+
+        request_spec = objects.RequestSpec()
+        request_spec.flavor = instance.flavor
+        request_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'}
+
+        selection = objects.Selection(
+            service_host='fake_host',
+            nodename='fake_node',
+            limits=None)
+        mock_schedule.return_value = [[selection]]
+        dp_groups = [objects.RequestGroup(requester_id='deviceprofile2'),
+                     objects.RequestGroup(requester_id='deviceprofile3')]
+        mock_get_dp.return_value = dp_groups
+        mock_get_res_req.return_value = []
+        arqs = ["fake-arq-uuid"]
+        mock_get_arqs.side_effect = exc.AcceleratorRequestBindingFailed(
+                                    arqs=arqs, msg='')
+        ex = self.assertRaises(exc.AcceleratorRequestBindingFailed,
+                               self.conductor_manager.unshelve_instance,
+                               context=self.context, instance=instance,
+                               request_spec=request_spec)
+        self.assertIn('Failed to bind accelerator requests', ex.message)
+        mock_del_arqs.assert_called_with(arqs)
+
     def test_rebuild_instance(self):
         inst_obj = self._create_fake_instance_obj()
         rebuild_args, compute_args = self._prepare_rebuild_args(
@@ -1698,6 +1788,57 @@ class _BaseTaskTestCase(object):
             self.context, inst_obj, 'thebesthost', action='rebuild_scheduled',
             source='nova-conductor')
 
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'get_arq_uuids_for_instance')
+    @mock.patch('nova.compute.utils.notify_about_instance_rebuild')
+    def test_rebuild_instance_with_device_profile(self, mock_notify,
+                                                  mock_get_arqs):
+        inst_obj = self._create_fake_instance_obj()
+        inst_obj.flavor.extra_specs = {'accel:device_profile': 'mydp'}
+        inst_obj.host = 'noselect'
+        expected_host = 'thebesthost'
+        expected_node = 'thebestnode'
+        expected_limits = None
+        fake_selection = objects.Selection(service_host=expected_host,
+                nodename=expected_node, limits=None)
+        rebuild_args, compute_args = self._prepare_rebuild_args(
+            {'host': None, 'node': expected_node, 'limits': expected_limits})
+        fake_spec = objects.RequestSpec()
+        rebuild_args['request_spec'] = fake_spec
+        inst_uuids = [inst_obj.uuid]
+        arqs = ['fake-arq-uuid']
+        mock_get_arqs.return_value = arqs
+        with test.nested(
+            mock.patch.object(self.conductor_manager.compute_rpcapi,
+                              'rebuild_instance'),
+            mock.patch.object(scheduler_utils, 'setup_instance_group',
+                              return_value=False),
+            mock.patch.object(self.conductor_manager.query_client,
+                              'select_destinations',
+                              return_value=[[fake_selection]])
+        ) as (rebuild_mock, sig_mock, select_dest_mock):
+            self.conductor_manager.rebuild_instance(context=self.context,
+                                            instance=inst_obj,
+                                            **rebuild_args)
+            self.ensure_network_metadata_mock.assert_called_once_with(
+                inst_obj)
+            self.heal_reqspec_is_bfv_mock.assert_called_once_with(
+                self.context, fake_spec, inst_obj)
+            select_dest_mock.assert_called_once_with(self.context, fake_spec,
+                    inst_uuids, return_objects=True, return_alternates=False)
+            compute_args['host'] = expected_host
+            compute_args['request_spec'] = fake_spec
+            compute_args['accel_uuids'] = arqs
+            rebuild_mock.assert_called_once_with(self.context,
+                                            instance=inst_obj,
+                                            **compute_args)
+            self.assertEqual(inst_obj.project_id, fake_spec.project_id)
+        self.assertEqual('compute.instance.rebuild.scheduled',
+                         fake_notifier.NOTIFICATIONS[0].event_type)
+        mock_notify.assert_called_once_with(
+            self.context, inst_obj, 'thebesthost', action='rebuild_scheduled',
+            source='nova-conductor')
+
     def test_rebuild_instance_with_scheduler_no_host(self):
         inst_obj = self._create_fake_instance_obj()
         inst_obj.host = 'noselect'
@@ -1897,6 +2038,151 @@ class _BaseTaskTestCase(object):
             source='nova-conductor')
 
     @mock.patch(
+        'nova.accelerator.cyborg._CyborgClient.delete_arqs_for_instance')
+    @mock.patch.object(conductor_manager.ComputeTaskManager,
+                       '_create_and_bind_arq_for_instance', )
+    @mock.patch('nova.accelerator.cyborg.get_device_profile_request_groups')
+    @mock.patch('nova.compute.utils.notify_about_instance_rebuild')
+    def test_evacuate_instance_with_request_spec_device_profile(
+            self, mock_notify, mock_get_dp, mock_get_arqs, mock_del_arqs):
+        inst_obj = self._create_fake_instance_obj()
+        inst_obj.host = 'noselect'
+        expected_host = 'thebesthost'
+        expected_node = 'thebestnode'
+        expected_limits = None
+        fake_selection = objects.Selection(service_host=expected_host,
+                nodename=expected_node, limits=None)
+        fake_spec = objects.RequestSpec(ignore_hosts=[uuids.ignored_host])
+        fake_spec.flavor = inst_obj.flavor
+        fake_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'}
+        rebuild_args, compute_args = self._prepare_rebuild_args(
+            {'host': None, 'node': expected_node, 'limits': expected_limits,
+             'request_spec': fake_spec, 'recreate': True})
+        dp_groups = [objects.RequestGroup(requester_id='deviceprofile2'),
+                     objects.RequestGroup(requester_id='deviceprofile3')]
+        mock_get_dp.return_value = dp_groups
+        arqs = ['fake-arq-uuid']
+        mock_get_arqs.return_value = arqs
+        with test.nested(
+            mock.patch.object(self.conductor_manager.compute_rpcapi,
+                              'rebuild_instance'),
+            mock.patch.object(scheduler_utils, 'setup_instance_group',
+                              return_value=False),
+            mock.patch.object(self.conductor_manager.query_client,
+                              'select_destinations',
+                              return_value=[[fake_selection]]),
+            mock.patch.object(fake_spec, 'reset_forced_destinations'),
+            mock.patch('nova.scheduler.utils.fill_provider_mapping'),
+            mock.patch('nova.network.neutron.API.'
+                       'get_requested_resource_for_instance',
+                       return_value=[])
+        ) as (rebuild_mock, sig_mock, select_dest_mock, reset_fd,
+              fill_rp_mapping_mock, get_req_res_mock):
+            self.conductor_manager.rebuild_instance(context=self.context,
+                                            instance=inst_obj,
+                                            **rebuild_args)
+            reset_fd.assert_called_once_with()
+            # The RequestSpec.ignore_hosts field should be overwritten.
+            self.assertEqual([inst_obj.host], fake_spec.ignore_hosts)
+            # The RequestSpec.requested_destination.cell field should be set.
+            self.assertIn('requested_destination', fake_spec)
+            self.assertIn('cell', fake_spec.requested_destination)
+            self.assertIsNotNone(fake_spec.requested_destination.cell)
+            select_dest_mock.assert_called_once_with(self.context,
+                    fake_spec, [inst_obj.uuid], return_objects=True,
+                    return_alternates=False)
+            compute_args['host'] = expected_host
+            compute_args['request_spec'] = fake_spec
+            compute_args['accel_uuids'] = arqs
+            rebuild_mock.assert_called_once_with(self.context,
+                                            instance=inst_obj,
+                                            **compute_args)
+            get_req_res_mock.assert_called_once_with(
+                self.context, inst_obj.uuid)
+            fill_rp_mapping_mock.assert_called_once_with(
+                fake_spec, fake_selection)
+            mock_del_arqs.assert_called_once()
+
+        self.assertEqual('compute.instance.rebuild.scheduled',
+                         fake_notifier.NOTIFICATIONS[0].event_type)
+        mock_notify.assert_called_once_with(
+            self.context, inst_obj, 'thebesthost', action='rebuild_scheduled',
+            source='nova-conductor')
+
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid')
+    @mock.patch(
+        'nova.accelerator.cyborg._CyborgClient.delete_arqs_for_instance')
+    @mock.patch.object(conductor_manager.ComputeTaskManager,
+                       '_create_and_bind_arq_for_instance', )
+    @mock.patch('nova.accelerator.cyborg.get_device_profile_request_groups')
+    @mock.patch('nova.compute.utils.notify_about_instance_rebuild')
+    def test_evacuate_instance_with_request_spec_arq_bind_fail(
+            self, mock_notify, mock_get_dp, mock_get_arqs,
+            mock_del_arqs_instance, mock_del_arqs):
+        inst_obj = self._create_fake_instance_obj()
+        inst_obj.host = 'noselect'
+        expected_host = 'thebesthost'
+        expected_node = 'thebestnode'
+        expected_limits = None
+        fake_selection = objects.Selection(service_host=expected_host,
+                nodename=expected_node, limits=None)
+        fake_spec = objects.RequestSpec(ignore_hosts=[uuids.ignored_host])
+        fake_spec.flavor = inst_obj.flavor
+        fake_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'}
+        dp_groups = [objects.RequestGroup(requester_id='deviceprofile2'),
+                     objects.RequestGroup(requester_id='deviceprofile3')]
+        mock_get_dp.return_value = dp_groups
+        fake_spec.requested_resources = dp_groups
+        rebuild_args, compute_args = self._prepare_rebuild_args(
+            {'host': None, 'node': expected_node, 'limits': expected_limits,
+             'request_spec': fake_spec, 'recreate': True})
+        arqs = ['fake-arq-uuid']
+        mock_get_arqs.side_effect = exc.AcceleratorRequestBindingFailed(
+                                    arqs=arqs, msg='')
+        with test.nested(
+            mock.patch.object(scheduler_utils, 'setup_instance_group',
+                              return_value=False),
+            mock.patch.object(self.conductor_manager.query_client,
+                              'select_destinations',
+                              return_value=[[fake_selection]]),
+            mock.patch.object(fake_spec, 'reset_forced_destinations'),
+            mock.patch('nova.scheduler.utils.fill_provider_mapping'),
+            mock.patch('nova.network.neutron.API.'
+                       'get_requested_resource_for_instance',
+                       return_value=[])
+        ) as (sig_mock, select_dest_mock, reset_fd,
+              fill_rp_mapping_mock, get_req_res_mock):
+            ex = self.assertRaises(exc.AcceleratorRequestBindingFailed,
+                                   self.conductor_manager.rebuild_instance,
+                                   context=self.context, instance=inst_obj,
+                                   **rebuild_args)
+            reset_fd.assert_called_once_with()
+            # The RequestSpec.ignore_hosts field should be overwritten.
+            self.assertEqual([inst_obj.host], fake_spec.ignore_hosts)
+            # The RequestSpec.requested_destination.cell field should be set.
+            self.assertIn('requested_destination', fake_spec)
+            self.assertIn('cell', fake_spec.requested_destination)
+            self.assertIsNotNone(fake_spec.requested_destination.cell)
+            select_dest_mock.assert_called_once_with(self.context,
+                    fake_spec, [inst_obj.uuid], return_objects=True,
+                    return_alternates=False)
+            compute_args['host'] = expected_host
+            compute_args['request_spec'] = fake_spec
+            get_req_res_mock.assert_called_once_with(
+                self.context, inst_obj.uuid)
+            fill_rp_mapping_mock.assert_called_once_with(
+                fake_spec, fake_selection)
+            self.assertIn('Failed to bind accelerator requests', ex.message)
+            mock_del_arqs.assert_called_with(arqs)
+            mock_del_arqs_instance.assert_called_once()
+
+        self.assertEqual('compute.instance.rebuild.scheduled',
+                         fake_notifier.NOTIFICATIONS[0].event_type)
+        mock_notify.assert_called_once_with(
+            self.context, inst_obj, 'thebesthost', action='rebuild_scheduled',
+            source='nova-conductor')
+
+    @mock.patch(
         'nova.conductor.tasks.cross_cell_migrate.ConfirmResizeTask.execute')
     def test_confirm_snapshot_based_resize(self, mock_execute):
         instance = self._create_fake_instance_obj(ctxt=self.context)
diff --git a/releasenotes/notes/bp-cyborg-rebuild-and-evacuate-97bba59988b8b072.yaml b/releasenotes/notes/bp-cyborg-rebuild-and-evacuate-97bba59988b8b072.yaml
new file mode 100644
index 0000000000..7f31811760
--- /dev/null
+++ b/releasenotes/notes/bp-cyborg-rebuild-and-evacuate-97bba59988b8b072.yaml
@@ -0,0 +1,12 @@
+---
+features:
+  - |
+    Add Cyborg shelve/unshelve support.
+
+    After shelve the ARQs are still kept bound to the instance.
+
+    After shelve offload the ARQs of the instance will be feered
+    in Cyborg.
+
+    During unshelve the ARQs will be reallocated and bound to
+    the instance if needed.
author	zhangbailin <zhangbailin@inspur.com>	2020-05-20 18:34:59 +0800
committer	zhangbailin <zhangbailin@inspur.com>	2021-01-15 03:21:17 +0000
commit	7fbd787b1b5106f3c4143b9de51980ede0167d3e (patch)
tree	944797571ca958e6d20b3337e702aabcd9d25b75
parent	261416aeb0187cc7d420bb74d8b330aa66cc37b6 (diff)
download	nova-7fbd787b1b5106f3c4143b9de51980ede0167d3e.tar.gz