18 files changed, 423 insertions, 64 deletions
diff --git a/.zuul.yaml b/.zuul.yaml
index 971c406e18..8149f71bc1 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -105,6 +105,17 @@
     timeout: 3600
 
 - job:
+    name: nova-tox-validate-backport
+    parent: openstack-tox
+    description: |
+      Determine whether a backport is ready to be merged by checking whether it
+      has already been merged to master or more recent stable branches.
+
+      Uses tox with the ``validate-backport`` environment.
+    vars:
+      tox_envlist: validate-backport
+
+- job:
     name: nova-cells-v1
     parent: nova-dsvm-base
     run: playbooks/legacy/nova-cells-v1/run.yaml
@@ -267,6 +278,8 @@
         - nova-next
         - nova-tox-functional
         - nova-tox-functional-py35
+        - nova-tox-validate-backport:
+            voting: false
         - tempest-full-py3:
             irrelevant-files: *dsvm-irrelevant-files
         - tempest-multinode-full:
@@ -274,6 +287,14 @@
             irrelevant-files: *dsvm-irrelevant-files
         - tempest-slow-py3:
             irrelevant-files: *dsvm-irrelevant-files
+            vars:
+              devstack_localrc:
+                # To workaround https://bugs.launchpad.net/neutron/+bug/1914037
+                # tempest is pinned to 26.0.0 on stable/stein in devstack, so
+                # we cannot consume the tempest fix.
+                IPV6_PUBLIC_RANGE: 2001:db8:0:10::/64
+                IPV6_PUBLIC_NETWORK_GATEWAY: 2001:db8:0:10::2
+                IPV6_ROUTER_GW_IP: 2001:db8:0:10::1
         - grenade-py3:
             irrelevant-files: *dsvm-irrelevant-files
     gate:
@@ -283,10 +304,19 @@
         - nova-tox-functional
         - nova-tox-functional-py35
         - nova-next
+        - nova-tox-validate-backport
         - tempest-full-py3:
             irrelevant-files: *dsvm-irrelevant-files
         - tempest-slow-py3:
             irrelevant-files: *dsvm-irrelevant-files
+            vars:
+              devstack_localrc:
+                # To workaround https://bugs.launchpad.net/neutron/+bug/1914037
+                # tempest is pinned to 26.0.0 on stable/stein in devstack, so
+                # we cannot consume the tempest fix.
+                IPV6_PUBLIC_RANGE: 2001:db8:0:10::/64
+                IPV6_PUBLIC_NETWORK_GATEWAY: 2001:db8:0:10::2
+                IPV6_ROUTER_GW_IP: 2001:db8:0:10::1
         - grenade-py3:
             irrelevant-files: *dsvm-irrelevant-files
     experimental:
diff --git a/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst b/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst
index 012d78e93b..fb76f656af 100644
--- a/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst
+++ b/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst
@@ -120,10 +120,13 @@ Performing the migration
 
 (1) On all relevant compute nodes, enable the
     :oslo.config:option:`libvirt.live_migration_with_native_tls`
-    configuration attribute::
+    configuration attribute and set the
+    :oslo.config:option:`libvirt.live_migration_scheme`
+    configuration attribute to tls::
 
        [libvirt]
        live_migration_with_native_tls = true
+       live_migration_scheme = tls
 
     .. note::
         Setting both
@@ -131,6 +134,12 @@ Performing the migration
         :oslo.config:option:`libvirt.live_migration_tunnelled` at the
         same time is invalid (and disallowed).
 
+    .. note::
+        Not setting
+        :oslo.config:option:`libvirt.live_migration_scheme` to ``tls``
+        will result in libvirt using the unencrypted TCP connection
+        without displaying any error or a warning in the logs.
+
     And restart the ``nova-compute`` service::
 
         $ systemctl restart openstack-nova-compute
diff --git a/nova/api/openstack/compute/aggregates.py b/nova/api/openstack/compute/aggregates.py
index e44b449e76..619fecfb37 100644
--- a/nova/api/openstack/compute/aggregates.py
+++ b/nova/api/openstack/compute/aggregates.py
@@ -28,6 +28,7 @@ from nova.compute import api as compute_api
 from nova import exception
 from nova.i18n import _
 from nova.policies import aggregates as aggr_policies
+from nova import utils
 
 
 def _get_context(req):
@@ -84,11 +85,17 @@ class AggregateController(wsgi.Controller):
 
         return agg
 
-    @wsgi.expected_errors(404)
+    @wsgi.expected_errors((400, 404))
     def show(self, req, id):
         """Shows the details of an aggregate, hosts and metadata included."""
         context = _get_context(req)
         context.can(aggr_policies.POLICY_ROOT % 'show')
+
+        try:
+            utils.validate_integer(id, 'id')
+        except exception.InvalidInput as e:
+            raise exc.HTTPBadRequest(explanation=e.format_message())
+
         try:
             aggregate = self.api.get_aggregate(context, id)
         except exception.AggregateNotFound as e:
@@ -107,6 +114,11 @@ class AggregateController(wsgi.Controller):
             updates['name'] = common.normalize_name(updates['name'])
 
         try:
+            utils.validate_integer(id, 'id')
+        except exception.InvalidInput as e:
+            raise exc.HTTPBadRequest(explanation=e.format_message())
+
+        try:
             aggregate = self.api.update_aggregate(context, id, updates)
         except exception.AggregateNameExists as e:
             raise exc.HTTPConflict(explanation=e.format_message())
@@ -125,6 +137,12 @@ class AggregateController(wsgi.Controller):
         """Removes an aggregate by id."""
         context = _get_context(req)
         context.can(aggr_policies.POLICY_ROOT % 'delete')
+
+        try:
+            utils.validate_integer(id, 'id')
+        except exception.InvalidInput as e:
+            raise exc.HTTPBadRequest(explanation=e.format_message())
+
         try:
             self.api.delete_aggregate(context, id)
         except exception.AggregateNotFound as e:
@@ -135,7 +153,7 @@ class AggregateController(wsgi.Controller):
     # NOTE(gmann): Returns 200 for backwards compatibility but should be 202
     # for representing async API as this API just accepts the request and
     # request hypervisor driver to complete the same in async mode.
-    @wsgi.expected_errors((404, 409))
+    @wsgi.expected_errors((400, 404, 409))
     @wsgi.action('add_host')
     @validation.schema(aggregates.add_host)
     def _add_host(self, req, id, body):
@@ -144,6 +162,12 @@ class AggregateController(wsgi.Controller):
 
         context = _get_context(req)
         context.can(aggr_policies.POLICY_ROOT % 'add_host')
+
+        try:
+            utils.validate_integer(id, 'id')
+        except exception.InvalidInput as e:
+            raise exc.HTTPBadRequest(explanation=e.format_message())
+
         try:
             aggregate = self.api.add_host_to_aggregate(context, id, host)
         except (exception.AggregateNotFound,
@@ -158,7 +182,7 @@ class AggregateController(wsgi.Controller):
     # NOTE(gmann): Returns 200 for backwards compatibility but should be 202
     # for representing async API as this API just accepts the request and
     # request hypervisor driver to complete the same in async mode.
-    @wsgi.expected_errors((404, 409))
+    @wsgi.expected_errors((400, 404, 409))
     @wsgi.action('remove_host')
     @validation.schema(aggregates.remove_host)
     def _remove_host(self, req, id, body):
@@ -167,6 +191,12 @@ class AggregateController(wsgi.Controller):
 
         context = _get_context(req)
         context.can(aggr_policies.POLICY_ROOT % 'remove_host')
+
+        try:
+            utils.validate_integer(id, 'id')
+        except exception.InvalidInput as e:
+            raise exc.HTTPBadRequest(explanation=e.format_message())
+
         try:
             aggregate = self.api.remove_host_from_aggregate(context, id, host)
         except (exception.AggregateNotFound, exception.AggregateHostNotFound,
@@ -188,6 +218,11 @@ class AggregateController(wsgi.Controller):
         context = _get_context(req)
         context.can(aggr_policies.POLICY_ROOT % 'set_metadata')
 
+        try:
+            utils.validate_integer(id, 'id')
+        except exception.InvalidInput as e:
+            raise exc.HTTPBadRequest(explanation=e.format_message())
+
         metadata = body["set_metadata"]["metadata"]
         try:
             aggregate = self.api.update_aggregate_metadata(context,
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index c7a0f37e92..1c29aa278f 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -1474,7 +1474,11 @@ class ComputeManager(manager.Manager):
         return [_decode(f) for f in injected_files]
 
     def _validate_instance_group_policy(self, context, instance,
-                                        scheduler_hints):
+                                        scheduler_hints=None):
+
+        if CONF.workarounds.disable_group_policy_check_upcall:
+            return
+
         # NOTE(russellb) Instance group policy is enforced by the scheduler.
         # However, there is a race condition with the enforcement of
         # the policy.  Since more than one instance may be scheduled at the
@@ -1483,29 +1487,63 @@ class ComputeManager(manager.Manager):
         # multiple instances with an affinity policy could end up on different
         # hosts.  This is a validation step to make sure that starting the
         # instance here doesn't violate the policy.
-        group_hint = scheduler_hints.get('group')
-        if not group_hint:
-            return
-
-        # The RequestSpec stores scheduler_hints as key=list pairs so we need
-        # to check the type on the value and pull the single entry out. The
-        # API request schema validates that the 'group' hint is a single value.
-        if isinstance(group_hint, list):
-            group_hint = group_hint[0]
+        if scheduler_hints is not None:
+            # only go through here if scheduler_hints is provided, even if it
+            # is empty.
+            group_hint = scheduler_hints.get('group')
+            if not group_hint:
+                return
+            else:
+                # The RequestSpec stores scheduler_hints as key=list pairs so
+                # we need to check the type on the value and pull the single
+                # entry out. The API request schema validates that
+                # the 'group' hint is a single value.
+                if isinstance(group_hint, list):
+                    group_hint = group_hint[0]
+
+                group = objects.InstanceGroup.get_by_hint(context, group_hint)
+        else:
+            # TODO(ganso): a call to DB can be saved by adding request_spec
+            # to rpcapi payload of live_migration, pre_live_migration and
+            # check_can_live_migrate_destination
+            try:
+                group = objects.InstanceGroup.get_by_instance_uuid(
+                    context, instance.uuid)
+            except exception.InstanceGroupNotFound:
+                return
 
-        @utils.synchronized(group_hint)
-        def _do_validation(context, instance, group_hint):
-            group = objects.InstanceGroup.get_by_hint(context, group_hint)
+        @utils.synchronized(group['uuid'])
+        def _do_validation(context, instance, group):
             if group.policy and 'anti-affinity' == group.policy:
+
+                # instances on host
                 instances_uuids = objects.InstanceList.get_uuids_by_host(
                     context, self.host)
                 ins_on_host = set(instances_uuids)
+
+                # instance param is just for logging, the nodename obtained is
+                # not actually related to the instance at all
+                nodename = self._get_nodename(instance)
+
+                # instances being migrated to host
+                migrations = (
+                    objects.MigrationList.get_in_progress_by_host_and_node(
+                        context, self.host, nodename))
+                migration_vm_uuids = set([mig['instance_uuid']
+                                          for mig in migrations])
+
+                total_instances = migration_vm_uuids | ins_on_host
+
+                # refresh group to get updated members within locked block
+                group = objects.InstanceGroup.get_by_uuid(context,
+                                                          group['uuid'])
                 members = set(group.members)
                 # Determine the set of instance group members on this host
                 # which are not the instance in question. This is used to
                 # determine how many other members from the same anti-affinity
                 # group can be on this host.
-                members_on_host = ins_on_host & members - set([instance.uuid])
+                members_on_host = (total_instances & members -
+                                   set([instance.uuid]))
                 rules = group.rules
                 if rules and 'max_server_per_host' in rules:
                     max_server = rules['max_server_per_host']
@@ -1517,6 +1555,12 @@ class ComputeManager(manager.Manager):
                     raise exception.RescheduledException(
                             instance_uuid=instance.uuid,
                             reason=msg)
+
+            # NOTE(ganso): The check for affinity below does not work and it
+            # can easily be violated because the lock happens in different
+            # compute hosts.
+            # The only fix seems to be a DB lock to perform the check whenever
+            # setting the host field to an instance.
             elif group.policy and 'affinity' == group.policy:
                 group_hosts = group.get_hosts(exclude=[instance.uuid])
                 if group_hosts and self.host not in group_hosts:
@@ -1525,8 +1569,7 @@ class ComputeManager(manager.Manager):
                             instance_uuid=instance.uuid,
                             reason=msg)
 
-        if not CONF.workarounds.disable_group_policy_check_upcall:
-            _do_validation(context, instance, group_hint)
+        _do_validation(context, instance, group)
 
     def _log_original_error(self, exc_info, instance_uuid):
         LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid,
@@ -4646,10 +4689,24 @@ class ComputeManager(manager.Manager):
         with self._error_out_instance_on_exception(
                 context, instance, instance_state=instance_state),\
                 errors_out_migration_ctxt(migration):
+
             self._send_prep_resize_notifications(
                 context, instance, fields.NotificationPhase.START,
                 instance_type)
             try:
+                scheduler_hints = self._get_scheduler_hints(filter_properties,
+                                                            request_spec)
+                # Error out if this host cannot accept the new instance due
+                # to anti-affinity. At this point the migration is already
+                # in-progress, so this is the definitive moment to abort due to
+                # the policy violation. Also, exploding here is covered by the
+                # cleanup methods in except block.
+                try:
+                    self._validate_instance_group_policy(context, instance,
+                                                         scheduler_hints)
+                except exception.RescheduledException as e:
+                    raise exception.InstanceFaultRollback(inner_exception=e)
+
                 self._prep_resize(context, image, instance,
                                   instance_type, filter_properties,
                                   node, migration, clean_shutdown)
@@ -6505,6 +6562,20 @@ class ComputeManager(manager.Manager):
                                  if None, ignore disk usage checking
         :returns: a dict containing migration info
         """
+
+        # Error out if this host cannot accept the new instance due
+        # to anti-affinity. This check at this moment is not very accurate, as
+        # multiple requests may be happening concurrently and miss the lock,
+        # but when it works it provides a better user experience by failing
+        # earlier. Also, it should be safe to explode here, error becomes
+        # NoValidHost and instance status remains ACTIVE.
+        try:
+            self._validate_instance_group_policy(ctxt, instance)
+        except exception.RescheduledException as e:
+            msg = ("Failed to validate instance group policy "
+                   "due to: {}".format(e))
+            raise exception.MigrationPreCheckError(reason=msg)
+
         src_compute_info = obj_base.obj_to_primitive(
             self._get_compute_info(ctxt, instance.host))
         dst_compute_info = obj_base.obj_to_primitive(
@@ -6567,6 +6638,13 @@ class ComputeManager(manager.Manager):
         """
         LOG.debug('pre_live_migration data is %s', migrate_data)
 
+        # Error out if this host cannot accept the new instance due
+        # to anti-affinity. At this point the migration is already in-progress,
+        # so this is the definitive moment to abort due to the policy
+        # violation. Also, it should be safe to explode here. The instance
+        # status remains ACTIVE, migration status failed.
+        self._validate_instance_group_policy(context, instance)
+
         migrate_data.old_vol_attachment_ids = {}
         bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
             context, instance.uuid)
diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py
index a0535aa2a5..a4d9be0271 100644
--- a/nova/db/sqlalchemy/api.py
+++ b/nova/db/sqlalchemy/api.py
@@ -48,6 +48,7 @@ from sqlalchemy.orm import aliased
 from sqlalchemy.orm import contains_eager
 from sqlalchemy.orm import joinedload
 from sqlalchemy.orm import noload
+from sqlalchemy.orm import subqueryload
 from sqlalchemy.orm import undefer
 from sqlalchemy.schema import Table
 from sqlalchemy import sql
@@ -1936,13 +1937,27 @@ def _build_instance_get(context, columns_to_join=None):
             continue
         if 'extra.' in column:
             query = query.options(undefer(column))
+        elif column in ['metadata', 'system_metadata']:
+            # NOTE(melwitt): We use subqueryload() instead of joinedload() for
+            # metadata and system_metadata because of the one-to-many
+            # relationship of the data. Directly joining these columns can
+            # result in a large number of additional rows being queried if an
+            # instance has a large number of (system_)metadata items, resulting
+            # in a large data transfer. Instead, the subqueryload() will
+            # perform additional queries to obtain metadata and system_metadata
+            # for the instance.
+            query = query.options(subqueryload(column))
         else:
             query = query.options(joinedload(column))
     # NOTE(alaski) Stop lazy loading of columns not needed.
     for col in ['metadata', 'system_metadata']:
         if col not in columns_to_join:
             query = query.options(noload(col))
-    return query
+    # NOTE(melwitt): We need to use order_by(<unique column>) so that the
+    # additional queries emitted by subqueryload() include the same ordering as
+    # used by the parent query.
+    # https://docs.sqlalchemy.org/en/13/orm/loading_relationships.html#the-importance-of-ordering
+    return query.order_by(models.Instance.id)
 
 
 def _instances_fill_metadata(context, instances, manual_joins=None):
@@ -5625,8 +5640,12 @@ def _archive_deleted_rows_for_table(tablename, max_rows):
                         "%(tablename)s: %(error)s",
                         {'tablename': tablename, 'error': six.text_type(ex)})
 
-    if ((max_rows is None or rows_archived < max_rows)
-            and 'instance_uuid' in columns):
+    if ((max_rows is None or rows_archived < max_rows) and
+            # NOTE(melwitt): The pci_devices table uses the 'instance_uuid'
+            # column to track the allocated association of a PCI device and its
+            # records are not tied to the lifecycles of instance records.
+            (tablename != 'pci_devices' and
+             'instance_uuid' in columns)):
         instances = models.BASE.metadata.tables['instances']
         limit = max_rows - rows_archived if max_rows is not None else None
         extra = _archive_if_instance_deleted(table, shadow_table, instances,
diff --git a/nova/exception.py b/nova/exception.py
index 11d26f55fa..1679a74f80 100644
--- a/nova/exception.py
+++ b/nova/exception.py
@@ -613,6 +613,11 @@ class ImageBadRequest(Invalid):
                 "%(response)s")
 
 
+class ImageQuotaExceeded(NovaException):
+    msg_fmt = _("Quota exceeded or out of space for image %(image_id)s "
+                "in the image service.")
+
+
 class InstanceUnacceptable(Invalid):
     msg_fmt = _("Instance %(instance_id)s is unacceptable: %(reason)s")
 
diff --git a/nova/image/glance.py b/nova/image/glance.py
index 39970d0498..459699531a 100644
--- a/nova/image/glance.py
+++ b/nova/image/glance.py
@@ -954,6 +954,8 @@ def _translate_image_exception(image_id, exc_value):
     if isinstance(exc_value, glanceclient.exc.BadRequest):
         return exception.ImageBadRequest(image_id=image_id,
                                          response=six.text_type(exc_value))
+    if isinstance(exc_value, glanceclient.exc.HTTPOverLimit):
+        return exception.ImageQuotaExceeded(image_id=image_id)
     return exc_value
 
 
diff --git a/nova/monkey_patch.py b/nova/monkey_patch.py
index 6d0cacd6c7..be2a70ab4b 100644
--- a/nova/monkey_patch.py
+++ b/nova/monkey_patch.py
@@ -22,6 +22,19 @@ import os
 
 
 def _monkey_patch():
+    # See https://bugs.launchpad.net/nova/+bug/1164822
+    # TODO(mdbooth): This feature was deprecated and removed in eventlet at
+    # some point but brought back in version 0.21.0, presumably because some
+    # users still required it to work round issues. However, there have been a
+    # number of greendns fixes in eventlet since then. Specifically, it looks
+    # as though the originally reported IPv6 issue may have been fixed in
+    # version 0.24.0. We should remove this when we can confirm that the
+    # original issue is fixed.
+    # NOTE(artom) eventlet processes environment variables at import-time. We
+    # therefore set this here, before importing eventlet, in order to correctly
+    # disable greendns.
+    os.environ['EVENTLET_NO_GREENDNS'] = 'yes'
+
     # NOTE(mdbooth): Anything imported here will not be monkey patched. It is
     # important to take care not to import anything here which requires monkey
     # patching.
@@ -39,16 +52,6 @@ def _monkey_patch():
     problems = (set(['urllib3', 'oslo_context.context']) &
                 set(sys.modules.keys()))
 
-    # See https://bugs.launchpad.net/nova/+bug/1164822
-    # TODO(mdbooth): This feature was deprecated and removed in eventlet at
-    # some point but brought back in version 0.21.0, presumably because some
-    # users still required it to work round issues. However, there have been a
-    # number of greendns fixes in eventlet since then. Specifically, it looks
-    # as though the originally reported IPv6 issue may have been fixed in
-    # version 0.24.0. We should remove this when we can confirm that the
-    # original issue is fixed.
-    os.environ['EVENTLET_NO_GREENDNS'] = 'yes'
-
     if debugger.enabled():
         # turn off thread patching to enable the remote debugger
         eventlet.monkey_patch(thread=False)
diff --git a/nova/network/neutronv2/api.py b/nova/network/neutronv2/api.py
index 8a60291910..2f4e39b1cb 100644
--- a/nova/network/neutronv2/api.py
+++ b/nova/network/neutronv2/api.py
@@ -828,9 +828,15 @@ class API(base_api.NetworkAPI):
         # TODO(arosen) Should optimize more to do direct query for security
         # group if len(security_groups) == 1
         if len(security_groups):
+            # NOTE(slaweq): fields other than name and id aren't really needed
+            # so asking only about those fields will allow Neutron to not
+            # prepare list of rules for each found security group. That may
+            # speed processing of this request a lot in case when tenant has
+            # got many security groups
+            sg_fields = ['id', 'name']
             search_opts = {'tenant_id': instance.project_id}
             user_security_groups = neutron.list_security_groups(
-                **search_opts).get('security_groups')
+                fields=sg_fields, **search_opts).get('security_groups')
 
             for security_group in security_groups:
                 name_match = None
diff --git a/nova/tests/functional/db/test_archive.py b/nova/tests/functional/db/test_archive.py
index 432d814d91..a8b4df6f16 100644
--- a/nova/tests/functional/db/test_archive.py
+++ b/nova/tests/functional/db/test_archive.py
@@ -135,6 +135,19 @@ class TestDatabaseArchive(test_servers.ServersTestBase):
         # Verify we have some system_metadata since we'll check that later.
         self.assertTrue(len(instance.system_metadata),
                         'No system_metadata for instance: %s' % server_id)
+        # Create a pci_devices record to simulate an instance that had a PCI
+        # device allocated at the time it was deleted. There is a window of
+        # time between deletion of the instance record and freeing of the PCI
+        # device in nova-compute's _complete_deletion method during RT update.
+        db.pci_device_update(admin_context, 1, 'fake-address',
+                             {'compute_node_id': 1,
+                              'address': 'fake-address',
+                              'vendor_id': 'fake',
+                              'product_id': 'fake',
+                              'dev_type': 'fake',
+                              'label': 'fake',
+                              'status': 'allocated',
+                              'instance_uuid': instance.uuid})
         # Now try and archive the soft deleted records.
         results, deleted_instance_uuids = db.archive_deleted_rows(max_rows=100)
         # verify system_metadata was dropped
@@ -147,6 +160,8 @@ class TestDatabaseArchive(test_servers.ServersTestBase):
         # by the archive
         self.assertIn('instance_actions', results)
         self.assertIn('instance_actions_events', results)
+        # Verify that the pci_devices record has not been dropped
+        self.assertNotIn('pci_devices', results)
 
     def _get_table_counts(self):
         engine = sqlalchemy_api.get_engine()
diff --git a/nova/tests/live_migration/hooks/ceph.sh b/nova/tests/live_migration/hooks/ceph.sh
index ff94967748..2a2ec0202c 100755
--- a/nova/tests/live_migration/hooks/ceph.sh
+++ b/nova/tests/live_migration/hooks/ceph.sh
@@ -8,6 +8,7 @@ function prepare_ceph {
     configure_ceph
     #install ceph-common package on compute nodes
     $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m raw -a "executable=/bin/bash
+    export CEPH_RELEASE=nautilus
     source $BASE/new/devstack/functions
     source $BASE/new/devstack/functions-common
     git clone https://git.openstack.org/openstack/devstack-plugin-ceph /tmp/devstack-plugin-ceph
diff --git a/nova/tests/live_migration/hooks/run_tests.sh b/nova/tests/live_migration/hooks/run_tests.sh
index ff005520b8..0f7c5f0a3f 100755
--- a/nova/tests/live_migration/hooks/run_tests.sh
+++ b/nova/tests/live_migration/hooks/run_tests.sh
@@ -46,6 +46,11 @@ echo '2. NFS testing is skipped due to setup failures with Ubuntu 16.04'
 echo '3. test with Ceph for root + ephemeral disks'
 # Discover and set variables for the OS version so the devstack-plugin-ceph
 # scripts can find the correct repository to install the ceph packages.
+# NOTE(lyarwood): Pin the CEPH_RELEASE to nautilus here as was the case
+# prior to https://review.opendev.org/c/openstack/devstack-plugin-ceph/+/777232
+# landing in the branchless plugin, we also have to pin in ceph.sh when
+# configuring ceph on a remote node via ansible.
+export CEPH_RELEASE=nautilus
 GetOSVersion
 prepare_ceph
 GLANCE_API_CONF=${GLANCE_API_CONF:-/etc/glance/glance-api.conf}
diff --git a/nova/tests/unit/api/openstack/compute/test_aggregates.py b/nova/tests/unit/api/openstack/compute/test_aggregates.py
index eb185849a6..c14f562dc7 100644
--- a/nova/tests/unit/api/openstack/compute/test_aggregates.py
+++ b/nova/tests/unit/api/openstack/compute/test_aggregates.py
@@ -299,7 +299,7 @@ class AggregateTestCaseV21(test.NoDBTestCase):
                           self.controller.show,
                           self.user_req, "1")
 
-    def test_show_with_invalid_id(self):
+    def test_show_with_bad_aggregate(self):
         side_effect = exception.AggregateNotFound(aggregate_id='2')
         with mock.patch.object(self.controller.api, 'get_aggregate',
                                side_effect=side_effect) as mock_get:
@@ -307,6 +307,10 @@ class AggregateTestCaseV21(test.NoDBTestCase):
                               self.req, "2")
             mock_get.assert_called_once_with(self.context, '2')
 
+    def test_show_with_invalid_id(self):
+        self.assertRaises(exc.HTTPBadRequest, self.controller.show,
+                          self.req, 'foo')
+
     def test_update(self):
         body = {"aggregate": {"name": "new_name",
                               "availability_zone": "nova1"}}
@@ -390,10 +394,11 @@ class AggregateTestCaseV21(test.NoDBTestCase):
 
     @mock.patch('nova.compute.api.AggregateAPI.update_aggregate')
     def test_update_with_none_availability_zone(self, mock_update_agg):
-        agg_id = uuidsentinel.aggregate
+        agg_id = 173
         mock_update_agg.return_value = objects.Aggregate(self.context,
                                                          name='test',
-                                                         uuid=agg_id,
+                                                         uuid=uuidsentinel.agg,
+                                                         id=agg_id,
                                                          hosts=[],
                                                          metadata={})
         body = {"aggregate": {"name": "test",
@@ -414,6 +419,11 @@ class AggregateTestCaseV21(test.NoDBTestCase):
             mock_update.assert_called_once_with(self.context, '2',
                                                 body["aggregate"])
 
+    def test_update_with_invalid_id(self):
+        body = {"aggregate": {"name": "test_name"}}
+        self.assertRaises(exc.HTTPBadRequest, self.controller.update,
+                          self.req, 'foo', body=body)
+
     def test_update_with_duplicated_name(self):
         body = {"aggregate": {"name": "test_name"}}
         side_effect = exception.AggregateNameExists(aggregate_name="test_name")
@@ -433,7 +443,7 @@ class AggregateTestCaseV21(test.NoDBTestCase):
     def test_update_with_invalid_action(self):
         with mock.patch.object(self.controller.api, "update_aggregate",
             side_effect=exception.InvalidAggregateAction(
-                action='invalid', aggregate_id='agg1', reason= "not empty")):
+                action='invalid', aggregate_id='1', reason= "not empty")):
             body = {"aggregate": {"availability_zone": "nova"}}
             self.assertRaises(exc.HTTPBadRequest, self.controller.update,
                               self.req, "1", body=body)
@@ -467,15 +477,20 @@ class AggregateTestCaseV21(test.NoDBTestCase):
 
     def test_add_host_with_bad_aggregate(self):
         side_effect = exception.AggregateNotFound(
-            aggregate_id="bogus_aggregate")
+            aggregate_id="2")
         with mock.patch.object(self.controller.api, 'add_host_to_aggregate',
                                side_effect=side_effect) as mock_add:
             self.assertRaises(exc.HTTPNotFound, eval(self.add_host),
-                              self.req, "bogus_aggregate",
+                              self.req, "2",
                               body={"add_host": {"host": "host1"}})
-            mock_add.assert_called_once_with(self.context, "bogus_aggregate",
+            mock_add.assert_called_once_with(self.context, "2",
                                              "host1")
 
+    def test_add_host_with_invalid_id(self):
+        body = {"add_host": {"host": "host1"}}
+        self.assertRaises(exc.HTTPBadRequest, eval(self.add_host),
+                          self.req, 'foo', body=body)
+
     def test_add_host_with_bad_host(self):
         side_effect = exception.ComputeHostNotFound(host="bogus_host")
         with mock.patch.object(self.controller.api, 'add_host_to_aggregate',
@@ -534,16 +549,21 @@ class AggregateTestCaseV21(test.NoDBTestCase):
 
     def test_remove_host_with_bad_aggregate(self):
         side_effect = exception.AggregateNotFound(
-            aggregate_id="bogus_aggregate")
+            aggregate_id="2")
         with mock.patch.object(self.controller.api,
                                'remove_host_from_aggregate',
                                side_effect=side_effect) as mock_rem:
             self.assertRaises(exc.HTTPNotFound, eval(self.remove_host),
-                              self.req, "bogus_aggregate",
+                              self.req, "2",
                               body={"remove_host": {"host": "host1"}})
-            mock_rem.assert_called_once_with(self.context, "bogus_aggregate",
+            mock_rem.assert_called_once_with(self.context, "2",
                                              "host1")
 
+    def test_remove_host_with_invalid_id(self):
+        body = {"remove_host": {"host": "host1"}}
+        self.assertRaises(exc.HTTPBadRequest, eval(self.remove_host),
+                          self.req, 'foo', body=body)
+
     def test_remove_host_with_host_not_in_aggregate(self):
         side_effect = exception.AggregateHostNotFound(aggregate_id="1",
                                                       host="host1")
@@ -621,16 +641,21 @@ class AggregateTestCaseV21(test.NoDBTestCase):
 
     def test_set_metadata_with_bad_aggregate(self):
         body = {"set_metadata": {"metadata": {"foo": "bar"}}}
-        side_effect = exception.AggregateNotFound(aggregate_id="bad_aggregate")
+        side_effect = exception.AggregateNotFound(aggregate_id="2")
 
         with mock.patch.object(self.controller.api,
                                'update_aggregate_metadata',
                                side_effect=side_effect) as mock_update:
             self.assertRaises(exc.HTTPNotFound, eval(self.set_metadata),
-                self.req, "bad_aggregate", body=body)
-            mock_update.assert_called_once_with(self.context, "bad_aggregate",
+                self.req, "2", body=body)
+            mock_update.assert_called_once_with(self.context, "2",
                 body["set_metadata"]['metadata'])
 
+    def test_set_metadata_with_invalid_id(self):
+        body = {"set_metadata": {"metadata": {"foo": "bar"}}}
+        self.assertRaises(exc.HTTPBadRequest, eval(self.set_metadata),
+                          self.req, 'foo', body=body)
+
     def test_set_metadata_with_missing_metadata(self):
         body = {"asdf": {"foo": "bar"}}
         self.assertRaises(self.bad_request, eval(self.set_metadata),
@@ -679,21 +704,25 @@ class AggregateTestCaseV21(test.NoDBTestCase):
 
     def test_delete_aggregate_with_bad_aggregate(self):
         side_effect = exception.AggregateNotFound(
-            aggregate_id="bogus_aggregate")
+            aggregate_id="2")
         with mock.patch.object(self.controller.api, 'delete_aggregate',
                                side_effect=side_effect) as mock_del:
             self.assertRaises(exc.HTTPNotFound, self.controller.delete,
-                self.req, "bogus_aggregate")
-            mock_del.assert_called_once_with(self.context, "bogus_aggregate")
+                self.req, "2")
+            mock_del.assert_called_once_with(self.context, "2")
+
+    def test_delete_with_invalid_id(self):
+        self.assertRaises(exc.HTTPBadRequest, self.controller.delete,
+                          self.req, 'foo')
 
     def test_delete_aggregate_with_host(self):
         with mock.patch.object(self.controller.api, "delete_aggregate",
                                side_effect=exception.InvalidAggregateAction(
-                               action="delete", aggregate_id="agg1",
+                               action="delete", aggregate_id="2",
                                reason="not empty")):
             self.assertRaises(exc.HTTPBadRequest,
                               self.controller.delete,
-                              self.req, "agg1")
+                              self.req, "2")
 
     def test_marshall_aggregate(self):
         # _marshall_aggregate() just basically turns the aggregate returned
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 1c7b5de37a..ef048e1d68 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -2915,14 +2915,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
                 self.context, 'compute_check_can_live_migrate_destination',
                 CONF.host, instance.uuid)
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
     def test_check_can_live_migrate_destination_success(self):
         self._test_check_can_live_migrate_destination()
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
     def test_check_can_live_migrate_destination_fail(self):
         self.assertRaises(
-                test.TestingException,
-                self._test_check_can_live_migrate_destination,
-                do_raise=True)
+            test.TestingException,
+            self._test_check_can_live_migrate_destination,
+            do_raise=True)
+
+    @mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
+    def test_check_can_live_migrate_destination_fail_group_policy(
+            self, mock_fail_db):
+
+        instance = fake_instance.fake_instance_obj(
+            self.context, host=self.compute.host, vm_state=vm_states.ACTIVE,
+            node='fake-node')
+
+        ex = exception.RescheduledException(
+                instance_uuid=instance.uuid, reason="policy violated")
+
+        with mock.patch.object(self.compute, '_validate_instance_group_policy',
+                               side_effect=ex):
+            self.assertRaises(
+                exception.MigrationPreCheckError,
+                self.compute.check_can_live_migrate_destination,
+                self.context, instance, None, None)
 
     @mock.patch('nova.compute.manager.InstanceEvents._lock_name')
     def test_prepare_for_instance_event(self, lock_name_mock):
@@ -6447,7 +6469,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
     def test_validate_policy_honors_workaround_disabled(self, mock_get):
         instance = objects.Instance(uuid=uuids.instance)
         hints = {'group': 'foo'}
-        mock_get.return_value = objects.InstanceGroup(policy=None)
+        mock_get.return_value = objects.InstanceGroup(policy=None,
+                                                      uuid=uuids.group)
         self.compute._validate_instance_group_policy(self.context,
                                                      instance, hints)
         mock_get.assert_called_once_with(self.context, 'foo')
@@ -6473,10 +6496,14 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
                                                      instance, hints)
         mock_get.assert_called_once_with(self.context, uuids.group_hint)
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_uuid')
     @mock.patch('nova.objects.InstanceList.get_uuids_by_host')
     @mock.patch('nova.objects.InstanceGroup.get_by_hint')
-    def test_validate_instance_group_policy_with_rules(self, mock_get_by_hint,
-                                                       mock_get_by_host):
+    @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
+    @mock.patch('nova.objects.MigrationList.get_in_progress_by_host_and_node')
+    def test_validate_instance_group_policy_with_rules(
+            self, migration_list, nodes, mock_get_by_hint, mock_get_by_host,
+            mock_get_by_uuid):
         # Create 2 instance in same host, inst2 created before inst1
         instance = objects.Instance(uuid=uuids.inst1)
         hints = {'group': [uuids.group_hint]}
@@ -6485,17 +6512,26 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
         mock_get_by_host.return_value = existing_insts
 
         # if group policy rules limit to 1, raise RescheduledException
-        mock_get_by_hint.return_value = objects.InstanceGroup(
+        group = objects.InstanceGroup(
             policy='anti-affinity', rules={'max_server_per_host': '1'},
-            hosts=['host1'], members=members_uuids)
+            hosts=['host1'], members=members_uuids,
+            uuid=uuids.group)
+        mock_get_by_hint.return_value = group
+        mock_get_by_uuid.return_value = group
+        nodes.return_value = ['nodename']
+        migration_list.return_value = [objects.Migration(
+            uuid=uuids.migration, instance_uuid=uuids.instance)]
         self.assertRaises(exception.RescheduledException,
                           self.compute._validate_instance_group_policy,
                           self.context, instance, hints)
 
         # if group policy rules limit change to 2, validate OK
-        mock_get_by_hint.return_value = objects.InstanceGroup(
+        group2 = objects.InstanceGroup(
             policy='anti-affinity', rules={'max_server_per_host': 2},
-            hosts=['host1'], members=members_uuids)
+            hosts=['host1'], members=members_uuids,
+            uuid=uuids.group)
+        mock_get_by_hint.return_value = group2
+        mock_get_by_uuid.return_value = group2
         self.compute._validate_instance_group_policy(self.context,
                                                      instance, hints)
 
@@ -7966,6 +8002,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
         else:
             mock_log.warning.assert_not_called()
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
     def test_pre_live_migration_cinder_v3_api(self):
         # This tests that pre_live_migration with a bdm with an
         # attachment_id, will create a new attachment and update
@@ -8043,6 +8081,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
 
         _test()
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
     def test_pre_live_migration_exception_cinder_v3_api(self):
         # The instance in this test has 2 attachments. The second attach_create
         # will throw an exception. This will test that the first attachment
@@ -8112,6 +8152,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
                 self.assertGreater(len(m.mock_calls), 0)
         _test()
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
     def test_pre_live_migration_exceptions_delete_attachments(self):
         # The instance in this test has 2 attachments. The call to
         # driver.pre_live_migration will raise an exception. This will test
@@ -9297,7 +9339,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
                               self.compute.prep_resize,
                               self.context, mock.sentinel.image,
                               instance, flavor,
-                              mock.sentinel.request_spec,
+                              objects.RequestSpec(),
                               {}, 'node', False,
                               migration, [])
 
@@ -9375,6 +9417,54 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
         # (_error_out_instance_on_exception will set to ACTIVE by default).
         self.assertEqual(vm_states.STOPPED, instance.vm_state)
 
+    @mock.patch('nova.compute.utils.notify_usage_exists')
+    @mock.patch('nova.compute.manager.ComputeManager.'
+                '_notify_about_instance_usage')
+    @mock.patch('nova.compute.utils.notify_about_resize_prep_instance')
+    @mock.patch('nova.objects.Instance.save')
+    @mock.patch('nova.compute.manager.ComputeManager._revert_allocation')
+    @mock.patch('nova.compute.manager.ComputeManager.'
+                '_reschedule_resize_or_reraise')
+    @mock.patch('nova.compute.utils.add_instance_fault_from_exc')
+    # this is almost copy-paste from test_prep_resize_fails_rollback
+    def test_prep_resize_fails_group_validation(
+            self, add_instance_fault_from_exc, _reschedule_resize_or_reraise,
+            _revert_allocation, mock_instance_save,
+            notify_about_resize_prep_instance, _notify_about_instance_usage,
+            notify_usage_exists):
+        """Tests that if _validate_instance_group_policy raises
+        InstanceFaultRollback, the instance.vm_state is reset properly in
+        _error_out_instance_on_exception
+        """
+        instance = fake_instance.fake_instance_obj(
+            self.context, host=self.compute.host, vm_state=vm_states.STOPPED,
+            node='fake-node', expected_attrs=['system_metadata', 'flavor'])
+        migration = mock.MagicMock(spec='nova.objects.Migration')
+        request_spec = mock.MagicMock(spec='nova.objects.RequestSpec')
+        ex = exception.RescheduledException(
+            instance_uuid=instance.uuid, reason="policy violated")
+        ex2 = exception.InstanceFaultRollback(
+            inner_exception=ex)
+
+        def fake_reschedule_resize_or_reraise(*args, **kwargs):
+            raise ex2
+
+        _reschedule_resize_or_reraise.side_effect = (
+            fake_reschedule_resize_or_reraise)
+
+        with mock.patch.object(self.compute, '_validate_instance_group_policy',
+                               side_effect=ex):
+            self.assertRaises(
+                # _error_out_instance_on_exception should reraise the
+                # RescheduledException inside InstanceFaultRollback.
+                exception.RescheduledException, self.compute.prep_resize,
+                self.context, instance.image_meta, instance, instance.flavor,
+                request_spec, filter_properties={}, node=instance.node,
+                clean_shutdown=True, migration=migration, host_list=[])
+        # The instance.vm_state should remain unchanged
+        # (_error_out_instance_on_exception will set to ACTIVE by default).
+        self.assertEqual(vm_states.STOPPED, instance.vm_state)
+
     def test_get_updated_nw_info_with_pci_mapping(self):
         old_dev = objects.PciDevice(address='0000:04:00.2')
         new_dev = objects.PciDevice(address='0000:05:00.3')
diff --git a/nova/tests/unit/db/test_db_api.py b/nova/tests/unit/db/test_db_api.py
index 608c3fd0c8..c3c04870ed 100644
--- a/nova/tests/unit/db/test_db_api.py
+++ b/nova/tests/unit/db/test_db_api.py
@@ -1957,6 +1957,14 @@ class InstanceTestCase(test.TestCase, ModelsObjectComparatorMixin):
             sys_meta = utils.metadata_to_dict(inst['system_metadata'])
             self.assertEqual(sys_meta, self.sample_data['system_metadata'])
 
+    def test_instance_get_with_meta(self):
+        inst_id = self.create_instance_with_args().id
+        inst = db.instance_get(self.ctxt, inst_id)
+        meta = utils.metadata_to_dict(inst['metadata'])
+        self.assertEqual(meta, self.sample_data['metadata'])
+        sys_meta = utils.metadata_to_dict(inst['system_metadata'])
+        self.assertEqual(sys_meta, self.sample_data['system_metadata'])
+
     def test_instance_update(self):
         instance = self.create_instance_with_args()
         metadata = {'host': 'bar', 'key2': 'wuff'}
diff --git a/nova/tests/unit/image/test_glance.py b/nova/tests/unit/image/test_glance.py
index 865efc23a6..6d90051ea1 100644
--- a/nova/tests/unit/image/test_glance.py
+++ b/nova/tests/unit/image/test_glance.py
@@ -303,6 +303,11 @@ class TestExceptionTranslations(test.NoDBTestCase):
         out_exc = glance._translate_image_exception('123', in_exc)
         self.assertIsInstance(out_exc, exception.ImageNotFound)
 
+    def test_client_httpoverlimit_converts_to_imagequotaexceeded(self):
+        in_exc = glanceclient.exc.HTTPOverLimit('123')
+        out_exc = glance._translate_image_exception('123', in_exc)
+        self.assertIsInstance(out_exc, exception.ImageQuotaExceeded)
+
 
 class TestGlanceSerializer(test.NoDBTestCase):
     def test_serialize(self):
diff --git a/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml b/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml
new file mode 100644
index 0000000000..4c6135311b
--- /dev/null
+++ b/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml
@@ -0,0 +1,11 @@
+---
+fixes:
+  - |
+    Improved detection of anti-affinity policy violation when performing live
+    and cold migrations. Most of the violations caused by race conditions due
+    to performing concurrent live or cold migrations should now be addressed
+    by extra checks in the compute service. Upon detection, cold migration
+    operations are automatically rescheduled, while live migrations have two
+    checks and will be rescheduled if detected by the first one, otherwise the
+    live migration will fail cleanly and revert the instance state back to its
+    previous value.
diff --git a/tox.ini b/tox.ini
index a6ee0d51fb..5bf028768b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -61,7 +61,6 @@ commands =
   bash -c "! find doc/ -type f -name *.json | xargs grep -U -n $'\r'"
   # Check that all included JSON files are valid JSON
   bash -c '! find doc/ -type f -name *.json | xargs -t -n1 python -m json.tool 2>&1 > /dev/null | grep -B1 -v ^python'
-  bash tools/check-cherry-picks.sh
 
 [testenv:fast8]
 description =
@@ -70,6 +69,15 @@ envdir = {toxworkdir}/shared
 commands =
   bash tools/flake8wrap.sh -HEAD
 
+[testenv:validate-backport]
+description =
+  Determine whether a backport is ready to be merged by checking whether it has
+  already been merged to master or more recent stable branches.
+deps =
+skipsdist = true
+commands =
+  bash tools/check-cherry-picks.sh
+
 [testenv:functional]
 # TODO(melwitt): This can be removed when functional tests are gating with
 # python 3.x