summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.zuul.yaml30
-rw-r--r--doc/source/admin/secure-live-migration-with-qemu-native-tls.rst11
-rw-r--r--nova/api/openstack/compute/aggregates.py41
-rw-r--r--nova/compute/manager.py110
-rw-r--r--nova/db/sqlalchemy/api.py25
-rw-r--r--nova/exception.py5
-rw-r--r--nova/image/glance.py2
-rw-r--r--nova/monkey_patch.py23
-rw-r--r--nova/network/neutronv2/api.py8
-rw-r--r--nova/tests/functional/db/test_archive.py15
-rwxr-xr-xnova/tests/live_migration/hooks/ceph.sh1
-rwxr-xr-xnova/tests/live_migration/hooks/run_tests.sh5
-rw-r--r--nova/tests/unit/api/openstack/compute/test_aggregates.py65
-rw-r--r--nova/tests/unit/compute/test_compute_mgr.py112
-rw-r--r--nova/tests/unit/db/test_db_api.py8
-rw-r--r--nova/tests/unit/image/test_glance.py5
-rw-r--r--releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml11
-rw-r--r--tox.ini10
18 files changed, 423 insertions, 64 deletions
diff --git a/.zuul.yaml b/.zuul.yaml
index 971c406e18..8149f71bc1 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -105,6 +105,17 @@
timeout: 3600
- job:
+ name: nova-tox-validate-backport
+ parent: openstack-tox
+ description: |
+ Determine whether a backport is ready to be merged by checking whether it
+ has already been merged to master or more recent stable branches.
+
+ Uses tox with the ``validate-backport`` environment.
+ vars:
+ tox_envlist: validate-backport
+
+- job:
name: nova-cells-v1
parent: nova-dsvm-base
run: playbooks/legacy/nova-cells-v1/run.yaml
@@ -267,6 +278,8 @@
- nova-next
- nova-tox-functional
- nova-tox-functional-py35
+ - nova-tox-validate-backport:
+ voting: false
- tempest-full-py3:
irrelevant-files: *dsvm-irrelevant-files
- tempest-multinode-full:
@@ -274,6 +287,14 @@
irrelevant-files: *dsvm-irrelevant-files
- tempest-slow-py3:
irrelevant-files: *dsvm-irrelevant-files
+ vars:
+ devstack_localrc:
+ # To workaround https://bugs.launchpad.net/neutron/+bug/1914037
+ # tempest is pinned to 26.0.0 on stable/stein in devstack, so
+ # we cannot consume the tempest fix.
+ IPV6_PUBLIC_RANGE: 2001:db8:0:10::/64
+ IPV6_PUBLIC_NETWORK_GATEWAY: 2001:db8:0:10::2
+ IPV6_ROUTER_GW_IP: 2001:db8:0:10::1
- grenade-py3:
irrelevant-files: *dsvm-irrelevant-files
gate:
@@ -283,10 +304,19 @@
- nova-tox-functional
- nova-tox-functional-py35
- nova-next
+ - nova-tox-validate-backport
- tempest-full-py3:
irrelevant-files: *dsvm-irrelevant-files
- tempest-slow-py3:
irrelevant-files: *dsvm-irrelevant-files
+ vars:
+ devstack_localrc:
+ # To workaround https://bugs.launchpad.net/neutron/+bug/1914037
+ # tempest is pinned to 26.0.0 on stable/stein in devstack, so
+ # we cannot consume the tempest fix.
+ IPV6_PUBLIC_RANGE: 2001:db8:0:10::/64
+ IPV6_PUBLIC_NETWORK_GATEWAY: 2001:db8:0:10::2
+ IPV6_ROUTER_GW_IP: 2001:db8:0:10::1
- grenade-py3:
irrelevant-files: *dsvm-irrelevant-files
experimental:
diff --git a/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst b/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst
index 012d78e93b..fb76f656af 100644
--- a/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst
+++ b/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst
@@ -120,10 +120,13 @@ Performing the migration
(1) On all relevant compute nodes, enable the
:oslo.config:option:`libvirt.live_migration_with_native_tls`
- configuration attribute::
+ configuration attribute and set the
+ :oslo.config:option:`libvirt.live_migration_scheme`
+ configuration attribute to tls::
[libvirt]
live_migration_with_native_tls = true
+ live_migration_scheme = tls
.. note::
Setting both
@@ -131,6 +134,12 @@ Performing the migration
:oslo.config:option:`libvirt.live_migration_tunnelled` at the
same time is invalid (and disallowed).
+ .. note::
+ Not setting
+ :oslo.config:option:`libvirt.live_migration_scheme` to ``tls``
+ will result in libvirt using the unencrypted TCP connection
+ without displaying any error or a warning in the logs.
+
And restart the ``nova-compute`` service::
$ systemctl restart openstack-nova-compute
diff --git a/nova/api/openstack/compute/aggregates.py b/nova/api/openstack/compute/aggregates.py
index e44b449e76..619fecfb37 100644
--- a/nova/api/openstack/compute/aggregates.py
+++ b/nova/api/openstack/compute/aggregates.py
@@ -28,6 +28,7 @@ from nova.compute import api as compute_api
from nova import exception
from nova.i18n import _
from nova.policies import aggregates as aggr_policies
+from nova import utils
def _get_context(req):
@@ -84,11 +85,17 @@ class AggregateController(wsgi.Controller):
return agg
- @wsgi.expected_errors(404)
+ @wsgi.expected_errors((400, 404))
def show(self, req, id):
"""Shows the details of an aggregate, hosts and metadata included."""
context = _get_context(req)
context.can(aggr_policies.POLICY_ROOT % 'show')
+
+ try:
+ utils.validate_integer(id, 'id')
+ except exception.InvalidInput as e:
+ raise exc.HTTPBadRequest(explanation=e.format_message())
+
try:
aggregate = self.api.get_aggregate(context, id)
except exception.AggregateNotFound as e:
@@ -107,6 +114,11 @@ class AggregateController(wsgi.Controller):
updates['name'] = common.normalize_name(updates['name'])
try:
+ utils.validate_integer(id, 'id')
+ except exception.InvalidInput as e:
+ raise exc.HTTPBadRequest(explanation=e.format_message())
+
+ try:
aggregate = self.api.update_aggregate(context, id, updates)
except exception.AggregateNameExists as e:
raise exc.HTTPConflict(explanation=e.format_message())
@@ -125,6 +137,12 @@ class AggregateController(wsgi.Controller):
"""Removes an aggregate by id."""
context = _get_context(req)
context.can(aggr_policies.POLICY_ROOT % 'delete')
+
+ try:
+ utils.validate_integer(id, 'id')
+ except exception.InvalidInput as e:
+ raise exc.HTTPBadRequest(explanation=e.format_message())
+
try:
self.api.delete_aggregate(context, id)
except exception.AggregateNotFound as e:
@@ -135,7 +153,7 @@ class AggregateController(wsgi.Controller):
# NOTE(gmann): Returns 200 for backwards compatibility but should be 202
# for representing async API as this API just accepts the request and
# request hypervisor driver to complete the same in async mode.
- @wsgi.expected_errors((404, 409))
+ @wsgi.expected_errors((400, 404, 409))
@wsgi.action('add_host')
@validation.schema(aggregates.add_host)
def _add_host(self, req, id, body):
@@ -144,6 +162,12 @@ class AggregateController(wsgi.Controller):
context = _get_context(req)
context.can(aggr_policies.POLICY_ROOT % 'add_host')
+
+ try:
+ utils.validate_integer(id, 'id')
+ except exception.InvalidInput as e:
+ raise exc.HTTPBadRequest(explanation=e.format_message())
+
try:
aggregate = self.api.add_host_to_aggregate(context, id, host)
except (exception.AggregateNotFound,
@@ -158,7 +182,7 @@ class AggregateController(wsgi.Controller):
# NOTE(gmann): Returns 200 for backwards compatibility but should be 202
# for representing async API as this API just accepts the request and
# request hypervisor driver to complete the same in async mode.
- @wsgi.expected_errors((404, 409))
+ @wsgi.expected_errors((400, 404, 409))
@wsgi.action('remove_host')
@validation.schema(aggregates.remove_host)
def _remove_host(self, req, id, body):
@@ -167,6 +191,12 @@ class AggregateController(wsgi.Controller):
context = _get_context(req)
context.can(aggr_policies.POLICY_ROOT % 'remove_host')
+
+ try:
+ utils.validate_integer(id, 'id')
+ except exception.InvalidInput as e:
+ raise exc.HTTPBadRequest(explanation=e.format_message())
+
try:
aggregate = self.api.remove_host_from_aggregate(context, id, host)
except (exception.AggregateNotFound, exception.AggregateHostNotFound,
@@ -188,6 +218,11 @@ class AggregateController(wsgi.Controller):
context = _get_context(req)
context.can(aggr_policies.POLICY_ROOT % 'set_metadata')
+ try:
+ utils.validate_integer(id, 'id')
+ except exception.InvalidInput as e:
+ raise exc.HTTPBadRequest(explanation=e.format_message())
+
metadata = body["set_metadata"]["metadata"]
try:
aggregate = self.api.update_aggregate_metadata(context,
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index c7a0f37e92..1c29aa278f 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -1474,7 +1474,11 @@ class ComputeManager(manager.Manager):
return [_decode(f) for f in injected_files]
def _validate_instance_group_policy(self, context, instance,
- scheduler_hints):
+ scheduler_hints=None):
+
+ if CONF.workarounds.disable_group_policy_check_upcall:
+ return
+
# NOTE(russellb) Instance group policy is enforced by the scheduler.
# However, there is a race condition with the enforcement of
# the policy. Since more than one instance may be scheduled at the
@@ -1483,29 +1487,63 @@ class ComputeManager(manager.Manager):
# multiple instances with an affinity policy could end up on different
# hosts. This is a validation step to make sure that starting the
# instance here doesn't violate the policy.
- group_hint = scheduler_hints.get('group')
- if not group_hint:
- return
-
- # The RequestSpec stores scheduler_hints as key=list pairs so we need
- # to check the type on the value and pull the single entry out. The
- # API request schema validates that the 'group' hint is a single value.
- if isinstance(group_hint, list):
- group_hint = group_hint[0]
+ if scheduler_hints is not None:
+ # only go through here if scheduler_hints is provided, even if it
+ # is empty.
+ group_hint = scheduler_hints.get('group')
+ if not group_hint:
+ return
+ else:
+ # The RequestSpec stores scheduler_hints as key=list pairs so
+ # we need to check the type on the value and pull the single
+ # entry out. The API request schema validates that
+ # the 'group' hint is a single value.
+ if isinstance(group_hint, list):
+ group_hint = group_hint[0]
+
+ group = objects.InstanceGroup.get_by_hint(context, group_hint)
+ else:
+ # TODO(ganso): a call to DB can be saved by adding request_spec
+ # to rpcapi payload of live_migration, pre_live_migration and
+ # check_can_live_migrate_destination
+ try:
+ group = objects.InstanceGroup.get_by_instance_uuid(
+ context, instance.uuid)
+ except exception.InstanceGroupNotFound:
+ return
- @utils.synchronized(group_hint)
- def _do_validation(context, instance, group_hint):
- group = objects.InstanceGroup.get_by_hint(context, group_hint)
+ @utils.synchronized(group['uuid'])
+ def _do_validation(context, instance, group):
if group.policy and 'anti-affinity' == group.policy:
+
+ # instances on host
instances_uuids = objects.InstanceList.get_uuids_by_host(
context, self.host)
ins_on_host = set(instances_uuids)
+
+ # instance param is just for logging, the nodename obtained is
+ # not actually related to the instance at all
+ nodename = self._get_nodename(instance)
+
+ # instances being migrated to host
+ migrations = (
+ objects.MigrationList.get_in_progress_by_host_and_node(
+ context, self.host, nodename))
+ migration_vm_uuids = set([mig['instance_uuid']
+ for mig in migrations])
+
+ total_instances = migration_vm_uuids | ins_on_host
+
+ # refresh group to get updated members within locked block
+ group = objects.InstanceGroup.get_by_uuid(context,
+ group['uuid'])
members = set(group.members)
# Determine the set of instance group members on this host
# which are not the instance in question. This is used to
# determine how many other members from the same anti-affinity
# group can be on this host.
- members_on_host = ins_on_host & members - set([instance.uuid])
+ members_on_host = (total_instances & members -
+ set([instance.uuid]))
rules = group.rules
if rules and 'max_server_per_host' in rules:
max_server = rules['max_server_per_host']
@@ -1517,6 +1555,12 @@ class ComputeManager(manager.Manager):
raise exception.RescheduledException(
instance_uuid=instance.uuid,
reason=msg)
+
+ # NOTE(ganso): The check for affinity below does not work and it
+ # can easily be violated because the lock happens in different
+ # compute hosts.
+ # The only fix seems to be a DB lock to perform the check whenever
+ # setting the host field to an instance.
elif group.policy and 'affinity' == group.policy:
group_hosts = group.get_hosts(exclude=[instance.uuid])
if group_hosts and self.host not in group_hosts:
@@ -1525,8 +1569,7 @@ class ComputeManager(manager.Manager):
instance_uuid=instance.uuid,
reason=msg)
- if not CONF.workarounds.disable_group_policy_check_upcall:
- _do_validation(context, instance, group_hint)
+ _do_validation(context, instance, group)
def _log_original_error(self, exc_info, instance_uuid):
LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid,
@@ -4646,10 +4689,24 @@ class ComputeManager(manager.Manager):
with self._error_out_instance_on_exception(
context, instance, instance_state=instance_state),\
errors_out_migration_ctxt(migration):
+
self._send_prep_resize_notifications(
context, instance, fields.NotificationPhase.START,
instance_type)
try:
+ scheduler_hints = self._get_scheduler_hints(filter_properties,
+ request_spec)
+ # Error out if this host cannot accept the new instance due
+ # to anti-affinity. At this point the migration is already
+ # in-progress, so this is the definitive moment to abort due to
+ # the policy violation. Also, exploding here is covered by the
+ # cleanup methods in except block.
+ try:
+ self._validate_instance_group_policy(context, instance,
+ scheduler_hints)
+ except exception.RescheduledException as e:
+ raise exception.InstanceFaultRollback(inner_exception=e)
+
self._prep_resize(context, image, instance,
instance_type, filter_properties,
node, migration, clean_shutdown)
@@ -6505,6 +6562,20 @@ class ComputeManager(manager.Manager):
if None, ignore disk usage checking
:returns: a dict containing migration info
"""
+
+ # Error out if this host cannot accept the new instance due
+ # to anti-affinity. This check at this moment is not very accurate, as
+ # multiple requests may be happening concurrently and miss the lock,
+ # but when it works it provides a better user experience by failing
+ # earlier. Also, it should be safe to explode here, error becomes
+ # NoValidHost and instance status remains ACTIVE.
+ try:
+ self._validate_instance_group_policy(ctxt, instance)
+ except exception.RescheduledException as e:
+ msg = ("Failed to validate instance group policy "
+ "due to: {}".format(e))
+ raise exception.MigrationPreCheckError(reason=msg)
+
src_compute_info = obj_base.obj_to_primitive(
self._get_compute_info(ctxt, instance.host))
dst_compute_info = obj_base.obj_to_primitive(
@@ -6567,6 +6638,13 @@ class ComputeManager(manager.Manager):
"""
LOG.debug('pre_live_migration data is %s', migrate_data)
+ # Error out if this host cannot accept the new instance due
+ # to anti-affinity. At this point the migration is already in-progress,
+ # so this is the definitive moment to abort due to the policy
+ # violation. Also, it should be safe to explode here. The instance
+ # status remains ACTIVE, migration status failed.
+ self._validate_instance_group_policy(context, instance)
+
migrate_data.old_vol_attachment_ids = {}
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
context, instance.uuid)
diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py
index a0535aa2a5..a4d9be0271 100644
--- a/nova/db/sqlalchemy/api.py
+++ b/nova/db/sqlalchemy/api.py
@@ -48,6 +48,7 @@ from sqlalchemy.orm import aliased
from sqlalchemy.orm import contains_eager
from sqlalchemy.orm import joinedload
from sqlalchemy.orm import noload
+from sqlalchemy.orm import subqueryload
from sqlalchemy.orm import undefer
from sqlalchemy.schema import Table
from sqlalchemy import sql
@@ -1936,13 +1937,27 @@ def _build_instance_get(context, columns_to_join=None):
continue
if 'extra.' in column:
query = query.options(undefer(column))
+ elif column in ['metadata', 'system_metadata']:
+ # NOTE(melwitt): We use subqueryload() instead of joinedload() for
+ # metadata and system_metadata because of the one-to-many
+ # relationship of the data. Directly joining these columns can
+ # result in a large number of additional rows being queried if an
+ # instance has a large number of (system_)metadata items, resulting
+ # in a large data transfer. Instead, the subqueryload() will
+ # perform additional queries to obtain metadata and system_metadata
+ # for the instance.
+ query = query.options(subqueryload(column))
else:
query = query.options(joinedload(column))
# NOTE(alaski) Stop lazy loading of columns not needed.
for col in ['metadata', 'system_metadata']:
if col not in columns_to_join:
query = query.options(noload(col))
- return query
+ # NOTE(melwitt): We need to use order_by(<unique column>) so that the
+ # additional queries emitted by subqueryload() include the same ordering as
+ # used by the parent query.
+ # https://docs.sqlalchemy.org/en/13/orm/loading_relationships.html#the-importance-of-ordering
+ return query.order_by(models.Instance.id)
def _instances_fill_metadata(context, instances, manual_joins=None):
@@ -5625,8 +5640,12 @@ def _archive_deleted_rows_for_table(tablename, max_rows):
"%(tablename)s: %(error)s",
{'tablename': tablename, 'error': six.text_type(ex)})
- if ((max_rows is None or rows_archived < max_rows)
- and 'instance_uuid' in columns):
+ if ((max_rows is None or rows_archived < max_rows) and
+ # NOTE(melwitt): The pci_devices table uses the 'instance_uuid'
+ # column to track the allocated association of a PCI device and its
+ # records are not tied to the lifecycles of instance records.
+ (tablename != 'pci_devices' and
+ 'instance_uuid' in columns)):
instances = models.BASE.metadata.tables['instances']
limit = max_rows - rows_archived if max_rows is not None else None
extra = _archive_if_instance_deleted(table, shadow_table, instances,
diff --git a/nova/exception.py b/nova/exception.py
index 11d26f55fa..1679a74f80 100644
--- a/nova/exception.py
+++ b/nova/exception.py
@@ -613,6 +613,11 @@ class ImageBadRequest(Invalid):
"%(response)s")
+class ImageQuotaExceeded(NovaException):
+ msg_fmt = _("Quota exceeded or out of space for image %(image_id)s "
+ "in the image service.")
+
+
class InstanceUnacceptable(Invalid):
msg_fmt = _("Instance %(instance_id)s is unacceptable: %(reason)s")
diff --git a/nova/image/glance.py b/nova/image/glance.py
index 39970d0498..459699531a 100644
--- a/nova/image/glance.py
+++ b/nova/image/glance.py
@@ -954,6 +954,8 @@ def _translate_image_exception(image_id, exc_value):
if isinstance(exc_value, glanceclient.exc.BadRequest):
return exception.ImageBadRequest(image_id=image_id,
response=six.text_type(exc_value))
+ if isinstance(exc_value, glanceclient.exc.HTTPOverLimit):
+ return exception.ImageQuotaExceeded(image_id=image_id)
return exc_value
diff --git a/nova/monkey_patch.py b/nova/monkey_patch.py
index 6d0cacd6c7..be2a70ab4b 100644
--- a/nova/monkey_patch.py
+++ b/nova/monkey_patch.py
@@ -22,6 +22,19 @@ import os
def _monkey_patch():
+ # See https://bugs.launchpad.net/nova/+bug/1164822
+ # TODO(mdbooth): This feature was deprecated and removed in eventlet at
+ # some point but brought back in version 0.21.0, presumably because some
+ # users still required it to work round issues. However, there have been a
+ # number of greendns fixes in eventlet since then. Specifically, it looks
+ # as though the originally reported IPv6 issue may have been fixed in
+ # version 0.24.0. We should remove this when we can confirm that the
+ # original issue is fixed.
+ # NOTE(artom) eventlet processes environment variables at import-time. We
+ # therefore set this here, before importing eventlet, in order to correctly
+ # disable greendns.
+ os.environ['EVENTLET_NO_GREENDNS'] = 'yes'
+
# NOTE(mdbooth): Anything imported here will not be monkey patched. It is
# important to take care not to import anything here which requires monkey
# patching.
@@ -39,16 +52,6 @@ def _monkey_patch():
problems = (set(['urllib3', 'oslo_context.context']) &
set(sys.modules.keys()))
- # See https://bugs.launchpad.net/nova/+bug/1164822
- # TODO(mdbooth): This feature was deprecated and removed in eventlet at
- # some point but brought back in version 0.21.0, presumably because some
- # users still required it to work round issues. However, there have been a
- # number of greendns fixes in eventlet since then. Specifically, it looks
- # as though the originally reported IPv6 issue may have been fixed in
- # version 0.24.0. We should remove this when we can confirm that the
- # original issue is fixed.
- os.environ['EVENTLET_NO_GREENDNS'] = 'yes'
-
if debugger.enabled():
# turn off thread patching to enable the remote debugger
eventlet.monkey_patch(thread=False)
diff --git a/nova/network/neutronv2/api.py b/nova/network/neutronv2/api.py
index 8a60291910..2f4e39b1cb 100644
--- a/nova/network/neutronv2/api.py
+++ b/nova/network/neutronv2/api.py
@@ -828,9 +828,15 @@ class API(base_api.NetworkAPI):
# TODO(arosen) Should optimize more to do direct query for security
# group if len(security_groups) == 1
if len(security_groups):
+ # NOTE(slaweq): fields other than name and id aren't really needed
+ # so asking only about those fields will allow Neutron to not
+ # prepare list of rules for each found security group. That may
+ # speed processing of this request a lot in case when tenant has
+ # got many security groups
+ sg_fields = ['id', 'name']
search_opts = {'tenant_id': instance.project_id}
user_security_groups = neutron.list_security_groups(
- **search_opts).get('security_groups')
+ fields=sg_fields, **search_opts).get('security_groups')
for security_group in security_groups:
name_match = None
diff --git a/nova/tests/functional/db/test_archive.py b/nova/tests/functional/db/test_archive.py
index 432d814d91..a8b4df6f16 100644
--- a/nova/tests/functional/db/test_archive.py
+++ b/nova/tests/functional/db/test_archive.py
@@ -135,6 +135,19 @@ class TestDatabaseArchive(test_servers.ServersTestBase):
# Verify we have some system_metadata since we'll check that later.
self.assertTrue(len(instance.system_metadata),
'No system_metadata for instance: %s' % server_id)
+ # Create a pci_devices record to simulate an instance that had a PCI
+ # device allocated at the time it was deleted. There is a window of
+ # time between deletion of the instance record and freeing of the PCI
+ # device in nova-compute's _complete_deletion method during RT update.
+ db.pci_device_update(admin_context, 1, 'fake-address',
+ {'compute_node_id': 1,
+ 'address': 'fake-address',
+ 'vendor_id': 'fake',
+ 'product_id': 'fake',
+ 'dev_type': 'fake',
+ 'label': 'fake',
+ 'status': 'allocated',
+ 'instance_uuid': instance.uuid})
# Now try and archive the soft deleted records.
results, deleted_instance_uuids = db.archive_deleted_rows(max_rows=100)
# verify system_metadata was dropped
@@ -147,6 +160,8 @@ class TestDatabaseArchive(test_servers.ServersTestBase):
# by the archive
self.assertIn('instance_actions', results)
self.assertIn('instance_actions_events', results)
+ # Verify that the pci_devices record has not been dropped
+ self.assertNotIn('pci_devices', results)
def _get_table_counts(self):
engine = sqlalchemy_api.get_engine()
diff --git a/nova/tests/live_migration/hooks/ceph.sh b/nova/tests/live_migration/hooks/ceph.sh
index ff94967748..2a2ec0202c 100755
--- a/nova/tests/live_migration/hooks/ceph.sh
+++ b/nova/tests/live_migration/hooks/ceph.sh
@@ -8,6 +8,7 @@ function prepare_ceph {
configure_ceph
#install ceph-common package on compute nodes
$ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m raw -a "executable=/bin/bash
+ export CEPH_RELEASE=nautilus
source $BASE/new/devstack/functions
source $BASE/new/devstack/functions-common
git clone https://git.openstack.org/openstack/devstack-plugin-ceph /tmp/devstack-plugin-ceph
diff --git a/nova/tests/live_migration/hooks/run_tests.sh b/nova/tests/live_migration/hooks/run_tests.sh
index ff005520b8..0f7c5f0a3f 100755
--- a/nova/tests/live_migration/hooks/run_tests.sh
+++ b/nova/tests/live_migration/hooks/run_tests.sh
@@ -46,6 +46,11 @@ echo '2. NFS testing is skipped due to setup failures with Ubuntu 16.04'
echo '3. test with Ceph for root + ephemeral disks'
# Discover and set variables for the OS version so the devstack-plugin-ceph
# scripts can find the correct repository to install the ceph packages.
+# NOTE(lyarwood): Pin the CEPH_RELEASE to nautilus here as was the case
+# prior to https://review.opendev.org/c/openstack/devstack-plugin-ceph/+/777232
+# landing in the branchless plugin, we also have to pin in ceph.sh when
+# configuring ceph on a remote node via ansible.
+export CEPH_RELEASE=nautilus
GetOSVersion
prepare_ceph
GLANCE_API_CONF=${GLANCE_API_CONF:-/etc/glance/glance-api.conf}
diff --git a/nova/tests/unit/api/openstack/compute/test_aggregates.py b/nova/tests/unit/api/openstack/compute/test_aggregates.py
index eb185849a6..c14f562dc7 100644
--- a/nova/tests/unit/api/openstack/compute/test_aggregates.py
+++ b/nova/tests/unit/api/openstack/compute/test_aggregates.py
@@ -299,7 +299,7 @@ class AggregateTestCaseV21(test.NoDBTestCase):
self.controller.show,
self.user_req, "1")
- def test_show_with_invalid_id(self):
+ def test_show_with_bad_aggregate(self):
side_effect = exception.AggregateNotFound(aggregate_id='2')
with mock.patch.object(self.controller.api, 'get_aggregate',
side_effect=side_effect) as mock_get:
@@ -307,6 +307,10 @@ class AggregateTestCaseV21(test.NoDBTestCase):
self.req, "2")
mock_get.assert_called_once_with(self.context, '2')
+ def test_show_with_invalid_id(self):
+ self.assertRaises(exc.HTTPBadRequest, self.controller.show,
+ self.req, 'foo')
+
def test_update(self):
body = {"aggregate": {"name": "new_name",
"availability_zone": "nova1"}}
@@ -390,10 +394,11 @@ class AggregateTestCaseV21(test.NoDBTestCase):
@mock.patch('nova.compute.api.AggregateAPI.update_aggregate')
def test_update_with_none_availability_zone(self, mock_update_agg):
- agg_id = uuidsentinel.aggregate
+ agg_id = 173
mock_update_agg.return_value = objects.Aggregate(self.context,
name='test',
- uuid=agg_id,
+ uuid=uuidsentinel.agg,
+ id=agg_id,
hosts=[],
metadata={})
body = {"aggregate": {"name": "test",
@@ -414,6 +419,11 @@ class AggregateTestCaseV21(test.NoDBTestCase):
mock_update.assert_called_once_with(self.context, '2',
body["aggregate"])
+ def test_update_with_invalid_id(self):
+ body = {"aggregate": {"name": "test_name"}}
+ self.assertRaises(exc.HTTPBadRequest, self.controller.update,
+ self.req, 'foo', body=body)
+
def test_update_with_duplicated_name(self):
body = {"aggregate": {"name": "test_name"}}
side_effect = exception.AggregateNameExists(aggregate_name="test_name")
@@ -433,7 +443,7 @@ class AggregateTestCaseV21(test.NoDBTestCase):
def test_update_with_invalid_action(self):
with mock.patch.object(self.controller.api, "update_aggregate",
side_effect=exception.InvalidAggregateAction(
- action='invalid', aggregate_id='agg1', reason= "not empty")):
+ action='invalid', aggregate_id='1', reason= "not empty")):
body = {"aggregate": {"availability_zone": "nova"}}
self.assertRaises(exc.HTTPBadRequest, self.controller.update,
self.req, "1", body=body)
@@ -467,15 +477,20 @@ class AggregateTestCaseV21(test.NoDBTestCase):
def test_add_host_with_bad_aggregate(self):
side_effect = exception.AggregateNotFound(
- aggregate_id="bogus_aggregate")
+ aggregate_id="2")
with mock.patch.object(self.controller.api, 'add_host_to_aggregate',
side_effect=side_effect) as mock_add:
self.assertRaises(exc.HTTPNotFound, eval(self.add_host),
- self.req, "bogus_aggregate",
+ self.req, "2",
body={"add_host": {"host": "host1"}})
- mock_add.assert_called_once_with(self.context, "bogus_aggregate",
+ mock_add.assert_called_once_with(self.context, "2",
"host1")
+ def test_add_host_with_invalid_id(self):
+ body = {"add_host": {"host": "host1"}}
+ self.assertRaises(exc.HTTPBadRequest, eval(self.add_host),
+ self.req, 'foo', body=body)
+
def test_add_host_with_bad_host(self):
side_effect = exception.ComputeHostNotFound(host="bogus_host")
with mock.patch.object(self.controller.api, 'add_host_to_aggregate',
@@ -534,16 +549,21 @@ class AggregateTestCaseV21(test.NoDBTestCase):
def test_remove_host_with_bad_aggregate(self):
side_effect = exception.AggregateNotFound(
- aggregate_id="bogus_aggregate")
+ aggregate_id="2")
with mock.patch.object(self.controller.api,
'remove_host_from_aggregate',
side_effect=side_effect) as mock_rem:
self.assertRaises(exc.HTTPNotFound, eval(self.remove_host),
- self.req, "bogus_aggregate",
+ self.req, "2",
body={"remove_host": {"host": "host1"}})
- mock_rem.assert_called_once_with(self.context, "bogus_aggregate",
+ mock_rem.assert_called_once_with(self.context, "2",
"host1")
+ def test_remove_host_with_invalid_id(self):
+ body = {"remove_host": {"host": "host1"}}
+ self.assertRaises(exc.HTTPBadRequest, eval(self.remove_host),
+ self.req, 'foo', body=body)
+
def test_remove_host_with_host_not_in_aggregate(self):
side_effect = exception.AggregateHostNotFound(aggregate_id="1",
host="host1")
@@ -621,16 +641,21 @@ class AggregateTestCaseV21(test.NoDBTestCase):
def test_set_metadata_with_bad_aggregate(self):
body = {"set_metadata": {"metadata": {"foo": "bar"}}}
- side_effect = exception.AggregateNotFound(aggregate_id="bad_aggregate")
+ side_effect = exception.AggregateNotFound(aggregate_id="2")
with mock.patch.object(self.controller.api,
'update_aggregate_metadata',
side_effect=side_effect) as mock_update:
self.assertRaises(exc.HTTPNotFound, eval(self.set_metadata),
- self.req, "bad_aggregate", body=body)
- mock_update.assert_called_once_with(self.context, "bad_aggregate",
+ self.req, "2", body=body)
+ mock_update.assert_called_once_with(self.context, "2",
body["set_metadata"]['metadata'])
+ def test_set_metadata_with_invalid_id(self):
+ body = {"set_metadata": {"metadata": {"foo": "bar"}}}
+ self.assertRaises(exc.HTTPBadRequest, eval(self.set_metadata),
+ self.req, 'foo', body=body)
+
def test_set_metadata_with_missing_metadata(self):
body = {"asdf": {"foo": "bar"}}
self.assertRaises(self.bad_request, eval(self.set_metadata),
@@ -679,21 +704,25 @@ class AggregateTestCaseV21(test.NoDBTestCase):
def test_delete_aggregate_with_bad_aggregate(self):
side_effect = exception.AggregateNotFound(
- aggregate_id="bogus_aggregate")
+ aggregate_id="2")
with mock.patch.object(self.controller.api, 'delete_aggregate',
side_effect=side_effect) as mock_del:
self.assertRaises(exc.HTTPNotFound, self.controller.delete,
- self.req, "bogus_aggregate")
- mock_del.assert_called_once_with(self.context, "bogus_aggregate")
+ self.req, "2")
+ mock_del.assert_called_once_with(self.context, "2")
+
+ def test_delete_with_invalid_id(self):
+ self.assertRaises(exc.HTTPBadRequest, self.controller.delete,
+ self.req, 'foo')
def test_delete_aggregate_with_host(self):
with mock.patch.object(self.controller.api, "delete_aggregate",
side_effect=exception.InvalidAggregateAction(
- action="delete", aggregate_id="agg1",
+ action="delete", aggregate_id="2",
reason="not empty")):
self.assertRaises(exc.HTTPBadRequest,
self.controller.delete,
- self.req, "agg1")
+ self.req, "2")
def test_marshall_aggregate(self):
# _marshall_aggregate() just basically turns the aggregate returned
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 1c7b5de37a..ef048e1d68 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -2915,14 +2915,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
self.context, 'compute_check_can_live_migrate_destination',
CONF.host, instance.uuid)
+ @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+ side_effect=exception.InstanceGroupNotFound(group_uuid='')))
def test_check_can_live_migrate_destination_success(self):
self._test_check_can_live_migrate_destination()
+ @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+ side_effect=exception.InstanceGroupNotFound(group_uuid='')))
def test_check_can_live_migrate_destination_fail(self):
self.assertRaises(
- test.TestingException,
- self._test_check_can_live_migrate_destination,
- do_raise=True)
+ test.TestingException,
+ self._test_check_can_live_migrate_destination,
+ do_raise=True)
+
+ @mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
+ def test_check_can_live_migrate_destination_fail_group_policy(
+ self, mock_fail_db):
+
+ instance = fake_instance.fake_instance_obj(
+ self.context, host=self.compute.host, vm_state=vm_states.ACTIVE,
+ node='fake-node')
+
+ ex = exception.RescheduledException(
+ instance_uuid=instance.uuid, reason="policy violated")
+
+ with mock.patch.object(self.compute, '_validate_instance_group_policy',
+ side_effect=ex):
+ self.assertRaises(
+ exception.MigrationPreCheckError,
+ self.compute.check_can_live_migrate_destination,
+ self.context, instance, None, None)
@mock.patch('nova.compute.manager.InstanceEvents._lock_name')
def test_prepare_for_instance_event(self, lock_name_mock):
@@ -6447,7 +6469,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
def test_validate_policy_honors_workaround_disabled(self, mock_get):
instance = objects.Instance(uuid=uuids.instance)
hints = {'group': 'foo'}
- mock_get.return_value = objects.InstanceGroup(policy=None)
+ mock_get.return_value = objects.InstanceGroup(policy=None,
+ uuid=uuids.group)
self.compute._validate_instance_group_policy(self.context,
instance, hints)
mock_get.assert_called_once_with(self.context, 'foo')
@@ -6473,10 +6496,14 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
instance, hints)
mock_get.assert_called_once_with(self.context, uuids.group_hint)
+ @mock.patch('nova.objects.InstanceGroup.get_by_uuid')
@mock.patch('nova.objects.InstanceList.get_uuids_by_host')
@mock.patch('nova.objects.InstanceGroup.get_by_hint')
- def test_validate_instance_group_policy_with_rules(self, mock_get_by_hint,
- mock_get_by_host):
+ @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
+ @mock.patch('nova.objects.MigrationList.get_in_progress_by_host_and_node')
+ def test_validate_instance_group_policy_with_rules(
+ self, migration_list, nodes, mock_get_by_hint, mock_get_by_host,
+ mock_get_by_uuid):
# Create 2 instance in same host, inst2 created before inst1
instance = objects.Instance(uuid=uuids.inst1)
hints = {'group': [uuids.group_hint]}
@@ -6485,17 +6512,26 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock_get_by_host.return_value = existing_insts
# if group policy rules limit to 1, raise RescheduledException
- mock_get_by_hint.return_value = objects.InstanceGroup(
+ group = objects.InstanceGroup(
policy='anti-affinity', rules={'max_server_per_host': '1'},
- hosts=['host1'], members=members_uuids)
+ hosts=['host1'], members=members_uuids,
+ uuid=uuids.group)
+ mock_get_by_hint.return_value = group
+ mock_get_by_uuid.return_value = group
+ nodes.return_value = ['nodename']
+ migration_list.return_value = [objects.Migration(
+ uuid=uuids.migration, instance_uuid=uuids.instance)]
self.assertRaises(exception.RescheduledException,
self.compute._validate_instance_group_policy,
self.context, instance, hints)
# if group policy rules limit change to 2, validate OK
- mock_get_by_hint.return_value = objects.InstanceGroup(
+ group2 = objects.InstanceGroup(
policy='anti-affinity', rules={'max_server_per_host': 2},
- hosts=['host1'], members=members_uuids)
+ hosts=['host1'], members=members_uuids,
+ uuid=uuids.group)
+ mock_get_by_hint.return_value = group2
+ mock_get_by_uuid.return_value = group2
self.compute._validate_instance_group_policy(self.context,
instance, hints)
@@ -7966,6 +8002,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
else:
mock_log.warning.assert_not_called()
+ @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+ side_effect=exception.InstanceGroupNotFound(group_uuid='')))
def test_pre_live_migration_cinder_v3_api(self):
# This tests that pre_live_migration with a bdm with an
# attachment_id, will create a new attachment and update
@@ -8043,6 +8081,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
_test()
+ @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+ side_effect=exception.InstanceGroupNotFound(group_uuid='')))
def test_pre_live_migration_exception_cinder_v3_api(self):
# The instance in this test has 2 attachments. The second attach_create
# will throw an exception. This will test that the first attachment
@@ -8112,6 +8152,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
self.assertGreater(len(m.mock_calls), 0)
_test()
+ @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+ side_effect=exception.InstanceGroupNotFound(group_uuid='')))
def test_pre_live_migration_exceptions_delete_attachments(self):
# The instance in this test has 2 attachments. The call to
# driver.pre_live_migration will raise an exception. This will test
@@ -9297,7 +9339,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
self.compute.prep_resize,
self.context, mock.sentinel.image,
instance, flavor,
- mock.sentinel.request_spec,
+ objects.RequestSpec(),
{}, 'node', False,
migration, [])
@@ -9375,6 +9417,54 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
# (_error_out_instance_on_exception will set to ACTIVE by default).
self.assertEqual(vm_states.STOPPED, instance.vm_state)
+ @mock.patch('nova.compute.utils.notify_usage_exists')
+ @mock.patch('nova.compute.manager.ComputeManager.'
+ '_notify_about_instance_usage')
+ @mock.patch('nova.compute.utils.notify_about_resize_prep_instance')
+ @mock.patch('nova.objects.Instance.save')
+ @mock.patch('nova.compute.manager.ComputeManager._revert_allocation')
+ @mock.patch('nova.compute.manager.ComputeManager.'
+ '_reschedule_resize_or_reraise')
+ @mock.patch('nova.compute.utils.add_instance_fault_from_exc')
+ # this is almost copy-paste from test_prep_resize_fails_rollback
+ def test_prep_resize_fails_group_validation(
+ self, add_instance_fault_from_exc, _reschedule_resize_or_reraise,
+ _revert_allocation, mock_instance_save,
+ notify_about_resize_prep_instance, _notify_about_instance_usage,
+ notify_usage_exists):
+ """Tests that if _validate_instance_group_policy raises
+ InstanceFaultRollback, the instance.vm_state is reset properly in
+ _error_out_instance_on_exception
+ """
+ instance = fake_instance.fake_instance_obj(
+ self.context, host=self.compute.host, vm_state=vm_states.STOPPED,
+ node='fake-node', expected_attrs=['system_metadata', 'flavor'])
+ migration = mock.MagicMock(spec='nova.objects.Migration')
+ request_spec = mock.MagicMock(spec='nova.objects.RequestSpec')
+ ex = exception.RescheduledException(
+ instance_uuid=instance.uuid, reason="policy violated")
+ ex2 = exception.InstanceFaultRollback(
+ inner_exception=ex)
+
+ def fake_reschedule_resize_or_reraise(*args, **kwargs):
+ raise ex2
+
+ _reschedule_resize_or_reraise.side_effect = (
+ fake_reschedule_resize_or_reraise)
+
+ with mock.patch.object(self.compute, '_validate_instance_group_policy',
+ side_effect=ex):
+ self.assertRaises(
+ # _error_out_instance_on_exception should reraise the
+ # RescheduledException inside InstanceFaultRollback.
+ exception.RescheduledException, self.compute.prep_resize,
+ self.context, instance.image_meta, instance, instance.flavor,
+ request_spec, filter_properties={}, node=instance.node,
+ clean_shutdown=True, migration=migration, host_list=[])
+ # The instance.vm_state should remain unchanged
+ # (_error_out_instance_on_exception will set to ACTIVE by default).
+ self.assertEqual(vm_states.STOPPED, instance.vm_state)
+
def test_get_updated_nw_info_with_pci_mapping(self):
old_dev = objects.PciDevice(address='0000:04:00.2')
new_dev = objects.PciDevice(address='0000:05:00.3')
diff --git a/nova/tests/unit/db/test_db_api.py b/nova/tests/unit/db/test_db_api.py
index 608c3fd0c8..c3c04870ed 100644
--- a/nova/tests/unit/db/test_db_api.py
+++ b/nova/tests/unit/db/test_db_api.py
@@ -1957,6 +1957,14 @@ class InstanceTestCase(test.TestCase, ModelsObjectComparatorMixin):
sys_meta = utils.metadata_to_dict(inst['system_metadata'])
self.assertEqual(sys_meta, self.sample_data['system_metadata'])
+ def test_instance_get_with_meta(self):
+ inst_id = self.create_instance_with_args().id
+ inst = db.instance_get(self.ctxt, inst_id)
+ meta = utils.metadata_to_dict(inst['metadata'])
+ self.assertEqual(meta, self.sample_data['metadata'])
+ sys_meta = utils.metadata_to_dict(inst['system_metadata'])
+ self.assertEqual(sys_meta, self.sample_data['system_metadata'])
+
def test_instance_update(self):
instance = self.create_instance_with_args()
metadata = {'host': 'bar', 'key2': 'wuff'}
diff --git a/nova/tests/unit/image/test_glance.py b/nova/tests/unit/image/test_glance.py
index 865efc23a6..6d90051ea1 100644
--- a/nova/tests/unit/image/test_glance.py
+++ b/nova/tests/unit/image/test_glance.py
@@ -303,6 +303,11 @@ class TestExceptionTranslations(test.NoDBTestCase):
out_exc = glance._translate_image_exception('123', in_exc)
self.assertIsInstance(out_exc, exception.ImageNotFound)
+ def test_client_httpoverlimit_converts_to_imagequotaexceeded(self):
+ in_exc = glanceclient.exc.HTTPOverLimit('123')
+ out_exc = glance._translate_image_exception('123', in_exc)
+ self.assertIsInstance(out_exc, exception.ImageQuotaExceeded)
+
class TestGlanceSerializer(test.NoDBTestCase):
def test_serialize(self):
diff --git a/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml b/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml
new file mode 100644
index 0000000000..4c6135311b
--- /dev/null
+++ b/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml
@@ -0,0 +1,11 @@
+---
+fixes:
+ - |
+ Improved detection of anti-affinity policy violation when performing live
+ and cold migrations. Most of the violations caused by race conditions due
+ to performing concurrent live or cold migrations should now be addressed
+ by extra checks in the compute service. Upon detection, cold migration
+ operations are automatically rescheduled, while live migrations have two
+ checks and will be rescheduled if detected by the first one, otherwise the
+ live migration will fail cleanly and revert the instance state back to its
+ previous value.
diff --git a/tox.ini b/tox.ini
index a6ee0d51fb..5bf028768b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -61,7 +61,6 @@ commands =
bash -c "! find doc/ -type f -name *.json | xargs grep -U -n $'\r'"
# Check that all included JSON files are valid JSON
bash -c '! find doc/ -type f -name *.json | xargs -t -n1 python -m json.tool 2>&1 > /dev/null | grep -B1 -v ^python'
- bash tools/check-cherry-picks.sh
[testenv:fast8]
description =
@@ -70,6 +69,15 @@ envdir = {toxworkdir}/shared
commands =
bash tools/flake8wrap.sh -HEAD
+[testenv:validate-backport]
+description =
+ Determine whether a backport is ready to be merged by checking whether it has
+ already been merged to master or more recent stable branches.
+deps =
+skipsdist = true
+commands =
+ bash tools/check-cherry-picks.sh
+
[testenv:functional]
# TODO(melwitt): This can be removed when functional tests are gating with
# python 3.x