diff options
-rw-r--r-- | .zuul.yaml | 30 | ||||
-rw-r--r-- | doc/source/admin/secure-live-migration-with-qemu-native-tls.rst | 11 | ||||
-rw-r--r-- | nova/api/openstack/compute/aggregates.py | 41 | ||||
-rw-r--r-- | nova/compute/manager.py | 110 | ||||
-rw-r--r-- | nova/db/sqlalchemy/api.py | 25 | ||||
-rw-r--r-- | nova/exception.py | 5 | ||||
-rw-r--r-- | nova/image/glance.py | 2 | ||||
-rw-r--r-- | nova/monkey_patch.py | 23 | ||||
-rw-r--r-- | nova/network/neutronv2/api.py | 8 | ||||
-rw-r--r-- | nova/tests/functional/db/test_archive.py | 15 | ||||
-rwxr-xr-x | nova/tests/live_migration/hooks/ceph.sh | 1 | ||||
-rwxr-xr-x | nova/tests/live_migration/hooks/run_tests.sh | 5 | ||||
-rw-r--r-- | nova/tests/unit/api/openstack/compute/test_aggregates.py | 65 | ||||
-rw-r--r-- | nova/tests/unit/compute/test_compute_mgr.py | 112 | ||||
-rw-r--r-- | nova/tests/unit/db/test_db_api.py | 8 | ||||
-rw-r--r-- | nova/tests/unit/image/test_glance.py | 5 | ||||
-rw-r--r-- | releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml | 11 | ||||
-rw-r--r-- | tox.ini | 10 |
18 files changed, 423 insertions, 64 deletions
diff --git a/.zuul.yaml b/.zuul.yaml index 971c406e18..8149f71bc1 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -105,6 +105,17 @@ timeout: 3600 - job: + name: nova-tox-validate-backport + parent: openstack-tox + description: | + Determine whether a backport is ready to be merged by checking whether it + has already been merged to master or more recent stable branches. + + Uses tox with the ``validate-backport`` environment. + vars: + tox_envlist: validate-backport + +- job: name: nova-cells-v1 parent: nova-dsvm-base run: playbooks/legacy/nova-cells-v1/run.yaml @@ -267,6 +278,8 @@ - nova-next - nova-tox-functional - nova-tox-functional-py35 + - nova-tox-validate-backport: + voting: false - tempest-full-py3: irrelevant-files: *dsvm-irrelevant-files - tempest-multinode-full: @@ -274,6 +287,14 @@ irrelevant-files: *dsvm-irrelevant-files - tempest-slow-py3: irrelevant-files: *dsvm-irrelevant-files + vars: + devstack_localrc: + # To workaround https://bugs.launchpad.net/neutron/+bug/1914037 + # tempest is pinned to 26.0.0 on stable/stein in devstack, so + # we cannot consume the tempest fix. + IPV6_PUBLIC_RANGE: 2001:db8:0:10::/64 + IPV6_PUBLIC_NETWORK_GATEWAY: 2001:db8:0:10::2 + IPV6_ROUTER_GW_IP: 2001:db8:0:10::1 - grenade-py3: irrelevant-files: *dsvm-irrelevant-files gate: @@ -283,10 +304,19 @@ - nova-tox-functional - nova-tox-functional-py35 - nova-next + - nova-tox-validate-backport - tempest-full-py3: irrelevant-files: *dsvm-irrelevant-files - tempest-slow-py3: irrelevant-files: *dsvm-irrelevant-files + vars: + devstack_localrc: + # To workaround https://bugs.launchpad.net/neutron/+bug/1914037 + # tempest is pinned to 26.0.0 on stable/stein in devstack, so + # we cannot consume the tempest fix. + IPV6_PUBLIC_RANGE: 2001:db8:0:10::/64 + IPV6_PUBLIC_NETWORK_GATEWAY: 2001:db8:0:10::2 + IPV6_ROUTER_GW_IP: 2001:db8:0:10::1 - grenade-py3: irrelevant-files: *dsvm-irrelevant-files experimental: diff --git a/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst b/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst index 012d78e93b..fb76f656af 100644 --- a/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst +++ b/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst @@ -120,10 +120,13 @@ Performing the migration (1) On all relevant compute nodes, enable the :oslo.config:option:`libvirt.live_migration_with_native_tls` - configuration attribute:: + configuration attribute and set the + :oslo.config:option:`libvirt.live_migration_scheme` + configuration attribute to tls:: [libvirt] live_migration_with_native_tls = true + live_migration_scheme = tls .. note:: Setting both @@ -131,6 +134,12 @@ Performing the migration :oslo.config:option:`libvirt.live_migration_tunnelled` at the same time is invalid (and disallowed). + .. note:: + Not setting + :oslo.config:option:`libvirt.live_migration_scheme` to ``tls`` + will result in libvirt using the unencrypted TCP connection + without displaying any error or a warning in the logs. + And restart the ``nova-compute`` service:: $ systemctl restart openstack-nova-compute diff --git a/nova/api/openstack/compute/aggregates.py b/nova/api/openstack/compute/aggregates.py index e44b449e76..619fecfb37 100644 --- a/nova/api/openstack/compute/aggregates.py +++ b/nova/api/openstack/compute/aggregates.py @@ -28,6 +28,7 @@ from nova.compute import api as compute_api from nova import exception from nova.i18n import _ from nova.policies import aggregates as aggr_policies +from nova import utils def _get_context(req): @@ -84,11 +85,17 @@ class AggregateController(wsgi.Controller): return agg - @wsgi.expected_errors(404) + @wsgi.expected_errors((400, 404)) def show(self, req, id): """Shows the details of an aggregate, hosts and metadata included.""" context = _get_context(req) context.can(aggr_policies.POLICY_ROOT % 'show') + + try: + utils.validate_integer(id, 'id') + except exception.InvalidInput as e: + raise exc.HTTPBadRequest(explanation=e.format_message()) + try: aggregate = self.api.get_aggregate(context, id) except exception.AggregateNotFound as e: @@ -107,6 +114,11 @@ class AggregateController(wsgi.Controller): updates['name'] = common.normalize_name(updates['name']) try: + utils.validate_integer(id, 'id') + except exception.InvalidInput as e: + raise exc.HTTPBadRequest(explanation=e.format_message()) + + try: aggregate = self.api.update_aggregate(context, id, updates) except exception.AggregateNameExists as e: raise exc.HTTPConflict(explanation=e.format_message()) @@ -125,6 +137,12 @@ class AggregateController(wsgi.Controller): """Removes an aggregate by id.""" context = _get_context(req) context.can(aggr_policies.POLICY_ROOT % 'delete') + + try: + utils.validate_integer(id, 'id') + except exception.InvalidInput as e: + raise exc.HTTPBadRequest(explanation=e.format_message()) + try: self.api.delete_aggregate(context, id) except exception.AggregateNotFound as e: @@ -135,7 +153,7 @@ class AggregateController(wsgi.Controller): # NOTE(gmann): Returns 200 for backwards compatibility but should be 202 # for representing async API as this API just accepts the request and # request hypervisor driver to complete the same in async mode. - @wsgi.expected_errors((404, 409)) + @wsgi.expected_errors((400, 404, 409)) @wsgi.action('add_host') @validation.schema(aggregates.add_host) def _add_host(self, req, id, body): @@ -144,6 +162,12 @@ class AggregateController(wsgi.Controller): context = _get_context(req) context.can(aggr_policies.POLICY_ROOT % 'add_host') + + try: + utils.validate_integer(id, 'id') + except exception.InvalidInput as e: + raise exc.HTTPBadRequest(explanation=e.format_message()) + try: aggregate = self.api.add_host_to_aggregate(context, id, host) except (exception.AggregateNotFound, @@ -158,7 +182,7 @@ class AggregateController(wsgi.Controller): # NOTE(gmann): Returns 200 for backwards compatibility but should be 202 # for representing async API as this API just accepts the request and # request hypervisor driver to complete the same in async mode. - @wsgi.expected_errors((404, 409)) + @wsgi.expected_errors((400, 404, 409)) @wsgi.action('remove_host') @validation.schema(aggregates.remove_host) def _remove_host(self, req, id, body): @@ -167,6 +191,12 @@ class AggregateController(wsgi.Controller): context = _get_context(req) context.can(aggr_policies.POLICY_ROOT % 'remove_host') + + try: + utils.validate_integer(id, 'id') + except exception.InvalidInput as e: + raise exc.HTTPBadRequest(explanation=e.format_message()) + try: aggregate = self.api.remove_host_from_aggregate(context, id, host) except (exception.AggregateNotFound, exception.AggregateHostNotFound, @@ -188,6 +218,11 @@ class AggregateController(wsgi.Controller): context = _get_context(req) context.can(aggr_policies.POLICY_ROOT % 'set_metadata') + try: + utils.validate_integer(id, 'id') + except exception.InvalidInput as e: + raise exc.HTTPBadRequest(explanation=e.format_message()) + metadata = body["set_metadata"]["metadata"] try: aggregate = self.api.update_aggregate_metadata(context, diff --git a/nova/compute/manager.py b/nova/compute/manager.py index c7a0f37e92..1c29aa278f 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1474,7 +1474,11 @@ class ComputeManager(manager.Manager): return [_decode(f) for f in injected_files] def _validate_instance_group_policy(self, context, instance, - scheduler_hints): + scheduler_hints=None): + + if CONF.workarounds.disable_group_policy_check_upcall: + return + # NOTE(russellb) Instance group policy is enforced by the scheduler. # However, there is a race condition with the enforcement of # the policy. Since more than one instance may be scheduled at the @@ -1483,29 +1487,63 @@ class ComputeManager(manager.Manager): # multiple instances with an affinity policy could end up on different # hosts. This is a validation step to make sure that starting the # instance here doesn't violate the policy. - group_hint = scheduler_hints.get('group') - if not group_hint: - return - - # The RequestSpec stores scheduler_hints as key=list pairs so we need - # to check the type on the value and pull the single entry out. The - # API request schema validates that the 'group' hint is a single value. - if isinstance(group_hint, list): - group_hint = group_hint[0] + if scheduler_hints is not None: + # only go through here if scheduler_hints is provided, even if it + # is empty. + group_hint = scheduler_hints.get('group') + if not group_hint: + return + else: + # The RequestSpec stores scheduler_hints as key=list pairs so + # we need to check the type on the value and pull the single + # entry out. The API request schema validates that + # the 'group' hint is a single value. + if isinstance(group_hint, list): + group_hint = group_hint[0] + + group = objects.InstanceGroup.get_by_hint(context, group_hint) + else: + # TODO(ganso): a call to DB can be saved by adding request_spec + # to rpcapi payload of live_migration, pre_live_migration and + # check_can_live_migrate_destination + try: + group = objects.InstanceGroup.get_by_instance_uuid( + context, instance.uuid) + except exception.InstanceGroupNotFound: + return - @utils.synchronized(group_hint) - def _do_validation(context, instance, group_hint): - group = objects.InstanceGroup.get_by_hint(context, group_hint) + @utils.synchronized(group['uuid']) + def _do_validation(context, instance, group): if group.policy and 'anti-affinity' == group.policy: + + # instances on host instances_uuids = objects.InstanceList.get_uuids_by_host( context, self.host) ins_on_host = set(instances_uuids) + + # instance param is just for logging, the nodename obtained is + # not actually related to the instance at all + nodename = self._get_nodename(instance) + + # instances being migrated to host + migrations = ( + objects.MigrationList.get_in_progress_by_host_and_node( + context, self.host, nodename)) + migration_vm_uuids = set([mig['instance_uuid'] + for mig in migrations]) + + total_instances = migration_vm_uuids | ins_on_host + + # refresh group to get updated members within locked block + group = objects.InstanceGroup.get_by_uuid(context, + group['uuid']) members = set(group.members) # Determine the set of instance group members on this host # which are not the instance in question. This is used to # determine how many other members from the same anti-affinity # group can be on this host. - members_on_host = ins_on_host & members - set([instance.uuid]) + members_on_host = (total_instances & members - + set([instance.uuid])) rules = group.rules if rules and 'max_server_per_host' in rules: max_server = rules['max_server_per_host'] @@ -1517,6 +1555,12 @@ class ComputeManager(manager.Manager): raise exception.RescheduledException( instance_uuid=instance.uuid, reason=msg) + + # NOTE(ganso): The check for affinity below does not work and it + # can easily be violated because the lock happens in different + # compute hosts. + # The only fix seems to be a DB lock to perform the check whenever + # setting the host field to an instance. elif group.policy and 'affinity' == group.policy: group_hosts = group.get_hosts(exclude=[instance.uuid]) if group_hosts and self.host not in group_hosts: @@ -1525,8 +1569,7 @@ class ComputeManager(manager.Manager): instance_uuid=instance.uuid, reason=msg) - if not CONF.workarounds.disable_group_policy_check_upcall: - _do_validation(context, instance, group_hint) + _do_validation(context, instance, group) def _log_original_error(self, exc_info, instance_uuid): LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid, @@ -4646,10 +4689,24 @@ class ComputeManager(manager.Manager): with self._error_out_instance_on_exception( context, instance, instance_state=instance_state),\ errors_out_migration_ctxt(migration): + self._send_prep_resize_notifications( context, instance, fields.NotificationPhase.START, instance_type) try: + scheduler_hints = self._get_scheduler_hints(filter_properties, + request_spec) + # Error out if this host cannot accept the new instance due + # to anti-affinity. At this point the migration is already + # in-progress, so this is the definitive moment to abort due to + # the policy violation. Also, exploding here is covered by the + # cleanup methods in except block. + try: + self._validate_instance_group_policy(context, instance, + scheduler_hints) + except exception.RescheduledException as e: + raise exception.InstanceFaultRollback(inner_exception=e) + self._prep_resize(context, image, instance, instance_type, filter_properties, node, migration, clean_shutdown) @@ -6505,6 +6562,20 @@ class ComputeManager(manager.Manager): if None, ignore disk usage checking :returns: a dict containing migration info """ + + # Error out if this host cannot accept the new instance due + # to anti-affinity. This check at this moment is not very accurate, as + # multiple requests may be happening concurrently and miss the lock, + # but when it works it provides a better user experience by failing + # earlier. Also, it should be safe to explode here, error becomes + # NoValidHost and instance status remains ACTIVE. + try: + self._validate_instance_group_policy(ctxt, instance) + except exception.RescheduledException as e: + msg = ("Failed to validate instance group policy " + "due to: {}".format(e)) + raise exception.MigrationPreCheckError(reason=msg) + src_compute_info = obj_base.obj_to_primitive( self._get_compute_info(ctxt, instance.host)) dst_compute_info = obj_base.obj_to_primitive( @@ -6567,6 +6638,13 @@ class ComputeManager(manager.Manager): """ LOG.debug('pre_live_migration data is %s', migrate_data) + # Error out if this host cannot accept the new instance due + # to anti-affinity. At this point the migration is already in-progress, + # so this is the definitive moment to abort due to the policy + # violation. Also, it should be safe to explode here. The instance + # status remains ACTIVE, migration status failed. + self._validate_instance_group_policy(context, instance) + migrate_data.old_vol_attachment_ids = {} bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( context, instance.uuid) diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py index a0535aa2a5..a4d9be0271 100644 --- a/nova/db/sqlalchemy/api.py +++ b/nova/db/sqlalchemy/api.py @@ -48,6 +48,7 @@ from sqlalchemy.orm import aliased from sqlalchemy.orm import contains_eager from sqlalchemy.orm import joinedload from sqlalchemy.orm import noload +from sqlalchemy.orm import subqueryload from sqlalchemy.orm import undefer from sqlalchemy.schema import Table from sqlalchemy import sql @@ -1936,13 +1937,27 @@ def _build_instance_get(context, columns_to_join=None): continue if 'extra.' in column: query = query.options(undefer(column)) + elif column in ['metadata', 'system_metadata']: + # NOTE(melwitt): We use subqueryload() instead of joinedload() for + # metadata and system_metadata because of the one-to-many + # relationship of the data. Directly joining these columns can + # result in a large number of additional rows being queried if an + # instance has a large number of (system_)metadata items, resulting + # in a large data transfer. Instead, the subqueryload() will + # perform additional queries to obtain metadata and system_metadata + # for the instance. + query = query.options(subqueryload(column)) else: query = query.options(joinedload(column)) # NOTE(alaski) Stop lazy loading of columns not needed. for col in ['metadata', 'system_metadata']: if col not in columns_to_join: query = query.options(noload(col)) - return query + # NOTE(melwitt): We need to use order_by(<unique column>) so that the + # additional queries emitted by subqueryload() include the same ordering as + # used by the parent query. + # https://docs.sqlalchemy.org/en/13/orm/loading_relationships.html#the-importance-of-ordering + return query.order_by(models.Instance.id) def _instances_fill_metadata(context, instances, manual_joins=None): @@ -5625,8 +5640,12 @@ def _archive_deleted_rows_for_table(tablename, max_rows): "%(tablename)s: %(error)s", {'tablename': tablename, 'error': six.text_type(ex)}) - if ((max_rows is None or rows_archived < max_rows) - and 'instance_uuid' in columns): + if ((max_rows is None or rows_archived < max_rows) and + # NOTE(melwitt): The pci_devices table uses the 'instance_uuid' + # column to track the allocated association of a PCI device and its + # records are not tied to the lifecycles of instance records. + (tablename != 'pci_devices' and + 'instance_uuid' in columns)): instances = models.BASE.metadata.tables['instances'] limit = max_rows - rows_archived if max_rows is not None else None extra = _archive_if_instance_deleted(table, shadow_table, instances, diff --git a/nova/exception.py b/nova/exception.py index 11d26f55fa..1679a74f80 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -613,6 +613,11 @@ class ImageBadRequest(Invalid): "%(response)s") +class ImageQuotaExceeded(NovaException): + msg_fmt = _("Quota exceeded or out of space for image %(image_id)s " + "in the image service.") + + class InstanceUnacceptable(Invalid): msg_fmt = _("Instance %(instance_id)s is unacceptable: %(reason)s") diff --git a/nova/image/glance.py b/nova/image/glance.py index 39970d0498..459699531a 100644 --- a/nova/image/glance.py +++ b/nova/image/glance.py @@ -954,6 +954,8 @@ def _translate_image_exception(image_id, exc_value): if isinstance(exc_value, glanceclient.exc.BadRequest): return exception.ImageBadRequest(image_id=image_id, response=six.text_type(exc_value)) + if isinstance(exc_value, glanceclient.exc.HTTPOverLimit): + return exception.ImageQuotaExceeded(image_id=image_id) return exc_value diff --git a/nova/monkey_patch.py b/nova/monkey_patch.py index 6d0cacd6c7..be2a70ab4b 100644 --- a/nova/monkey_patch.py +++ b/nova/monkey_patch.py @@ -22,6 +22,19 @@ import os def _monkey_patch(): + # See https://bugs.launchpad.net/nova/+bug/1164822 + # TODO(mdbooth): This feature was deprecated and removed in eventlet at + # some point but brought back in version 0.21.0, presumably because some + # users still required it to work round issues. However, there have been a + # number of greendns fixes in eventlet since then. Specifically, it looks + # as though the originally reported IPv6 issue may have been fixed in + # version 0.24.0. We should remove this when we can confirm that the + # original issue is fixed. + # NOTE(artom) eventlet processes environment variables at import-time. We + # therefore set this here, before importing eventlet, in order to correctly + # disable greendns. + os.environ['EVENTLET_NO_GREENDNS'] = 'yes' + # NOTE(mdbooth): Anything imported here will not be monkey patched. It is # important to take care not to import anything here which requires monkey # patching. @@ -39,16 +52,6 @@ def _monkey_patch(): problems = (set(['urllib3', 'oslo_context.context']) & set(sys.modules.keys())) - # See https://bugs.launchpad.net/nova/+bug/1164822 - # TODO(mdbooth): This feature was deprecated and removed in eventlet at - # some point but brought back in version 0.21.0, presumably because some - # users still required it to work round issues. However, there have been a - # number of greendns fixes in eventlet since then. Specifically, it looks - # as though the originally reported IPv6 issue may have been fixed in - # version 0.24.0. We should remove this when we can confirm that the - # original issue is fixed. - os.environ['EVENTLET_NO_GREENDNS'] = 'yes' - if debugger.enabled(): # turn off thread patching to enable the remote debugger eventlet.monkey_patch(thread=False) diff --git a/nova/network/neutronv2/api.py b/nova/network/neutronv2/api.py index 8a60291910..2f4e39b1cb 100644 --- a/nova/network/neutronv2/api.py +++ b/nova/network/neutronv2/api.py @@ -828,9 +828,15 @@ class API(base_api.NetworkAPI): # TODO(arosen) Should optimize more to do direct query for security # group if len(security_groups) == 1 if len(security_groups): + # NOTE(slaweq): fields other than name and id aren't really needed + # so asking only about those fields will allow Neutron to not + # prepare list of rules for each found security group. That may + # speed processing of this request a lot in case when tenant has + # got many security groups + sg_fields = ['id', 'name'] search_opts = {'tenant_id': instance.project_id} user_security_groups = neutron.list_security_groups( - **search_opts).get('security_groups') + fields=sg_fields, **search_opts).get('security_groups') for security_group in security_groups: name_match = None diff --git a/nova/tests/functional/db/test_archive.py b/nova/tests/functional/db/test_archive.py index 432d814d91..a8b4df6f16 100644 --- a/nova/tests/functional/db/test_archive.py +++ b/nova/tests/functional/db/test_archive.py @@ -135,6 +135,19 @@ class TestDatabaseArchive(test_servers.ServersTestBase): # Verify we have some system_metadata since we'll check that later. self.assertTrue(len(instance.system_metadata), 'No system_metadata for instance: %s' % server_id) + # Create a pci_devices record to simulate an instance that had a PCI + # device allocated at the time it was deleted. There is a window of + # time between deletion of the instance record and freeing of the PCI + # device in nova-compute's _complete_deletion method during RT update. + db.pci_device_update(admin_context, 1, 'fake-address', + {'compute_node_id': 1, + 'address': 'fake-address', + 'vendor_id': 'fake', + 'product_id': 'fake', + 'dev_type': 'fake', + 'label': 'fake', + 'status': 'allocated', + 'instance_uuid': instance.uuid}) # Now try and archive the soft deleted records. results, deleted_instance_uuids = db.archive_deleted_rows(max_rows=100) # verify system_metadata was dropped @@ -147,6 +160,8 @@ class TestDatabaseArchive(test_servers.ServersTestBase): # by the archive self.assertIn('instance_actions', results) self.assertIn('instance_actions_events', results) + # Verify that the pci_devices record has not been dropped + self.assertNotIn('pci_devices', results) def _get_table_counts(self): engine = sqlalchemy_api.get_engine() diff --git a/nova/tests/live_migration/hooks/ceph.sh b/nova/tests/live_migration/hooks/ceph.sh index ff94967748..2a2ec0202c 100755 --- a/nova/tests/live_migration/hooks/ceph.sh +++ b/nova/tests/live_migration/hooks/ceph.sh @@ -8,6 +8,7 @@ function prepare_ceph { configure_ceph #install ceph-common package on compute nodes $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m raw -a "executable=/bin/bash + export CEPH_RELEASE=nautilus source $BASE/new/devstack/functions source $BASE/new/devstack/functions-common git clone https://git.openstack.org/openstack/devstack-plugin-ceph /tmp/devstack-plugin-ceph diff --git a/nova/tests/live_migration/hooks/run_tests.sh b/nova/tests/live_migration/hooks/run_tests.sh index ff005520b8..0f7c5f0a3f 100755 --- a/nova/tests/live_migration/hooks/run_tests.sh +++ b/nova/tests/live_migration/hooks/run_tests.sh @@ -46,6 +46,11 @@ echo '2. NFS testing is skipped due to setup failures with Ubuntu 16.04' echo '3. test with Ceph for root + ephemeral disks' # Discover and set variables for the OS version so the devstack-plugin-ceph # scripts can find the correct repository to install the ceph packages. +# NOTE(lyarwood): Pin the CEPH_RELEASE to nautilus here as was the case +# prior to https://review.opendev.org/c/openstack/devstack-plugin-ceph/+/777232 +# landing in the branchless plugin, we also have to pin in ceph.sh when +# configuring ceph on a remote node via ansible. +export CEPH_RELEASE=nautilus GetOSVersion prepare_ceph GLANCE_API_CONF=${GLANCE_API_CONF:-/etc/glance/glance-api.conf} diff --git a/nova/tests/unit/api/openstack/compute/test_aggregates.py b/nova/tests/unit/api/openstack/compute/test_aggregates.py index eb185849a6..c14f562dc7 100644 --- a/nova/tests/unit/api/openstack/compute/test_aggregates.py +++ b/nova/tests/unit/api/openstack/compute/test_aggregates.py @@ -299,7 +299,7 @@ class AggregateTestCaseV21(test.NoDBTestCase): self.controller.show, self.user_req, "1") - def test_show_with_invalid_id(self): + def test_show_with_bad_aggregate(self): side_effect = exception.AggregateNotFound(aggregate_id='2') with mock.patch.object(self.controller.api, 'get_aggregate', side_effect=side_effect) as mock_get: @@ -307,6 +307,10 @@ class AggregateTestCaseV21(test.NoDBTestCase): self.req, "2") mock_get.assert_called_once_with(self.context, '2') + def test_show_with_invalid_id(self): + self.assertRaises(exc.HTTPBadRequest, self.controller.show, + self.req, 'foo') + def test_update(self): body = {"aggregate": {"name": "new_name", "availability_zone": "nova1"}} @@ -390,10 +394,11 @@ class AggregateTestCaseV21(test.NoDBTestCase): @mock.patch('nova.compute.api.AggregateAPI.update_aggregate') def test_update_with_none_availability_zone(self, mock_update_agg): - agg_id = uuidsentinel.aggregate + agg_id = 173 mock_update_agg.return_value = objects.Aggregate(self.context, name='test', - uuid=agg_id, + uuid=uuidsentinel.agg, + id=agg_id, hosts=[], metadata={}) body = {"aggregate": {"name": "test", @@ -414,6 +419,11 @@ class AggregateTestCaseV21(test.NoDBTestCase): mock_update.assert_called_once_with(self.context, '2', body["aggregate"]) + def test_update_with_invalid_id(self): + body = {"aggregate": {"name": "test_name"}} + self.assertRaises(exc.HTTPBadRequest, self.controller.update, + self.req, 'foo', body=body) + def test_update_with_duplicated_name(self): body = {"aggregate": {"name": "test_name"}} side_effect = exception.AggregateNameExists(aggregate_name="test_name") @@ -433,7 +443,7 @@ class AggregateTestCaseV21(test.NoDBTestCase): def test_update_with_invalid_action(self): with mock.patch.object(self.controller.api, "update_aggregate", side_effect=exception.InvalidAggregateAction( - action='invalid', aggregate_id='agg1', reason= "not empty")): + action='invalid', aggregate_id='1', reason= "not empty")): body = {"aggregate": {"availability_zone": "nova"}} self.assertRaises(exc.HTTPBadRequest, self.controller.update, self.req, "1", body=body) @@ -467,15 +477,20 @@ class AggregateTestCaseV21(test.NoDBTestCase): def test_add_host_with_bad_aggregate(self): side_effect = exception.AggregateNotFound( - aggregate_id="bogus_aggregate") + aggregate_id="2") with mock.patch.object(self.controller.api, 'add_host_to_aggregate', side_effect=side_effect) as mock_add: self.assertRaises(exc.HTTPNotFound, eval(self.add_host), - self.req, "bogus_aggregate", + self.req, "2", body={"add_host": {"host": "host1"}}) - mock_add.assert_called_once_with(self.context, "bogus_aggregate", + mock_add.assert_called_once_with(self.context, "2", "host1") + def test_add_host_with_invalid_id(self): + body = {"add_host": {"host": "host1"}} + self.assertRaises(exc.HTTPBadRequest, eval(self.add_host), + self.req, 'foo', body=body) + def test_add_host_with_bad_host(self): side_effect = exception.ComputeHostNotFound(host="bogus_host") with mock.patch.object(self.controller.api, 'add_host_to_aggregate', @@ -534,16 +549,21 @@ class AggregateTestCaseV21(test.NoDBTestCase): def test_remove_host_with_bad_aggregate(self): side_effect = exception.AggregateNotFound( - aggregate_id="bogus_aggregate") + aggregate_id="2") with mock.patch.object(self.controller.api, 'remove_host_from_aggregate', side_effect=side_effect) as mock_rem: self.assertRaises(exc.HTTPNotFound, eval(self.remove_host), - self.req, "bogus_aggregate", + self.req, "2", body={"remove_host": {"host": "host1"}}) - mock_rem.assert_called_once_with(self.context, "bogus_aggregate", + mock_rem.assert_called_once_with(self.context, "2", "host1") + def test_remove_host_with_invalid_id(self): + body = {"remove_host": {"host": "host1"}} + self.assertRaises(exc.HTTPBadRequest, eval(self.remove_host), + self.req, 'foo', body=body) + def test_remove_host_with_host_not_in_aggregate(self): side_effect = exception.AggregateHostNotFound(aggregate_id="1", host="host1") @@ -621,16 +641,21 @@ class AggregateTestCaseV21(test.NoDBTestCase): def test_set_metadata_with_bad_aggregate(self): body = {"set_metadata": {"metadata": {"foo": "bar"}}} - side_effect = exception.AggregateNotFound(aggregate_id="bad_aggregate") + side_effect = exception.AggregateNotFound(aggregate_id="2") with mock.patch.object(self.controller.api, 'update_aggregate_metadata', side_effect=side_effect) as mock_update: self.assertRaises(exc.HTTPNotFound, eval(self.set_metadata), - self.req, "bad_aggregate", body=body) - mock_update.assert_called_once_with(self.context, "bad_aggregate", + self.req, "2", body=body) + mock_update.assert_called_once_with(self.context, "2", body["set_metadata"]['metadata']) + def test_set_metadata_with_invalid_id(self): + body = {"set_metadata": {"metadata": {"foo": "bar"}}} + self.assertRaises(exc.HTTPBadRequest, eval(self.set_metadata), + self.req, 'foo', body=body) + def test_set_metadata_with_missing_metadata(self): body = {"asdf": {"foo": "bar"}} self.assertRaises(self.bad_request, eval(self.set_metadata), @@ -679,21 +704,25 @@ class AggregateTestCaseV21(test.NoDBTestCase): def test_delete_aggregate_with_bad_aggregate(self): side_effect = exception.AggregateNotFound( - aggregate_id="bogus_aggregate") + aggregate_id="2") with mock.patch.object(self.controller.api, 'delete_aggregate', side_effect=side_effect) as mock_del: self.assertRaises(exc.HTTPNotFound, self.controller.delete, - self.req, "bogus_aggregate") - mock_del.assert_called_once_with(self.context, "bogus_aggregate") + self.req, "2") + mock_del.assert_called_once_with(self.context, "2") + + def test_delete_with_invalid_id(self): + self.assertRaises(exc.HTTPBadRequest, self.controller.delete, + self.req, 'foo') def test_delete_aggregate_with_host(self): with mock.patch.object(self.controller.api, "delete_aggregate", side_effect=exception.InvalidAggregateAction( - action="delete", aggregate_id="agg1", + action="delete", aggregate_id="2", reason="not empty")): self.assertRaises(exc.HTTPBadRequest, self.controller.delete, - self.req, "agg1") + self.req, "2") def test_marshall_aggregate(self): # _marshall_aggregate() just basically turns the aggregate returned diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 1c7b5de37a..ef048e1d68 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -2915,14 +2915,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, self.context, 'compute_check_can_live_migrate_destination', CONF.host, instance.uuid) + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_success(self): self._test_check_can_live_migrate_destination() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_fail(self): self.assertRaises( - test.TestingException, - self._test_check_can_live_migrate_destination, - do_raise=True) + test.TestingException, + self._test_check_can_live_migrate_destination, + do_raise=True) + + @mock.patch.object(compute_utils, 'add_instance_fault_from_exc') + def test_check_can_live_migrate_destination_fail_group_policy( + self, mock_fail_db): + + instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, vm_state=vm_states.ACTIVE, + node='fake-node') + + ex = exception.RescheduledException( + instance_uuid=instance.uuid, reason="policy violated") + + with mock.patch.object(self.compute, '_validate_instance_group_policy', + side_effect=ex): + self.assertRaises( + exception.MigrationPreCheckError, + self.compute.check_can_live_migrate_destination, + self.context, instance, None, None) @mock.patch('nova.compute.manager.InstanceEvents._lock_name') def test_prepare_for_instance_event(self, lock_name_mock): @@ -6447,7 +6469,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): def test_validate_policy_honors_workaround_disabled(self, mock_get): instance = objects.Instance(uuid=uuids.instance) hints = {'group': 'foo'} - mock_get.return_value = objects.InstanceGroup(policy=None) + mock_get.return_value = objects.InstanceGroup(policy=None, + uuid=uuids.group) self.compute._validate_instance_group_policy(self.context, instance, hints) mock_get.assert_called_once_with(self.context, 'foo') @@ -6473,10 +6496,14 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): instance, hints) mock_get.assert_called_once_with(self.context, uuids.group_hint) + @mock.patch('nova.objects.InstanceGroup.get_by_uuid') @mock.patch('nova.objects.InstanceList.get_uuids_by_host') @mock.patch('nova.objects.InstanceGroup.get_by_hint') - def test_validate_instance_group_policy_with_rules(self, mock_get_by_hint, - mock_get_by_host): + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + @mock.patch('nova.objects.MigrationList.get_in_progress_by_host_and_node') + def test_validate_instance_group_policy_with_rules( + self, migration_list, nodes, mock_get_by_hint, mock_get_by_host, + mock_get_by_uuid): # Create 2 instance in same host, inst2 created before inst1 instance = objects.Instance(uuid=uuids.inst1) hints = {'group': [uuids.group_hint]} @@ -6485,17 +6512,26 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): mock_get_by_host.return_value = existing_insts # if group policy rules limit to 1, raise RescheduledException - mock_get_by_hint.return_value = objects.InstanceGroup( + group = objects.InstanceGroup( policy='anti-affinity', rules={'max_server_per_host': '1'}, - hosts=['host1'], members=members_uuids) + hosts=['host1'], members=members_uuids, + uuid=uuids.group) + mock_get_by_hint.return_value = group + mock_get_by_uuid.return_value = group + nodes.return_value = ['nodename'] + migration_list.return_value = [objects.Migration( + uuid=uuids.migration, instance_uuid=uuids.instance)] self.assertRaises(exception.RescheduledException, self.compute._validate_instance_group_policy, self.context, instance, hints) # if group policy rules limit change to 2, validate OK - mock_get_by_hint.return_value = objects.InstanceGroup( + group2 = objects.InstanceGroup( policy='anti-affinity', rules={'max_server_per_host': 2}, - hosts=['host1'], members=members_uuids) + hosts=['host1'], members=members_uuids, + uuid=uuids.group) + mock_get_by_hint.return_value = group2 + mock_get_by_uuid.return_value = group2 self.compute._validate_instance_group_policy(self.context, instance, hints) @@ -7966,6 +8002,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, else: mock_log.warning.assert_not_called() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_pre_live_migration_cinder_v3_api(self): # This tests that pre_live_migration with a bdm with an # attachment_id, will create a new attachment and update @@ -8043,6 +8081,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, _test() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_pre_live_migration_exception_cinder_v3_api(self): # The instance in this test has 2 attachments. The second attach_create # will throw an exception. This will test that the first attachment @@ -8112,6 +8152,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, self.assertGreater(len(m.mock_calls), 0) _test() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_pre_live_migration_exceptions_delete_attachments(self): # The instance in this test has 2 attachments. The call to # driver.pre_live_migration will raise an exception. This will test @@ -9297,7 +9339,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, self.compute.prep_resize, self.context, mock.sentinel.image, instance, flavor, - mock.sentinel.request_spec, + objects.RequestSpec(), {}, 'node', False, migration, []) @@ -9375,6 +9417,54 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, # (_error_out_instance_on_exception will set to ACTIVE by default). self.assertEqual(vm_states.STOPPED, instance.vm_state) + @mock.patch('nova.compute.utils.notify_usage_exists') + @mock.patch('nova.compute.manager.ComputeManager.' + '_notify_about_instance_usage') + @mock.patch('nova.compute.utils.notify_about_resize_prep_instance') + @mock.patch('nova.objects.Instance.save') + @mock.patch('nova.compute.manager.ComputeManager._revert_allocation') + @mock.patch('nova.compute.manager.ComputeManager.' + '_reschedule_resize_or_reraise') + @mock.patch('nova.compute.utils.add_instance_fault_from_exc') + # this is almost copy-paste from test_prep_resize_fails_rollback + def test_prep_resize_fails_group_validation( + self, add_instance_fault_from_exc, _reschedule_resize_or_reraise, + _revert_allocation, mock_instance_save, + notify_about_resize_prep_instance, _notify_about_instance_usage, + notify_usage_exists): + """Tests that if _validate_instance_group_policy raises + InstanceFaultRollback, the instance.vm_state is reset properly in + _error_out_instance_on_exception + """ + instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, vm_state=vm_states.STOPPED, + node='fake-node', expected_attrs=['system_metadata', 'flavor']) + migration = mock.MagicMock(spec='nova.objects.Migration') + request_spec = mock.MagicMock(spec='nova.objects.RequestSpec') + ex = exception.RescheduledException( + instance_uuid=instance.uuid, reason="policy violated") + ex2 = exception.InstanceFaultRollback( + inner_exception=ex) + + def fake_reschedule_resize_or_reraise(*args, **kwargs): + raise ex2 + + _reschedule_resize_or_reraise.side_effect = ( + fake_reschedule_resize_or_reraise) + + with mock.patch.object(self.compute, '_validate_instance_group_policy', + side_effect=ex): + self.assertRaises( + # _error_out_instance_on_exception should reraise the + # RescheduledException inside InstanceFaultRollback. + exception.RescheduledException, self.compute.prep_resize, + self.context, instance.image_meta, instance, instance.flavor, + request_spec, filter_properties={}, node=instance.node, + clean_shutdown=True, migration=migration, host_list=[]) + # The instance.vm_state should remain unchanged + # (_error_out_instance_on_exception will set to ACTIVE by default). + self.assertEqual(vm_states.STOPPED, instance.vm_state) + def test_get_updated_nw_info_with_pci_mapping(self): old_dev = objects.PciDevice(address='0000:04:00.2') new_dev = objects.PciDevice(address='0000:05:00.3') diff --git a/nova/tests/unit/db/test_db_api.py b/nova/tests/unit/db/test_db_api.py index 608c3fd0c8..c3c04870ed 100644 --- a/nova/tests/unit/db/test_db_api.py +++ b/nova/tests/unit/db/test_db_api.py @@ -1957,6 +1957,14 @@ class InstanceTestCase(test.TestCase, ModelsObjectComparatorMixin): sys_meta = utils.metadata_to_dict(inst['system_metadata']) self.assertEqual(sys_meta, self.sample_data['system_metadata']) + def test_instance_get_with_meta(self): + inst_id = self.create_instance_with_args().id + inst = db.instance_get(self.ctxt, inst_id) + meta = utils.metadata_to_dict(inst['metadata']) + self.assertEqual(meta, self.sample_data['metadata']) + sys_meta = utils.metadata_to_dict(inst['system_metadata']) + self.assertEqual(sys_meta, self.sample_data['system_metadata']) + def test_instance_update(self): instance = self.create_instance_with_args() metadata = {'host': 'bar', 'key2': 'wuff'} diff --git a/nova/tests/unit/image/test_glance.py b/nova/tests/unit/image/test_glance.py index 865efc23a6..6d90051ea1 100644 --- a/nova/tests/unit/image/test_glance.py +++ b/nova/tests/unit/image/test_glance.py @@ -303,6 +303,11 @@ class TestExceptionTranslations(test.NoDBTestCase): out_exc = glance._translate_image_exception('123', in_exc) self.assertIsInstance(out_exc, exception.ImageNotFound) + def test_client_httpoverlimit_converts_to_imagequotaexceeded(self): + in_exc = glanceclient.exc.HTTPOverLimit('123') + out_exc = glance._translate_image_exception('123', in_exc) + self.assertIsInstance(out_exc, exception.ImageQuotaExceeded) + class TestGlanceSerializer(test.NoDBTestCase): def test_serialize(self): diff --git a/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml b/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml new file mode 100644 index 0000000000..4c6135311b --- /dev/null +++ b/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Improved detection of anti-affinity policy violation when performing live + and cold migrations. Most of the violations caused by race conditions due + to performing concurrent live or cold migrations should now be addressed + by extra checks in the compute service. Upon detection, cold migration + operations are automatically rescheduled, while live migrations have two + checks and will be rescheduled if detected by the first one, otherwise the + live migration will fail cleanly and revert the instance state back to its + previous value. @@ -61,7 +61,6 @@ commands = bash -c "! find doc/ -type f -name *.json | xargs grep -U -n $'\r'" # Check that all included JSON files are valid JSON bash -c '! find doc/ -type f -name *.json | xargs -t -n1 python -m json.tool 2>&1 > /dev/null | grep -B1 -v ^python' - bash tools/check-cherry-picks.sh [testenv:fast8] description = @@ -70,6 +69,15 @@ envdir = {toxworkdir}/shared commands = bash tools/flake8wrap.sh -HEAD +[testenv:validate-backport] +description = + Determine whether a backport is ready to be merged by checking whether it has + already been merged to master or more recent stable branches. +deps = +skipsdist = true +commands = + bash tools/check-cherry-picks.sh + [testenv:functional] # TODO(melwitt): This can be removed when functional tests are gating with # python 3.x |