diff options
Diffstat (limited to 'nova/compute/resource_tracker.py')
-rw-r--r-- | nova/compute/resource_tracker.py | 283 |
1 files changed, 185 insertions, 98 deletions
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 1c492bcb27..9ee6670c17 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -30,6 +30,7 @@ import retrying from nova.compute import claims from nova.compute import monitors +from nova.compute import pci_placement_translator from nova.compute import provider_config from nova.compute import stats as compute_stats from nova.compute import task_states @@ -48,6 +49,7 @@ from nova import rpc from nova.scheduler.client import report from nova import utils from nova.virt import hardware +from nova.virt import node CONF = nova.conf.CONF @@ -103,7 +105,7 @@ class ResourceTracker(object): monitor_handler = monitors.MonitorHandler(self) self.monitors = monitor_handler.monitors self.old_resources = collections.defaultdict(objects.ComputeNode) - self.reportclient = reportclient or report.SchedulerReportClient() + self.reportclient = reportclient or report.report_client_singleton() self.ram_allocation_ratio = CONF.ram_allocation_ratio self.cpu_allocation_ratio = CONF.cpu_allocation_ratio self.disk_allocation_ratio = CONF.disk_allocation_ratio @@ -144,16 +146,20 @@ class ResourceTracker(object): during the instance build. """ if self.disabled(nodename): - # instance_claim() was called before update_available_resource() - # (which ensures that a compute node exists for nodename). We - # shouldn't get here but in case we do, just set the instance's - # host and nodename attribute (probably incorrect) and return a - # NoopClaim. - # TODO(jaypipes): Remove all the disabled junk from the resource - # tracker. Servicegroup API-level active-checking belongs in the - # nova-compute manager. - self._set_instance_host_and_node(instance, nodename) - return claims.NopClaim() + # If we get here, it means we are trying to claim for an instance + # that was scheduled to a node that we do not have in our list, + # or is in some other way unmanageable by this node. This would + # mean that we are unable to account for resources, create + # allocations in placement, or do any of the other accounting + # necessary for this to work. In the past, this situation was + # effectively ignored silently, but in a world where we track + # resources with placement and instance assignment to compute nodes + # by service, we can no longer be leaky. + raise exception.ComputeResourcesUnavailable( + ('Attempt to claim resources for instance %(inst)s ' + 'on unknown node %(node)s failed') % { + 'inst': instance.uuid, + 'node': nodename}) # sanity checks: if instance.host: @@ -278,9 +284,17 @@ class ResourceTracker(object): context, instance, new_flavor, nodename, move_type) if self.disabled(nodename): - # compute_driver doesn't support resource tracking, just - # generate the migration record and continue the resize: - return claims.NopClaim(migration=migration) + # This means we were asked to accept an incoming migration to a + # node that we do not own or track. We really should not get here, + # but if we do, we must refuse to continue with the migration + # process, since we cannot account for those resources, create + # allocations in placement, etc. This has been a silent resource + # leak in the past, but it must be a hard failure now. + raise exception.ComputeResourcesUnavailable( + ('Attempt to claim move resources for instance %(inst)s on ' + 'unknown node %(node)s failed') % { + 'inst': instance.uuid, + 'node': 'nodename'}) cn = self.compute_nodes[nodename] @@ -328,7 +342,12 @@ class ResourceTracker(object): migration_id=migration.id, old_numa_topology=instance.numa_topology, new_numa_topology=claim.claimed_numa_topology, - old_pci_devices=instance.pci_devices, + # NOTE(gibi): the _update_usage_from_migration call below appends + # the newly claimed pci devices to the instance.pci_devices list + # to keep the migration context independent we need to make a copy + # that list here. We need a deep copy as we need to duplicate + # the instance.pci_devices.objects list + old_pci_devices=copy.deepcopy(instance.pci_devices), new_pci_devices=claimed_pci_devices, old_pci_requests=instance.pci_requests, new_pci_requests=new_pci_requests, @@ -613,18 +632,11 @@ class ResourceTracker(object): :param prefix: Prefix to use when accessing migration context attributes. 'old_' or 'new_', with 'new_' being the default. """ - # Remove usage for an instance that is tracked in migrations, such as - # on the dest node during revert resize. - if instance['uuid'] in self.tracked_migrations: - migration = self.tracked_migrations.pop(instance['uuid']) + if instance["uuid"] in self.tracked_migrations: if not flavor: - flavor = self._get_flavor(instance, prefix, migration) - # Remove usage for an instance that is not tracked in migrations (such - # as on the source node after a migration). - # NOTE(lbeliveau): On resize on the same node, the instance is - # included in both tracked_migrations and tracked_instances. - elif instance['uuid'] in self.tracked_instances: - self.tracked_instances.remove(instance['uuid']) + flavor = self._get_flavor( + instance, prefix, self.tracked_migrations[instance["uuid"]] + ) if flavor is not None: numa_topology = self._get_migration_context_resource( @@ -640,6 +652,15 @@ class ResourceTracker(object): ctxt = context.elevated() self._update(ctxt, self.compute_nodes[nodename]) + # Remove usage for an instance that is tracked in migrations, such as + # on the dest node during revert resize. + self.tracked_migrations.pop(instance['uuid'], None) + # Remove usage for an instance that is not tracked in migrations (such + # as on the source node after a migration). + # NOTE(lbeliveau): On resize on the same node, the instance is + # included in both tracked_migrations and tracked_instances. + self.tracked_instances.discard(instance['uuid']) + @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True) def update_usage(self, context, instance, nodename): """Update the resource usage and stats after a change in an @@ -660,50 +681,6 @@ class ResourceTracker(object): return (nodename not in self.compute_nodes or not self.driver.node_is_available(nodename)) - def _check_for_nodes_rebalance(self, context, resources, nodename): - """Check if nodes rebalance has happened. - - The ironic driver maintains a hash ring mapping bare metal nodes - to compute nodes. If a compute dies, the hash ring is rebuilt, and - some of its bare metal nodes (more precisely, those not in ACTIVE - state) are assigned to other computes. - - This method checks for this condition and adjusts the database - accordingly. - - :param context: security context - :param resources: initial values - :param nodename: node name - :returns: True if a suitable compute node record was found, else False - """ - if not self.driver.rebalances_nodes: - return False - - # Its possible ironic just did a node re-balance, so let's - # check if there is a compute node that already has the correct - # hypervisor_hostname. We can re-use that rather than create a - # new one and have to move existing placement allocations - cn_candidates = objects.ComputeNodeList.get_by_hypervisor( - context, nodename) - - if len(cn_candidates) == 1: - cn = cn_candidates[0] - LOG.info("ComputeNode %(name)s moving from %(old)s to %(new)s", - {"name": nodename, "old": cn.host, "new": self.host}) - cn.host = self.host - self.compute_nodes[nodename] = cn - self._copy_resources(cn, resources) - self._setup_pci_tracker(context, cn, resources) - self._update(context, cn) - return True - elif len(cn_candidates) > 1: - LOG.error( - "Found more than one ComputeNode for nodename %s. " - "Please clean up the orphaned ComputeNode records in your DB.", - nodename) - - return False - def _init_compute_node(self, context, resources): """Initialize the compute node if it does not already exist. @@ -721,6 +698,7 @@ class ResourceTracker(object): False otherwise """ nodename = resources['hypervisor_hostname'] + node_uuid = resources['uuid'] # if there is already a compute node just use resources # to initialize @@ -732,23 +710,43 @@ class ResourceTracker(object): # now try to get the compute node record from the # database. If we get one we use resources to initialize - cn = self._get_compute_node(context, nodename) + + # We use read_deleted=True so that we will find and recover a deleted + # node object, if necessary. + with utils.temporary_mutation(context, read_deleted='yes'): + cn = self._get_compute_node(context, node_uuid) + if cn and cn.deleted: + # Undelete and save this right now so that everything below + # can continue without read_deleted=yes + LOG.info('Undeleting compute node %s', cn.uuid) + cn.deleted = False + cn.deleted_at = None + cn.save() if cn: + if cn.host != self.host: + LOG.info("ComputeNode %(name)s moving from %(old)s to %(new)s", + {"name": nodename, "old": cn.host, "new": self.host}) + cn.host = self.host + self._update(context, cn) + self.compute_nodes[nodename] = cn self._copy_resources(cn, resources) self._setup_pci_tracker(context, cn, resources) return False - if self._check_for_nodes_rebalance(context, resources, nodename): - return False - # there was no local copy and none in the database # so we need to create a new compute node. This needs # to be initialized with resource values. cn = objects.ComputeNode(context) cn.host = self.host self._copy_resources(cn, resources, initial=True) - cn.create() + try: + cn.create() + except exception.DuplicateRecord: + raise exception.InvalidConfiguration( + 'Duplicate compute node record found for host %s node %s' % ( + cn.host, cn.hypervisor_hostname)) + # Only map the ComputeNode into compute_nodes if create() was OK # because if create() fails, on the next run through here nodename # would be in compute_nodes and we won't try to create again (because @@ -881,6 +879,14 @@ class ResourceTracker(object): # contains a non-None value, even for non-Ironic nova-compute hosts. It # is this value that will be populated in the compute_nodes table. resources['host_ip'] = CONF.my_ip + if 'uuid' not in resources: + # NOTE(danms): Any driver that does not provide a uuid per + # node gets the locally-persistent compute_id. Only ironic + # should be setting the per-node uuid (and returning + # multiple nodes in general). If this is the first time we + # are creating a compute node on this host, we will + # generate and persist this uuid for the future. + resources['uuid'] = node.get_local_node_uuid() # We want the 'cpu_info' to be None from the POV of the # virt driver, but the DB requires it to be non-null so @@ -985,8 +991,6 @@ class ResourceTracker(object): # notified when instances are deleted, we need remove all usages # from deleted instances. self.pci_tracker.clean_usage(instances, migrations) - dev_pools_obj = self.pci_tracker.stats.to_device_pools_obj() - cn.pci_device_pools = dev_pools_obj self._report_final_resource_view(nodename) @@ -1008,14 +1012,13 @@ class ResourceTracker(object): if startup: self._check_resources(context) - def _get_compute_node(self, context, nodename): + def _get_compute_node(self, context, node_uuid): """Returns compute node for the host and nodename.""" try: - return objects.ComputeNode.get_by_host_and_nodename( - context, self.host, nodename) + return objects.ComputeNode.get_by_uuid(context, node_uuid) except exception.NotFound: LOG.warning("No compute node record for %(host)s:%(node)s", - {'host': self.host, 'node': nodename}) + {'host': self.host, 'node': node_uuid}) def _report_hypervisor_resource_view(self, resources): """Log the hypervisor's view of free resources. @@ -1126,6 +1129,28 @@ class ResourceTracker(object): LOG.error('Unable to find services table record for nova-compute ' 'host %s', self.host) + def _should_expose_remote_managed_ports_trait(self, + is_supported: bool): + """Determine whether COMPUTE_REMOTE_MANAGED_PORTS should be exposed. + + Determines if the COMPUTE_REMOTE_MANAGED_PORTS trait needs to be + exposed based on the respective compute driver capability and + the presence of remote managed devices on a given host. Whether such + devices are present or not depends on the Whitelist configuration + (presence of a remote_managed tag association with some PCI devices) + and their physical presence (plugged in, enumerated by the OS). + + The aim of having this check is to optimize host lookup by prefiltering + hosts that have compute driver support but no hardware. The check + does not consider free device count - just the presence of device + pools since device availability may change between a prefilter check + and a later check in PciPassthroughFilter. + + :param bool is_supported: Is the trait supported by the compute driver + """ + return (is_supported and + self.pci_tracker.pci_stats.has_remote_managed_device_pools()) + def _get_traits(self, context, nodename, provider_tree): """Synchronizes internal and external traits for the node provider. @@ -1149,7 +1174,11 @@ class ResourceTracker(object): # traits that are missing, and remove any existing set traits # that are not currently supported. for trait, supported in self.driver.capabilities_as_traits().items(): - if supported: + add_trait = supported + if trait == os_traits.COMPUTE_REMOTE_MANAGED_PORTS: + add_trait &= self._should_expose_remote_managed_ports_trait( + supported) + if add_trait: traits.add(trait) elif trait in traits: traits.remove(trait) @@ -1163,9 +1192,16 @@ class ResourceTracker(object): return list(traits) - @retrying.retry(stop_max_attempt_number=4, - retry_on_exception=lambda e: isinstance( - e, exception.ResourceProviderUpdateConflict)) + @retrying.retry( + stop_max_attempt_number=4, + retry_on_exception=lambda e: isinstance( + e, + ( + exception.ResourceProviderUpdateConflict, + exception.PlacementReshapeConflict, + ), + ), + ) def _update_to_placement(self, context, compute_node, startup): """Send resource and inventory changes to placement.""" # NOTE(jianghuaw): Some resources(e.g. VGPU) are not saved in the @@ -1185,7 +1221,9 @@ class ResourceTracker(object): context, compute_node.uuid, name=compute_node.hypervisor_hostname) # Let the virt driver rearrange the provider tree and set/update # the inventory, traits, and aggregates throughout. - allocs = None + allocs = self.reportclient.get_allocations_for_provider_tree( + context, nodename) + driver_reshaped = False try: self.driver.update_provider_tree(prov_tree, nodename) except exception.ReshapeNeeded: @@ -1196,10 +1234,9 @@ class ResourceTracker(object): LOG.info("Performing resource provider inventory and " "allocation data migration during compute service " "startup or fast-forward upgrade.") - allocs = self.reportclient.get_allocations_for_provider_tree( - context, nodename) - self.driver.update_provider_tree(prov_tree, nodename, - allocations=allocs) + self.driver.update_provider_tree( + prov_tree, nodename, allocations=allocs) + driver_reshaped = True # Inject driver capabilities traits into the provider # tree. We need to determine the traits that the virt @@ -1220,25 +1257,77 @@ class ResourceTracker(object): context, nodename, provider_tree=prov_tree) prov_tree.update_traits(nodename, traits) + instances_under_same_host_resize = [ + migration.instance_uuid + for migration in self.tracked_migrations.values() + if migration.is_same_host_resize + ] + # NOTE(gibi): Tracking PCI in placement is different from other + # resources. + # + # While driver.update_provider_tree is used to let the virt driver + # create any kind of placement model for a resource the PCI data + # modelling is done virt driver independently by the PCI tracker. + # So the placement reporting needs to be also done here in the resource + # tracker independently of the virt driver. + # + # Additionally, when PCI tracking in placement was introduced there was + # already PCI allocations in nova. So both the PCI inventories and + # allocations needs to be healed. Moreover, to support rolling upgrade + # the placement prefilter for PCI devices was not turned on by default + # at the first release of this feature. Therefore, there could be new + # PCI allocation without placement being involved until the prefilter + # is enabled. So we need to be ready to heal PCI allocations at + # every call not just at startup. + pci_reshaped = pci_placement_translator.update_provider_tree_for_pci( + prov_tree, + nodename, + self.pci_tracker, + allocs, + instances_under_same_host_resize, + ) + self.provider_tree = prov_tree # This merges in changes from the provider config files loaded in init self._merge_provider_configs(self.provider_configs, prov_tree) - # Flush any changes. If we processed ReshapeNeeded above, allocs is not - # None, and this will hit placement's POST /reshaper route. - self.reportclient.update_from_provider_tree(context, prov_tree, - allocations=allocs) + try: + # Flush any changes. If we either processed ReshapeNeeded above or + # update_provider_tree_for_pci did reshape, then we need to pass + # allocs to update_from_provider_tree to hit placement's POST + # /reshaper route. + self.reportclient.update_from_provider_tree( + context, + prov_tree, + allocations=allocs if driver_reshaped or pci_reshaped else None + ) + except exception.InventoryInUse as e: + # This means an inventory reconfiguration (e.g.: removing a parent + # PF and adding a VF under that parent) was not possible due to + # existing allocations. Translate the exception to prevent the + # compute service to start + raise exception.PlacementPciException(error=str(e)) def _update(self, context, compute_node, startup=False): """Update partial stats locally and populate them to Scheduler.""" + + self._update_to_placement(context, compute_node, startup) + + if self.pci_tracker: + # sync PCI device pool state stored in the compute node with + # the actual state from the PCI tracker as we commit changes in + # the DB and in the PCI tracker below + dev_pools_obj = self.pci_tracker.stats.to_device_pools_obj() + compute_node.pci_device_pools = dev_pools_obj + # _resource_change will update self.old_resources if it detects changes # but we want to restore those if compute_node.save() fails. nodename = compute_node.hypervisor_hostname old_compute = self.old_resources[nodename] if self._resource_change(compute_node): # If the compute_node's resource changed, update to DB. Note that - # _update_to_placement below does not supersede the need to do this + # _update_to_placement above does not supersede the need to do this # because there are stats-related fields in the ComputeNode object # which could have changed and still need to be reported to the # scheduler filters/weighers (which could be out of tree as well). @@ -1251,8 +1340,6 @@ class ResourceTracker(object): with excutils.save_and_reraise_exception(logger=LOG): self.old_resources[nodename] = old_compute - self._update_to_placement(context, compute_node, startup) - if self.pci_tracker: self.pci_tracker.save(context) @@ -1825,7 +1912,7 @@ class ResourceTracker(object): raise ValueError(_( "Provider config '%(source_file_name)s' attempts " "to define a trait that is owned by the " - "virt driver or specified via the placment api. " + "virt driver or specified via the placement api. " "Invalid traits '%(invalid)s' must be removed " "from '%(source_file_name)s'.") % { 'source_file_name': source_file_name, |