diff options
Diffstat (limited to 'ironic/conductor')
-rw-r--r-- | ironic/conductor/allocations.py | 2 | ||||
-rw-r--r-- | ironic/conductor/base_manager.py | 17 | ||||
-rw-r--r-- | ironic/conductor/cleaning.py | 10 | ||||
-rw-r--r-- | ironic/conductor/deployments.py | 25 | ||||
-rw-r--r-- | ironic/conductor/manager.py | 41 | ||||
-rw-r--r-- | ironic/conductor/rpcapi.py | 3 | ||||
-rw-r--r-- | ironic/conductor/steps.py | 65 | ||||
-rw-r--r-- | ironic/conductor/utils.py | 34 |
8 files changed, 126 insertions, 71 deletions
diff --git a/ironic/conductor/allocations.py b/ironic/conductor/allocations.py index d1492bf38..d8fc38a5e 100644 --- a/ironic/conductor/allocations.py +++ b/ironic/conductor/allocations.py @@ -113,7 +113,7 @@ def _candidate_nodes(context, allocation): # UUIDs on the API level. filters['uuid_in'] = allocation.candidate_nodes if allocation.owner: - filters['owner'] = allocation.owner + filters['project'] = allocation.owner nodes = objects.Node.list(context, filters=filters) diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index d610d2163..cd63d875c 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -118,7 +118,8 @@ class BaseConductorManager(object): _check_enabled_interfaces() - # NOTE(deva): these calls may raise DriverLoadError or DriverNotFound + # NOTE(tenbrae): these calls may raise DriverLoadError or + # DriverNotFound # NOTE(vdrok): Instantiate network and storage interface factory on # startup so that all the interfaces are loaded at the very # beginning, and failures prevent the conductor from starting. @@ -296,6 +297,8 @@ class BaseConductorManager(object): return self._shutdown = True self._keepalive_evt.set() + # clear all locks held by this conductor before deregistering + self.dbapi.clear_node_reservations_for_conductor(self.host) if deregister: try: # Inform the cluster that this conductor is shutting down. @@ -476,15 +479,19 @@ class BaseConductorManager(object): node_iter = self.iter_nodes(filters=filters, sort_key=sort_key, sort_dir='asc') - + desired_maintenance = filters.get('maintenance') workers_count = 0 for node_uuid, driver, conductor_group in node_iter: try: with task_manager.acquire(context, node_uuid, purpose='node state check') as task: - if (task.node.maintenance - or task.node.provision_state - not in provision_state): + # Check maintenance value since it could have changed + # after the filtering was done. + if (desired_maintenance is not None + and desired_maintenance != task.node.maintenance): + continue + + if task.node.provision_state not in provision_state: continue target_state = (None if not keep_target_state else diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py index 351c38121..e02abdb36 100644 --- a/ironic/conductor/cleaning.py +++ b/ironic/conductor/cleaning.py @@ -21,6 +21,7 @@ from ironic.conductor import steps as conductor_steps from ironic.conductor import task_manager from ironic.conductor import utils from ironic.conf import CONF +from ironic.drivers import utils as driver_utils LOG = log.getLogger(__name__) @@ -182,6 +183,7 @@ def do_next_clean_step(task, step_index): {'node': node.uuid, 'exc': e, 'step': node.clean_step}) LOG.exception(msg) + driver_utils.collect_ramdisk_logs(task.node, label='cleaning') utils.cleaning_error_handler(task, msg) return @@ -206,6 +208,9 @@ def do_next_clean_step(task, step_index): LOG.info('Node %(node)s finished clean step %(step)s', {'node': node.uuid, 'step': step}) + if CONF.agent.deploy_logs_collect == 'always': + driver_utils.collect_ramdisk_logs(task.node, label='cleaning') + # Clear clean_step node.clean_step = None driver_internal_info = node.driver_internal_info @@ -213,12 +218,13 @@ def do_next_clean_step(task, step_index): driver_internal_info.pop('clean_step_index', None) driver_internal_info.pop('cleaning_reboot', None) driver_internal_info.pop('cleaning_polling', None) - driver_internal_info.pop('agent_secret_token', None) - driver_internal_info.pop('agent_secret_token_pregenerated', None) # Remove agent_url if not utils.fast_track_able(task): driver_internal_info.pop('agent_url', None) + driver_internal_info.pop('agent_secret_token', None) + driver_internal_info.pop('agent_secret_token_pregenerated', None) + node.driver_internal_info = driver_internal_info node.save() try: diff --git a/ironic/conductor/deployments.py b/ironic/conductor/deployments.py index 26df423aa..3bda75b23 100644 --- a/ironic/conductor/deployments.py +++ b/ironic/conductor/deployments.py @@ -87,13 +87,13 @@ def start_deploy(task, manager, configdrive=None, event='deploy'): instance_info.pop('ramdisk', None) node.instance_info = instance_info - driver_internal_info = node.driver_internal_info # Infer the image type to make sure the deploy driver # validates only the necessary variables for different # image types. # NOTE(sirushtim): The iwdi variable can be None. It's up to # the deploy driver to validate this. iwdi = images.is_whole_disk_image(task.context, node.instance_info) + driver_internal_info = node.driver_internal_info driver_internal_info['is_whole_disk_image'] = iwdi node.driver_internal_info = driver_internal_info node.save() @@ -127,6 +127,7 @@ def start_deploy(task, manager, configdrive=None, event='deploy'): def do_node_deploy(task, conductor_id=None, configdrive=None): """Prepare the environment and deploy a node.""" node = task.node + utils.wipe_deploy_internal_info(node) utils.del_secret_token(node) try: if configdrive: @@ -185,9 +186,10 @@ def do_node_deploy(task, conductor_id=None, configdrive=None): traceback=True, clean_up=False) try: - # This gets the deploy steps and puts them in the node's - # driver_internal_info['deploy_steps']. - conductor_steps.set_node_deployment_steps(task) + # This gets the deploy steps (if any) and puts them in the node's + # driver_internal_info['deploy_steps']. In-band steps are skipped since + # we know that an agent is not running yet. + conductor_steps.set_node_deployment_steps(task, skip_missing=True) except exception.InstanceDeployFailure as e: with excutils.save_and_reraise_exception(): utils.deploying_error_handler( @@ -281,8 +283,8 @@ def do_next_deploy_step(task, step_index, conductor_id): # Check if the step is done or not. The step should return # states.DEPLOYWAIT if the step is still being executed, or # None if the step is done. - # NOTE(deva): Some drivers may return states.DEPLOYWAIT - # eg. if they are waiting for a callback + # NOTE(tenbrae): Some drivers may return states.DEPLOYWAIT + # eg. if they are waiting for a callback if result == states.DEPLOYWAIT: # Kill this worker, the async step will make an RPC call to # continue_node_deploy() to continue deploying @@ -306,16 +308,7 @@ def do_next_deploy_step(task, step_index, conductor_id): # Finished executing the steps. Clear deploy_step. node.deploy_step = None - driver_internal_info = node.driver_internal_info - driver_internal_info.pop('agent_secret_token', None) - driver_internal_info.pop('agent_secret_token_pregenerated', None) - driver_internal_info['deploy_steps'] = None - driver_internal_info.pop('deploy_step_index', None) - driver_internal_info.pop('deployment_reboot', None) - driver_internal_info.pop('deployment_polling', None) - # Remove the agent_url cached from the deployment. - driver_internal_info.pop('agent_url', None) - node.driver_internal_info = driver_internal_info + utils.wipe_deploy_internal_info(node) node.save() _start_console_in_deploy(task) diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index e6ce9ff2b..b63d5412e 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -872,6 +872,15 @@ class ConductorManager(base_manager.BaseConductorManager): save_required = False info = node.driver_internal_info + + # Agent is now running, we're ready to validate the remaining steps + if not info.get('steps_validated'): + conductor_steps.validate_deploy_templates(task) + conductor_steps.set_node_deployment_steps( + task, reset_current=False) + info['steps_validated'] = True + save_required = True + try: skip_current_step = info.pop('skip_current_deploy_step') except KeyError: @@ -1004,13 +1013,13 @@ class ConductorManager(base_manager.BaseConductorManager): node.last_error = _("Failed to tear down. Error: %s") % e task.process_event('error') else: - # NOTE(deva): When tear_down finishes, the deletion is done, + # NOTE(tenbrae): When tear_down finishes, the deletion is done, # cleaning will start next LOG.info('Successfully unprovisioned node %(node)s with ' 'instance %(instance)s.', {'node': node.uuid, 'instance': node.instance_uuid}) finally: - # NOTE(deva): there is no need to unset conductor_affinity + # NOTE(tenbrae): there is no need to unset conductor_affinity # because it is a reference to the most recent conductor which # deployed a node, and does not limit any future actions. # But we do need to clear the instance-related fields. @@ -1463,7 +1472,7 @@ class ConductorManager(base_manager.BaseConductorManager): with task_manager.acquire(context, node_uuid, purpose='power state sync', shared=True) as task: - # NOTE(deva): we should not acquire a lock on a node in + # NOTE(tenbrae): we should not acquire a lock on a node in # DEPLOYWAIT/CLEANWAIT, as this could cause # an error within a deploy ramdisk POSTing back # at the same time. @@ -1590,7 +1599,7 @@ class ConductorManager(base_manager.BaseConductorManager): @periodics.periodic( spacing=CONF.conductor.check_provision_state_interval, enabled=CONF.conductor.check_provision_state_interval > 0 - and CONF.conductor.deploy_callback_timeout != 0) + and CONF.conductor.deploy_callback_timeout > 0) def _check_deploy_timeouts(self, context): """Periodically checks whether a deploy RPC call has timed out. @@ -1598,8 +1607,6 @@ class ConductorManager(base_manager.BaseConductorManager): :param context: request context. """ - # FIXME(rloo): If the value is < 0, it will be enabled. That doesn't - # seem right. callback_timeout = CONF.conductor.deploy_callback_timeout filters = {'reserved': False, @@ -1812,7 +1819,7 @@ class ConductorManager(base_manager.BaseConductorManager): @periodics.periodic( spacing=CONF.conductor.check_provision_state_interval, enabled=CONF.conductor.check_provision_state_interval > 0 - and CONF.conductor.clean_callback_timeout != 0) + and CONF.conductor.clean_callback_timeout > 0) def _check_cleanwait_timeouts(self, context): """Periodically checks for nodes being cleaned. @@ -1821,8 +1828,6 @@ class ConductorManager(base_manager.BaseConductorManager): :param context: request context. """ - # FIXME(rloo): If the value is < 0, it will be enabled. That doesn't - # seem right. callback_timeout = CONF.conductor.clean_callback_timeout filters = {'reserved': False, @@ -1883,7 +1888,7 @@ class ConductorManager(base_manager.BaseConductorManager): try: with task_manager.acquire(context, node_uuid, purpose='node take over') as task: - # NOTE(deva): now that we have the lock, check again to + # NOTE(tenbrae): now that we have the lock, check again to # avoid racing with deletes and other state changes node = task.node if (node.maintenance @@ -1935,7 +1940,11 @@ class ConductorManager(base_manager.BaseConductorManager): iface.validate(task) if iface_name == 'deploy': utils.validate_instance_info_traits(task.node) - conductor_steps.validate_deploy_templates(task) + # NOTE(dtantsur): without the agent running we cannot + # have the complete list of steps, so skip ones that we + # don't know. + conductor_steps.validate_deploy_templates( + task, skip_missing=True) result = True except (exception.InvalidParameterValue, exception.UnsupportedDriverExtension) as e: @@ -2233,7 +2242,7 @@ class ConductorManager(base_manager.BaseConductorManager): try: if enabled: task.driver.console.start_console(task) - # TODO(deva): We should be updating conductor_affinity here + # TODO(tenbrae): We should be updating conductor_affinity here # but there is no support for console sessions in # take_over() right now. else: @@ -2967,18 +2976,17 @@ class ConductorManager(base_manager.BaseConductorManager): @periodics.periodic( spacing=CONF.conductor.check_provision_state_interval, enabled=CONF.conductor.check_provision_state_interval > 0 - and CONF.conductor.inspect_wait_timeout != 0) + and CONF.conductor.inspect_wait_timeout > 0) def _check_inspect_wait_timeouts(self, context): """Periodically checks inspect_wait_timeout and fails upon reaching it. :param context: request context """ - # FIXME(rloo): If the value is < 0, it will be enabled. That doesn't - # seem right. callback_timeout = CONF.conductor.inspect_wait_timeout filters = {'reserved': False, + 'maintenance': False, 'provision_state': states.INSPECTWAIT, 'inspection_started_before': callback_timeout} sort_key = 'inspection_started_at' @@ -3073,6 +3081,7 @@ class ConductorManager(base_manager.BaseConductorManager): agent_version, in these cases assume agent v3.0.0 (the last release before sending agent_version was introduced). :param callback_url: URL to reach back to the ramdisk. + :param agent_token: randomly generated validation token. :raises: NoFreeConductorWorker if there are no conductors to process this heartbeat request. """ @@ -3111,7 +3120,7 @@ class ConductorManager(base_manager.BaseConductorManager): 'Invalid or missing agent token received.') else: LOG.warning('Out of date agent detected for node ' - '%(node)s. Agent version %(version) ' + '%(node)s. Agent version %(version)s ' 'reported. Support for this version is ' 'deprecated.', {'node': task.node.uuid, diff --git a/ironic/conductor/rpcapi.py b/ironic/conductor/rpcapi.py index f3a64824b..64447b61a 100644 --- a/ironic/conductor/rpcapi.py +++ b/ironic/conductor/rpcapi.py @@ -132,7 +132,7 @@ class ConductorAPI(object): serializer=serializer) use_groups = self.client.can_send_version('1.47') - # NOTE(deva): this is going to be buggy + # NOTE(tenbrae): this is going to be buggy self.ring_manager = hash_ring.HashRingManager(use_groups=use_groups) def get_conductor_for(self, node): @@ -905,6 +905,7 @@ class ConductorAPI(object): :param node_id: node ID or UUID. :param callback_url: URL to reach back to the ramdisk. :param topic: RPC topic. Defaults to self.topic. + :param agent_token: randomly generated validation token. :param agent_version: the version of the agent that is heartbeating """ new_kws = {} diff --git a/ironic/conductor/steps.py b/ironic/conductor/steps.py index e67f6e3bb..a6663db01 100644 --- a/ironic/conductor/steps.py +++ b/ironic/conductor/steps.py @@ -81,6 +81,17 @@ def _sorted_steps(steps, sort_step_key): return sorted(steps, key=sort_step_key, reverse=True) +def is_equivalent(step1, step2): + """Compare steps, ignoring their priority.""" + return (step1.get('interface') == step2.get('interface') + and step1.get('step') == step2.get('step')) + + +def find_step(steps, step): + """Find an identical step in the list of steps.""" + return next((x for x in steps if is_equivalent(x, step)), None) + + def _get_steps(task, interfaces, get_method, enabled=False, sort_step_key=None): """Get steps for task.node. @@ -227,7 +238,7 @@ def _get_steps_from_deployment_templates(task, templates): return steps -def _get_validated_steps_from_templates(task): +def _get_validated_steps_from_templates(task, skip_missing=False): """Return a list of validated deploy steps from deploy templates. Deployment template steps are those steps defined in deployment templates @@ -258,10 +269,11 @@ def _get_validated_steps_from_templates(task): 'deploy templates: %(templates)s. Errors: ') % {'templates': ','.join(t.name for t in templates)}) return _validate_user_deploy_steps(task, user_steps, - error_prefix=error_prefix) + error_prefix=error_prefix, + skip_missing=skip_missing) -def _get_all_deployment_steps(task): +def _get_all_deployment_steps(task, skip_missing=False): """Get deployment steps for task.node. Deployment steps from matching deployment templates are combined with those @@ -276,7 +288,8 @@ def _get_all_deployment_steps(task): # NOTE(mgoddard): although we've probably just validated the templates in # do_node_deploy, they may have changed in the DB since we last checked, so # validate again. - user_steps = _get_validated_steps_from_templates(task) + user_steps = _get_validated_steps_from_templates(task, + skip_missing=skip_missing) # Gather enabled deploy steps from drivers. driver_steps = _get_deployment_steps(task, enabled=True, sort=False) @@ -293,19 +306,22 @@ def _get_all_deployment_steps(task): return _sorted_steps(steps, _deploy_step_key) -def set_node_deployment_steps(task): +def set_node_deployment_steps(task, reset_current=True, skip_missing=False): """Set up the node with deployment step information for deploying. Get the deploy steps from the driver. + :param reset_current: Whether to reset the current step to the first one. :raises: InstanceDeployFailure if there was a problem getting the deployment steps. """ node = task.node driver_internal_info = node.driver_internal_info - driver_internal_info['deploy_steps'] = _get_all_deployment_steps(task) - node.deploy_step = {} - driver_internal_info['deploy_step_index'] = None + driver_internal_info['deploy_steps'] = _get_all_deployment_steps( + task, skip_missing=skip_missing) + if reset_current: + node.deploy_step = {} + driver_internal_info['deploy_step_index'] = None node.driver_internal_info = driver_internal_info node.save() @@ -450,7 +466,7 @@ def _validate_user_step(task, user_step, driver_step, step_type): def _validate_user_steps(task, user_steps, driver_steps, step_type, - error_prefix=None): + error_prefix=None, skip_missing=False): """Validate the user-specified steps. :param task: A TaskManager object @@ -506,23 +522,34 @@ def _validate_user_steps(task, user_steps, driver_steps, step_type, # Convert driver steps to a dict. driver_steps = {_step_id(s): s for s in driver_steps} + result = [] + for user_step in user_steps: # Check if this user-specified step isn't supported by the driver try: driver_step = driver_steps[_step_id(user_step)] except KeyError: - error = (_('node does not support this %(type)s step: %(step)s') - % {'type': step_type, 'step': user_step}) - errors.append(error) + if skip_missing: + LOG.debug('%(type)s step %(step)s is not currently known for ' + 'node %(node)s, delaying its validation until ' + 'in-band steps are loaded', + {'type': step_type.capitalize(), + 'step': user_step, 'node': task.node.uuid}) + else: + error = (_('node does not support this %(type)s step: ' + '%(step)s') + % {'type': step_type, 'step': user_step}) + errors.append(error) continue step_errors = _validate_user_step(task, user_step, driver_step, step_type) errors.extend(step_errors) + result.append(user_step) if step_type == 'deploy': # Deploy steps should be unique across all combined templates. - dup_errors = _validate_deploy_steps_unique(user_steps) + dup_errors = _validate_deploy_steps_unique(result) errors.extend(dup_errors) if errors: @@ -530,7 +557,7 @@ def _validate_user_steps(task, user_steps, driver_steps, step_type, err += '; '.join(errors) raise exception.InvalidParameterValue(err=err) - return user_steps + return result def _validate_user_clean_steps(task, user_steps): @@ -558,7 +585,8 @@ def _validate_user_clean_steps(task, user_steps): return _validate_user_steps(task, user_steps, driver_steps, 'clean') -def _validate_user_deploy_steps(task, user_steps, error_prefix=None): +def _validate_user_deploy_steps(task, user_steps, error_prefix=None, + skip_missing=False): """Validate the user-specified deploy steps. :param task: A TaskManager object @@ -585,10 +613,11 @@ def _validate_user_deploy_steps(task, user_steps, error_prefix=None): """ driver_steps = _get_deployment_steps(task, enabled=False, sort=False) return _validate_user_steps(task, user_steps, driver_steps, 'deploy', - error_prefix=error_prefix) + error_prefix=error_prefix, + skip_missing=skip_missing) -def validate_deploy_templates(task): +def validate_deploy_templates(task, skip_missing=False): """Validate the deploy templates for a node. :param task: A TaskManager object @@ -598,4 +627,4 @@ def validate_deploy_templates(task): steps from the driver. """ # Gather deploy steps from matching deploy templates and validate them. - _get_validated_steps_from_templates(task) + _get_validated_steps_from_templates(task, skip_missing=skip_missing) diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 75b90486a..35e2c94fc 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -369,7 +369,7 @@ def provisioning_error_handler(e, node, provision_state, """ if isinstance(e, exception.NoFreeConductorWorker): - # NOTE(deva): there is no need to clear conductor_affinity + # NOTE(tenbrae): there is no need to clear conductor_affinity # because it isn't updated on a failed deploy node.provision_state = provision_state node.target_provision_state = target_provision_state @@ -444,6 +444,25 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True, task.process_event('fail', target_state=target_state) +def wipe_deploy_internal_info(node): + """Remove temporary deployment fields from driver_internal_info.""" + info = node.driver_internal_info + info.pop('agent_secret_token', None) + info.pop('agent_secret_token_pregenerated', None) + # Clear any leftover metadata about deployment. + info['deploy_steps'] = None + info.pop('agent_cached_deploy_steps', None) + info.pop('deploy_step_index', None) + info.pop('deployment_reboot', None) + info.pop('deployment_polling', None) + info.pop('skip_current_deploy_step', None) + info.pop('steps_validated', None) + # Remove agent_url since it will be re-asserted + # upon the next deployment attempt. + info.pop('agent_url', None) + node.driver_internal_info = info + + def deploying_error_handler(task, logmsg, errmsg=None, traceback=False, clean_up=True): """Put a failed node in DEPLOYFAIL. @@ -484,22 +503,13 @@ def deploying_error_handler(task, logmsg, errmsg=None, traceback=False, # Clear deploy step; we leave the list of deploy steps # in node.driver_internal_info for debugging purposes. node.deploy_step = {} - info = node.driver_internal_info - # Clear any leftover metadata about deployment. - info.pop('deploy_step_index', None) - info.pop('deployment_reboot', None) - info.pop('deployment_polling', None) - info.pop('skip_current_deploy_step', None) - # Remove agent_url since it will be re-asserted - # upon the next deployment attempt. - info.pop('agent_url', None) - node.driver_internal_info = info + wipe_deploy_internal_info(node) if cleanup_err: node.last_error = cleanup_err node.save() - # NOTE(deva): there is no need to clear conductor_affinity + # NOTE(tenbrae): there is no need to clear conductor_affinity task.process_event('fail') |