summaryrefslogtreecommitdiff
path: root/ironic/conductor
diff options
context:
space:
mode:
Diffstat (limited to 'ironic/conductor')
-rw-r--r--ironic/conductor/allocations.py2
-rw-r--r--ironic/conductor/base_manager.py17
-rw-r--r--ironic/conductor/cleaning.py10
-rw-r--r--ironic/conductor/deployments.py25
-rw-r--r--ironic/conductor/manager.py41
-rw-r--r--ironic/conductor/rpcapi.py3
-rw-r--r--ironic/conductor/steps.py65
-rw-r--r--ironic/conductor/utils.py34
8 files changed, 126 insertions, 71 deletions
diff --git a/ironic/conductor/allocations.py b/ironic/conductor/allocations.py
index d1492bf38..d8fc38a5e 100644
--- a/ironic/conductor/allocations.py
+++ b/ironic/conductor/allocations.py
@@ -113,7 +113,7 @@ def _candidate_nodes(context, allocation):
# UUIDs on the API level.
filters['uuid_in'] = allocation.candidate_nodes
if allocation.owner:
- filters['owner'] = allocation.owner
+ filters['project'] = allocation.owner
nodes = objects.Node.list(context, filters=filters)
diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py
index d610d2163..cd63d875c 100644
--- a/ironic/conductor/base_manager.py
+++ b/ironic/conductor/base_manager.py
@@ -118,7 +118,8 @@ class BaseConductorManager(object):
_check_enabled_interfaces()
- # NOTE(deva): these calls may raise DriverLoadError or DriverNotFound
+ # NOTE(tenbrae): these calls may raise DriverLoadError or
+ # DriverNotFound
# NOTE(vdrok): Instantiate network and storage interface factory on
# startup so that all the interfaces are loaded at the very
# beginning, and failures prevent the conductor from starting.
@@ -296,6 +297,8 @@ class BaseConductorManager(object):
return
self._shutdown = True
self._keepalive_evt.set()
+ # clear all locks held by this conductor before deregistering
+ self.dbapi.clear_node_reservations_for_conductor(self.host)
if deregister:
try:
# Inform the cluster that this conductor is shutting down.
@@ -476,15 +479,19 @@ class BaseConductorManager(object):
node_iter = self.iter_nodes(filters=filters,
sort_key=sort_key,
sort_dir='asc')
-
+ desired_maintenance = filters.get('maintenance')
workers_count = 0
for node_uuid, driver, conductor_group in node_iter:
try:
with task_manager.acquire(context, node_uuid,
purpose='node state check') as task:
- if (task.node.maintenance
- or task.node.provision_state
- not in provision_state):
+ # Check maintenance value since it could have changed
+ # after the filtering was done.
+ if (desired_maintenance is not None
+ and desired_maintenance != task.node.maintenance):
+ continue
+
+ if task.node.provision_state not in provision_state:
continue
target_state = (None if not keep_target_state else
diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py
index 351c38121..e02abdb36 100644
--- a/ironic/conductor/cleaning.py
+++ b/ironic/conductor/cleaning.py
@@ -21,6 +21,7 @@ from ironic.conductor import steps as conductor_steps
from ironic.conductor import task_manager
from ironic.conductor import utils
from ironic.conf import CONF
+from ironic.drivers import utils as driver_utils
LOG = log.getLogger(__name__)
@@ -182,6 +183,7 @@ def do_next_clean_step(task, step_index):
{'node': node.uuid, 'exc': e,
'step': node.clean_step})
LOG.exception(msg)
+ driver_utils.collect_ramdisk_logs(task.node, label='cleaning')
utils.cleaning_error_handler(task, msg)
return
@@ -206,6 +208,9 @@ def do_next_clean_step(task, step_index):
LOG.info('Node %(node)s finished clean step %(step)s',
{'node': node.uuid, 'step': step})
+ if CONF.agent.deploy_logs_collect == 'always':
+ driver_utils.collect_ramdisk_logs(task.node, label='cleaning')
+
# Clear clean_step
node.clean_step = None
driver_internal_info = node.driver_internal_info
@@ -213,12 +218,13 @@ def do_next_clean_step(task, step_index):
driver_internal_info.pop('clean_step_index', None)
driver_internal_info.pop('cleaning_reboot', None)
driver_internal_info.pop('cleaning_polling', None)
- driver_internal_info.pop('agent_secret_token', None)
- driver_internal_info.pop('agent_secret_token_pregenerated', None)
# Remove agent_url
if not utils.fast_track_able(task):
driver_internal_info.pop('agent_url', None)
+ driver_internal_info.pop('agent_secret_token', None)
+ driver_internal_info.pop('agent_secret_token_pregenerated', None)
+
node.driver_internal_info = driver_internal_info
node.save()
try:
diff --git a/ironic/conductor/deployments.py b/ironic/conductor/deployments.py
index 26df423aa..3bda75b23 100644
--- a/ironic/conductor/deployments.py
+++ b/ironic/conductor/deployments.py
@@ -87,13 +87,13 @@ def start_deploy(task, manager, configdrive=None, event='deploy'):
instance_info.pop('ramdisk', None)
node.instance_info = instance_info
- driver_internal_info = node.driver_internal_info
# Infer the image type to make sure the deploy driver
# validates only the necessary variables for different
# image types.
# NOTE(sirushtim): The iwdi variable can be None. It's up to
# the deploy driver to validate this.
iwdi = images.is_whole_disk_image(task.context, node.instance_info)
+ driver_internal_info = node.driver_internal_info
driver_internal_info['is_whole_disk_image'] = iwdi
node.driver_internal_info = driver_internal_info
node.save()
@@ -127,6 +127,7 @@ def start_deploy(task, manager, configdrive=None, event='deploy'):
def do_node_deploy(task, conductor_id=None, configdrive=None):
"""Prepare the environment and deploy a node."""
node = task.node
+ utils.wipe_deploy_internal_info(node)
utils.del_secret_token(node)
try:
if configdrive:
@@ -185,9 +186,10 @@ def do_node_deploy(task, conductor_id=None, configdrive=None):
traceback=True, clean_up=False)
try:
- # This gets the deploy steps and puts them in the node's
- # driver_internal_info['deploy_steps'].
- conductor_steps.set_node_deployment_steps(task)
+ # This gets the deploy steps (if any) and puts them in the node's
+ # driver_internal_info['deploy_steps']. In-band steps are skipped since
+ # we know that an agent is not running yet.
+ conductor_steps.set_node_deployment_steps(task, skip_missing=True)
except exception.InstanceDeployFailure as e:
with excutils.save_and_reraise_exception():
utils.deploying_error_handler(
@@ -281,8 +283,8 @@ def do_next_deploy_step(task, step_index, conductor_id):
# Check if the step is done or not. The step should return
# states.DEPLOYWAIT if the step is still being executed, or
# None if the step is done.
- # NOTE(deva): Some drivers may return states.DEPLOYWAIT
- # eg. if they are waiting for a callback
+ # NOTE(tenbrae): Some drivers may return states.DEPLOYWAIT
+ # eg. if they are waiting for a callback
if result == states.DEPLOYWAIT:
# Kill this worker, the async step will make an RPC call to
# continue_node_deploy() to continue deploying
@@ -306,16 +308,7 @@ def do_next_deploy_step(task, step_index, conductor_id):
# Finished executing the steps. Clear deploy_step.
node.deploy_step = None
- driver_internal_info = node.driver_internal_info
- driver_internal_info.pop('agent_secret_token', None)
- driver_internal_info.pop('agent_secret_token_pregenerated', None)
- driver_internal_info['deploy_steps'] = None
- driver_internal_info.pop('deploy_step_index', None)
- driver_internal_info.pop('deployment_reboot', None)
- driver_internal_info.pop('deployment_polling', None)
- # Remove the agent_url cached from the deployment.
- driver_internal_info.pop('agent_url', None)
- node.driver_internal_info = driver_internal_info
+ utils.wipe_deploy_internal_info(node)
node.save()
_start_console_in_deploy(task)
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index e6ce9ff2b..b63d5412e 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -872,6 +872,15 @@ class ConductorManager(base_manager.BaseConductorManager):
save_required = False
info = node.driver_internal_info
+
+ # Agent is now running, we're ready to validate the remaining steps
+ if not info.get('steps_validated'):
+ conductor_steps.validate_deploy_templates(task)
+ conductor_steps.set_node_deployment_steps(
+ task, reset_current=False)
+ info['steps_validated'] = True
+ save_required = True
+
try:
skip_current_step = info.pop('skip_current_deploy_step')
except KeyError:
@@ -1004,13 +1013,13 @@ class ConductorManager(base_manager.BaseConductorManager):
node.last_error = _("Failed to tear down. Error: %s") % e
task.process_event('error')
else:
- # NOTE(deva): When tear_down finishes, the deletion is done,
+ # NOTE(tenbrae): When tear_down finishes, the deletion is done,
# cleaning will start next
LOG.info('Successfully unprovisioned node %(node)s with '
'instance %(instance)s.',
{'node': node.uuid, 'instance': node.instance_uuid})
finally:
- # NOTE(deva): there is no need to unset conductor_affinity
+ # NOTE(tenbrae): there is no need to unset conductor_affinity
# because it is a reference to the most recent conductor which
# deployed a node, and does not limit any future actions.
# But we do need to clear the instance-related fields.
@@ -1463,7 +1472,7 @@ class ConductorManager(base_manager.BaseConductorManager):
with task_manager.acquire(context, node_uuid,
purpose='power state sync',
shared=True) as task:
- # NOTE(deva): we should not acquire a lock on a node in
+ # NOTE(tenbrae): we should not acquire a lock on a node in
# DEPLOYWAIT/CLEANWAIT, as this could cause
# an error within a deploy ramdisk POSTing back
# at the same time.
@@ -1590,7 +1599,7 @@ class ConductorManager(base_manager.BaseConductorManager):
@periodics.periodic(
spacing=CONF.conductor.check_provision_state_interval,
enabled=CONF.conductor.check_provision_state_interval > 0
- and CONF.conductor.deploy_callback_timeout != 0)
+ and CONF.conductor.deploy_callback_timeout > 0)
def _check_deploy_timeouts(self, context):
"""Periodically checks whether a deploy RPC call has timed out.
@@ -1598,8 +1607,6 @@ class ConductorManager(base_manager.BaseConductorManager):
:param context: request context.
"""
- # FIXME(rloo): If the value is < 0, it will be enabled. That doesn't
- # seem right.
callback_timeout = CONF.conductor.deploy_callback_timeout
filters = {'reserved': False,
@@ -1812,7 +1819,7 @@ class ConductorManager(base_manager.BaseConductorManager):
@periodics.periodic(
spacing=CONF.conductor.check_provision_state_interval,
enabled=CONF.conductor.check_provision_state_interval > 0
- and CONF.conductor.clean_callback_timeout != 0)
+ and CONF.conductor.clean_callback_timeout > 0)
def _check_cleanwait_timeouts(self, context):
"""Periodically checks for nodes being cleaned.
@@ -1821,8 +1828,6 @@ class ConductorManager(base_manager.BaseConductorManager):
:param context: request context.
"""
- # FIXME(rloo): If the value is < 0, it will be enabled. That doesn't
- # seem right.
callback_timeout = CONF.conductor.clean_callback_timeout
filters = {'reserved': False,
@@ -1883,7 +1888,7 @@ class ConductorManager(base_manager.BaseConductorManager):
try:
with task_manager.acquire(context, node_uuid,
purpose='node take over') as task:
- # NOTE(deva): now that we have the lock, check again to
+ # NOTE(tenbrae): now that we have the lock, check again to
# avoid racing with deletes and other state changes
node = task.node
if (node.maintenance
@@ -1935,7 +1940,11 @@ class ConductorManager(base_manager.BaseConductorManager):
iface.validate(task)
if iface_name == 'deploy':
utils.validate_instance_info_traits(task.node)
- conductor_steps.validate_deploy_templates(task)
+ # NOTE(dtantsur): without the agent running we cannot
+ # have the complete list of steps, so skip ones that we
+ # don't know.
+ conductor_steps.validate_deploy_templates(
+ task, skip_missing=True)
result = True
except (exception.InvalidParameterValue,
exception.UnsupportedDriverExtension) as e:
@@ -2233,7 +2242,7 @@ class ConductorManager(base_manager.BaseConductorManager):
try:
if enabled:
task.driver.console.start_console(task)
- # TODO(deva): We should be updating conductor_affinity here
+ # TODO(tenbrae): We should be updating conductor_affinity here
# but there is no support for console sessions in
# take_over() right now.
else:
@@ -2967,18 +2976,17 @@ class ConductorManager(base_manager.BaseConductorManager):
@periodics.periodic(
spacing=CONF.conductor.check_provision_state_interval,
enabled=CONF.conductor.check_provision_state_interval > 0
- and CONF.conductor.inspect_wait_timeout != 0)
+ and CONF.conductor.inspect_wait_timeout > 0)
def _check_inspect_wait_timeouts(self, context):
"""Periodically checks inspect_wait_timeout and fails upon reaching it.
:param context: request context
"""
- # FIXME(rloo): If the value is < 0, it will be enabled. That doesn't
- # seem right.
callback_timeout = CONF.conductor.inspect_wait_timeout
filters = {'reserved': False,
+ 'maintenance': False,
'provision_state': states.INSPECTWAIT,
'inspection_started_before': callback_timeout}
sort_key = 'inspection_started_at'
@@ -3073,6 +3081,7 @@ class ConductorManager(base_manager.BaseConductorManager):
agent_version, in these cases assume agent v3.0.0 (the last release
before sending agent_version was introduced).
:param callback_url: URL to reach back to the ramdisk.
+ :param agent_token: randomly generated validation token.
:raises: NoFreeConductorWorker if there are no conductors to process
this heartbeat request.
"""
@@ -3111,7 +3120,7 @@ class ConductorManager(base_manager.BaseConductorManager):
'Invalid or missing agent token received.')
else:
LOG.warning('Out of date agent detected for node '
- '%(node)s. Agent version %(version) '
+ '%(node)s. Agent version %(version)s '
'reported. Support for this version is '
'deprecated.',
{'node': task.node.uuid,
diff --git a/ironic/conductor/rpcapi.py b/ironic/conductor/rpcapi.py
index f3a64824b..64447b61a 100644
--- a/ironic/conductor/rpcapi.py
+++ b/ironic/conductor/rpcapi.py
@@ -132,7 +132,7 @@ class ConductorAPI(object):
serializer=serializer)
use_groups = self.client.can_send_version('1.47')
- # NOTE(deva): this is going to be buggy
+ # NOTE(tenbrae): this is going to be buggy
self.ring_manager = hash_ring.HashRingManager(use_groups=use_groups)
def get_conductor_for(self, node):
@@ -905,6 +905,7 @@ class ConductorAPI(object):
:param node_id: node ID or UUID.
:param callback_url: URL to reach back to the ramdisk.
:param topic: RPC topic. Defaults to self.topic.
+ :param agent_token: randomly generated validation token.
:param agent_version: the version of the agent that is heartbeating
"""
new_kws = {}
diff --git a/ironic/conductor/steps.py b/ironic/conductor/steps.py
index e67f6e3bb..a6663db01 100644
--- a/ironic/conductor/steps.py
+++ b/ironic/conductor/steps.py
@@ -81,6 +81,17 @@ def _sorted_steps(steps, sort_step_key):
return sorted(steps, key=sort_step_key, reverse=True)
+def is_equivalent(step1, step2):
+ """Compare steps, ignoring their priority."""
+ return (step1.get('interface') == step2.get('interface')
+ and step1.get('step') == step2.get('step'))
+
+
+def find_step(steps, step):
+ """Find an identical step in the list of steps."""
+ return next((x for x in steps if is_equivalent(x, step)), None)
+
+
def _get_steps(task, interfaces, get_method, enabled=False,
sort_step_key=None):
"""Get steps for task.node.
@@ -227,7 +238,7 @@ def _get_steps_from_deployment_templates(task, templates):
return steps
-def _get_validated_steps_from_templates(task):
+def _get_validated_steps_from_templates(task, skip_missing=False):
"""Return a list of validated deploy steps from deploy templates.
Deployment template steps are those steps defined in deployment templates
@@ -258,10 +269,11 @@ def _get_validated_steps_from_templates(task):
'deploy templates: %(templates)s. Errors: ') %
{'templates': ','.join(t.name for t in templates)})
return _validate_user_deploy_steps(task, user_steps,
- error_prefix=error_prefix)
+ error_prefix=error_prefix,
+ skip_missing=skip_missing)
-def _get_all_deployment_steps(task):
+def _get_all_deployment_steps(task, skip_missing=False):
"""Get deployment steps for task.node.
Deployment steps from matching deployment templates are combined with those
@@ -276,7 +288,8 @@ def _get_all_deployment_steps(task):
# NOTE(mgoddard): although we've probably just validated the templates in
# do_node_deploy, they may have changed in the DB since we last checked, so
# validate again.
- user_steps = _get_validated_steps_from_templates(task)
+ user_steps = _get_validated_steps_from_templates(task,
+ skip_missing=skip_missing)
# Gather enabled deploy steps from drivers.
driver_steps = _get_deployment_steps(task, enabled=True, sort=False)
@@ -293,19 +306,22 @@ def _get_all_deployment_steps(task):
return _sorted_steps(steps, _deploy_step_key)
-def set_node_deployment_steps(task):
+def set_node_deployment_steps(task, reset_current=True, skip_missing=False):
"""Set up the node with deployment step information for deploying.
Get the deploy steps from the driver.
+ :param reset_current: Whether to reset the current step to the first one.
:raises: InstanceDeployFailure if there was a problem getting the
deployment steps.
"""
node = task.node
driver_internal_info = node.driver_internal_info
- driver_internal_info['deploy_steps'] = _get_all_deployment_steps(task)
- node.deploy_step = {}
- driver_internal_info['deploy_step_index'] = None
+ driver_internal_info['deploy_steps'] = _get_all_deployment_steps(
+ task, skip_missing=skip_missing)
+ if reset_current:
+ node.deploy_step = {}
+ driver_internal_info['deploy_step_index'] = None
node.driver_internal_info = driver_internal_info
node.save()
@@ -450,7 +466,7 @@ def _validate_user_step(task, user_step, driver_step, step_type):
def _validate_user_steps(task, user_steps, driver_steps, step_type,
- error_prefix=None):
+ error_prefix=None, skip_missing=False):
"""Validate the user-specified steps.
:param task: A TaskManager object
@@ -506,23 +522,34 @@ def _validate_user_steps(task, user_steps, driver_steps, step_type,
# Convert driver steps to a dict.
driver_steps = {_step_id(s): s for s in driver_steps}
+ result = []
+
for user_step in user_steps:
# Check if this user-specified step isn't supported by the driver
try:
driver_step = driver_steps[_step_id(user_step)]
except KeyError:
- error = (_('node does not support this %(type)s step: %(step)s')
- % {'type': step_type, 'step': user_step})
- errors.append(error)
+ if skip_missing:
+ LOG.debug('%(type)s step %(step)s is not currently known for '
+ 'node %(node)s, delaying its validation until '
+ 'in-band steps are loaded',
+ {'type': step_type.capitalize(),
+ 'step': user_step, 'node': task.node.uuid})
+ else:
+ error = (_('node does not support this %(type)s step: '
+ '%(step)s')
+ % {'type': step_type, 'step': user_step})
+ errors.append(error)
continue
step_errors = _validate_user_step(task, user_step, driver_step,
step_type)
errors.extend(step_errors)
+ result.append(user_step)
if step_type == 'deploy':
# Deploy steps should be unique across all combined templates.
- dup_errors = _validate_deploy_steps_unique(user_steps)
+ dup_errors = _validate_deploy_steps_unique(result)
errors.extend(dup_errors)
if errors:
@@ -530,7 +557,7 @@ def _validate_user_steps(task, user_steps, driver_steps, step_type,
err += '; '.join(errors)
raise exception.InvalidParameterValue(err=err)
- return user_steps
+ return result
def _validate_user_clean_steps(task, user_steps):
@@ -558,7 +585,8 @@ def _validate_user_clean_steps(task, user_steps):
return _validate_user_steps(task, user_steps, driver_steps, 'clean')
-def _validate_user_deploy_steps(task, user_steps, error_prefix=None):
+def _validate_user_deploy_steps(task, user_steps, error_prefix=None,
+ skip_missing=False):
"""Validate the user-specified deploy steps.
:param task: A TaskManager object
@@ -585,10 +613,11 @@ def _validate_user_deploy_steps(task, user_steps, error_prefix=None):
"""
driver_steps = _get_deployment_steps(task, enabled=False, sort=False)
return _validate_user_steps(task, user_steps, driver_steps, 'deploy',
- error_prefix=error_prefix)
+ error_prefix=error_prefix,
+ skip_missing=skip_missing)
-def validate_deploy_templates(task):
+def validate_deploy_templates(task, skip_missing=False):
"""Validate the deploy templates for a node.
:param task: A TaskManager object
@@ -598,4 +627,4 @@ def validate_deploy_templates(task):
steps from the driver.
"""
# Gather deploy steps from matching deploy templates and validate them.
- _get_validated_steps_from_templates(task)
+ _get_validated_steps_from_templates(task, skip_missing=skip_missing)
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index 75b90486a..35e2c94fc 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -369,7 +369,7 @@ def provisioning_error_handler(e, node, provision_state,
"""
if isinstance(e, exception.NoFreeConductorWorker):
- # NOTE(deva): there is no need to clear conductor_affinity
+ # NOTE(tenbrae): there is no need to clear conductor_affinity
# because it isn't updated on a failed deploy
node.provision_state = provision_state
node.target_provision_state = target_provision_state
@@ -444,6 +444,25 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
task.process_event('fail', target_state=target_state)
+def wipe_deploy_internal_info(node):
+ """Remove temporary deployment fields from driver_internal_info."""
+ info = node.driver_internal_info
+ info.pop('agent_secret_token', None)
+ info.pop('agent_secret_token_pregenerated', None)
+ # Clear any leftover metadata about deployment.
+ info['deploy_steps'] = None
+ info.pop('agent_cached_deploy_steps', None)
+ info.pop('deploy_step_index', None)
+ info.pop('deployment_reboot', None)
+ info.pop('deployment_polling', None)
+ info.pop('skip_current_deploy_step', None)
+ info.pop('steps_validated', None)
+ # Remove agent_url since it will be re-asserted
+ # upon the next deployment attempt.
+ info.pop('agent_url', None)
+ node.driver_internal_info = info
+
+
def deploying_error_handler(task, logmsg, errmsg=None, traceback=False,
clean_up=True):
"""Put a failed node in DEPLOYFAIL.
@@ -484,22 +503,13 @@ def deploying_error_handler(task, logmsg, errmsg=None, traceback=False,
# Clear deploy step; we leave the list of deploy steps
# in node.driver_internal_info for debugging purposes.
node.deploy_step = {}
- info = node.driver_internal_info
- # Clear any leftover metadata about deployment.
- info.pop('deploy_step_index', None)
- info.pop('deployment_reboot', None)
- info.pop('deployment_polling', None)
- info.pop('skip_current_deploy_step', None)
- # Remove agent_url since it will be re-asserted
- # upon the next deployment attempt.
- info.pop('agent_url', None)
- node.driver_internal_info = info
+ wipe_deploy_internal_info(node)
if cleanup_err:
node.last_error = cleanup_err
node.save()
- # NOTE(deva): there is no need to clear conductor_affinity
+ # NOTE(tenbrae): there is no need to clear conductor_affinity
task.process_event('fail')