1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Functionality related to cleaning."""
from oslo_log import log
from ironic.common import exception
from ironic.common.i18n import _
from ironic.common import states
from ironic.conductor import steps as conductor_steps
from ironic.conductor import task_manager
from ironic.conductor import utils
from ironic.conf import CONF
from ironic.drivers import utils as driver_utils
LOG = log.getLogger(__name__)
@task_manager.require_exclusive_lock
def do_node_clean(task, clean_steps=None):
"""Internal RPC method to perform cleaning of a node.
:param task: a TaskManager instance with an exclusive lock on its node
:param clean_steps: For a manual clean, the list of clean steps to
perform. Is None For automated cleaning (default).
For more information, see the clean_steps parameter
of :func:`ConductorManager.do_node_clean`.
"""
node = task.node
manual_clean = clean_steps is not None
clean_type = 'manual' if manual_clean else 'automated'
LOG.debug('Starting %(type)s cleaning for node %(node)s',
{'type': clean_type, 'node': node.uuid})
if not manual_clean and utils.skip_automated_cleaning(node):
# Skip cleaning, move to AVAILABLE.
node.clean_step = None
node.save()
task.process_event('done')
LOG.info('Automated cleaning is disabled, node %s has been '
'successfully moved to AVAILABLE state.', node.uuid)
return
# NOTE(dtantsur): this is only reachable during automated cleaning,
# for manual cleaning we verify maintenance mode earlier on.
if (not CONF.conductor.allow_provisioning_in_maintenance
and node.maintenance):
msg = _('Cleaning a node in maintenance mode is not allowed')
return utils.cleaning_error_handler(task, msg,
tear_down_cleaning=False)
try:
# NOTE(ghe): Valid power and network values are needed to perform
# a cleaning.
task.driver.power.validate(task)
task.driver.network.validate(task)
except exception.InvalidParameterValue as e:
msg = (_('Validation failed. Cannot clean node %(node)s. '
'Error: %(msg)s') %
{'node': node.uuid, 'msg': e})
return utils.cleaning_error_handler(task, msg)
if manual_clean:
info = node.driver_internal_info
info['clean_steps'] = clean_steps
node.driver_internal_info = info
node.save()
# Retrieve BIOS config settings for this node
utils.node_cache_bios_settings(task, node)
# Allow the deploy driver to set up the ramdisk again (necessary for
# IPA cleaning)
try:
prepare_result = task.driver.deploy.prepare_cleaning(task)
except Exception as e:
msg = (_('Failed to prepare node %(node)s for cleaning: %(e)s')
% {'node': node.uuid, 'e': e})
LOG.exception(msg)
return utils.cleaning_error_handler(task, msg)
if prepare_result == states.CLEANWAIT:
# Prepare is asynchronous, the deploy driver will need to
# set node.driver_internal_info['clean_steps'] and
# node.clean_step and then make an RPC call to
# continue_node_clean to start cleaning.
# For manual cleaning, the target provision state is MANAGEABLE,
# whereas for automated cleaning, it is AVAILABLE (the default).
target_state = states.MANAGEABLE if manual_clean else None
task.process_event('wait', target_state=target_state)
return
try:
conductor_steps.set_node_cleaning_steps(task)
except (exception.InvalidParameterValue,
exception.NodeCleaningFailure) as e:
msg = (_('Cannot clean node %(node)s. Error: %(msg)s')
% {'node': node.uuid, 'msg': e})
return utils.cleaning_error_handler(task, msg)
steps = node.driver_internal_info.get('clean_steps', [])
step_index = 0 if steps else None
do_next_clean_step(task, step_index)
@utils.fail_on_error(utils.cleaning_error_handler,
_("Unexpected error when processing next clean step"))
@task_manager.require_exclusive_lock
def do_next_clean_step(task, step_index):
"""Do cleaning, starting from the specified clean step.
:param task: a TaskManager instance with an exclusive lock
:param step_index: The first clean step in the list to execute. This
is the index (from 0) into the list of clean steps in the node's
driver_internal_info['clean_steps']. Is None if there are no steps
to execute.
"""
node = task.node
# For manual cleaning, the target provision state is MANAGEABLE,
# whereas for automated cleaning, it is AVAILABLE.
manual_clean = node.target_provision_state == states.MANAGEABLE
if step_index is None:
steps = []
else:
steps = node.driver_internal_info['clean_steps'][step_index:]
LOG.info('Executing %(state)s on node %(node)s, remaining steps: '
'%(steps)s', {'node': node.uuid, 'steps': steps,
'state': node.provision_state})
# Execute each step until we hit an async step or run out of steps
for ind, step in enumerate(steps):
# Save which step we're about to start so we can restart
# if necessary
node.clean_step = step
driver_internal_info = node.driver_internal_info
driver_internal_info['clean_step_index'] = step_index + ind
node.driver_internal_info = driver_internal_info
node.save()
interface = getattr(task.driver, step.get('interface'))
LOG.info('Executing %(step)s on node %(node)s',
{'step': step, 'node': node.uuid})
try:
result = interface.execute_clean_step(task, step)
except Exception as e:
if isinstance(e, exception.AgentConnectionFailed):
if task.node.driver_internal_info.get('cleaning_reboot'):
LOG.info('Agent is not yet running on node %(node)s '
'after cleaning reboot, waiting for agent to '
'come up to run next clean step %(step)s.',
{'node': node.uuid, 'step': step})
driver_internal_info['skip_current_clean_step'] = False
node.driver_internal_info = driver_internal_info
target_state = (states.MANAGEABLE if manual_clean
else None)
task.process_event('wait', target_state=target_state)
return
if isinstance(e, exception.AgentInProgress):
LOG.info('Conductor attempted to process clean step for '
'node %(node)s. Agent indicated it is presently '
'executing a command. Error: %(error)s',
{'node': task.node.uuid,
'error': e})
driver_internal_info['skip_current_clean_step'] = False
node.driver_internal_info = driver_internal_info
target_state = states.MANAGEABLE if manual_clean else None
task.process_event('wait', target_state=target_state)
return
msg = (_('Node %(node)s failed step %(step)s: '
'%(exc)s') %
{'node': node.uuid, 'exc': e,
'step': node.clean_step})
LOG.exception(msg)
driver_utils.collect_ramdisk_logs(task.node, label='cleaning')
utils.cleaning_error_handler(task, msg)
return
# Check if the step is done or not. The step should return
# states.CLEANWAIT if the step is still being executed, or
# None if the step is done.
if result == states.CLEANWAIT:
# Kill this worker, the async step will make an RPC call to
# continue_node_clean to continue cleaning
LOG.info('Clean step %(step)s on node %(node)s being '
'executed asynchronously, waiting for driver.',
{'node': node.uuid, 'step': step})
target_state = states.MANAGEABLE if manual_clean else None
task.process_event('wait', target_state=target_state)
return
elif result is not None:
msg = (_('While executing step %(step)s on node '
'%(node)s, step returned invalid value: %(val)s')
% {'step': step, 'node': node.uuid, 'val': result})
LOG.error(msg)
return utils.cleaning_error_handler(task, msg)
LOG.info('Node %(node)s finished clean step %(step)s',
{'node': node.uuid, 'step': step})
if CONF.agent.deploy_logs_collect == 'always':
driver_utils.collect_ramdisk_logs(task.node, label='cleaning')
# Clear clean_step
node.clean_step = None
utils.wipe_cleaning_internal_info(task)
node.save()
try:
task.driver.deploy.tear_down_cleaning(task)
except Exception as e:
msg = (_('Failed to tear down from cleaning for node %(node)s, '
'reason: %(err)s')
% {'node': node.uuid, 'err': e})
LOG.exception(msg)
return utils.cleaning_error_handler(task, msg,
tear_down_cleaning=False)
LOG.info('Node %s cleaning complete', node.uuid)
event = 'manage' if manual_clean or node.retired else 'done'
# NOTE(rloo): No need to specify target prov. state; we're done
task.process_event(event)
@task_manager.require_exclusive_lock
def do_node_clean_abort(task, step_name=None):
"""Internal method to abort an ongoing operation.
:param task: a TaskManager instance with an exclusive lock
:param step_name: The name of the clean step.
"""
node = task.node
try:
task.driver.deploy.tear_down_cleaning(task)
except Exception as e:
LOG.exception('Failed to tear down cleaning for node %(node)s '
'after aborting the operation. Error: %(err)s',
{'node': node.uuid, 'err': e})
error_msg = _('Failed to tear down cleaning after aborting '
'the operation')
utils.cleaning_error_handler(task, error_msg,
tear_down_cleaning=False,
set_fail_state=False)
return
info_message = _('Clean operation aborted for node %s') % node.uuid
last_error = _('By request, the clean operation was aborted')
if step_name:
msg = _(' after the completion of step "%s"') % step_name
last_error += msg
info_message += msg
node.last_error = last_error
node.clean_step = None
utils.wipe_cleaning_internal_info(task)
node.save()
LOG.info(info_message)
@utils.fail_on_error(utils.cleaning_error_handler,
_("Unexpected error when processing next clean step"))
@task_manager.require_exclusive_lock
def continue_node_clean(task):
"""Continue cleaning after finishing an async clean step.
This function calculates which step has to run next and passes control
into do_next_clean_step.
:param task: a TaskManager instance with an exclusive lock
"""
node = task.node
next_step_index = utils.update_next_step_index(task, 'clean')
# If this isn't the final clean step in the cleaning operation
# and it is flagged to abort after the clean step that just
# finished, we abort the cleaning operation.
if node.clean_step.get('abort_after'):
step_name = node.clean_step['step']
if next_step_index is not None:
LOG.debug('The cleaning operation for node %(node)s was '
'marked to be aborted after step "%(step)s '
'completed. Aborting now that it has completed.',
{'node': task.node.uuid, 'step': step_name})
if node.target_provision_state == states.MANAGEABLE:
target_state = states.MANAGEABLE
else:
target_state = None
task.process_event('fail', target_state=target_state)
do_node_clean_abort(task, step_name)
return
LOG.debug('The cleaning operation for node %(node)s was '
'marked to be aborted after step "%(step)s" '
'completed. However, since there are no more '
'clean steps after this, the abort is not going '
'to be done.', {'node': node.uuid,
'step': step_name})
do_next_clean_step(task, next_step_index)
|