diff options
author | Julia Kreger <juliaashleykreger@gmail.com> | 2021-08-02 16:07:46 -0700 |
---|---|---|
committer | Julia Kreger <juliaashleykreger@gmail.com> | 2021-09-10 14:47:27 -0700 |
commit | d17749249cbe8507c39eb213e5e97aa1fb543a55 (patch) | |
tree | b1ae17fd632e65e9710566f3ab1dffe6335be46d /ironic/conductor/manager.py | |
parent | fbaad948d870ffd18995f5494016798c8d3c9206 (diff) | |
download | ironic-d17749249cbe8507c39eb213e5e97aa1fb543a55.tar.gz |
Record node history and manage events in db
* Adds periodic task to purge node_history entries based upon
provided configuration.
* Adds recording of node history entries for errors in the
core conductor code.
* Also changes the rescue abort behavior to remove the notice
from being recorded as an error, as this is a likely bug in
behavior for any process or service evaluating the node
last_error field.
* Makes use of a semi-free form event_type field to help
provide some additional context into what is going on and
why. For example if deployments are repeatedly failing,
then perhaps it is a configuration issue, as opposed to
a general failure. If a conductor has no resources, then
the failure, in theory would point back to the conductor
itself.
Story: 2002980
Task: 42960
Change-Id: Ibfa8ac4878cacd98a43dd4424f6d53021ad91166
Diffstat (limited to 'ironic/conductor/manager.py')
-rw-r--r-- | ironic/conductor/manager.py | 128 |
1 files changed, 111 insertions, 17 deletions
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 1abb75fe1..7fe4c5959 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -712,7 +712,11 @@ class ConductorManager(base_manager.BaseConductorManager): def handle_failure(e, errmsg, log_func=LOG.error): utils.remove_node_rescue_password(node, save=False) - node.last_error = errmsg % e + error = errmsg % e + utils.node_history_record(task.node, event=error, + event_type=states.RESCUE, + error=True, + user=task.context.user_id) task.process_event('fail') log_func('Error while performing rescue operation for node ' '%(node)s with instance %(instance)s: %(err)s', @@ -801,7 +805,11 @@ class ConductorManager(base_manager.BaseConductorManager): node = task.node def handle_failure(e, errmsg, log_func=LOG.error): - node.last_error = errmsg % e + error = errmsg % e + utils.node_history_record(task.node, event=error, + event_type=states.RESCUE, + error=True, + user=task.context.user_id) task.process_event('fail') log_func('Error while performing unrescue operation for node ' '%(node)s with instance %(instance)s: %(err)s', @@ -845,7 +853,9 @@ class ConductorManager(base_manager.BaseConductorManager): error_msg = _('Failed to clean up rescue after aborting ' 'the operation') node.refresh() - node.last_error = error_msg + utils.node_history_record(node, event=error_msg, + event_type=states.RESCUE, error=True, + user=task.context.user_id) node.maintenance = True node.maintenance_reason = error_msg node.fault = faults.RESCUE_ABORT_FAILURE @@ -853,10 +863,15 @@ class ConductorManager(base_manager.BaseConductorManager): return info_message = _('Rescue operation aborted for node %s.') % node.uuid - last_error = _('By request, the rescue operation was aborted.') + # NOTE(TheJulia): This "error" is not an actual error, the operation + # has been aborted and the node returned to normal operation. + error = _('By request, the rescue operation was aborted.') + utils.node_history_record(task.node, event=error, + event_type=states.RESCUE, + error=False, + user=task.context.user_id) node.refresh() utils.remove_agent_url(node) - node.last_error = last_error node.save() LOG.info(info_message) @@ -1053,7 +1068,11 @@ class ConductorManager(base_manager.BaseConductorManager): with excutils.save_and_reraise_exception(): LOG.exception('Error in tear_down of node %(node)s: %(err)s', {'node': node.uuid, 'err': e}) - node.last_error = _("Failed to tear down. Error: %s") % e + error = _("Failed to tear down. Error: %s") % e + utils.node_history_record(task.node, event=error, + event_type=states.UNPROVISION, + error=True, + user=task.context.user_id) task.process_event('fail') else: # NOTE(tenbrae): When tear_down finishes, the deletion is done, @@ -1339,10 +1358,18 @@ class ConductorManager(base_manager.BaseConductorManager): with excutils.save_and_reraise_exception(): LOG.exception('Error in aborting the inspection of ' 'node %(node)s', {'node': node.uuid}) - node.last_error = _('Failed to abort inspection. ' - 'Error: %s') % e + error = _('Failed to abort inspection. ' + 'Error: %s') % e + utils.node_history_record(task.node, event=error, + event_type=states.INTROSPECTION, + error=True, + user=task.context.user_id) node.save() - node.last_error = _('Inspection was aborted by request.') + error = _('Inspection was aborted by request.') + utils.node_history_record(task.node, event=error, + event_type=states.INTROSPECTION, + error=True, + user=task.context.user_id) utils.wipe_token_and_url(task) task.process_event('abort') LOG.info('Successfully aborted inspection of node %(node)s', @@ -1659,9 +1686,14 @@ class ConductorManager(base_manager.BaseConductorManager): if not task.node.maintenance and task.node.target_power_state: old_state = task.node.target_power_state task.node.target_power_state = None - task.node.last_error = _('Pending power operation was ' - 'aborted due to conductor take ' - 'over') + error = _('Pending power operation was ' + 'aborted due to conductor take ' + 'over') + utils.node_history_record(task.node, event=error, + event_type=states.TAKEOVER, + error=True, + user=task.context.user_id) + task.node.save() LOG.warning('Aborted pending power operation %(op)s ' 'on node %(node)s due to conductor take over', @@ -1725,7 +1757,10 @@ class ConductorManager(base_manager.BaseConductorManager): LOG.error(msg) # Wipe power state from being preserved as it is likely invalid. node.power_state = states.NOSTATE - node.last_error = msg + utils.node_history_record(task.node, event=msg, + event_type=states.ADOPTION, + error=True, + user=task.context.user_id) task.process_event('fail') @METRICS.timer('ConductorManager._do_takeover') @@ -1764,7 +1799,10 @@ class ConductorManager(base_manager.BaseConductorManager): LOG.error(msg) # If taking over console failed, set node's console_enabled # back to False and set node's last error. - task.node.last_error = msg + utils.node_history_record(task.node, event=msg, + event_type=states.TAKEOVER, + error=True, + user=task.context.user_id) task.node.console_enabled = False console_error = True else: @@ -2231,7 +2269,10 @@ class ConductorManager(base_manager.BaseConductorManager): 'Reason: %(error)s') % {'op': op, 'node': node.uuid, 'error': e}) - node.last_error = msg + utils.node_history_record(task.node, event=msg, + event_type=states.CONSOLE, + error=True, + user=task.context.user_id) LOG.error(msg) node.save() notify_utils.emit_console_notification( @@ -3495,6 +3536,55 @@ class ConductorManager(base_manager.BaseConductorManager): task.node.save() return task.node + @METRICS.timer('ConductorManager.manage_node_history') + @periodics.periodic( + spacing=CONF.conductor.node_history_cleanup_interval, + enabled=( + CONF.conductor.node_history_cleanup_batch_count > 0 + and CONF.conductor.node_history_max_entries != 0 + ) + ) + def manage_node_history(self, context): + try: + self._manage_node_history(context) + except Exception as e: + LOG.error('Encountered error while cleaning node ' + 'history records: %s', e) + + def _manage_node_history(self, context): + """Periodic task to keep the node history tidy.""" + max_batch = CONF.conductor.node_history_cleanup_batch_count + # NOTE(TheJulia): Asks the db for the list. Presently just gets + # the node id and the count. If we incorporate by date constraint + # or handling, then it will need to be something like the method + # needs to identify the explicit ID values to delete, and then + # the deletion process needs to erase in logical chunks. + entries_to_clean = self.dbapi.query_node_history_records_for_purge( + conductor_id=self.conductor.id) + count = 0 + for node_id in entries_to_clean: + if count < max_batch: + # If we have not hit our total limit, proceed + if entries_to_clean[node_id]: + # if we have work to do on this node, proceed. + self.dbapi.bulk_delete_node_history_records( + entries_to_clean[node_id]) + else: + LOG.warning('While cleaning up node history records, ' + 'we reached the maximum number of records ' + 'permitted in a single batch. If this error ' + 'is repeated, consider tuning node history ' + 'configuration options to be more aggressive ' + 'by increasing frequency and lowering the ' + 'number of entries to be deleted to not ' + 'negatively impact performance.') + break + count = count + len(entries_to_clean[node_id]) + # Yield to other threads, since we also don't want to be + # looping tightly deleting rows as that will negatively + # impact DB access if done in excess. + eventlet.sleep(0) + @METRICS.timer('get_vendor_passthru_metadata') def get_vendor_passthru_metadata(route_dict): @@ -3534,7 +3624,9 @@ def handle_sync_power_state_max_retries_exceeded(task, actual_power_state, old_power_state = node.power_state node.power_state = actual_power_state - node.last_error = msg + utils.node_history_record(task.node, event=msg, + event_type=states.MONITORING, + error=True) node.maintenance = True node.maintenance_reason = msg node.fault = faults.POWER_FAILURE @@ -3688,7 +3780,9 @@ def _do_inspect_hardware(task): node = task.node def handle_failure(e, log_func=LOG.error): - node.last_error = e + utils.node_history_record(task.node, event=e, + event_type=states.INTROSPECTION, + error=True, user=task.context.user_id) task.process_event('fail') log_func("Failed to inspect node %(node)s: %(err)s", {'node': node.uuid, 'err': e}) |