Record node history and manage events in db

* Adds periodic task to purge node_history entries based upon provided configuration. * Adds recording of node history entries for errors in the core conductor code. * Also changes the rescue abort behavior to remove the notice from being recorded as an error, as this is a likely bug in behavior for any process or service evaluating the node last_error field. * Makes use of a semi-free form event_type field to help provide some additional context into what is going on and why. For example if deployments are repeatedly failing, then perhaps it is a configuration issue, as opposed to a general failure. If a conductor has no resources, then the failure, in theory would point back to the conductor itself. Story: 2002980 Task: 42960 Change-Id: Ibfa8ac4878cacd98a43dd4424f6d53021ad91166
author: Julia Kreger <juliaashleykreger@gmail.com> 2021-08-02 16:07:46 -0700
committer: Julia Kreger <juliaashleykreger@gmail.com> 2021-09-10 14:47:27 -0700
commit: d17749249cbe8507c39eb213e5e97aa1fb543a55 (patch)
tree: b1ae17fd632e65e9710566f3ab1dffe6335be46d /ironic/conductor/manager.py
parent: fbaad948d870ffd18995f5494016798c8d3c9206 (diff)
download: ironic-d17749249cbe8507c39eb213e5e97aa1fb543a55.tar.gz
1 files changed, 111 insertions, 17 deletions
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index 1abb75fe1..7fe4c5959 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -712,7 +712,11 @@ class ConductorManager(base_manager.BaseConductorManager):
 
         def handle_failure(e, errmsg, log_func=LOG.error):
             utils.remove_node_rescue_password(node, save=False)
-            node.last_error = errmsg % e
+            error = errmsg % e
+            utils.node_history_record(task.node, event=error,
+                                      event_type=states.RESCUE,
+                                      error=True,
+                                      user=task.context.user_id)
             task.process_event('fail')
             log_func('Error while performing rescue operation for node '
                      '%(node)s with instance %(instance)s: %(err)s',
@@ -801,7 +805,11 @@ class ConductorManager(base_manager.BaseConductorManager):
         node = task.node
 
         def handle_failure(e, errmsg, log_func=LOG.error):
-            node.last_error = errmsg % e
+            error = errmsg % e
+            utils.node_history_record(task.node, event=error,
+                                      event_type=states.RESCUE,
+                                      error=True,
+                                      user=task.context.user_id)
             task.process_event('fail')
             log_func('Error while performing unrescue operation for node '
                      '%(node)s with instance %(instance)s: %(err)s',
@@ -845,7 +853,9 @@ class ConductorManager(base_manager.BaseConductorManager):
             error_msg = _('Failed to clean up rescue after aborting '
                           'the operation')
             node.refresh()
-            node.last_error = error_msg
+            utils.node_history_record(node, event=error_msg,
+                                      event_type=states.RESCUE, error=True,
+                                      user=task.context.user_id)
             node.maintenance = True
             node.maintenance_reason = error_msg
             node.fault = faults.RESCUE_ABORT_FAILURE
@@ -853,10 +863,15 @@ class ConductorManager(base_manager.BaseConductorManager):
             return
 
         info_message = _('Rescue operation aborted for node %s.') % node.uuid
-        last_error = _('By request, the rescue operation was aborted.')
+        # NOTE(TheJulia): This "error" is not an actual error, the operation
+        # has been aborted and the node returned to normal operation.
+        error = _('By request, the rescue operation was aborted.')
+        utils.node_history_record(task.node, event=error,
+                                  event_type=states.RESCUE,
+                                  error=False,
+                                  user=task.context.user_id)
         node.refresh()
         utils.remove_agent_url(node)
-        node.last_error = last_error
         node.save()
         LOG.info(info_message)
 
@@ -1053,7 +1068,11 @@ class ConductorManager(base_manager.BaseConductorManager):
             with excutils.save_and_reraise_exception():
                 LOG.exception('Error in tear_down of node %(node)s: %(err)s',
                               {'node': node.uuid, 'err': e})
-                node.last_error = _("Failed to tear down. Error: %s") % e
+                error = _("Failed to tear down. Error: %s") % e
+                utils.node_history_record(task.node, event=error,
+                                          event_type=states.UNPROVISION,
+                                          error=True,
+                                          user=task.context.user_id)
                 task.process_event('fail')
         else:
             # NOTE(tenbrae): When tear_down finishes, the deletion is done,
@@ -1339,10 +1358,18 @@ class ConductorManager(base_manager.BaseConductorManager):
                 with excutils.save_and_reraise_exception():
                     LOG.exception('Error in aborting the inspection of '
                                   'node %(node)s', {'node': node.uuid})
-                    node.last_error = _('Failed to abort inspection. '
-                                        'Error: %s') % e
+                    error = _('Failed to abort inspection. '
+                              'Error: %s') % e
+                    utils.node_history_record(task.node, event=error,
+                                              event_type=states.INTROSPECTION,
+                                              error=True,
+                                              user=task.context.user_id)
                     node.save()
-            node.last_error = _('Inspection was aborted by request.')
+            error = _('Inspection was aborted by request.')
+            utils.node_history_record(task.node, event=error,
+                                      event_type=states.INTROSPECTION,
+                                      error=True,
+                                      user=task.context.user_id)
             utils.wipe_token_and_url(task)
             task.process_event('abort')
             LOG.info('Successfully aborted inspection of node %(node)s',
@@ -1659,9 +1686,14 @@ class ConductorManager(base_manager.BaseConductorManager):
                 if not task.node.maintenance and task.node.target_power_state:
                     old_state = task.node.target_power_state
                     task.node.target_power_state = None
-                    task.node.last_error = _('Pending power operation was '
-                                             'aborted due to conductor take '
-                                             'over')
+                    error = _('Pending power operation was '
+                              'aborted due to conductor take '
+                              'over')
+                    utils.node_history_record(task.node, event=error,
+                                              event_type=states.TAKEOVER,
+                                              error=True,
+                                              user=task.context.user_id)
+
                     task.node.save()
                     LOG.warning('Aborted pending power operation %(op)s '
                                 'on node %(node)s due to conductor take over',
@@ -1725,7 +1757,10 @@ class ConductorManager(base_manager.BaseConductorManager):
             LOG.error(msg)
             # Wipe power state from being preserved as it is likely invalid.
             node.power_state = states.NOSTATE
-            node.last_error = msg
+            utils.node_history_record(task.node, event=msg,
+                                      event_type=states.ADOPTION,
+                                      error=True,
+                                      user=task.context.user_id)
             task.process_event('fail')
 
     @METRICS.timer('ConductorManager._do_takeover')
@@ -1764,7 +1799,10 @@ class ConductorManager(base_manager.BaseConductorManager):
                 LOG.error(msg)
                 # If taking over console failed, set node's console_enabled
                 # back to False and set node's last error.
-                task.node.last_error = msg
+                utils.node_history_record(task.node, event=msg,
+                                          event_type=states.TAKEOVER,
+                                          error=True,
+                                          user=task.context.user_id)
                 task.node.console_enabled = False
                 console_error = True
             else:
@@ -2231,7 +2269,10 @@ class ConductorManager(base_manager.BaseConductorManager):
                          'Reason: %(error)s') % {'op': op,
                                                  'node': node.uuid,
                                                  'error': e})
-                node.last_error = msg
+                utils.node_history_record(task.node, event=msg,
+                                          event_type=states.CONSOLE,
+                                          error=True,
+                                          user=task.context.user_id)
                 LOG.error(msg)
                 node.save()
                 notify_utils.emit_console_notification(
@@ -3495,6 +3536,55 @@ class ConductorManager(base_manager.BaseConductorManager):
             task.node.save()
             return task.node
 
+    @METRICS.timer('ConductorManager.manage_node_history')
+    @periodics.periodic(
+        spacing=CONF.conductor.node_history_cleanup_interval,
+        enabled=(
+            CONF.conductor.node_history_cleanup_batch_count > 0
+            and CONF.conductor.node_history_max_entries != 0
+        )
+    )
+    def manage_node_history(self, context):
+        try:
+            self._manage_node_history(context)
+        except Exception as e:
+            LOG.error('Encountered error while cleaning node '
+                      'history records: %s', e)
+
+    def _manage_node_history(self, context):
+        """Periodic task to keep the node history tidy."""
+        max_batch = CONF.conductor.node_history_cleanup_batch_count
+        # NOTE(TheJulia): Asks the db for the list. Presently just gets
+        # the node id and the count. If we incorporate by date constraint
+        # or handling, then it will need to be something like the method
+        # needs to identify the explicit ID values to delete, and then
+        # the deletion process needs to erase in logical chunks.
+        entries_to_clean = self.dbapi.query_node_history_records_for_purge(
+            conductor_id=self.conductor.id)
+        count = 0
+        for node_id in entries_to_clean:
+            if count < max_batch:
+                # If we have not hit our total limit, proceed
+                if entries_to_clean[node_id]:
+                    # if we have work to do on this node, proceed.
+                    self.dbapi.bulk_delete_node_history_records(
+                        entries_to_clean[node_id])
+            else:
+                LOG.warning('While cleaning up node history records, '
+                            'we reached the maximum number of records '
+                            'permitted in a single batch. If this error '
+                            'is repeated, consider tuning node history '
+                            'configuration options to be more aggressive '
+                            'by increasing frequency and lowering the '
+                            'number of entries to be deleted to not '
+                            'negatively impact performance.')
+                break
+            count = count + len(entries_to_clean[node_id])
+            # Yield to other threads, since we also don't want to be
+            # looping tightly deleting rows as that will negatively
+            # impact DB access if done in excess.
+            eventlet.sleep(0)
+
 
 @METRICS.timer('get_vendor_passthru_metadata')
 def get_vendor_passthru_metadata(route_dict):
@@ -3534,7 +3624,9 @@ def handle_sync_power_state_max_retries_exceeded(task, actual_power_state,
 
     old_power_state = node.power_state
     node.power_state = actual_power_state
-    node.last_error = msg
+    utils.node_history_record(task.node, event=msg,
+                              event_type=states.MONITORING,
+                              error=True)
     node.maintenance = True
     node.maintenance_reason = msg
     node.fault = faults.POWER_FAILURE
@@ -3688,7 +3780,9 @@ def _do_inspect_hardware(task):
     node = task.node
 
     def handle_failure(e, log_func=LOG.error):
-        node.last_error = e
+        utils.node_history_record(task.node, event=e,
+                                  event_type=states.INTROSPECTION,
+                                  error=True, user=task.context.user_id)
         task.process_event('fail')
         log_func("Failed to inspect node %(node)s: %(err)s",
                  {'node': node.uuid, 'err': e})
author	Julia Kreger <juliaashleykreger@gmail.com>	2021-08-02 16:07:46 -0700
committer	Julia Kreger <juliaashleykreger@gmail.com>	2021-09-10 14:47:27 -0700
commit	d17749249cbe8507c39eb213e5e97aa1fb543a55 (patch)
tree	b1ae17fd632e65e9710566f3ab1dffe6335be46d /ironic/conductor/manager.py
parent	fbaad948d870ffd18995f5494016798c8d3c9206 (diff)
download	ironic-d17749249cbe8507c39eb213e5e97aa1fb543a55.tar.gz