From d6de22e31c3223a2c46f175e71d3dd3a53611842 Mon Sep 17 00:00:00 2001 From: Chris Patterson Date: Wed, 19 Apr 2023 11:33:28 -0400 Subject: azure/errors: introduce reportable errors (#2129) When provisioning failures occur an Azure, a generic description is used in the report and ultimately returned to the user. To improve the user experience, report details of the failure in a manner that is parsable, readable and succinct. The current approach is to use csv with a custom delimiter ("|") and quote character ("'"). This format may change in the future. Gracefully handle reportable errors thrown while crawling metadata and treat other exceptions as ReportableErrorUnhandledException. Future work will introduce more reportable errors to handle the expected failure cases. Signed-off-by: Chris Patterson --- cloudinit/sources/DataSourceAzure.py | 28 +++++++---- cloudinit/sources/azure/errors.py | 93 ++++++++++++++++++++++++++++++++++++ cloudinit/sources/helpers/azure.py | 15 +++--- 3 files changed, 118 insertions(+), 18 deletions(-) create mode 100644 cloudinit/sources/azure/errors.py (limited to 'cloudinit') diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index aeec6a92..83dbdce1 100644 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -26,7 +26,7 @@ from cloudinit.net.dhcp import ( ) from cloudinit.net.ephemeral import EphemeralDHCPv4 from cloudinit.reporting import events -from cloudinit.sources.azure import identity, imds +from cloudinit.sources.azure import errors, identity, imds from cloudinit.sources.helpers import netlink from cloudinit.sources.helpers.azure import ( DEFAULT_WIRESERVER_ENDPOINT, @@ -727,11 +727,12 @@ class DataSourceAzure(sources.DataSource): msg="Crawl of metadata service", func=self.crawl_metadata, ) - except Exception as e: - report_diagnostic_event( - "Could not crawl Azure metadata: %s" % e, logger_func=LOG.error - ) - self._report_failure() + except errors.ReportableError as error: + self._report_failure(error) + return False + except Exception as error: + reportable_error = errors.ReportableErrorUnhandledException(error) + self._report_failure(reportable_error) return False finally: self._teardown_ephemeral_networking() @@ -1170,12 +1171,17 @@ class DataSourceAzure(sources.DataSource): return reprovision_data @azure_ds_telemetry_reporter - def _report_failure(self) -> bool: + def _report_failure(self, error: errors.ReportableError) -> bool: """Tells the Azure fabric that provisioning has failed. @param description: A description of the error encountered. @return: The success status of sending the failure signal. """ + report_diagnostic_event( + f"Azure datasource failure occurred: {error.as_description()}", + logger_func=LOG.error, + ) + if self._is_ephemeral_networking_up(): try: report_diagnostic_event( @@ -1183,7 +1189,9 @@ class DataSourceAzure(sources.DataSource): "to report failure to Azure", logger_func=LOG.debug, ) - report_failure_to_fabric(endpoint=self._wireserver_endpoint) + report_failure_to_fabric( + endpoint=self._wireserver_endpoint, error=error + ) return True except Exception as e: report_diagnostic_event( @@ -1203,7 +1211,9 @@ class DataSourceAzure(sources.DataSource): except NoDHCPLeaseError: # Reporting failure will fail, but it will emit telemetry. pass - report_failure_to_fabric(endpoint=self._wireserver_endpoint) + report_failure_to_fabric( + endpoint=self._wireserver_endpoint, error=error + ) return True except Exception as e: report_diagnostic_event( diff --git a/cloudinit/sources/azure/errors.py b/cloudinit/sources/azure/errors.py new file mode 100644 index 00000000..0dd426a7 --- /dev/null +++ b/cloudinit/sources/azure/errors.py @@ -0,0 +1,93 @@ +# Copyright (C) 2022 Microsoft Corporation. +# +# This file is part of cloud-init. See LICENSE file for license information. + +import base64 +import csv +import logging +import traceback +from datetime import datetime +from io import StringIO +from typing import Any, Dict, Optional + +from cloudinit import version +from cloudinit.sources.azure import identity + +LOG = logging.getLogger(__name__) + + +class ReportableError(Exception): + def __init__( + self, + reason: str, + *, + supporting_data: Optional[Dict[str, Any]] = None, + ) -> None: + self.agent = f"Cloud-Init/{version.version_string()}" + self.documentation_url = "https://aka.ms/linuxprovisioningerror" + self.reason = reason + + if supporting_data: + self.supporting_data = supporting_data + else: + self.supporting_data = {} + + self.timestamp = datetime.utcnow() + + try: + self.vm_id = identity.query_vm_id() + except Exception as id_error: + self.vm_id = f"failed to read vm id: {id_error!r}" + + def as_description( + self, *, delimiter: str = "|", quotechar: str = "'" + ) -> str: + data = [ + f"reason={self.reason}", + f"agent={self.agent}", + ] + data += [f"{k}={v}" for k, v in self.supporting_data.items()] + data += [ + f"vm_id={self.vm_id}", + f"timestamp={self.timestamp.isoformat()}", + f"documentation_url={self.documentation_url}", + ] + + with StringIO() as io: + csv.writer( + io, + delimiter=delimiter, + quotechar=quotechar, + quoting=csv.QUOTE_MINIMAL, + ).writerow(data) + + # strip trailing \r\n + csv_data = io.getvalue().rstrip() + + return f"PROVISIONING_ERROR: {csv_data}" + + def __eq__(self, other) -> bool: + return ( + isinstance(other, ReportableError) + and self.timestamp == other.timestamp + and self.reason == other.reason + and self.supporting_data == other.supporting_data + ) + + def __repr__(self) -> str: + return self.as_description() + + +class ReportableErrorUnhandledException(ReportableError): + def __init__(self, exception: Exception) -> None: + super().__init__("unhandled exception") + + trace = "".join( + traceback.format_exception( + type(exception), exception, exception.__traceback__ + ) + ) + trace_base64 = base64.b64encode(trace.encode("utf-8")) + + self.supporting_data["exception"] = repr(exception) + self.supporting_data["traceback_base64"] = trace_base64 diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index c0ffd760..2413d6b0 100644 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -12,7 +12,7 @@ from contextlib import contextmanager from datetime import datetime from errno import ENOENT from time import sleep, time -from typing import Callable, List, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar, Union from xml.etree import ElementTree from xml.sax.saxutils import escape @@ -20,6 +20,9 @@ from cloudinit import distros, subp, temp_utils, url_helper, util, version from cloudinit.reporting import events from cloudinit.settings import CFG_BUILTIN +if TYPE_CHECKING: + from cloudinit.sources.azure import errors + LOG = logging.getLogger(__name__) # Default Wireserver endpoint (if not found in DHCP option 245). @@ -43,12 +46,6 @@ azure_ds_reporter = events.ReportEventStack( reporting_enabled=True, ) -DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE = ( - "The VM encountered an error during deployment. " - "Please visit https://aka.ms/linuxprovisioningerror " - "for more information on remediation." -) - T = TypeVar("T") @@ -1024,9 +1021,9 @@ def get_metadata_from_fabric( @azure_ds_telemetry_reporter -def report_failure_to_fabric(endpoint: str): +def report_failure_to_fabric(endpoint: str, error: "errors.ReportableError"): shim = WALinuxAgentShim(endpoint=endpoint) - description = DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE + description = error.as_description() try: shim.register_with_azure_and_report_failure(description=description) finally: -- cgit v1.2.1