diff options
Diffstat (limited to 'cloudinit')
-rw-r--r-- | cloudinit/sources/DataSourceAzure.py | 28 | ||||
-rw-r--r-- | cloudinit/sources/azure/errors.py | 93 | ||||
-rw-r--r-- | cloudinit/sources/helpers/azure.py | 15 |
3 files changed, 118 insertions, 18 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index aeec6a92..83dbdce1 100644 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -26,7 +26,7 @@ from cloudinit.net.dhcp import ( ) from cloudinit.net.ephemeral import EphemeralDHCPv4 from cloudinit.reporting import events -from cloudinit.sources.azure import identity, imds +from cloudinit.sources.azure import errors, identity, imds from cloudinit.sources.helpers import netlink from cloudinit.sources.helpers.azure import ( DEFAULT_WIRESERVER_ENDPOINT, @@ -727,11 +727,12 @@ class DataSourceAzure(sources.DataSource): msg="Crawl of metadata service", func=self.crawl_metadata, ) - except Exception as e: - report_diagnostic_event( - "Could not crawl Azure metadata: %s" % e, logger_func=LOG.error - ) - self._report_failure() + except errors.ReportableError as error: + self._report_failure(error) + return False + except Exception as error: + reportable_error = errors.ReportableErrorUnhandledException(error) + self._report_failure(reportable_error) return False finally: self._teardown_ephemeral_networking() @@ -1170,12 +1171,17 @@ class DataSourceAzure(sources.DataSource): return reprovision_data @azure_ds_telemetry_reporter - def _report_failure(self) -> bool: + def _report_failure(self, error: errors.ReportableError) -> bool: """Tells the Azure fabric that provisioning has failed. @param description: A description of the error encountered. @return: The success status of sending the failure signal. """ + report_diagnostic_event( + f"Azure datasource failure occurred: {error.as_description()}", + logger_func=LOG.error, + ) + if self._is_ephemeral_networking_up(): try: report_diagnostic_event( @@ -1183,7 +1189,9 @@ class DataSourceAzure(sources.DataSource): "to report failure to Azure", logger_func=LOG.debug, ) - report_failure_to_fabric(endpoint=self._wireserver_endpoint) + report_failure_to_fabric( + endpoint=self._wireserver_endpoint, error=error + ) return True except Exception as e: report_diagnostic_event( @@ -1203,7 +1211,9 @@ class DataSourceAzure(sources.DataSource): except NoDHCPLeaseError: # Reporting failure will fail, but it will emit telemetry. pass - report_failure_to_fabric(endpoint=self._wireserver_endpoint) + report_failure_to_fabric( + endpoint=self._wireserver_endpoint, error=error + ) return True except Exception as e: report_diagnostic_event( diff --git a/cloudinit/sources/azure/errors.py b/cloudinit/sources/azure/errors.py new file mode 100644 index 00000000..0dd426a7 --- /dev/null +++ b/cloudinit/sources/azure/errors.py @@ -0,0 +1,93 @@ +# Copyright (C) 2022 Microsoft Corporation. +# +# This file is part of cloud-init. See LICENSE file for license information. + +import base64 +import csv +import logging +import traceback +from datetime import datetime +from io import StringIO +from typing import Any, Dict, Optional + +from cloudinit import version +from cloudinit.sources.azure import identity + +LOG = logging.getLogger(__name__) + + +class ReportableError(Exception): + def __init__( + self, + reason: str, + *, + supporting_data: Optional[Dict[str, Any]] = None, + ) -> None: + self.agent = f"Cloud-Init/{version.version_string()}" + self.documentation_url = "https://aka.ms/linuxprovisioningerror" + self.reason = reason + + if supporting_data: + self.supporting_data = supporting_data + else: + self.supporting_data = {} + + self.timestamp = datetime.utcnow() + + try: + self.vm_id = identity.query_vm_id() + except Exception as id_error: + self.vm_id = f"failed to read vm id: {id_error!r}" + + def as_description( + self, *, delimiter: str = "|", quotechar: str = "'" + ) -> str: + data = [ + f"reason={self.reason}", + f"agent={self.agent}", + ] + data += [f"{k}={v}" for k, v in self.supporting_data.items()] + data += [ + f"vm_id={self.vm_id}", + f"timestamp={self.timestamp.isoformat()}", + f"documentation_url={self.documentation_url}", + ] + + with StringIO() as io: + csv.writer( + io, + delimiter=delimiter, + quotechar=quotechar, + quoting=csv.QUOTE_MINIMAL, + ).writerow(data) + + # strip trailing \r\n + csv_data = io.getvalue().rstrip() + + return f"PROVISIONING_ERROR: {csv_data}" + + def __eq__(self, other) -> bool: + return ( + isinstance(other, ReportableError) + and self.timestamp == other.timestamp + and self.reason == other.reason + and self.supporting_data == other.supporting_data + ) + + def __repr__(self) -> str: + return self.as_description() + + +class ReportableErrorUnhandledException(ReportableError): + def __init__(self, exception: Exception) -> None: + super().__init__("unhandled exception") + + trace = "".join( + traceback.format_exception( + type(exception), exception, exception.__traceback__ + ) + ) + trace_base64 = base64.b64encode(trace.encode("utf-8")) + + self.supporting_data["exception"] = repr(exception) + self.supporting_data["traceback_base64"] = trace_base64 diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index c0ffd760..2413d6b0 100644 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -12,7 +12,7 @@ from contextlib import contextmanager from datetime import datetime from errno import ENOENT from time import sleep, time -from typing import Callable, List, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar, Union from xml.etree import ElementTree from xml.sax.saxutils import escape @@ -20,6 +20,9 @@ from cloudinit import distros, subp, temp_utils, url_helper, util, version from cloudinit.reporting import events from cloudinit.settings import CFG_BUILTIN +if TYPE_CHECKING: + from cloudinit.sources.azure import errors + LOG = logging.getLogger(__name__) # Default Wireserver endpoint (if not found in DHCP option 245). @@ -43,12 +46,6 @@ azure_ds_reporter = events.ReportEventStack( reporting_enabled=True, ) -DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE = ( - "The VM encountered an error during deployment. " - "Please visit https://aka.ms/linuxprovisioningerror " - "for more information on remediation." -) - T = TypeVar("T") @@ -1024,9 +1021,9 @@ def get_metadata_from_fabric( @azure_ds_telemetry_reporter -def report_failure_to_fabric(endpoint: str): +def report_failure_to_fabric(endpoint: str, error: "errors.ReportableError"): shim = WALinuxAgentShim(endpoint=endpoint) - description = DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE + description = error.as_description() try: shim.register_with_azure_and_report_failure(description=description) finally: |