summaryrefslogtreecommitdiff
path: root/cloudinit
diff options
context:
space:
mode:
authorChris Patterson <cpatterson@microsoft.com>2023-04-19 11:33:28 -0400
committerGitHub <noreply@github.com>2023-04-19 17:33:28 +0200
commitd6de22e31c3223a2c46f175e71d3dd3a53611842 (patch)
tree26ec41737c3d7dc5af6e4a262b89a7743d924f7c /cloudinit
parent3ee384680e0a615834c1cb386be88c94f004b9b5 (diff)
downloadcloud-init-git-d6de22e31c3223a2c46f175e71d3dd3a53611842.tar.gz
azure/errors: introduce reportable errors (#2129)
When provisioning failures occur an Azure, a generic description is used in the report and ultimately returned to the user. To improve the user experience, report details of the failure in a manner that is parsable, readable and succinct. The current approach is to use csv with a custom delimiter ("|") and quote character ("'"). This format may change in the future. Gracefully handle reportable errors thrown while crawling metadata and treat other exceptions as ReportableErrorUnhandledException. Future work will introduce more reportable errors to handle the expected failure cases. Signed-off-by: Chris Patterson <cpatterson@microsoft.com>
Diffstat (limited to 'cloudinit')
-rw-r--r--cloudinit/sources/DataSourceAzure.py28
-rw-r--r--cloudinit/sources/azure/errors.py93
-rw-r--r--cloudinit/sources/helpers/azure.py15
3 files changed, 118 insertions, 18 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index aeec6a92..83dbdce1 100644
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -26,7 +26,7 @@ from cloudinit.net.dhcp import (
)
from cloudinit.net.ephemeral import EphemeralDHCPv4
from cloudinit.reporting import events
-from cloudinit.sources.azure import identity, imds
+from cloudinit.sources.azure import errors, identity, imds
from cloudinit.sources.helpers import netlink
from cloudinit.sources.helpers.azure import (
DEFAULT_WIRESERVER_ENDPOINT,
@@ -727,11 +727,12 @@ class DataSourceAzure(sources.DataSource):
msg="Crawl of metadata service",
func=self.crawl_metadata,
)
- except Exception as e:
- report_diagnostic_event(
- "Could not crawl Azure metadata: %s" % e, logger_func=LOG.error
- )
- self._report_failure()
+ except errors.ReportableError as error:
+ self._report_failure(error)
+ return False
+ except Exception as error:
+ reportable_error = errors.ReportableErrorUnhandledException(error)
+ self._report_failure(reportable_error)
return False
finally:
self._teardown_ephemeral_networking()
@@ -1170,12 +1171,17 @@ class DataSourceAzure(sources.DataSource):
return reprovision_data
@azure_ds_telemetry_reporter
- def _report_failure(self) -> bool:
+ def _report_failure(self, error: errors.ReportableError) -> bool:
"""Tells the Azure fabric that provisioning has failed.
@param description: A description of the error encountered.
@return: The success status of sending the failure signal.
"""
+ report_diagnostic_event(
+ f"Azure datasource failure occurred: {error.as_description()}",
+ logger_func=LOG.error,
+ )
+
if self._is_ephemeral_networking_up():
try:
report_diagnostic_event(
@@ -1183,7 +1189,9 @@ class DataSourceAzure(sources.DataSource):
"to report failure to Azure",
logger_func=LOG.debug,
)
- report_failure_to_fabric(endpoint=self._wireserver_endpoint)
+ report_failure_to_fabric(
+ endpoint=self._wireserver_endpoint, error=error
+ )
return True
except Exception as e:
report_diagnostic_event(
@@ -1203,7 +1211,9 @@ class DataSourceAzure(sources.DataSource):
except NoDHCPLeaseError:
# Reporting failure will fail, but it will emit telemetry.
pass
- report_failure_to_fabric(endpoint=self._wireserver_endpoint)
+ report_failure_to_fabric(
+ endpoint=self._wireserver_endpoint, error=error
+ )
return True
except Exception as e:
report_diagnostic_event(
diff --git a/cloudinit/sources/azure/errors.py b/cloudinit/sources/azure/errors.py
new file mode 100644
index 00000000..0dd426a7
--- /dev/null
+++ b/cloudinit/sources/azure/errors.py
@@ -0,0 +1,93 @@
+# Copyright (C) 2022 Microsoft Corporation.
+#
+# This file is part of cloud-init. See LICENSE file for license information.
+
+import base64
+import csv
+import logging
+import traceback
+from datetime import datetime
+from io import StringIO
+from typing import Any, Dict, Optional
+
+from cloudinit import version
+from cloudinit.sources.azure import identity
+
+LOG = logging.getLogger(__name__)
+
+
+class ReportableError(Exception):
+ def __init__(
+ self,
+ reason: str,
+ *,
+ supporting_data: Optional[Dict[str, Any]] = None,
+ ) -> None:
+ self.agent = f"Cloud-Init/{version.version_string()}"
+ self.documentation_url = "https://aka.ms/linuxprovisioningerror"
+ self.reason = reason
+
+ if supporting_data:
+ self.supporting_data = supporting_data
+ else:
+ self.supporting_data = {}
+
+ self.timestamp = datetime.utcnow()
+
+ try:
+ self.vm_id = identity.query_vm_id()
+ except Exception as id_error:
+ self.vm_id = f"failed to read vm id: {id_error!r}"
+
+ def as_description(
+ self, *, delimiter: str = "|", quotechar: str = "'"
+ ) -> str:
+ data = [
+ f"reason={self.reason}",
+ f"agent={self.agent}",
+ ]
+ data += [f"{k}={v}" for k, v in self.supporting_data.items()]
+ data += [
+ f"vm_id={self.vm_id}",
+ f"timestamp={self.timestamp.isoformat()}",
+ f"documentation_url={self.documentation_url}",
+ ]
+
+ with StringIO() as io:
+ csv.writer(
+ io,
+ delimiter=delimiter,
+ quotechar=quotechar,
+ quoting=csv.QUOTE_MINIMAL,
+ ).writerow(data)
+
+ # strip trailing \r\n
+ csv_data = io.getvalue().rstrip()
+
+ return f"PROVISIONING_ERROR: {csv_data}"
+
+ def __eq__(self, other) -> bool:
+ return (
+ isinstance(other, ReportableError)
+ and self.timestamp == other.timestamp
+ and self.reason == other.reason
+ and self.supporting_data == other.supporting_data
+ )
+
+ def __repr__(self) -> str:
+ return self.as_description()
+
+
+class ReportableErrorUnhandledException(ReportableError):
+ def __init__(self, exception: Exception) -> None:
+ super().__init__("unhandled exception")
+
+ trace = "".join(
+ traceback.format_exception(
+ type(exception), exception, exception.__traceback__
+ )
+ )
+ trace_base64 = base64.b64encode(trace.encode("utf-8"))
+
+ self.supporting_data["exception"] = repr(exception)
+ self.supporting_data["traceback_base64"] = trace_base64
diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py
index c0ffd760..2413d6b0 100644
--- a/cloudinit/sources/helpers/azure.py
+++ b/cloudinit/sources/helpers/azure.py
@@ -12,7 +12,7 @@ from contextlib import contextmanager
from datetime import datetime
from errno import ENOENT
from time import sleep, time
-from typing import Callable, List, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar, Union
from xml.etree import ElementTree
from xml.sax.saxutils import escape
@@ -20,6 +20,9 @@ from cloudinit import distros, subp, temp_utils, url_helper, util, version
from cloudinit.reporting import events
from cloudinit.settings import CFG_BUILTIN
+if TYPE_CHECKING:
+ from cloudinit.sources.azure import errors
+
LOG = logging.getLogger(__name__)
# Default Wireserver endpoint (if not found in DHCP option 245).
@@ -43,12 +46,6 @@ azure_ds_reporter = events.ReportEventStack(
reporting_enabled=True,
)
-DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE = (
- "The VM encountered an error during deployment. "
- "Please visit https://aka.ms/linuxprovisioningerror "
- "for more information on remediation."
-)
-
T = TypeVar("T")
@@ -1024,9 +1021,9 @@ def get_metadata_from_fabric(
@azure_ds_telemetry_reporter
-def report_failure_to_fabric(endpoint: str):
+def report_failure_to_fabric(endpoint: str, error: "errors.ReportableError"):
shim = WALinuxAgentShim(endpoint=endpoint)
- description = DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE
+ description = error.as_description()
try:
shim.register_with_azure_and_report_failure(description=description)
finally: