summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Patterson <cpatterson@microsoft.com>2023-04-19 11:33:28 -0400
committerGitHub <noreply@github.com>2023-04-19 17:33:28 +0200
commitd6de22e31c3223a2c46f175e71d3dd3a53611842 (patch)
tree26ec41737c3d7dc5af6e4a262b89a7743d924f7c
parent3ee384680e0a615834c1cb386be88c94f004b9b5 (diff)
downloadcloud-init-git-d6de22e31c3223a2c46f175e71d3dd3a53611842.tar.gz
azure/errors: introduce reportable errors (#2129)
When provisioning failures occur an Azure, a generic description is used in the report and ultimately returned to the user. To improve the user experience, report details of the failure in a manner that is parsable, readable and succinct. The current approach is to use csv with a custom delimiter ("|") and quote character ("'"). This format may change in the future. Gracefully handle reportable errors thrown while crawling metadata and treat other exceptions as ReportableErrorUnhandledException. Future work will introduce more reportable errors to handle the expected failure cases. Signed-off-by: Chris Patterson <cpatterson@microsoft.com>
-rw-r--r--cloudinit/sources/DataSourceAzure.py28
-rw-r--r--cloudinit/sources/azure/errors.py93
-rw-r--r--cloudinit/sources/helpers/azure.py15
-rw-r--r--tests/unittests/sources/azure/test_errors.py137
-rw-r--r--tests/unittests/sources/test_azure.py30
-rw-r--r--tests/unittests/sources/test_azure_helper.py28
6 files changed, 287 insertions, 44 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index aeec6a92..83dbdce1 100644
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -26,7 +26,7 @@ from cloudinit.net.dhcp import (
)
from cloudinit.net.ephemeral import EphemeralDHCPv4
from cloudinit.reporting import events
-from cloudinit.sources.azure import identity, imds
+from cloudinit.sources.azure import errors, identity, imds
from cloudinit.sources.helpers import netlink
from cloudinit.sources.helpers.azure import (
DEFAULT_WIRESERVER_ENDPOINT,
@@ -727,11 +727,12 @@ class DataSourceAzure(sources.DataSource):
msg="Crawl of metadata service",
func=self.crawl_metadata,
)
- except Exception as e:
- report_diagnostic_event(
- "Could not crawl Azure metadata: %s" % e, logger_func=LOG.error
- )
- self._report_failure()
+ except errors.ReportableError as error:
+ self._report_failure(error)
+ return False
+ except Exception as error:
+ reportable_error = errors.ReportableErrorUnhandledException(error)
+ self._report_failure(reportable_error)
return False
finally:
self._teardown_ephemeral_networking()
@@ -1170,12 +1171,17 @@ class DataSourceAzure(sources.DataSource):
return reprovision_data
@azure_ds_telemetry_reporter
- def _report_failure(self) -> bool:
+ def _report_failure(self, error: errors.ReportableError) -> bool:
"""Tells the Azure fabric that provisioning has failed.
@param description: A description of the error encountered.
@return: The success status of sending the failure signal.
"""
+ report_diagnostic_event(
+ f"Azure datasource failure occurred: {error.as_description()}",
+ logger_func=LOG.error,
+ )
+
if self._is_ephemeral_networking_up():
try:
report_diagnostic_event(
@@ -1183,7 +1189,9 @@ class DataSourceAzure(sources.DataSource):
"to report failure to Azure",
logger_func=LOG.debug,
)
- report_failure_to_fabric(endpoint=self._wireserver_endpoint)
+ report_failure_to_fabric(
+ endpoint=self._wireserver_endpoint, error=error
+ )
return True
except Exception as e:
report_diagnostic_event(
@@ -1203,7 +1211,9 @@ class DataSourceAzure(sources.DataSource):
except NoDHCPLeaseError:
# Reporting failure will fail, but it will emit telemetry.
pass
- report_failure_to_fabric(endpoint=self._wireserver_endpoint)
+ report_failure_to_fabric(
+ endpoint=self._wireserver_endpoint, error=error
+ )
return True
except Exception as e:
report_diagnostic_event(
diff --git a/cloudinit/sources/azure/errors.py b/cloudinit/sources/azure/errors.py
new file mode 100644
index 00000000..0dd426a7
--- /dev/null
+++ b/cloudinit/sources/azure/errors.py
@@ -0,0 +1,93 @@
+# Copyright (C) 2022 Microsoft Corporation.
+#
+# This file is part of cloud-init. See LICENSE file for license information.
+
+import base64
+import csv
+import logging
+import traceback
+from datetime import datetime
+from io import StringIO
+from typing import Any, Dict, Optional
+
+from cloudinit import version
+from cloudinit.sources.azure import identity
+
+LOG = logging.getLogger(__name__)
+
+
+class ReportableError(Exception):
+ def __init__(
+ self,
+ reason: str,
+ *,
+ supporting_data: Optional[Dict[str, Any]] = None,
+ ) -> None:
+ self.agent = f"Cloud-Init/{version.version_string()}"
+ self.documentation_url = "https://aka.ms/linuxprovisioningerror"
+ self.reason = reason
+
+ if supporting_data:
+ self.supporting_data = supporting_data
+ else:
+ self.supporting_data = {}
+
+ self.timestamp = datetime.utcnow()
+
+ try:
+ self.vm_id = identity.query_vm_id()
+ except Exception as id_error:
+ self.vm_id = f"failed to read vm id: {id_error!r}"
+
+ def as_description(
+ self, *, delimiter: str = "|", quotechar: str = "'"
+ ) -> str:
+ data = [
+ f"reason={self.reason}",
+ f"agent={self.agent}",
+ ]
+ data += [f"{k}={v}" for k, v in self.supporting_data.items()]
+ data += [
+ f"vm_id={self.vm_id}",
+ f"timestamp={self.timestamp.isoformat()}",
+ f"documentation_url={self.documentation_url}",
+ ]
+
+ with StringIO() as io:
+ csv.writer(
+ io,
+ delimiter=delimiter,
+ quotechar=quotechar,
+ quoting=csv.QUOTE_MINIMAL,
+ ).writerow(data)
+
+ # strip trailing \r\n
+ csv_data = io.getvalue().rstrip()
+
+ return f"PROVISIONING_ERROR: {csv_data}"
+
+ def __eq__(self, other) -> bool:
+ return (
+ isinstance(other, ReportableError)
+ and self.timestamp == other.timestamp
+ and self.reason == other.reason
+ and self.supporting_data == other.supporting_data
+ )
+
+ def __repr__(self) -> str:
+ return self.as_description()
+
+
+class ReportableErrorUnhandledException(ReportableError):
+ def __init__(self, exception: Exception) -> None:
+ super().__init__("unhandled exception")
+
+ trace = "".join(
+ traceback.format_exception(
+ type(exception), exception, exception.__traceback__
+ )
+ )
+ trace_base64 = base64.b64encode(trace.encode("utf-8"))
+
+ self.supporting_data["exception"] = repr(exception)
+ self.supporting_data["traceback_base64"] = trace_base64
diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py
index c0ffd760..2413d6b0 100644
--- a/cloudinit/sources/helpers/azure.py
+++ b/cloudinit/sources/helpers/azure.py
@@ -12,7 +12,7 @@ from contextlib import contextmanager
from datetime import datetime
from errno import ENOENT
from time import sleep, time
-from typing import Callable, List, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar, Union
from xml.etree import ElementTree
from xml.sax.saxutils import escape
@@ -20,6 +20,9 @@ from cloudinit import distros, subp, temp_utils, url_helper, util, version
from cloudinit.reporting import events
from cloudinit.settings import CFG_BUILTIN
+if TYPE_CHECKING:
+ from cloudinit.sources.azure import errors
+
LOG = logging.getLogger(__name__)
# Default Wireserver endpoint (if not found in DHCP option 245).
@@ -43,12 +46,6 @@ azure_ds_reporter = events.ReportEventStack(
reporting_enabled=True,
)
-DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE = (
- "The VM encountered an error during deployment. "
- "Please visit https://aka.ms/linuxprovisioningerror "
- "for more information on remediation."
-)
-
T = TypeVar("T")
@@ -1024,9 +1021,9 @@ def get_metadata_from_fabric(
@azure_ds_telemetry_reporter
-def report_failure_to_fabric(endpoint: str):
+def report_failure_to_fabric(endpoint: str, error: "errors.ReportableError"):
shim = WALinuxAgentShim(endpoint=endpoint)
- description = DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE
+ description = error.as_description()
try:
shim.register_with_azure_and_report_failure(description=description)
finally:
diff --git a/tests/unittests/sources/azure/test_errors.py b/tests/unittests/sources/azure/test_errors.py
new file mode 100644
index 00000000..eb80dd17
--- /dev/null
+++ b/tests/unittests/sources/azure/test_errors.py
@@ -0,0 +1,137 @@
+# This file is part of cloud-init. See LICENSE file for license information.
+
+import base64
+import datetime
+from unittest import mock
+
+import pytest
+
+from cloudinit import version
+from cloudinit.sources.azure import errors
+
+
+@pytest.fixture()
+def agent_string():
+ yield f"agent=Cloud-Init/{version.version_string()}"
+
+
+@pytest.fixture()
+def fake_utcnow():
+ timestamp = datetime.datetime.utcnow()
+ with mock.patch.object(errors, "datetime", autospec=True) as m:
+ m.utcnow.return_value = timestamp
+ yield timestamp
+
+
+@pytest.fixture()
+def fake_vm_id():
+ vm_id = "fake-vm-id"
+ with mock.patch.object(errors.identity, "query_vm_id", autospec=True) as m:
+ m.return_value = vm_id
+ yield vm_id
+
+
+def quote_csv_value(value: str) -> str:
+ """Match quoting behavior, if needed for given string."""
+ if any([x in value for x in ("\n", "\r", "'")]):
+ value = value.replace("'", "''")
+ value = f"'{value}'"
+
+ return value
+
+
+@pytest.mark.parametrize("reason", ["foo", "foo bar", "foo'bar"])
+@pytest.mark.parametrize(
+ "supporting_data",
+ [
+ {},
+ {
+ "foo": "bar",
+ },
+ {
+ "foo": "bar",
+ "count": 4,
+ },
+ {
+ "csvcheck": "",
+ },
+ {
+ "csvcheck": "trailingspace ",
+ },
+ {
+ "csvcheck": "\r\n",
+ },
+ {
+ "csvcheck": "\n",
+ },
+ {
+ "csvcheck": "\t",
+ },
+ {
+ "csvcheck": "x\nx",
+ },
+ {
+ "csvcheck": "x\rx",
+ },
+ {
+ "csvcheck": '"',
+ },
+ {
+ "csvcheck": '""',
+ },
+ {
+ "csvcheck": "'",
+ },
+ {
+ "csvcheck": "''",
+ },
+ {
+ "csvcheck": "xx'xx'xx",
+ },
+ {
+ "csvcheck": ",'|~!@#$%^&*()[]\\{}|;':\",./<>?x\nnew\r\nline",
+ },
+ ],
+)
+def test_reportable_errors(
+ fake_utcnow,
+ fake_vm_id,
+ reason,
+ supporting_data,
+):
+ error = errors.ReportableError(
+ reason=reason,
+ supporting_data=supporting_data,
+ )
+
+ data = [
+ "PROVISIONING_ERROR: " + quote_csv_value(f"reason={reason}"),
+ f"agent=Cloud-Init/{version.version_string()}",
+ ]
+ data += [quote_csv_value(f"{k}={v}") for k, v in supporting_data.items()]
+ data += [
+ f"vm_id={fake_vm_id}",
+ f"timestamp={fake_utcnow.isoformat()}",
+ "documentation_url=https://aka.ms/linuxprovisioningerror",
+ ]
+
+ assert error.as_description() == "|".join(data)
+
+
+def test_unhandled_exception():
+ source_error = None
+ try:
+ raise ValueError("my value error")
+ except Exception as exception:
+ source_error = exception
+
+ error = errors.ReportableErrorUnhandledException(source_error)
+ trace = base64.b64decode(error.supporting_data["traceback_base64"]).decode(
+ "utf-8"
+ )
+
+ quoted_value = quote_csv_value(f"exception={source_error!r}")
+ assert f"|{quoted_value}|" in error.as_description()
+ assert trace.startswith("Traceback")
+ assert "raise ValueError" in trace
+ assert trace.endswith("ValueError: my value error\n")
diff --git a/tests/unittests/sources/test_azure.py b/tests/unittests/sources/test_azure.py
index 3c36c9c6..a8588ea9 100644
--- a/tests/unittests/sources/test_azure.py
+++ b/tests/unittests/sources/test_azure.py
@@ -16,7 +16,7 @@ from cloudinit.net import dhcp
from cloudinit.sources import UNSET
from cloudinit.sources import DataSourceAzure as dsaz
from cloudinit.sources import InvalidMetaDataException
-from cloudinit.sources.azure import identity
+from cloudinit.sources.azure import errors, identity
from cloudinit.sources.helpers import netlink
from cloudinit.util import (
MountFailedError,
@@ -1154,7 +1154,7 @@ scbus-1 on xpt0 bus 0
m_crawl_metadata.side_effect = Exception
dsrc.get_data()
self.assertEqual(1, m_crawl_metadata.call_count)
- m_report_failure.assert_called_once_with()
+ m_report_failure.assert_called_once_with(mock.ANY)
def test_crawl_metadata_exc_should_log_could_not_crawl_msg(self):
data = {}
@@ -1164,7 +1164,7 @@ scbus-1 on xpt0 bus 0
dsrc.get_data()
self.assertEqual(1, m_crawl_metadata.call_count)
self.assertIn(
- "Could not crawl Azure metadata", self.logs.getvalue()
+ "Azure datasource failure occurred:", self.logs.getvalue()
)
def test_basic_seed_dir(self):
@@ -1775,7 +1775,8 @@ scbus-1 on xpt0 bus 0
# mock crawl metadata failure to cause report failure
m_crawl_metadata.side_effect = Exception
- self.assertTrue(dsrc._report_failure())
+ error = errors.ReportableError(reason="foo")
+ self.assertTrue(dsrc._report_failure(error))
self.assertEqual(1, self.m_report_failure_to_fabric.call_count)
def test_dsaz_report_failure_returns_false_and_does_not_propagate_exc(
@@ -1805,7 +1806,9 @@ scbus-1 on xpt0 bus 0
# 1. Using cached ephemeral dhcp context to report failure to Azure
# 2. Using new ephemeral dhcp to report failure to Azure
self.m_report_failure_to_fabric.side_effect = Exception
- self.assertFalse(dsrc._report_failure())
+
+ error = errors.ReportableError(reason="foo")
+ self.assertFalse(dsrc._report_failure(error))
self.assertEqual(2, self.m_report_failure_to_fabric.call_count)
def test_dsaz_report_failure(self):
@@ -1814,9 +1817,10 @@ scbus-1 on xpt0 bus 0
with mock.patch.object(dsrc, "crawl_metadata") as m_crawl_metadata:
m_crawl_metadata.side_effect = Exception
- self.assertTrue(dsrc._report_failure())
+ error = errors.ReportableError(reason="foo")
+ self.assertTrue(dsrc._report_failure(error))
self.m_report_failure_to_fabric.assert_called_once_with(
- endpoint="168.63.129.16"
+ endpoint="168.63.129.16", error=error
)
def test_dsaz_report_failure_uses_cached_ephemeral_dhcp_ctx_lease(self):
@@ -1830,11 +1834,12 @@ scbus-1 on xpt0 bus 0
# mock crawl metadata failure to cause report failure
m_crawl_metadata.side_effect = Exception
- self.assertTrue(dsrc._report_failure())
+ error = errors.ReportableError(reason="foo")
+ self.assertTrue(dsrc._report_failure(error))
# ensure called with cached ephemeral dhcp lease option 245
self.m_report_failure_to_fabric.assert_called_once_with(
- endpoint="test-ep"
+ endpoint="test-ep", error=error
)
def test_dsaz_report_failure_no_net_uses_new_ephemeral_dhcp_lease(self):
@@ -1851,12 +1856,13 @@ scbus-1 on xpt0 bus 0
}
self.m_dhcp.return_value.obtain_lease.return_value = test_lease
- self.assertTrue(dsrc._report_failure())
+ error = errors.ReportableError(reason="foo")
+ self.assertTrue(dsrc._report_failure(error))
# ensure called with the newly discovered
# ephemeral dhcp lease option 245
self.m_report_failure_to_fabric.assert_called_once_with(
- endpoint="1.2.3.4"
+ endpoint="1.2.3.4", error=error
)
def test_exception_fetching_fabric_data_doesnt_propagate(self):
@@ -4052,7 +4058,7 @@ class TestProvisioning:
with mock.patch.object(self.azure_ds, "_report_failure") as m_report:
self.azure_ds._get_data()
- assert m_report.mock_calls == [mock.call()]
+ assert m_report.mock_calls == [mock.call(mock.ANY)]
assert self.mock_wrapping_setup_ephemeral_networking.mock_calls == [
mock.call(timeout_minutes=20),
diff --git a/tests/unittests/sources/test_azure_helper.py b/tests/unittests/sources/test_azure_helper.py
index ac2126e8..a08011a6 100644
--- a/tests/unittests/sources/test_azure_helper.py
+++ b/tests/unittests/sources/test_azure_helper.py
@@ -11,6 +11,7 @@ import pytest
import requests
from cloudinit import url_helper
+from cloudinit.sources.azure import errors
from cloudinit.sources.helpers import azure as azure_helper
from cloudinit.sources.helpers.azure import WALinuxAgentShim as wa_shim
from cloudinit.util import load_file
@@ -106,15 +107,6 @@ MOCKPATH = "cloudinit.sources.helpers.azure."
@pytest.fixture
-def mock_dmi_read_dmi_data():
- with mock.patch(
- MOCKPATH + "dmi.read_dmi_data",
- autospec=True,
- ) as m:
- yield m
-
-
-@pytest.fixture
def mock_readurl():
with mock.patch(MOCKPATH + "url_helper.readurl", autospec=True) as m:
yield m
@@ -1358,7 +1350,10 @@ class TestGetMetadataGoalStateXMLAndReportFailureToFabric(CiTestCase):
)
def test_success_calls_clean_up(self):
- azure_helper.report_failure_to_fabric(endpoint="test_endpoint")
+ error = errors.ReportableError(reason="test")
+ azure_helper.report_failure_to_fabric(
+ endpoint="test_endpoint", error=error
+ )
self.assertEqual(1, self.m_shim.return_value.clean_up.call_count)
def test_failure_in_shim_report_failure_propagates_exc_and_calls_clean_up(
@@ -1371,24 +1366,29 @@ class TestGetMetadataGoalStateXMLAndReportFailureToFabric(CiTestCase):
SentinelException,
azure_helper.report_failure_to_fabric,
"test_endpoint",
+ errors.ReportableError(reason="test"),
)
self.assertEqual(1, self.m_shim.return_value.clean_up.call_count)
def test_report_failure_to_fabric_calls_shim_report_failure(
self,
):
- azure_helper.report_failure_to_fabric(endpoint="test_endpoint")
+ error = errors.ReportableError(reason="test")
+
+ azure_helper.report_failure_to_fabric(
+ endpoint="test_endpoint",
+ error=error,
+ )
# default err message description should be shown to the user
# if an empty description is passed in
self.m_shim.return_value.register_with_azure_and_report_failure.assert_called_once_with( # noqa: E501
- description=(
- azure_helper.DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE
- )
+ description=error.as_description(),
)
def test_instantiates_shim_with_kwargs(self):
azure_helper.report_failure_to_fabric(
endpoint="test_endpoint",
+ error=errors.ReportableError(reason="test"),
)
self.m_shim.assert_called_once_with(
endpoint="test_endpoint",