diff options
author | Chris Patterson <cpatterson@microsoft.com> | 2023-04-19 11:33:28 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-04-19 17:33:28 +0200 |
commit | d6de22e31c3223a2c46f175e71d3dd3a53611842 (patch) | |
tree | 26ec41737c3d7dc5af6e4a262b89a7743d924f7c | |
parent | 3ee384680e0a615834c1cb386be88c94f004b9b5 (diff) | |
download | cloud-init-git-d6de22e31c3223a2c46f175e71d3dd3a53611842.tar.gz |
azure/errors: introduce reportable errors (#2129)
When provisioning failures occur an Azure, a generic description is
used in the report and ultimately returned to the user. To improve
the user experience, report details of the failure in a manner that is
parsable, readable and succinct. The current approach is to use csv
with a custom delimiter ("|") and quote character ("'"). This format
may change in the future.
Gracefully handle reportable errors thrown while crawling metadata and
treat other exceptions as ReportableErrorUnhandledException. Future
work will introduce more reportable errors to handle the expected
failure cases.
Signed-off-by: Chris Patterson <cpatterson@microsoft.com>
-rw-r--r-- | cloudinit/sources/DataSourceAzure.py | 28 | ||||
-rw-r--r-- | cloudinit/sources/azure/errors.py | 93 | ||||
-rw-r--r-- | cloudinit/sources/helpers/azure.py | 15 | ||||
-rw-r--r-- | tests/unittests/sources/azure/test_errors.py | 137 | ||||
-rw-r--r-- | tests/unittests/sources/test_azure.py | 30 | ||||
-rw-r--r-- | tests/unittests/sources/test_azure_helper.py | 28 |
6 files changed, 287 insertions, 44 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index aeec6a92..83dbdce1 100644 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -26,7 +26,7 @@ from cloudinit.net.dhcp import ( ) from cloudinit.net.ephemeral import EphemeralDHCPv4 from cloudinit.reporting import events -from cloudinit.sources.azure import identity, imds +from cloudinit.sources.azure import errors, identity, imds from cloudinit.sources.helpers import netlink from cloudinit.sources.helpers.azure import ( DEFAULT_WIRESERVER_ENDPOINT, @@ -727,11 +727,12 @@ class DataSourceAzure(sources.DataSource): msg="Crawl of metadata service", func=self.crawl_metadata, ) - except Exception as e: - report_diagnostic_event( - "Could not crawl Azure metadata: %s" % e, logger_func=LOG.error - ) - self._report_failure() + except errors.ReportableError as error: + self._report_failure(error) + return False + except Exception as error: + reportable_error = errors.ReportableErrorUnhandledException(error) + self._report_failure(reportable_error) return False finally: self._teardown_ephemeral_networking() @@ -1170,12 +1171,17 @@ class DataSourceAzure(sources.DataSource): return reprovision_data @azure_ds_telemetry_reporter - def _report_failure(self) -> bool: + def _report_failure(self, error: errors.ReportableError) -> bool: """Tells the Azure fabric that provisioning has failed. @param description: A description of the error encountered. @return: The success status of sending the failure signal. """ + report_diagnostic_event( + f"Azure datasource failure occurred: {error.as_description()}", + logger_func=LOG.error, + ) + if self._is_ephemeral_networking_up(): try: report_diagnostic_event( @@ -1183,7 +1189,9 @@ class DataSourceAzure(sources.DataSource): "to report failure to Azure", logger_func=LOG.debug, ) - report_failure_to_fabric(endpoint=self._wireserver_endpoint) + report_failure_to_fabric( + endpoint=self._wireserver_endpoint, error=error + ) return True except Exception as e: report_diagnostic_event( @@ -1203,7 +1211,9 @@ class DataSourceAzure(sources.DataSource): except NoDHCPLeaseError: # Reporting failure will fail, but it will emit telemetry. pass - report_failure_to_fabric(endpoint=self._wireserver_endpoint) + report_failure_to_fabric( + endpoint=self._wireserver_endpoint, error=error + ) return True except Exception as e: report_diagnostic_event( diff --git a/cloudinit/sources/azure/errors.py b/cloudinit/sources/azure/errors.py new file mode 100644 index 00000000..0dd426a7 --- /dev/null +++ b/cloudinit/sources/azure/errors.py @@ -0,0 +1,93 @@ +# Copyright (C) 2022 Microsoft Corporation. +# +# This file is part of cloud-init. See LICENSE file for license information. + +import base64 +import csv +import logging +import traceback +from datetime import datetime +from io import StringIO +from typing import Any, Dict, Optional + +from cloudinit import version +from cloudinit.sources.azure import identity + +LOG = logging.getLogger(__name__) + + +class ReportableError(Exception): + def __init__( + self, + reason: str, + *, + supporting_data: Optional[Dict[str, Any]] = None, + ) -> None: + self.agent = f"Cloud-Init/{version.version_string()}" + self.documentation_url = "https://aka.ms/linuxprovisioningerror" + self.reason = reason + + if supporting_data: + self.supporting_data = supporting_data + else: + self.supporting_data = {} + + self.timestamp = datetime.utcnow() + + try: + self.vm_id = identity.query_vm_id() + except Exception as id_error: + self.vm_id = f"failed to read vm id: {id_error!r}" + + def as_description( + self, *, delimiter: str = "|", quotechar: str = "'" + ) -> str: + data = [ + f"reason={self.reason}", + f"agent={self.agent}", + ] + data += [f"{k}={v}" for k, v in self.supporting_data.items()] + data += [ + f"vm_id={self.vm_id}", + f"timestamp={self.timestamp.isoformat()}", + f"documentation_url={self.documentation_url}", + ] + + with StringIO() as io: + csv.writer( + io, + delimiter=delimiter, + quotechar=quotechar, + quoting=csv.QUOTE_MINIMAL, + ).writerow(data) + + # strip trailing \r\n + csv_data = io.getvalue().rstrip() + + return f"PROVISIONING_ERROR: {csv_data}" + + def __eq__(self, other) -> bool: + return ( + isinstance(other, ReportableError) + and self.timestamp == other.timestamp + and self.reason == other.reason + and self.supporting_data == other.supporting_data + ) + + def __repr__(self) -> str: + return self.as_description() + + +class ReportableErrorUnhandledException(ReportableError): + def __init__(self, exception: Exception) -> None: + super().__init__("unhandled exception") + + trace = "".join( + traceback.format_exception( + type(exception), exception, exception.__traceback__ + ) + ) + trace_base64 = base64.b64encode(trace.encode("utf-8")) + + self.supporting_data["exception"] = repr(exception) + self.supporting_data["traceback_base64"] = trace_base64 diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index c0ffd760..2413d6b0 100644 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -12,7 +12,7 @@ from contextlib import contextmanager from datetime import datetime from errno import ENOENT from time import sleep, time -from typing import Callable, List, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar, Union from xml.etree import ElementTree from xml.sax.saxutils import escape @@ -20,6 +20,9 @@ from cloudinit import distros, subp, temp_utils, url_helper, util, version from cloudinit.reporting import events from cloudinit.settings import CFG_BUILTIN +if TYPE_CHECKING: + from cloudinit.sources.azure import errors + LOG = logging.getLogger(__name__) # Default Wireserver endpoint (if not found in DHCP option 245). @@ -43,12 +46,6 @@ azure_ds_reporter = events.ReportEventStack( reporting_enabled=True, ) -DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE = ( - "The VM encountered an error during deployment. " - "Please visit https://aka.ms/linuxprovisioningerror " - "for more information on remediation." -) - T = TypeVar("T") @@ -1024,9 +1021,9 @@ def get_metadata_from_fabric( @azure_ds_telemetry_reporter -def report_failure_to_fabric(endpoint: str): +def report_failure_to_fabric(endpoint: str, error: "errors.ReportableError"): shim = WALinuxAgentShim(endpoint=endpoint) - description = DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE + description = error.as_description() try: shim.register_with_azure_and_report_failure(description=description) finally: diff --git a/tests/unittests/sources/azure/test_errors.py b/tests/unittests/sources/azure/test_errors.py new file mode 100644 index 00000000..eb80dd17 --- /dev/null +++ b/tests/unittests/sources/azure/test_errors.py @@ -0,0 +1,137 @@ +# This file is part of cloud-init. See LICENSE file for license information. + +import base64 +import datetime +from unittest import mock + +import pytest + +from cloudinit import version +from cloudinit.sources.azure import errors + + +@pytest.fixture() +def agent_string(): + yield f"agent=Cloud-Init/{version.version_string()}" + + +@pytest.fixture() +def fake_utcnow(): + timestamp = datetime.datetime.utcnow() + with mock.patch.object(errors, "datetime", autospec=True) as m: + m.utcnow.return_value = timestamp + yield timestamp + + +@pytest.fixture() +def fake_vm_id(): + vm_id = "fake-vm-id" + with mock.patch.object(errors.identity, "query_vm_id", autospec=True) as m: + m.return_value = vm_id + yield vm_id + + +def quote_csv_value(value: str) -> str: + """Match quoting behavior, if needed for given string.""" + if any([x in value for x in ("\n", "\r", "'")]): + value = value.replace("'", "''") + value = f"'{value}'" + + return value + + +@pytest.mark.parametrize("reason", ["foo", "foo bar", "foo'bar"]) +@pytest.mark.parametrize( + "supporting_data", + [ + {}, + { + "foo": "bar", + }, + { + "foo": "bar", + "count": 4, + }, + { + "csvcheck": "", + }, + { + "csvcheck": "trailingspace ", + }, + { + "csvcheck": "\r\n", + }, + { + "csvcheck": "\n", + }, + { + "csvcheck": "\t", + }, + { + "csvcheck": "x\nx", + }, + { + "csvcheck": "x\rx", + }, + { + "csvcheck": '"', + }, + { + "csvcheck": '""', + }, + { + "csvcheck": "'", + }, + { + "csvcheck": "''", + }, + { + "csvcheck": "xx'xx'xx", + }, + { + "csvcheck": ",'|~!@#$%^&*()[]\\{}|;':\",./<>?x\nnew\r\nline", + }, + ], +) +def test_reportable_errors( + fake_utcnow, + fake_vm_id, + reason, + supporting_data, +): + error = errors.ReportableError( + reason=reason, + supporting_data=supporting_data, + ) + + data = [ + "PROVISIONING_ERROR: " + quote_csv_value(f"reason={reason}"), + f"agent=Cloud-Init/{version.version_string()}", + ] + data += [quote_csv_value(f"{k}={v}") for k, v in supporting_data.items()] + data += [ + f"vm_id={fake_vm_id}", + f"timestamp={fake_utcnow.isoformat()}", + "documentation_url=https://aka.ms/linuxprovisioningerror", + ] + + assert error.as_description() == "|".join(data) + + +def test_unhandled_exception(): + source_error = None + try: + raise ValueError("my value error") + except Exception as exception: + source_error = exception + + error = errors.ReportableErrorUnhandledException(source_error) + trace = base64.b64decode(error.supporting_data["traceback_base64"]).decode( + "utf-8" + ) + + quoted_value = quote_csv_value(f"exception={source_error!r}") + assert f"|{quoted_value}|" in error.as_description() + assert trace.startswith("Traceback") + assert "raise ValueError" in trace + assert trace.endswith("ValueError: my value error\n") diff --git a/tests/unittests/sources/test_azure.py b/tests/unittests/sources/test_azure.py index 3c36c9c6..a8588ea9 100644 --- a/tests/unittests/sources/test_azure.py +++ b/tests/unittests/sources/test_azure.py @@ -16,7 +16,7 @@ from cloudinit.net import dhcp from cloudinit.sources import UNSET from cloudinit.sources import DataSourceAzure as dsaz from cloudinit.sources import InvalidMetaDataException -from cloudinit.sources.azure import identity +from cloudinit.sources.azure import errors, identity from cloudinit.sources.helpers import netlink from cloudinit.util import ( MountFailedError, @@ -1154,7 +1154,7 @@ scbus-1 on xpt0 bus 0 m_crawl_metadata.side_effect = Exception dsrc.get_data() self.assertEqual(1, m_crawl_metadata.call_count) - m_report_failure.assert_called_once_with() + m_report_failure.assert_called_once_with(mock.ANY) def test_crawl_metadata_exc_should_log_could_not_crawl_msg(self): data = {} @@ -1164,7 +1164,7 @@ scbus-1 on xpt0 bus 0 dsrc.get_data() self.assertEqual(1, m_crawl_metadata.call_count) self.assertIn( - "Could not crawl Azure metadata", self.logs.getvalue() + "Azure datasource failure occurred:", self.logs.getvalue() ) def test_basic_seed_dir(self): @@ -1775,7 +1775,8 @@ scbus-1 on xpt0 bus 0 # mock crawl metadata failure to cause report failure m_crawl_metadata.side_effect = Exception - self.assertTrue(dsrc._report_failure()) + error = errors.ReportableError(reason="foo") + self.assertTrue(dsrc._report_failure(error)) self.assertEqual(1, self.m_report_failure_to_fabric.call_count) def test_dsaz_report_failure_returns_false_and_does_not_propagate_exc( @@ -1805,7 +1806,9 @@ scbus-1 on xpt0 bus 0 # 1. Using cached ephemeral dhcp context to report failure to Azure # 2. Using new ephemeral dhcp to report failure to Azure self.m_report_failure_to_fabric.side_effect = Exception - self.assertFalse(dsrc._report_failure()) + + error = errors.ReportableError(reason="foo") + self.assertFalse(dsrc._report_failure(error)) self.assertEqual(2, self.m_report_failure_to_fabric.call_count) def test_dsaz_report_failure(self): @@ -1814,9 +1817,10 @@ scbus-1 on xpt0 bus 0 with mock.patch.object(dsrc, "crawl_metadata") as m_crawl_metadata: m_crawl_metadata.side_effect = Exception - self.assertTrue(dsrc._report_failure()) + error = errors.ReportableError(reason="foo") + self.assertTrue(dsrc._report_failure(error)) self.m_report_failure_to_fabric.assert_called_once_with( - endpoint="168.63.129.16" + endpoint="168.63.129.16", error=error ) def test_dsaz_report_failure_uses_cached_ephemeral_dhcp_ctx_lease(self): @@ -1830,11 +1834,12 @@ scbus-1 on xpt0 bus 0 # mock crawl metadata failure to cause report failure m_crawl_metadata.side_effect = Exception - self.assertTrue(dsrc._report_failure()) + error = errors.ReportableError(reason="foo") + self.assertTrue(dsrc._report_failure(error)) # ensure called with cached ephemeral dhcp lease option 245 self.m_report_failure_to_fabric.assert_called_once_with( - endpoint="test-ep" + endpoint="test-ep", error=error ) def test_dsaz_report_failure_no_net_uses_new_ephemeral_dhcp_lease(self): @@ -1851,12 +1856,13 @@ scbus-1 on xpt0 bus 0 } self.m_dhcp.return_value.obtain_lease.return_value = test_lease - self.assertTrue(dsrc._report_failure()) + error = errors.ReportableError(reason="foo") + self.assertTrue(dsrc._report_failure(error)) # ensure called with the newly discovered # ephemeral dhcp lease option 245 self.m_report_failure_to_fabric.assert_called_once_with( - endpoint="1.2.3.4" + endpoint="1.2.3.4", error=error ) def test_exception_fetching_fabric_data_doesnt_propagate(self): @@ -4052,7 +4058,7 @@ class TestProvisioning: with mock.patch.object(self.azure_ds, "_report_failure") as m_report: self.azure_ds._get_data() - assert m_report.mock_calls == [mock.call()] + assert m_report.mock_calls == [mock.call(mock.ANY)] assert self.mock_wrapping_setup_ephemeral_networking.mock_calls == [ mock.call(timeout_minutes=20), diff --git a/tests/unittests/sources/test_azure_helper.py b/tests/unittests/sources/test_azure_helper.py index ac2126e8..a08011a6 100644 --- a/tests/unittests/sources/test_azure_helper.py +++ b/tests/unittests/sources/test_azure_helper.py @@ -11,6 +11,7 @@ import pytest import requests from cloudinit import url_helper +from cloudinit.sources.azure import errors from cloudinit.sources.helpers import azure as azure_helper from cloudinit.sources.helpers.azure import WALinuxAgentShim as wa_shim from cloudinit.util import load_file @@ -106,15 +107,6 @@ MOCKPATH = "cloudinit.sources.helpers.azure." @pytest.fixture -def mock_dmi_read_dmi_data(): - with mock.patch( - MOCKPATH + "dmi.read_dmi_data", - autospec=True, - ) as m: - yield m - - -@pytest.fixture def mock_readurl(): with mock.patch(MOCKPATH + "url_helper.readurl", autospec=True) as m: yield m @@ -1358,7 +1350,10 @@ class TestGetMetadataGoalStateXMLAndReportFailureToFabric(CiTestCase): ) def test_success_calls_clean_up(self): - azure_helper.report_failure_to_fabric(endpoint="test_endpoint") + error = errors.ReportableError(reason="test") + azure_helper.report_failure_to_fabric( + endpoint="test_endpoint", error=error + ) self.assertEqual(1, self.m_shim.return_value.clean_up.call_count) def test_failure_in_shim_report_failure_propagates_exc_and_calls_clean_up( @@ -1371,24 +1366,29 @@ class TestGetMetadataGoalStateXMLAndReportFailureToFabric(CiTestCase): SentinelException, azure_helper.report_failure_to_fabric, "test_endpoint", + errors.ReportableError(reason="test"), ) self.assertEqual(1, self.m_shim.return_value.clean_up.call_count) def test_report_failure_to_fabric_calls_shim_report_failure( self, ): - azure_helper.report_failure_to_fabric(endpoint="test_endpoint") + error = errors.ReportableError(reason="test") + + azure_helper.report_failure_to_fabric( + endpoint="test_endpoint", + error=error, + ) # default err message description should be shown to the user # if an empty description is passed in self.m_shim.return_value.register_with_azure_and_report_failure.assert_called_once_with( # noqa: E501 - description=( - azure_helper.DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE - ) + description=error.as_description(), ) def test_instantiates_shim_with_kwargs(self): azure_helper.report_failure_to_fabric( endpoint="test_endpoint", + error=errors.ReportableError(reason="test"), ) self.m_shim.assert_called_once_with( endpoint="test_endpoint", |