diff options
-rw-r--r-- | cloudinit/sources/DataSourceAzure.py | 28 | ||||
-rw-r--r-- | cloudinit/sources/azure/errors.py | 93 | ||||
-rw-r--r-- | cloudinit/sources/helpers/azure.py | 15 | ||||
-rw-r--r-- | tests/unittests/sources/azure/test_errors.py | 137 | ||||
-rw-r--r-- | tests/unittests/sources/test_azure.py | 30 | ||||
-rw-r--r-- | tests/unittests/sources/test_azure_helper.py | 28 |
6 files changed, 287 insertions, 44 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index aeec6a92..83dbdce1 100644 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -26,7 +26,7 @@ from cloudinit.net.dhcp import ( ) from cloudinit.net.ephemeral import EphemeralDHCPv4 from cloudinit.reporting import events -from cloudinit.sources.azure import identity, imds +from cloudinit.sources.azure import errors, identity, imds from cloudinit.sources.helpers import netlink from cloudinit.sources.helpers.azure import ( DEFAULT_WIRESERVER_ENDPOINT, @@ -727,11 +727,12 @@ class DataSourceAzure(sources.DataSource): msg="Crawl of metadata service", func=self.crawl_metadata, ) - except Exception as e: - report_diagnostic_event( - "Could not crawl Azure metadata: %s" % e, logger_func=LOG.error - ) - self._report_failure() + except errors.ReportableError as error: + self._report_failure(error) + return False + except Exception as error: + reportable_error = errors.ReportableErrorUnhandledException(error) + self._report_failure(reportable_error) return False finally: self._teardown_ephemeral_networking() @@ -1170,12 +1171,17 @@ class DataSourceAzure(sources.DataSource): return reprovision_data @azure_ds_telemetry_reporter - def _report_failure(self) -> bool: + def _report_failure(self, error: errors.ReportableError) -> bool: """Tells the Azure fabric that provisioning has failed. @param description: A description of the error encountered. @return: The success status of sending the failure signal. """ + report_diagnostic_event( + f"Azure datasource failure occurred: {error.as_description()}", + logger_func=LOG.error, + ) + if self._is_ephemeral_networking_up(): try: report_diagnostic_event( @@ -1183,7 +1189,9 @@ class DataSourceAzure(sources.DataSource): "to report failure to Azure", logger_func=LOG.debug, ) - report_failure_to_fabric(endpoint=self._wireserver_endpoint) + report_failure_to_fabric( + endpoint=self._wireserver_endpoint, error=error + ) return True except Exception as e: report_diagnostic_event( @@ -1203,7 +1211,9 @@ class DataSourceAzure(sources.DataSource): except NoDHCPLeaseError: # Reporting failure will fail, but it will emit telemetry. pass - report_failure_to_fabric(endpoint=self._wireserver_endpoint) + report_failure_to_fabric( + endpoint=self._wireserver_endpoint, error=error + ) return True except Exception as e: report_diagnostic_event( diff --git a/cloudinit/sources/azure/errors.py b/cloudinit/sources/azure/errors.py new file mode 100644 index 00000000..0dd426a7 --- /dev/null +++ b/cloudinit/sources/azure/errors.py @@ -0,0 +1,93 @@ +# Copyright (C) 2022 Microsoft Corporation. +# +# This file is part of cloud-init. See LICENSE file for license information. + +import base64 +import csv +import logging +import traceback +from datetime import datetime +from io import StringIO +from typing import Any, Dict, Optional + +from cloudinit import version +from cloudinit.sources.azure import identity + +LOG = logging.getLogger(__name__) + + +class ReportableError(Exception): + def __init__( + self, + reason: str, + *, + supporting_data: Optional[Dict[str, Any]] = None, + ) -> None: + self.agent = f"Cloud-Init/{version.version_string()}" + self.documentation_url = "https://aka.ms/linuxprovisioningerror" + self.reason = reason + + if supporting_data: + self.supporting_data = supporting_data + else: + self.supporting_data = {} + + self.timestamp = datetime.utcnow() + + try: + self.vm_id = identity.query_vm_id() + except Exception as id_error: + self.vm_id = f"failed to read vm id: {id_error!r}" + + def as_description( + self, *, delimiter: str = "|", quotechar: str = "'" + ) -> str: + data = [ + f"reason={self.reason}", + f"agent={self.agent}", + ] + data += [f"{k}={v}" for k, v in self.supporting_data.items()] + data += [ + f"vm_id={self.vm_id}", + f"timestamp={self.timestamp.isoformat()}", + f"documentation_url={self.documentation_url}", + ] + + with StringIO() as io: + csv.writer( + io, + delimiter=delimiter, + quotechar=quotechar, + quoting=csv.QUOTE_MINIMAL, + ).writerow(data) + + # strip trailing \r\n + csv_data = io.getvalue().rstrip() + + return f"PROVISIONING_ERROR: {csv_data}" + + def __eq__(self, other) -> bool: + return ( + isinstance(other, ReportableError) + and self.timestamp == other.timestamp + and self.reason == other.reason + and self.supporting_data == other.supporting_data + ) + + def __repr__(self) -> str: + return self.as_description() + + +class ReportableErrorUnhandledException(ReportableError): + def __init__(self, exception: Exception) -> None: + super().__init__("unhandled exception") + + trace = "".join( + traceback.format_exception( + type(exception), exception, exception.__traceback__ + ) + ) + trace_base64 = base64.b64encode(trace.encode("utf-8")) + + self.supporting_data["exception"] = repr(exception) + self.supporting_data["traceback_base64"] = trace_base64 diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index c0ffd760..2413d6b0 100644 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -12,7 +12,7 @@ from contextlib import contextmanager from datetime import datetime from errno import ENOENT from time import sleep, time -from typing import Callable, List, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar, Union from xml.etree import ElementTree from xml.sax.saxutils import escape @@ -20,6 +20,9 @@ from cloudinit import distros, subp, temp_utils, url_helper, util, version from cloudinit.reporting import events from cloudinit.settings import CFG_BUILTIN +if TYPE_CHECKING: + from cloudinit.sources.azure import errors + LOG = logging.getLogger(__name__) # Default Wireserver endpoint (if not found in DHCP option 245). @@ -43,12 +46,6 @@ azure_ds_reporter = events.ReportEventStack( reporting_enabled=True, ) -DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE = ( - "The VM encountered an error during deployment. " - "Please visit https://aka.ms/linuxprovisioningerror " - "for more information on remediation." -) - T = TypeVar("T") @@ -1024,9 +1021,9 @@ def get_metadata_from_fabric( @azure_ds_telemetry_reporter -def report_failure_to_fabric(endpoint: str): +def report_failure_to_fabric(endpoint: str, error: "errors.ReportableError"): shim = WALinuxAgentShim(endpoint=endpoint) - description = DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE + description = error.as_description() try: shim.register_with_azure_and_report_failure(description=description) finally: diff --git a/tests/unittests/sources/azure/test_errors.py b/tests/unittests/sources/azure/test_errors.py new file mode 100644 index 00000000..eb80dd17 --- /dev/null +++ b/tests/unittests/sources/azure/test_errors.py @@ -0,0 +1,137 @@ +# This file is part of cloud-init. See LICENSE file for license information. + +import base64 +import datetime +from unittest import mock + +import pytest + +from cloudinit import version +from cloudinit.sources.azure import errors + + +@pytest.fixture() +def agent_string(): + yield f"agent=Cloud-Init/{version.version_string()}" + + +@pytest.fixture() +def fake_utcnow(): + timestamp = datetime.datetime.utcnow() + with mock.patch.object(errors, "datetime", autospec=True) as m: + m.utcnow.return_value = timestamp + yield timestamp + + +@pytest.fixture() +def fake_vm_id(): + vm_id = "fake-vm-id" + with mock.patch.object(errors.identity, "query_vm_id", autospec=True) as m: + m.return_value = vm_id + yield vm_id + + +def quote_csv_value(value: str) -> str: + """Match quoting behavior, if needed for given string.""" + if any([x in value for x in ("\n", "\r", "'")]): + value = value.replace("'", "''") + value = f"'{value}'" + + return value + + +@pytest.mark.parametrize("reason", ["foo", "foo bar", "foo'bar"]) +@pytest.mark.parametrize( + "supporting_data", + [ + {}, + { + "foo": "bar", + }, + { + "foo": "bar", + "count": 4, + }, + { + "csvcheck": "", + }, + { + "csvcheck": "trailingspace ", + }, + { + "csvcheck": "\r\n", + }, + { + "csvcheck": "\n", + }, + { + "csvcheck": "\t", + }, + { + "csvcheck": "x\nx", + }, + { + "csvcheck": "x\rx", + }, + { + "csvcheck": '"', + }, + { + "csvcheck": '""', + }, + { + "csvcheck": "'", + }, + { + "csvcheck": "''", + }, + { + "csvcheck": "xx'xx'xx", + }, + { + "csvcheck": ",'|~!@#$%^&*()[]\\{}|;':\",./<>?x\nnew\r\nline", + }, + ], +) +def test_reportable_errors( + fake_utcnow, + fake_vm_id, + reason, + supporting_data, +): + error = errors.ReportableError( + reason=reason, + supporting_data=supporting_data, + ) + + data = [ + "PROVISIONING_ERROR: " + quote_csv_value(f"reason={reason}"), + f"agent=Cloud-Init/{version.version_string()}", + ] + data += [quote_csv_value(f"{k}={v}") for k, v in supporting_data.items()] + data += [ + f"vm_id={fake_vm_id}", + f"timestamp={fake_utcnow.isoformat()}", + "documentation_url=https://aka.ms/linuxprovisioningerror", + ] + + assert error.as_description() == "|".join(data) + + +def test_unhandled_exception(): + source_error = None + try: + raise ValueError("my value error") + except Exception as exception: + source_error = exception + + error = errors.ReportableErrorUnhandledException(source_error) + trace = base64.b64decode(error.supporting_data["traceback_base64"]).decode( + "utf-8" + ) + + quoted_value = quote_csv_value(f"exception={source_error!r}") + assert f"|{quoted_value}|" in error.as_description() + assert trace.startswith("Traceback") + assert "raise ValueError" in trace + assert trace.endswith("ValueError: my value error\n") diff --git a/tests/unittests/sources/test_azure.py b/tests/unittests/sources/test_azure.py index 3c36c9c6..a8588ea9 100644 --- a/tests/unittests/sources/test_azure.py +++ b/tests/unittests/sources/test_azure.py @@ -16,7 +16,7 @@ from cloudinit.net import dhcp from cloudinit.sources import UNSET from cloudinit.sources import DataSourceAzure as dsaz from cloudinit.sources import InvalidMetaDataException -from cloudinit.sources.azure import identity +from cloudinit.sources.azure import errors, identity from cloudinit.sources.helpers import netlink from cloudinit.util import ( MountFailedError, @@ -1154,7 +1154,7 @@ scbus-1 on xpt0 bus 0 m_crawl_metadata.side_effect = Exception dsrc.get_data() self.assertEqual(1, m_crawl_metadata.call_count) - m_report_failure.assert_called_once_with() + m_report_failure.assert_called_once_with(mock.ANY) def test_crawl_metadata_exc_should_log_could_not_crawl_msg(self): data = {} @@ -1164,7 +1164,7 @@ scbus-1 on xpt0 bus 0 dsrc.get_data() self.assertEqual(1, m_crawl_metadata.call_count) self.assertIn( - "Could not crawl Azure metadata", self.logs.getvalue() + "Azure datasource failure occurred:", self.logs.getvalue() ) def test_basic_seed_dir(self): @@ -1775,7 +1775,8 @@ scbus-1 on xpt0 bus 0 # mock crawl metadata failure to cause report failure m_crawl_metadata.side_effect = Exception - self.assertTrue(dsrc._report_failure()) + error = errors.ReportableError(reason="foo") + self.assertTrue(dsrc._report_failure(error)) self.assertEqual(1, self.m_report_failure_to_fabric.call_count) def test_dsaz_report_failure_returns_false_and_does_not_propagate_exc( @@ -1805,7 +1806,9 @@ scbus-1 on xpt0 bus 0 # 1. Using cached ephemeral dhcp context to report failure to Azure # 2. Using new ephemeral dhcp to report failure to Azure self.m_report_failure_to_fabric.side_effect = Exception - self.assertFalse(dsrc._report_failure()) + + error = errors.ReportableError(reason="foo") + self.assertFalse(dsrc._report_failure(error)) self.assertEqual(2, self.m_report_failure_to_fabric.call_count) def test_dsaz_report_failure(self): @@ -1814,9 +1817,10 @@ scbus-1 on xpt0 bus 0 with mock.patch.object(dsrc, "crawl_metadata") as m_crawl_metadata: m_crawl_metadata.side_effect = Exception - self.assertTrue(dsrc._report_failure()) + error = errors.ReportableError(reason="foo") + self.assertTrue(dsrc._report_failure(error)) self.m_report_failure_to_fabric.assert_called_once_with( - endpoint="168.63.129.16" + endpoint="168.63.129.16", error=error ) def test_dsaz_report_failure_uses_cached_ephemeral_dhcp_ctx_lease(self): @@ -1830,11 +1834,12 @@ scbus-1 on xpt0 bus 0 # mock crawl metadata failure to cause report failure m_crawl_metadata.side_effect = Exception - self.assertTrue(dsrc._report_failure()) + error = errors.ReportableError(reason="foo") + self.assertTrue(dsrc._report_failure(error)) # ensure called with cached ephemeral dhcp lease option 245 self.m_report_failure_to_fabric.assert_called_once_with( - endpoint="test-ep" + endpoint="test-ep", error=error ) def test_dsaz_report_failure_no_net_uses_new_ephemeral_dhcp_lease(self): @@ -1851,12 +1856,13 @@ scbus-1 on xpt0 bus 0 } self.m_dhcp.return_value.obtain_lease.return_value = test_lease - self.assertTrue(dsrc._report_failure()) + error = errors.ReportableError(reason="foo") + self.assertTrue(dsrc._report_failure(error)) # ensure called with the newly discovered # ephemeral dhcp lease option 245 self.m_report_failure_to_fabric.assert_called_once_with( - endpoint="1.2.3.4" + endpoint="1.2.3.4", error=error ) def test_exception_fetching_fabric_data_doesnt_propagate(self): @@ -4052,7 +4058,7 @@ class TestProvisioning: with mock.patch.object(self.azure_ds, "_report_failure") as m_report: self.azure_ds._get_data() - assert m_report.mock_calls == [mock.call()] + assert m_report.mock_calls == [mock.call(mock.ANY)] assert self.mock_wrapping_setup_ephemeral_networking.mock_calls == [ mock.call(timeout_minutes=20), diff --git a/tests/unittests/sources/test_azure_helper.py b/tests/unittests/sources/test_azure_helper.py index ac2126e8..a08011a6 100644 --- a/tests/unittests/sources/test_azure_helper.py +++ b/tests/unittests/sources/test_azure_helper.py @@ -11,6 +11,7 @@ import pytest import requests from cloudinit import url_helper +from cloudinit.sources.azure import errors from cloudinit.sources.helpers import azure as azure_helper from cloudinit.sources.helpers.azure import WALinuxAgentShim as wa_shim from cloudinit.util import load_file @@ -106,15 +107,6 @@ MOCKPATH = "cloudinit.sources.helpers.azure." @pytest.fixture -def mock_dmi_read_dmi_data(): - with mock.patch( - MOCKPATH + "dmi.read_dmi_data", - autospec=True, - ) as m: - yield m - - -@pytest.fixture def mock_readurl(): with mock.patch(MOCKPATH + "url_helper.readurl", autospec=True) as m: yield m @@ -1358,7 +1350,10 @@ class TestGetMetadataGoalStateXMLAndReportFailureToFabric(CiTestCase): ) def test_success_calls_clean_up(self): - azure_helper.report_failure_to_fabric(endpoint="test_endpoint") + error = errors.ReportableError(reason="test") + azure_helper.report_failure_to_fabric( + endpoint="test_endpoint", error=error + ) self.assertEqual(1, self.m_shim.return_value.clean_up.call_count) def test_failure_in_shim_report_failure_propagates_exc_and_calls_clean_up( @@ -1371,24 +1366,29 @@ class TestGetMetadataGoalStateXMLAndReportFailureToFabric(CiTestCase): SentinelException, azure_helper.report_failure_to_fabric, "test_endpoint", + errors.ReportableError(reason="test"), ) self.assertEqual(1, self.m_shim.return_value.clean_up.call_count) def test_report_failure_to_fabric_calls_shim_report_failure( self, ): - azure_helper.report_failure_to_fabric(endpoint="test_endpoint") + error = errors.ReportableError(reason="test") + + azure_helper.report_failure_to_fabric( + endpoint="test_endpoint", + error=error, + ) # default err message description should be shown to the user # if an empty description is passed in self.m_shim.return_value.register_with_azure_and_report_failure.assert_called_once_with( # noqa: E501 - description=( - azure_helper.DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE - ) + description=error.as_description(), ) def test_instantiates_shim_with_kwargs(self): azure_helper.report_failure_to_fabric( endpoint="test_endpoint", + error=errors.ReportableError(reason="test"), ) self.m_shim.assert_called_once_with( endpoint="test_endpoint", |