summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cloudinit/sources/DataSourceAzure.py28
-rw-r--r--cloudinit/sources/azure/errors.py93
-rw-r--r--cloudinit/sources/helpers/azure.py15
-rw-r--r--tests/unittests/sources/azure/test_errors.py137
-rw-r--r--tests/unittests/sources/test_azure.py30
-rw-r--r--tests/unittests/sources/test_azure_helper.py28
6 files changed, 287 insertions, 44 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index aeec6a92..83dbdce1 100644
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -26,7 +26,7 @@ from cloudinit.net.dhcp import (
)
from cloudinit.net.ephemeral import EphemeralDHCPv4
from cloudinit.reporting import events
-from cloudinit.sources.azure import identity, imds
+from cloudinit.sources.azure import errors, identity, imds
from cloudinit.sources.helpers import netlink
from cloudinit.sources.helpers.azure import (
DEFAULT_WIRESERVER_ENDPOINT,
@@ -727,11 +727,12 @@ class DataSourceAzure(sources.DataSource):
msg="Crawl of metadata service",
func=self.crawl_metadata,
)
- except Exception as e:
- report_diagnostic_event(
- "Could not crawl Azure metadata: %s" % e, logger_func=LOG.error
- )
- self._report_failure()
+ except errors.ReportableError as error:
+ self._report_failure(error)
+ return False
+ except Exception as error:
+ reportable_error = errors.ReportableErrorUnhandledException(error)
+ self._report_failure(reportable_error)
return False
finally:
self._teardown_ephemeral_networking()
@@ -1170,12 +1171,17 @@ class DataSourceAzure(sources.DataSource):
return reprovision_data
@azure_ds_telemetry_reporter
- def _report_failure(self) -> bool:
+ def _report_failure(self, error: errors.ReportableError) -> bool:
"""Tells the Azure fabric that provisioning has failed.
@param description: A description of the error encountered.
@return: The success status of sending the failure signal.
"""
+ report_diagnostic_event(
+ f"Azure datasource failure occurred: {error.as_description()}",
+ logger_func=LOG.error,
+ )
+
if self._is_ephemeral_networking_up():
try:
report_diagnostic_event(
@@ -1183,7 +1189,9 @@ class DataSourceAzure(sources.DataSource):
"to report failure to Azure",
logger_func=LOG.debug,
)
- report_failure_to_fabric(endpoint=self._wireserver_endpoint)
+ report_failure_to_fabric(
+ endpoint=self._wireserver_endpoint, error=error
+ )
return True
except Exception as e:
report_diagnostic_event(
@@ -1203,7 +1211,9 @@ class DataSourceAzure(sources.DataSource):
except NoDHCPLeaseError:
# Reporting failure will fail, but it will emit telemetry.
pass
- report_failure_to_fabric(endpoint=self._wireserver_endpoint)
+ report_failure_to_fabric(
+ endpoint=self._wireserver_endpoint, error=error
+ )
return True
except Exception as e:
report_diagnostic_event(
diff --git a/cloudinit/sources/azure/errors.py b/cloudinit/sources/azure/errors.py
new file mode 100644
index 00000000..0dd426a7
--- /dev/null
+++ b/cloudinit/sources/azure/errors.py
@@ -0,0 +1,93 @@
+# Copyright (C) 2022 Microsoft Corporation.
+#
+# This file is part of cloud-init. See LICENSE file for license information.
+
+import base64
+import csv
+import logging
+import traceback
+from datetime import datetime
+from io import StringIO
+from typing import Any, Dict, Optional
+
+from cloudinit import version
+from cloudinit.sources.azure import identity
+
+LOG = logging.getLogger(__name__)
+
+
+class ReportableError(Exception):
+ def __init__(
+ self,
+ reason: str,
+ *,
+ supporting_data: Optional[Dict[str, Any]] = None,
+ ) -> None:
+ self.agent = f"Cloud-Init/{version.version_string()}"
+ self.documentation_url = "https://aka.ms/linuxprovisioningerror"
+ self.reason = reason
+
+ if supporting_data:
+ self.supporting_data = supporting_data
+ else:
+ self.supporting_data = {}
+
+ self.timestamp = datetime.utcnow()
+
+ try:
+ self.vm_id = identity.query_vm_id()
+ except Exception as id_error:
+ self.vm_id = f"failed to read vm id: {id_error!r}"
+
+ def as_description(
+ self, *, delimiter: str = "|", quotechar: str = "'"
+ ) -> str:
+ data = [
+ f"reason={self.reason}",
+ f"agent={self.agent}",
+ ]
+ data += [f"{k}={v}" for k, v in self.supporting_data.items()]
+ data += [
+ f"vm_id={self.vm_id}",
+ f"timestamp={self.timestamp.isoformat()}",
+ f"documentation_url={self.documentation_url}",
+ ]
+
+ with StringIO() as io:
+ csv.writer(
+ io,
+ delimiter=delimiter,
+ quotechar=quotechar,
+ quoting=csv.QUOTE_MINIMAL,
+ ).writerow(data)
+
+ # strip trailing \r\n
+ csv_data = io.getvalue().rstrip()
+
+ return f"PROVISIONING_ERROR: {csv_data}"
+
+ def __eq__(self, other) -> bool:
+ return (
+ isinstance(other, ReportableError)
+ and self.timestamp == other.timestamp
+ and self.reason == other.reason
+ and self.supporting_data == other.supporting_data
+ )
+
+ def __repr__(self) -> str:
+ return self.as_description()
+
+
+class ReportableErrorUnhandledException(ReportableError):
+ def __init__(self, exception: Exception) -> None:
+ super().__init__("unhandled exception")
+
+ trace = "".join(
+ traceback.format_exception(
+ type(exception), exception, exception.__traceback__
+ )
+ )
+ trace_base64 = base64.b64encode(trace.encode("utf-8"))
+
+ self.supporting_data["exception"] = repr(exception)
+ self.supporting_data["traceback_base64"] = trace_base64
diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py
index c0ffd760..2413d6b0 100644
--- a/cloudinit/sources/helpers/azure.py
+++ b/cloudinit/sources/helpers/azure.py
@@ -12,7 +12,7 @@ from contextlib import contextmanager
from datetime import datetime
from errno import ENOENT
from time import sleep, time
-from typing import Callable, List, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar, Union
from xml.etree import ElementTree
from xml.sax.saxutils import escape
@@ -20,6 +20,9 @@ from cloudinit import distros, subp, temp_utils, url_helper, util, version
from cloudinit.reporting import events
from cloudinit.settings import CFG_BUILTIN
+if TYPE_CHECKING:
+ from cloudinit.sources.azure import errors
+
LOG = logging.getLogger(__name__)
# Default Wireserver endpoint (if not found in DHCP option 245).
@@ -43,12 +46,6 @@ azure_ds_reporter = events.ReportEventStack(
reporting_enabled=True,
)
-DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE = (
- "The VM encountered an error during deployment. "
- "Please visit https://aka.ms/linuxprovisioningerror "
- "for more information on remediation."
-)
-
T = TypeVar("T")
@@ -1024,9 +1021,9 @@ def get_metadata_from_fabric(
@azure_ds_telemetry_reporter
-def report_failure_to_fabric(endpoint: str):
+def report_failure_to_fabric(endpoint: str, error: "errors.ReportableError"):
shim = WALinuxAgentShim(endpoint=endpoint)
- description = DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE
+ description = error.as_description()
try:
shim.register_with_azure_and_report_failure(description=description)
finally:
diff --git a/tests/unittests/sources/azure/test_errors.py b/tests/unittests/sources/azure/test_errors.py
new file mode 100644
index 00000000..eb80dd17
--- /dev/null
+++ b/tests/unittests/sources/azure/test_errors.py
@@ -0,0 +1,137 @@
+# This file is part of cloud-init. See LICENSE file for license information.
+
+import base64
+import datetime
+from unittest import mock
+
+import pytest
+
+from cloudinit import version
+from cloudinit.sources.azure import errors
+
+
+@pytest.fixture()
+def agent_string():
+ yield f"agent=Cloud-Init/{version.version_string()}"
+
+
+@pytest.fixture()
+def fake_utcnow():
+ timestamp = datetime.datetime.utcnow()
+ with mock.patch.object(errors, "datetime", autospec=True) as m:
+ m.utcnow.return_value = timestamp
+ yield timestamp
+
+
+@pytest.fixture()
+def fake_vm_id():
+ vm_id = "fake-vm-id"
+ with mock.patch.object(errors.identity, "query_vm_id", autospec=True) as m:
+ m.return_value = vm_id
+ yield vm_id
+
+
+def quote_csv_value(value: str) -> str:
+ """Match quoting behavior, if needed for given string."""
+ if any([x in value for x in ("\n", "\r", "'")]):
+ value = value.replace("'", "''")
+ value = f"'{value}'"
+
+ return value
+
+
+@pytest.mark.parametrize("reason", ["foo", "foo bar", "foo'bar"])
+@pytest.mark.parametrize(
+ "supporting_data",
+ [
+ {},
+ {
+ "foo": "bar",
+ },
+ {
+ "foo": "bar",
+ "count": 4,
+ },
+ {
+ "csvcheck": "",
+ },
+ {
+ "csvcheck": "trailingspace ",
+ },
+ {
+ "csvcheck": "\r\n",
+ },
+ {
+ "csvcheck": "\n",
+ },
+ {
+ "csvcheck": "\t",
+ },
+ {
+ "csvcheck": "x\nx",
+ },
+ {
+ "csvcheck": "x\rx",
+ },
+ {
+ "csvcheck": '"',
+ },
+ {
+ "csvcheck": '""',
+ },
+ {
+ "csvcheck": "'",
+ },
+ {
+ "csvcheck": "''",
+ },
+ {
+ "csvcheck": "xx'xx'xx",
+ },
+ {
+ "csvcheck": ",'|~!@#$%^&*()[]\\{}|;':\",./<>?x\nnew\r\nline",
+ },
+ ],
+)
+def test_reportable_errors(
+ fake_utcnow,
+ fake_vm_id,
+ reason,
+ supporting_data,
+):
+ error = errors.ReportableError(
+ reason=reason,
+ supporting_data=supporting_data,
+ )
+
+ data = [
+ "PROVISIONING_ERROR: " + quote_csv_value(f"reason={reason}"),
+ f"agent=Cloud-Init/{version.version_string()}",
+ ]
+ data += [quote_csv_value(f"{k}={v}") for k, v in supporting_data.items()]
+ data += [
+ f"vm_id={fake_vm_id}",
+ f"timestamp={fake_utcnow.isoformat()}",
+ "documentation_url=https://aka.ms/linuxprovisioningerror",
+ ]
+
+ assert error.as_description() == "|".join(data)
+
+
+def test_unhandled_exception():
+ source_error = None
+ try:
+ raise ValueError("my value error")
+ except Exception as exception:
+ source_error = exception
+
+ error = errors.ReportableErrorUnhandledException(source_error)
+ trace = base64.b64decode(error.supporting_data["traceback_base64"]).decode(
+ "utf-8"
+ )
+
+ quoted_value = quote_csv_value(f"exception={source_error!r}")
+ assert f"|{quoted_value}|" in error.as_description()
+ assert trace.startswith("Traceback")
+ assert "raise ValueError" in trace
+ assert trace.endswith("ValueError: my value error\n")
diff --git a/tests/unittests/sources/test_azure.py b/tests/unittests/sources/test_azure.py
index 3c36c9c6..a8588ea9 100644
--- a/tests/unittests/sources/test_azure.py
+++ b/tests/unittests/sources/test_azure.py
@@ -16,7 +16,7 @@ from cloudinit.net import dhcp
from cloudinit.sources import UNSET
from cloudinit.sources import DataSourceAzure as dsaz
from cloudinit.sources import InvalidMetaDataException
-from cloudinit.sources.azure import identity
+from cloudinit.sources.azure import errors, identity
from cloudinit.sources.helpers import netlink
from cloudinit.util import (
MountFailedError,
@@ -1154,7 +1154,7 @@ scbus-1 on xpt0 bus 0
m_crawl_metadata.side_effect = Exception
dsrc.get_data()
self.assertEqual(1, m_crawl_metadata.call_count)
- m_report_failure.assert_called_once_with()
+ m_report_failure.assert_called_once_with(mock.ANY)
def test_crawl_metadata_exc_should_log_could_not_crawl_msg(self):
data = {}
@@ -1164,7 +1164,7 @@ scbus-1 on xpt0 bus 0
dsrc.get_data()
self.assertEqual(1, m_crawl_metadata.call_count)
self.assertIn(
- "Could not crawl Azure metadata", self.logs.getvalue()
+ "Azure datasource failure occurred:", self.logs.getvalue()
)
def test_basic_seed_dir(self):
@@ -1775,7 +1775,8 @@ scbus-1 on xpt0 bus 0
# mock crawl metadata failure to cause report failure
m_crawl_metadata.side_effect = Exception
- self.assertTrue(dsrc._report_failure())
+ error = errors.ReportableError(reason="foo")
+ self.assertTrue(dsrc._report_failure(error))
self.assertEqual(1, self.m_report_failure_to_fabric.call_count)
def test_dsaz_report_failure_returns_false_and_does_not_propagate_exc(
@@ -1805,7 +1806,9 @@ scbus-1 on xpt0 bus 0
# 1. Using cached ephemeral dhcp context to report failure to Azure
# 2. Using new ephemeral dhcp to report failure to Azure
self.m_report_failure_to_fabric.side_effect = Exception
- self.assertFalse(dsrc._report_failure())
+
+ error = errors.ReportableError(reason="foo")
+ self.assertFalse(dsrc._report_failure(error))
self.assertEqual(2, self.m_report_failure_to_fabric.call_count)
def test_dsaz_report_failure(self):
@@ -1814,9 +1817,10 @@ scbus-1 on xpt0 bus 0
with mock.patch.object(dsrc, "crawl_metadata") as m_crawl_metadata:
m_crawl_metadata.side_effect = Exception
- self.assertTrue(dsrc._report_failure())
+ error = errors.ReportableError(reason="foo")
+ self.assertTrue(dsrc._report_failure(error))
self.m_report_failure_to_fabric.assert_called_once_with(
- endpoint="168.63.129.16"
+ endpoint="168.63.129.16", error=error
)
def test_dsaz_report_failure_uses_cached_ephemeral_dhcp_ctx_lease(self):
@@ -1830,11 +1834,12 @@ scbus-1 on xpt0 bus 0
# mock crawl metadata failure to cause report failure
m_crawl_metadata.side_effect = Exception
- self.assertTrue(dsrc._report_failure())
+ error = errors.ReportableError(reason="foo")
+ self.assertTrue(dsrc._report_failure(error))
# ensure called with cached ephemeral dhcp lease option 245
self.m_report_failure_to_fabric.assert_called_once_with(
- endpoint="test-ep"
+ endpoint="test-ep", error=error
)
def test_dsaz_report_failure_no_net_uses_new_ephemeral_dhcp_lease(self):
@@ -1851,12 +1856,13 @@ scbus-1 on xpt0 bus 0
}
self.m_dhcp.return_value.obtain_lease.return_value = test_lease
- self.assertTrue(dsrc._report_failure())
+ error = errors.ReportableError(reason="foo")
+ self.assertTrue(dsrc._report_failure(error))
# ensure called with the newly discovered
# ephemeral dhcp lease option 245
self.m_report_failure_to_fabric.assert_called_once_with(
- endpoint="1.2.3.4"
+ endpoint="1.2.3.4", error=error
)
def test_exception_fetching_fabric_data_doesnt_propagate(self):
@@ -4052,7 +4058,7 @@ class TestProvisioning:
with mock.patch.object(self.azure_ds, "_report_failure") as m_report:
self.azure_ds._get_data()
- assert m_report.mock_calls == [mock.call()]
+ assert m_report.mock_calls == [mock.call(mock.ANY)]
assert self.mock_wrapping_setup_ephemeral_networking.mock_calls == [
mock.call(timeout_minutes=20),
diff --git a/tests/unittests/sources/test_azure_helper.py b/tests/unittests/sources/test_azure_helper.py
index ac2126e8..a08011a6 100644
--- a/tests/unittests/sources/test_azure_helper.py
+++ b/tests/unittests/sources/test_azure_helper.py
@@ -11,6 +11,7 @@ import pytest
import requests
from cloudinit import url_helper
+from cloudinit.sources.azure import errors
from cloudinit.sources.helpers import azure as azure_helper
from cloudinit.sources.helpers.azure import WALinuxAgentShim as wa_shim
from cloudinit.util import load_file
@@ -106,15 +107,6 @@ MOCKPATH = "cloudinit.sources.helpers.azure."
@pytest.fixture
-def mock_dmi_read_dmi_data():
- with mock.patch(
- MOCKPATH + "dmi.read_dmi_data",
- autospec=True,
- ) as m:
- yield m
-
-
-@pytest.fixture
def mock_readurl():
with mock.patch(MOCKPATH + "url_helper.readurl", autospec=True) as m:
yield m
@@ -1358,7 +1350,10 @@ class TestGetMetadataGoalStateXMLAndReportFailureToFabric(CiTestCase):
)
def test_success_calls_clean_up(self):
- azure_helper.report_failure_to_fabric(endpoint="test_endpoint")
+ error = errors.ReportableError(reason="test")
+ azure_helper.report_failure_to_fabric(
+ endpoint="test_endpoint", error=error
+ )
self.assertEqual(1, self.m_shim.return_value.clean_up.call_count)
def test_failure_in_shim_report_failure_propagates_exc_and_calls_clean_up(
@@ -1371,24 +1366,29 @@ class TestGetMetadataGoalStateXMLAndReportFailureToFabric(CiTestCase):
SentinelException,
azure_helper.report_failure_to_fabric,
"test_endpoint",
+ errors.ReportableError(reason="test"),
)
self.assertEqual(1, self.m_shim.return_value.clean_up.call_count)
def test_report_failure_to_fabric_calls_shim_report_failure(
self,
):
- azure_helper.report_failure_to_fabric(endpoint="test_endpoint")
+ error = errors.ReportableError(reason="test")
+
+ azure_helper.report_failure_to_fabric(
+ endpoint="test_endpoint",
+ error=error,
+ )
# default err message description should be shown to the user
# if an empty description is passed in
self.m_shim.return_value.register_with_azure_and_report_failure.assert_called_once_with( # noqa: E501
- description=(
- azure_helper.DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE
- )
+ description=error.as_description(),
)
def test_instantiates_shim_with_kwargs(self):
azure_helper.report_failure_to_fabric(
endpoint="test_endpoint",
+ error=errors.ReportableError(reason="test"),
)
self.m_shim.assert_called_once_with(
endpoint="test_endpoint",