From 9f8450368c2ae713de0a2308f5d3cb73de5b39f2 Mon Sep 17 00:00:00 2001 From: Chris Patterson Date: Fri, 12 May 2023 03:21:14 -0700 Subject: azure/errors: introduce reportable errors for imds (#3647) Always report failure to host, but report failure to fabric only outside of _check_if_nic_is_primary() which is expected to fail if nic is not primary. Add two types of reportable errors for IMDS metadata: - add ReportableErrorImdsUrlError() for url errors. - add ReportableErrorImdsMetadataParsingException() for parsing errors. Tweak ReportableError repr to be a bit friendlier. Signed-off-by: Chris Patterson --- cloudinit/sources/DataSourceAzure.py | 35 ++++++++++----- cloudinit/sources/azure/errors.py | 41 +++++++++++++++++- tests/unittests/sources/azure/test_errors.py | 65 ++++++++++++++++++++++++++++ tests/unittests/sources/test_azure.py | 65 +++++++++++++++++++++++++++- 4 files changed, 194 insertions(+), 12 deletions(-) diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index b8087406..dc8b2a21 100644 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -543,7 +543,7 @@ class DataSourceAzure(sources.DataSource): imds_md = {} if self._is_ephemeral_networking_up(): - imds_md = self.get_metadata_from_imds() + imds_md = self.get_metadata_from_imds(report_failure=True) if not imds_md and ovf_source is None: msg = "No OVF or IMDS available" @@ -575,7 +575,7 @@ class DataSourceAzure(sources.DataSource): md, userdata_raw, cfg, files = self._reprovision() # fetch metadata again as it has changed after reprovisioning - imds_md = self.get_metadata_from_imds() + imds_md = self.get_metadata_from_imds(report_failure=True) # Report errors if IMDS network configuration is missing data. self.validate_imds_network_metadata(imds_md=imds_md) @@ -667,18 +667,33 @@ class DataSourceAzure(sources.DataSource): return crawled_data @azure_ds_telemetry_reporter - def get_metadata_from_imds(self) -> Dict: - retry_deadline = time() + 300 + def get_metadata_from_imds(self, report_failure: bool) -> Dict: + start_time = time() + retry_deadline = start_time + 300 + error_string: Optional[str] = None + error_report: Optional[errors.ReportableError] = None try: return imds.fetch_metadata_with_api_fallback( retry_deadline=retry_deadline ) - except (UrlError, ValueError) as error: - report_diagnostic_event( - "Ignoring IMDS metadata due to: %s" % error, - logger_func=LOG.warning, + except UrlError as error: + error_string = str(error) + duration = time() - start_time + error_report = errors.ReportableErrorImdsUrlError( + exception=error, duration=duration ) - return {} + except ValueError as error: + error_string = str(error) + error_report = errors.ReportableErrorImdsMetadataParsingException( + exception=error + ) + + self._report_failure(error_report, host_only=not report_failure) + report_diagnostic_event( + "Ignoring IMDS metadata due to: %s" % error_string, + logger_func=LOG.warning, + ) + return {} def clear_cached_attrs(self, attr_defaults=()): """Reset any cached class attributes to defaults.""" @@ -976,7 +991,7 @@ class DataSourceAzure(sources.DataSource): # Primary nic detection will be optimized in the future. The fact that # primary nic is being attached first helps here. Otherwise each nic # could add several seconds of delay. - imds_md = self.get_metadata_from_imds() + imds_md = self.get_metadata_from_imds(report_failure=False) if imds_md: # Only primary NIC will get a response from IMDS. LOG.info("%s is the primary nic", ifname) diff --git a/cloudinit/sources/azure/errors.py b/cloudinit/sources/azure/errors.py index ca902a03..966725b0 100644 --- a/cloudinit/sources/azure/errors.py +++ b/cloudinit/sources/azure/errors.py @@ -10,8 +10,11 @@ from datetime import datetime from io import StringIO from typing import Any, Dict, List, Optional +import requests + from cloudinit import version from cloudinit.sources.azure import identity +from cloudinit.url_helper import UrlError LOG = logging.getLogger(__name__) @@ -81,7 +84,12 @@ class ReportableError(Exception): ) def __repr__(self) -> str: - return self.as_encoded_report() + return ( + f"{self.__class__.__name__}(" + f"reason={self.reason}, " + f"timestamp={self.timestamp}, " + f"supporting_data={self.supporting_data})" + ) class ReportableErrorDhcpInterfaceNotFound(ReportableError): @@ -99,6 +107,37 @@ class ReportableErrorDhcpLease(ReportableError): self.supporting_data["interface"] = interface +class ReportableErrorImdsUrlError(ReportableError): + def __init__(self, *, exception: UrlError, duration: float) -> None: + # ConnectTimeout sub-classes ConnectError so order is important. + if isinstance(exception.cause, requests.ConnectTimeout): + reason = "connection timeout querying IMDS" + elif isinstance(exception.cause, requests.ConnectionError): + reason = "connection error querying IMDS" + elif isinstance(exception.cause, requests.ReadTimeout): + reason = "read timeout querying IMDS" + elif exception.code: + reason = "http error querying IMDS" + else: + reason = "unexpected error querying IMDS" + + super().__init__(reason) + + if exception.code: + self.supporting_data["http_code"] = exception.code + + self.supporting_data["duration"] = duration + self.supporting_data["exception"] = repr(exception) + self.supporting_data["url"] = exception.url + + +class ReportableErrorImdsMetadataParsingException(ReportableError): + def __init__(self, *, exception: ValueError) -> None: + super().__init__("error parsing IMDS metadata") + + self.supporting_data["exception"] = repr(exception) + + class ReportableErrorUnhandledException(ReportableError): def __init__(self, exception: Exception) -> None: super().__init__("unhandled exception") diff --git a/tests/unittests/sources/azure/test_errors.py b/tests/unittests/sources/azure/test_errors.py index e9d3e039..cf3e0e71 100644 --- a/tests/unittests/sources/azure/test_errors.py +++ b/tests/unittests/sources/azure/test_errors.py @@ -5,9 +5,11 @@ import datetime from unittest import mock import pytest +import requests from cloudinit import version from cloudinit.sources.azure import errors +from cloudinit.url_helper import UrlError @pytest.fixture() @@ -134,6 +136,69 @@ def test_dhcp_interface_not_found(): assert error.supporting_data["duration"] == 5.6 +@pytest.mark.parametrize( + "exception,reason", + [ + ( + UrlError( + requests.ConnectionError(), + ), + "connection error querying IMDS", + ), + ( + UrlError( + requests.ConnectTimeout(), + ), + "connection timeout querying IMDS", + ), + ( + UrlError( + requests.ReadTimeout(), + ), + "read timeout querying IMDS", + ), + ( + UrlError( + Exception(), + code=500, + ), + "http error querying IMDS", + ), + ( + UrlError( + requests.HTTPError(), + code=None, + ), + "unexpected error querying IMDS", + ), + ], +) +def test_imds_url_error(exception, reason): + duration = 123.4 + fake_url = "fake://url" + + exception.url = fake_url + error = errors.ReportableErrorImdsUrlError( + exception=exception, duration=duration + ) + + assert error.reason == reason + assert error.supporting_data["duration"] == duration + assert error.supporting_data["exception"] == repr(exception) + assert error.supporting_data["url"] == fake_url + + +def test_imds_metadata_parsing_exception(): + exception = ValueError("foobar") + + error = errors.ReportableErrorImdsMetadataParsingException( + exception=exception + ) + + assert error.reason == "error parsing IMDS metadata" + assert error.supporting_data["exception"] == repr(exception) + + def test_unhandled_exception(): source_error = None try: diff --git a/tests/unittests/sources/test_azure.py b/tests/unittests/sources/test_azure.py index 59ac6459..ed605ee7 100644 --- a/tests/unittests/sources/test_azure.py +++ b/tests/unittests/sources/test_azure.py @@ -17,7 +17,7 @@ from cloudinit.reporting.handlers import HyperVKvpReportingHandler from cloudinit.sources import UNSET from cloudinit.sources import DataSourceAzure as dsaz from cloudinit.sources import InvalidMetaDataException -from cloudinit.sources.azure import errors, identity +from cloudinit.sources.azure import errors, identity, imds from cloudinit.sources.helpers import netlink from cloudinit.util import ( MountFailedError, @@ -135,6 +135,16 @@ def mock_ephemeral_dhcp_v4(): yield m +@pytest.fixture +def mock_imds_fetch_metadata_with_api_fallback(): + with mock.patch.object( + imds, + "fetch_metadata_with_api_fallback", + autospec=True, + ) as m: + yield m + + @pytest.fixture def mock_kvp_report_failure_to_host(): with mock.patch( @@ -4244,6 +4254,59 @@ class TestProvisioning: assert len(self.mock_kvp_report_success_to_host.mock_calls) == 1 +class TestGetMetadataFromImds: + @pytest.mark.parametrize("report_failure", [False, True]) + @pytest.mark.parametrize( + "exception,reported_error_type", + [ + ( + url_helper.UrlError(requests.ConnectionError()), + errors.ReportableErrorImdsUrlError, + ), + ( + ValueError("bad data"), + errors.ReportableErrorImdsMetadataParsingException, + ), + ], + ) + def test_errors( + self, + azure_ds, + exception, + mock_azure_report_failure_to_fabric, + mock_imds_fetch_metadata_with_api_fallback, + mock_kvp_report_failure_to_host, + monkeypatch, + report_failure, + reported_error_type, + ): + monkeypatch.setattr( + azure_ds, "_is_ephemeral_networking_up", lambda: True + ) + mock_imds_fetch_metadata_with_api_fallback.side_effect = exception + + assert ( + azure_ds.get_metadata_from_imds(report_failure=report_failure) + == {} + ) + assert mock_imds_fetch_metadata_with_api_fallback.mock_calls == [ + mock.call(retry_deadline=mock.ANY) + ] + + reported_error = mock_kvp_report_failure_to_host.call_args[0][0] + assert isinstance(reported_error, reported_error_type) + assert reported_error.supporting_data["exception"] == repr(exception) + assert mock_kvp_report_failure_to_host.mock_calls == [ + mock.call(reported_error) + ] + if report_failure: + assert mock_azure_report_failure_to_fabric.mock_calls == [ + mock.call(endpoint=mock.ANY, error=reported_error) + ] + else: + assert mock_azure_report_failure_to_fabric.mock_calls == [] + + class TestReportFailure: @pytest.mark.parametrize("kvp_enabled", [False, True]) def report_host_only_kvp_enabled( -- cgit v1.2.1