From 9f8450368c2ae713de0a2308f5d3cb73de5b39f2 Mon Sep 17 00:00:00 2001 From: Chris Patterson Date: Fri, 12 May 2023 03:21:14 -0700 Subject: azure/errors: introduce reportable errors for imds (#3647) Always report failure to host, but report failure to fabric only outside of _check_if_nic_is_primary() which is expected to fail if nic is not primary. Add two types of reportable errors for IMDS metadata: - add ReportableErrorImdsUrlError() for url errors. - add ReportableErrorImdsMetadataParsingException() for parsing errors. Tweak ReportableError repr to be a bit friendlier. Signed-off-by: Chris Patterson --- cloudinit/sources/DataSourceAzure.py | 35 +++++++++++++++++++++--------- cloudinit/sources/azure/errors.py | 41 +++++++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 11 deletions(-) (limited to 'cloudinit') diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index b8087406..dc8b2a21 100644 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -543,7 +543,7 @@ class DataSourceAzure(sources.DataSource): imds_md = {} if self._is_ephemeral_networking_up(): - imds_md = self.get_metadata_from_imds() + imds_md = self.get_metadata_from_imds(report_failure=True) if not imds_md and ovf_source is None: msg = "No OVF or IMDS available" @@ -575,7 +575,7 @@ class DataSourceAzure(sources.DataSource): md, userdata_raw, cfg, files = self._reprovision() # fetch metadata again as it has changed after reprovisioning - imds_md = self.get_metadata_from_imds() + imds_md = self.get_metadata_from_imds(report_failure=True) # Report errors if IMDS network configuration is missing data. self.validate_imds_network_metadata(imds_md=imds_md) @@ -667,18 +667,33 @@ class DataSourceAzure(sources.DataSource): return crawled_data @azure_ds_telemetry_reporter - def get_metadata_from_imds(self) -> Dict: - retry_deadline = time() + 300 + def get_metadata_from_imds(self, report_failure: bool) -> Dict: + start_time = time() + retry_deadline = start_time + 300 + error_string: Optional[str] = None + error_report: Optional[errors.ReportableError] = None try: return imds.fetch_metadata_with_api_fallback( retry_deadline=retry_deadline ) - except (UrlError, ValueError) as error: - report_diagnostic_event( - "Ignoring IMDS metadata due to: %s" % error, - logger_func=LOG.warning, + except UrlError as error: + error_string = str(error) + duration = time() - start_time + error_report = errors.ReportableErrorImdsUrlError( + exception=error, duration=duration ) - return {} + except ValueError as error: + error_string = str(error) + error_report = errors.ReportableErrorImdsMetadataParsingException( + exception=error + ) + + self._report_failure(error_report, host_only=not report_failure) + report_diagnostic_event( + "Ignoring IMDS metadata due to: %s" % error_string, + logger_func=LOG.warning, + ) + return {} def clear_cached_attrs(self, attr_defaults=()): """Reset any cached class attributes to defaults.""" @@ -976,7 +991,7 @@ class DataSourceAzure(sources.DataSource): # Primary nic detection will be optimized in the future. The fact that # primary nic is being attached first helps here. Otherwise each nic # could add several seconds of delay. - imds_md = self.get_metadata_from_imds() + imds_md = self.get_metadata_from_imds(report_failure=False) if imds_md: # Only primary NIC will get a response from IMDS. LOG.info("%s is the primary nic", ifname) diff --git a/cloudinit/sources/azure/errors.py b/cloudinit/sources/azure/errors.py index ca902a03..966725b0 100644 --- a/cloudinit/sources/azure/errors.py +++ b/cloudinit/sources/azure/errors.py @@ -10,8 +10,11 @@ from datetime import datetime from io import StringIO from typing import Any, Dict, List, Optional +import requests + from cloudinit import version from cloudinit.sources.azure import identity +from cloudinit.url_helper import UrlError LOG = logging.getLogger(__name__) @@ -81,7 +84,12 @@ class ReportableError(Exception): ) def __repr__(self) -> str: - return self.as_encoded_report() + return ( + f"{self.__class__.__name__}(" + f"reason={self.reason}, " + f"timestamp={self.timestamp}, " + f"supporting_data={self.supporting_data})" + ) class ReportableErrorDhcpInterfaceNotFound(ReportableError): @@ -99,6 +107,37 @@ class ReportableErrorDhcpLease(ReportableError): self.supporting_data["interface"] = interface +class ReportableErrorImdsUrlError(ReportableError): + def __init__(self, *, exception: UrlError, duration: float) -> None: + # ConnectTimeout sub-classes ConnectError so order is important. + if isinstance(exception.cause, requests.ConnectTimeout): + reason = "connection timeout querying IMDS" + elif isinstance(exception.cause, requests.ConnectionError): + reason = "connection error querying IMDS" + elif isinstance(exception.cause, requests.ReadTimeout): + reason = "read timeout querying IMDS" + elif exception.code: + reason = "http error querying IMDS" + else: + reason = "unexpected error querying IMDS" + + super().__init__(reason) + + if exception.code: + self.supporting_data["http_code"] = exception.code + + self.supporting_data["duration"] = duration + self.supporting_data["exception"] = repr(exception) + self.supporting_data["url"] = exception.url + + +class ReportableErrorImdsMetadataParsingException(ReportableError): + def __init__(self, *, exception: ValueError) -> None: + super().__init__("error parsing IMDS metadata") + + self.supporting_data["exception"] = repr(exception) + + class ReportableErrorUnhandledException(ReportableError): def __init__(self, exception: Exception) -> None: super().__init__("unhandled exception") -- cgit v1.2.1