summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Patterson <cpatterson@microsoft.com>2023-05-11 11:43:13 -0700
committerGitHub <noreply@github.com>2023-05-11 13:43:13 -0500
commit481bf4d3c3460fe398c74c950eb02f8562e1c20a (patch)
treede8487d377980ba69c62e40afb36265968485c5f
parent0c4d53f2b756c8ecc3c17e3403ba66ebb5ce1560 (diff)
downloadcloud-init-git-481bf4d3c3460fe398c74c950eb02f8562e1c20a.tar.gz
azure/errors: add host reporting for dhcp errors (#2167)
- Add host_only flag to _report_failure() to allow caller to only report the failure to host. This is for cases where we don't want _report_failure() to attempt DHCP or we expect that we may recover from the reported error (there is no issue reporting multiple times to host, whereas fabric reports will immediately fail the VM provisioning). - Add ReportableErrorDhcpLease() to report lease failures. - Add ReportableErrorDhcpInterfaceNotFound() to report errors where the DHCP interface hasn't been found yet. - Add TestReportFailure class with new test coverage. Will migrate other _report_failure() tests in the future as they currently depend on TestAzureDataSource/CiTestCase. Future work will add the interface name to supporting data, but as that information is not available with iface=None, another PR will explicitly add a call to net.find_fallback_nic() to specify it. Signed-off-by: Chris Patterson <cpatterson@microsoft.com>
-rw-r--r--cloudinit/sources/DataSourceAzure.py36
-rw-r--r--cloudinit/sources/azure/errors.py15
-rw-r--r--tests/unittests/sources/azure/test_errors.py15
-rw-r--r--tests/unittests/sources/test_azure.py72
4 files changed, 120 insertions, 18 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index 77b53aaa..b8087406 100644
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -371,7 +371,8 @@ class DataSourceAzure(sources.DataSource):
)
lease = None
- timeout = timeout_minutes * 60 + time()
+ start_time = time()
+ deadline = start_time + timeout_minutes * 60
with events.ReportEventStack(
name="obtain-dhcp-lease",
description="obtain dhcp lease",
@@ -385,6 +386,12 @@ class DataSourceAzure(sources.DataSource):
report_diagnostic_event(
"Interface not found for DHCP", logger_func=LOG.warning
)
+ self._report_failure(
+ errors.ReportableErrorDhcpInterfaceNotFound(
+ duration=time() - start_time
+ ),
+ host_only=True,
+ )
except NoDHCPLeaseMissingDhclientError:
# No dhclient, no point in retrying.
report_diagnostic_event(
@@ -398,6 +405,12 @@ class DataSourceAzure(sources.DataSource):
"Failed to obtain DHCP lease (iface=%s)" % iface,
logger_func=LOG.error,
)
+ self._report_failure(
+ errors.ReportableErrorDhcpLease(
+ duration=time() - start_time, interface=iface
+ ),
+ host_only=True,
+ )
except subp.ProcessExecutionError as error:
# udevadm settle, ip link set dev eth0 up, etc.
report_diagnostic_event(
@@ -412,8 +425,8 @@ class DataSourceAzure(sources.DataSource):
logger_func=LOG.error,
)
- # Sleep before retrying, otherwise break if we're past timeout.
- if lease is None and time() + retry_sleep < timeout:
+ # Sleep before retrying, otherwise break if past deadline.
+ if lease is None and time() + retry_sleep < deadline:
sleep(retry_sleep)
else:
break
@@ -1154,17 +1167,26 @@ class DataSourceAzure(sources.DataSource):
return reprovision_data
@azure_ds_telemetry_reporter
- def _report_failure(self, error: errors.ReportableError) -> bool:
- """Tells the Azure fabric that provisioning has failed.
+ def _report_failure(
+ self, error: errors.ReportableError, host_only: bool = False
+ ) -> bool:
+ """Report failure to Azure host and fabric.
+
+ For errors that may be recoverable (e.g. DHCP), host_only provides a
+ mechanism to report the failure that can be updated later with success.
+ DHCP will not be attempted if host_only=True and networking is down.
- @param description: A description of the error encountered.
+ @param error: Error to report.
+ @param host_only: Only report to host (error may be recoverable).
@return: The success status of sending the failure signal.
"""
report_diagnostic_event(
f"Azure datasource failure occurred: {error.as_encoded_report()}",
logger_func=LOG.error,
)
- kvp.report_failure_to_host(error)
+ reported = kvp.report_failure_to_host(error)
+ if host_only:
+ return reported
if self._is_ephemeral_networking_up():
try:
diff --git a/cloudinit/sources/azure/errors.py b/cloudinit/sources/azure/errors.py
index 5c4ad7db..ca902a03 100644
--- a/cloudinit/sources/azure/errors.py
+++ b/cloudinit/sources/azure/errors.py
@@ -84,6 +84,21 @@ class ReportableError(Exception):
return self.as_encoded_report()
+class ReportableErrorDhcpInterfaceNotFound(ReportableError):
+ def __init__(self, duration: float) -> None:
+ super().__init__("failure to find DHCP interface")
+
+ self.supporting_data["duration"] = duration
+
+
+class ReportableErrorDhcpLease(ReportableError):
+ def __init__(self, duration: float, interface: Optional[str]) -> None:
+ super().__init__("failure to obtain DHCP lease")
+
+ self.supporting_data["duration"] = duration
+ self.supporting_data["interface"] = interface
+
+
class ReportableErrorUnhandledException(ReportableError):
def __init__(self, exception: Exception) -> None:
super().__init__("unhandled exception")
diff --git a/tests/unittests/sources/azure/test_errors.py b/tests/unittests/sources/azure/test_errors.py
index d2213613..e9d3e039 100644
--- a/tests/unittests/sources/azure/test_errors.py
+++ b/tests/unittests/sources/azure/test_errors.py
@@ -119,6 +119,21 @@ def test_reportable_errors(
assert error.as_encoded_report() == "|".join(data)
+def test_dhcp_lease():
+ error = errors.ReportableErrorDhcpLease(duration=5.6, interface="foo")
+
+ assert error.reason == "failure to obtain DHCP lease"
+ assert error.supporting_data["duration"] == 5.6
+ assert error.supporting_data["interface"] == "foo"
+
+
+def test_dhcp_interface_not_found():
+ error = errors.ReportableErrorDhcpInterfaceNotFound(duration=5.6)
+
+ assert error.reason == "failure to find DHCP interface"
+ assert error.supporting_data["duration"] == 5.6
+
+
def test_unhandled_exception():
source_error = None
try:
diff --git a/tests/unittests/sources/test_azure.py b/tests/unittests/sources/test_azure.py
index 677c250d..59ac6459 100644
--- a/tests/unittests/sources/test_azure.py
+++ b/tests/unittests/sources/test_azure.py
@@ -3337,6 +3337,7 @@ class TestEphemeralNetworking:
self,
azure_ds,
mock_ephemeral_dhcp_v4,
+ mock_kvp_report_failure_to_host,
mock_sleep,
):
lease = {
@@ -3362,6 +3363,12 @@ class TestEphemeralNetworking:
assert azure_ds._wireserver_endpoint == "168.63.129.16"
assert azure_ds._ephemeral_dhcp_ctx.iface == "fakeEth0"
+ error_reasons = [
+ c[0][0].reason
+ for c in mock_kvp_report_failure_to_host.call_args_list
+ ]
+ assert error_reasons == ["failure to find DHCP interface"]
+
def test_retry_process_error(
self,
azure_ds,
@@ -3403,14 +3410,20 @@ class TestEphemeralNetworking:
]
@pytest.mark.parametrize(
- "error_class", [dhcp.NoDHCPLeaseInterfaceError, dhcp.NoDHCPLeaseError]
+ "error_class,error_reason",
+ [
+ (dhcp.NoDHCPLeaseInterfaceError, "failure to find DHCP interface"),
+ (dhcp.NoDHCPLeaseError, "failure to obtain DHCP lease"),
+ ],
)
def test_retry_sleeps(
self,
azure_ds,
mock_ephemeral_dhcp_v4,
+ mock_kvp_report_failure_to_host,
mock_sleep,
error_class,
+ error_reason,
):
lease = {
"interface": "fakeEth0",
@@ -3436,30 +3449,41 @@ class TestEphemeralNetworking:
assert azure_ds._wireserver_endpoint == "168.63.129.16"
assert azure_ds._ephemeral_dhcp_ctx.iface == "fakeEth0"
+ error_reasons = [
+ c[0][0].reason
+ for c in mock_kvp_report_failure_to_host.call_args_list
+ ]
+ assert error_reasons == [error_reason] * 10
+
@pytest.mark.parametrize(
- "error_class", [dhcp.NoDHCPLeaseInterfaceError, dhcp.NoDHCPLeaseError]
+ "error_class,error_reason",
+ [
+ (dhcp.NoDHCPLeaseInterfaceError, "failure to find DHCP interface"),
+ (dhcp.NoDHCPLeaseError, "failure to obtain DHCP lease"),
+ ],
)
def test_retry_times_out(
self,
azure_ds,
mock_ephemeral_dhcp_v4,
+ mock_kvp_report_failure_to_host,
mock_sleep,
mock_time,
error_class,
+ error_reason,
):
mock_time.side_effect = [
0.0, # start
- 60.1, # first
- 120.1, # third
- 180.1, # timeout
+ 60.1, # duration check for host error report
+ 60.11, # loop check
+ 120.1, # duration check for host error report
+ 120.11, # loop check
+ 180.1, # duration check for host error report
+ 180.11, # loop check timeout
]
mock_ephemeral_dhcp_v4.return_value.obtain_lease.side_effect = [
error_class()
- ] * 10 + [
- {
- "interface": "fakeEth0",
- }
- ]
+ ] * 3
with pytest.raises(dhcp.NoDHCPLeaseError):
azure_ds._setup_ephemeral_networking(timeout_minutes=3)
@@ -3472,6 +3496,12 @@ class TestEphemeralNetworking:
assert azure_ds._wireserver_endpoint == "168.63.129.16"
assert azure_ds._ephemeral_dhcp_ctx is None
+ error_reasons = [
+ c[0][0].reason
+ for c in mock_kvp_report_failure_to_host.call_args_list
+ ]
+ assert error_reasons == [error_reason] * 3
+
class TestInstanceId:
def test_metadata(self, azure_ds, mock_dmi_read_dmi_data):
@@ -4138,7 +4168,7 @@ class TestProvisioning:
assert self.mock_netlink.mock_calls == []
# Verify reports via KVP.
- assert len(self.mock_kvp_report_failure_to_host.mock_calls) == 1
+ assert len(self.mock_kvp_report_failure_to_host.mock_calls) == 2
assert len(self.mock_kvp_report_success_to_host.mock_calls) == 0
@pytest.mark.parametrize(
@@ -4214,6 +4244,26 @@ class TestProvisioning:
assert len(self.mock_kvp_report_success_to_host.mock_calls) == 1
+class TestReportFailure:
+ @pytest.mark.parametrize("kvp_enabled", [False, True])
+ def report_host_only_kvp_enabled(
+ self,
+ azure_ds,
+ kvp_enabled,
+ mock_azure_report_failure_to_fabric,
+ mock_kvp_report_failure_to_host,
+ mock_kvp_report_success_to_host,
+ ):
+ mock_kvp_report_failure_to_host.return_value = kvp_enabled
+ error = errors.ReportableError(reason="foo")
+
+ assert azure_ds._report_failure(error, host_only=True) == kvp_enabled
+
+ assert mock_kvp_report_failure_to_host.mock_calls == [mock.call(error)]
+ assert mock_kvp_report_success_to_host.mock_calls == []
+ assert mock_azure_report_failure_to_fabric.mock_calls == []
+
+
class TestValidateIMDSMetadata:
@pytest.mark.parametrize(
"mac,expected",