summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoraswinrajamannar <39812128+aswinrajamannar@users.noreply.github.com>2021-08-10 12:28:00 -0700
committerGitHub <noreply@github.com>2021-08-10 14:28:00 -0500
commitd3271217e2745fb0e3405bd093b61c39fe0708a7 (patch)
tree8d140da5c47638db090305bddf558bfe3e918591
parentc62cb3af59abc464380011c106b31879181e7c45 (diff)
downloadcloud-init-git-d3271217e2745fb0e3405bd093b61c39fe0708a7.tar.gz
Azure: Limit polling network metadata on connection errors (#961)
-rwxr-xr-xcloudinit/sources/DataSourceAzure.py27
-rw-r--r--tests/unittests/test_datasource/test_azure.py10
2 files changed, 25 insertions, 12 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index 01e2c959..6df9934b 100755
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -972,7 +972,7 @@ class DataSourceAzure(sources.DataSource):
imds_md = None
metadata_poll_count = 0
metadata_logging_threshold = 1
- metadata_timeout_count = 0
+ expected_errors_count = 0
# For now, only a VM's primary NIC can contact IMDS and WireServer. If
# DHCP fails for a NIC, we have no mechanism to determine if the NIC is
@@ -998,13 +998,16 @@ class DataSourceAzure(sources.DataSource):
raise
# Retry polling network metadata for a limited duration only when the
- # calls fail due to timeout. This is because the platform drops packets
- # going towards IMDS when it is not a primary nic. If the calls fail
- # due to other issues like 410, 503 etc, then it means we are primary
- # but IMDS service is unavailable at the moment. Retry indefinitely in
- # those cases since we cannot move on without the network metadata.
+ # calls fail due to network unreachable error or timeout.
+ # This is because the platform drops packets going towards IMDS
+ # when it is not a primary nic. If the calls fail due to other issues
+ # like 410, 503 etc, then it means we are primary but IMDS service
+ # is unavailable at the moment. Retry indefinitely in those cases
+ # since we cannot move on without the network metadata. In the future,
+ # all this will not be necessary, as a new dhcp option would tell
+ # whether the nic is primary or not.
def network_metadata_exc_cb(msg, exc):
- nonlocal metadata_timeout_count, metadata_poll_count
+ nonlocal expected_errors_count, metadata_poll_count
nonlocal metadata_logging_threshold
metadata_poll_count = metadata_poll_count + 1
@@ -1024,9 +1027,13 @@ class DataSourceAzure(sources.DataSource):
(msg, exc.cause, exc.code),
logger_func=LOG.error)
- if exc.cause and isinstance(exc.cause, requests.Timeout):
- metadata_timeout_count = metadata_timeout_count + 1
- return (metadata_timeout_count <= 10)
+ # Retry up to a certain limit for both timeout and network
+ # unreachable errors.
+ if exc.cause and isinstance(
+ exc.cause, (requests.Timeout, requests.ConnectionError)
+ ):
+ expected_errors_count = expected_errors_count + 1
+ return (expected_errors_count <= 10)
return True
# Primary nic detection will be optimized in the future. The fact that
diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py
index 3bf8fdb2..63eaf384 100644
--- a/tests/unittests/test_datasource/test_azure.py
+++ b/tests/unittests/test_datasource/test_azure.py
@@ -2825,7 +2825,8 @@ class TestPreprovisioningHotAttachNics(CiTestCase):
@mock.patch(MOCKPATH + 'EphemeralDHCPv4')
def test_check_if_nic_is_primary_retries_on_failures(
self, m_dhcpv4, m_imds):
- """Retry polling for network metadata on all failures except timeout"""
+ """Retry polling for network metadata on all failures except timeout
+ and network unreachable errors"""
dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths)
lease = {
'interface': 'eth9', 'fixed-address': '192.168.2.9',
@@ -2854,8 +2855,13 @@ class TestPreprovisioningHotAttachNics(CiTestCase):
error = url_helper.UrlError(cause=cause, code=410)
eth0Retries.append(exc_cb("No goal state.", error))
else:
- cause = requests.Timeout('Fake connection timeout')
for _ in range(0, 10):
+ # We are expected to retry for a certain period for both
+ # timeout errors and network unreachable errors.
+ if _ < 5:
+ cause = requests.Timeout('Fake connection timeout')
+ else:
+ cause = requests.ConnectionError('Network Unreachable')
error = url_helper.UrlError(cause=cause)
eth1Retries.append(exc_cb("Connection timeout", error))
# Should stop retrying after 10 retries