From 4f3fe82305b709721a9725b25cbf0ca51b770fc6 Mon Sep 17 00:00:00 2001 From: Chris Patterson Date: Wed, 19 Apr 2023 21:54:04 -0400 Subject: azure/imds: retry fetching metadata up to 300 seconds (#2121) Instead of a fixed number of retries, allow up to 5 minutes to fetch metadata from IMDS. The current approach allows for up to 11 attempts depending on the path. Given the timeout setting, this can vary from ~11 seconds up to ~32 seconds depending on whether or not read/connection timeouts are encountered. Delaying boot on the rare occasion that IMDS is delayed is better than ignoring the metadata as it ensures the VM is configured as expected. This is a very conservative timeout and may be reduced in the future. Signed-off-by: Chris Patterson --- cloudinit/sources/DataSourceAzure.py | 9 ++- cloudinit/sources/azure/imds.py | 54 +++++++++---- tests/unittests/sources/azure/test_imds.py | 121 ++++++++++++++++++++--------- tests/unittests/sources/test_azure.py | 70 +++++++++-------- 4 files changed, 171 insertions(+), 83 deletions(-) diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index a2b052b7..d0af2089 100644 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -673,9 +673,12 @@ class DataSourceAzure(sources.DataSource): return crawled_data @azure_ds_telemetry_reporter - def get_metadata_from_imds(self, retries: int = 10) -> Dict: + def get_metadata_from_imds(self) -> Dict: + retry_deadline = time() + 300 try: - return imds.fetch_metadata_with_api_fallback(retries=retries) + return imds.fetch_metadata_with_api_fallback( + retry_deadline=retry_deadline + ) except (UrlError, ValueError) as error: report_diagnostic_event( "Ignoring IMDS metadata due to: %s" % error, @@ -981,7 +984,7 @@ class DataSourceAzure(sources.DataSource): # Primary nic detection will be optimized in the future. The fact that # primary nic is being attached first helps here. Otherwise each nic # could add several seconds of delay. - imds_md = self.get_metadata_from_imds(retries=300) + imds_md = self.get_metadata_from_imds() if imds_md: # Only primary NIC will get a response from IMDS. LOG.info("%s is the primary nic", ifname) diff --git a/cloudinit/sources/azure/imds.py b/cloudinit/sources/azure/imds.py index 1f5cf008..f0b6911c 100644 --- a/cloudinit/sources/azure/imds.py +++ b/cloudinit/sources/azure/imds.py @@ -2,7 +2,8 @@ # # This file is part of cloud-init. See LICENSE file for license information. -from typing import Dict +from time import time +from typing import Dict, Optional import requests @@ -17,21 +18,31 @@ IMDS_URL = "http://169.254.169.254/metadata" class ReadUrlRetryHandler: + """Manager for readurl retry behavior using exception_callback(). + + :param logging_backoff: Backoff to limit logging. + :param max_connection_errors: Number of connection errors to retry on. + :param retry_codes: Set of http codes to retry on. + :param retry_deadline: Optional time()-based deadline to retry until. + """ + def __init__( self, *, + logging_backoff: float = 1.0, + max_connection_errors: int = 10, retry_codes=( 404, # not found (yet) 410, # gone / unavailable (yet) 429, # rate-limited/throttled 500, # server error ), - max_connection_errors: int = 10, - logging_backoff: float = 1.0, + retry_deadline: Optional[float] = None, ) -> None: self.logging_backoff = logging_backoff self.max_connection_errors = max_connection_errors self.retry_codes = retry_codes + self.retry_deadline = retry_deadline self._logging_threshold = 1.0 self._request_count = 0 @@ -46,7 +57,10 @@ class ReadUrlRetryHandler: return False log = True - retry = True + if self.retry_deadline is not None and time() >= self.retry_deadline: + retry = False + else: + retry = True # Check for connection errors which may occur early boot, but # are otherwise indicative that we are not connecting with the @@ -76,22 +90,30 @@ class ReadUrlRetryHandler: def _fetch_url( - url: str, *, log_response: bool = True, retries: int = 10, timeout: int = 2 + url: str, + *, + retry_deadline: float, + log_response: bool = True, + timeout: int = 2, ) -> bytes: """Fetch URL from IMDS. + :param url: url to fetch. + :param log_response: log responses in readurl(). + :param retry_deadline: time()-based deadline to retry until. + :param timeout: Read/connection timeout in seconds for readurl(). + :raises UrlError: on error fetching metadata. """ - handler = ReadUrlRetryHandler() + handler = ReadUrlRetryHandler(retry_deadline=retry_deadline) try: response = readurl( url, exception_cb=handler.exception_callback, headers={"Metadata": "true"}, - infinite=False, + infinite=True, log_req_resp=log_response, - retries=retries, timeout=timeout, ) except UrlError as error: @@ -106,14 +128,17 @@ def _fetch_url( def _fetch_metadata( url: str, - retries: int = 10, + retry_deadline: float, ) -> Dict: """Fetch IMDS metadata. + :param url: url to fetch. + :param retry_deadline: time()-based deadline to retry until. + :raises UrlError: on error fetching metadata. :raises ValueError: on error parsing metadata. """ - metadata = _fetch_url(url, retries=retries) + metadata = _fetch_url(url, retry_deadline=retry_deadline) try: return util.load_json(metadata) @@ -125,15 +150,17 @@ def _fetch_metadata( raise -def fetch_metadata_with_api_fallback(retries: int = 10) -> Dict: +def fetch_metadata_with_api_fallback(retry_deadline: float) -> Dict: """Fetch extended metadata, falling back to non-extended as required. + :param retry_deadline: time()-based deadline to retry until. + :raises UrlError: on error fetching metadata. :raises ValueError: on error parsing metadata. """ try: url = IMDS_URL + "/instance?api-version=2021-08-01&extended=true" - return _fetch_metadata(url, retries=retries) + return _fetch_metadata(url, retry_deadline=retry_deadline) except UrlError as error: if error.code == 400: report_diagnostic_event( @@ -141,7 +168,7 @@ def fetch_metadata_with_api_fallback(retries: int = 10) -> Dict: logger_func=LOG.warning, ) url = IMDS_URL + "/instance?api-version=2019-06-01" - return _fetch_metadata(url, retries=retries) + return _fetch_metadata(url, retry_deadline=retry_deadline) raise @@ -159,6 +186,7 @@ def fetch_reprovision_data() -> bytes: 404, 410, ), + retry_deadline=None, ) response = readurl( url, diff --git a/tests/unittests/sources/azure/test_imds.py b/tests/unittests/sources/azure/test_imds.py index 9a8aad88..d8fdd563 100644 --- a/tests/unittests/sources/azure/test_imds.py +++ b/tests/unittests/sources/azure/test_imds.py @@ -36,6 +36,22 @@ def mock_requests_session_request(): yield m +@pytest.fixture(autouse=True) +def mock_time(): + with mock.patch.object(imds, "time", autospec=True) as m: + m.time_current = 0.0 + m.time_increment = 1.0 + + def fake_time(): + nonlocal m + current = m.time_current + m.time_current += m.time_increment + return current + + m.side_effect = fake_time + yield m + + @pytest.fixture def mock_url_helper_time_sleep(): with mock.patch("cloudinit.url_helper.time.sleep", autospec=True) as m: @@ -60,13 +76,14 @@ class TestFetchMetadataWithApiFallback: "http://169.254.169.254/metadata/instance?api-version=2019-06-01" ) headers = {"Metadata": "true"} - retries = 10 timeout = 2 + @pytest.mark.parametrize("retry_deadline", [0.0, 1.0, 60.0]) def test_basic( self, caplog, mock_requests_session_request, + retry_deadline, wrapped_readurl, ): fake_md = {"foo": {"bar": []}} @@ -74,7 +91,9 @@ class TestFetchMetadataWithApiFallback: mock.Mock(content=json.dumps(fake_md)), ] - md = imds.fetch_metadata_with_api_fallback() + md = imds.fetch_metadata_with_api_fallback( + retry_deadline=retry_deadline + ) assert md == fake_md assert wrapped_readurl.mock_calls == [ @@ -82,9 +101,8 @@ class TestFetchMetadataWithApiFallback: self.default_url, timeout=self.timeout, headers=self.headers, - retries=self.retries, exception_cb=mock.ANY, - infinite=False, + infinite=True, log_req_resp=True, ) ] @@ -92,7 +110,7 @@ class TestFetchMetadataWithApiFallback: ( "cloudinit.url_helper", logging.DEBUG, - StringMatch(r"\[0/11\] open.*"), + StringMatch(r"\[0/infinite\] open.*"), ), ( "cloudinit.url_helper", @@ -101,10 +119,12 @@ class TestFetchMetadataWithApiFallback: ), ] + @pytest.mark.parametrize("retry_deadline", [0.0, 1.0, 60.0]) def test_basic_fallback( self, caplog, mock_requests_session_request, + retry_deadline, wrapped_readurl, ): fake_md = {"foo": {"bar": []}} @@ -113,7 +133,9 @@ class TestFetchMetadataWithApiFallback: mock.Mock(content=json.dumps(fake_md)), ] - md = imds.fetch_metadata_with_api_fallback() + md = imds.fetch_metadata_with_api_fallback( + retry_deadline=retry_deadline + ) assert md == fake_md assert wrapped_readurl.mock_calls == [ @@ -121,18 +143,16 @@ class TestFetchMetadataWithApiFallback: self.default_url, timeout=self.timeout, headers=self.headers, - retries=self.retries, exception_cb=mock.ANY, - infinite=False, + infinite=True, log_req_resp=True, ), mock.call( self.fallback_url, timeout=self.timeout, headers=self.headers, - retries=self.retries, exception_cb=mock.ANY, - infinite=False, + infinite=True, log_req_resp=True, ), ] @@ -141,7 +161,7 @@ class TestFetchMetadataWithApiFallback: ( "cloudinit.url_helper", logging.DEBUG, - StringMatch(r"\[0/11\] open.*"), + StringMatch(r"\[0/infinite\] open.*"), ), ( LOG_PATH, @@ -156,7 +176,7 @@ class TestFetchMetadataWithApiFallback: ( "cloudinit.url_helper", logging.DEBUG, - StringMatch(r"\[0/11\] open.*"), + StringMatch(r"\[0/infinite\] open.*"), ), ( "cloudinit.url_helper", @@ -176,9 +196,12 @@ class TestFetchMetadataWithApiFallback: requests.Timeout("Fake connection timeout"), ], ) + @pytest.mark.parametrize("max_attempts,retry_deadline", [(2, 1.0)]) def test_will_retry_errors( self, caplog, + max_attempts, + retry_deadline, mock_requests_session_request, mock_url_helper_time_sleep, error, @@ -189,16 +212,18 @@ class TestFetchMetadataWithApiFallback: mock.Mock(content=json.dumps(fake_md)), ] - md = imds.fetch_metadata_with_api_fallback() + md = imds.fetch_metadata_with_api_fallback( + retry_deadline=retry_deadline + ) assert md == fake_md - assert len(mock_requests_session_request.mock_calls) == 2 + assert len(mock_requests_session_request.mock_calls) == max_attempts assert mock_url_helper_time_sleep.mock_calls == [mock.call(1)] assert caplog.record_tuples == [ ( "cloudinit.url_helper", logging.DEBUG, - StringMatch(r"\[0/11\] open.*"), + StringMatch(r"\[0/infinite\] open.*"), ), ( LOG_PATH, @@ -216,7 +241,7 @@ class TestFetchMetadataWithApiFallback: ( "cloudinit.url_helper", logging.DEBUG, - StringMatch(r"\[1/11\] open.*"), + StringMatch(r"\[1/infinite\] open.*"), ), ( "cloudinit.url_helper", @@ -225,9 +250,11 @@ class TestFetchMetadataWithApiFallback: ), ] + @pytest.mark.parametrize("retry_deadline", [3.0, 30.0]) def test_will_retry_errors_on_fallback( self, caplog, + retry_deadline, mock_requests_session_request, mock_url_helper_time_sleep, ): @@ -238,17 +265,20 @@ class TestFetchMetadataWithApiFallback: fake_http_error_for_code(429), mock.Mock(content=json.dumps(fake_md)), ] + max_attempts = len(mock_requests_session_request.side_effect) - md = imds.fetch_metadata_with_api_fallback() + md = imds.fetch_metadata_with_api_fallback( + retry_deadline=retry_deadline + ) assert md == fake_md - assert len(mock_requests_session_request.mock_calls) == 3 + assert len(mock_requests_session_request.mock_calls) == max_attempts assert mock_url_helper_time_sleep.mock_calls == [mock.call(1)] assert caplog.record_tuples == [ ( "cloudinit.url_helper", logging.DEBUG, - StringMatch(r"\[0/11\] open.*"), + StringMatch(r"\[0/infinite\] open.*"), ), ( LOG_PATH, @@ -271,7 +301,7 @@ class TestFetchMetadataWithApiFallback: ( "cloudinit.url_helper", logging.DEBUG, - StringMatch(r"\[0/11\] open.*"), + StringMatch(r"\[0/infinite\] open.*"), ), ( LOG_PATH, @@ -289,7 +319,7 @@ class TestFetchMetadataWithApiFallback: ( "cloudinit.url_helper", logging.DEBUG, - StringMatch(r"\[1/11\] open.*"), + StringMatch(r"\[1/infinite\] open.*"), ), ( "cloudinit.url_helper", @@ -309,25 +339,37 @@ class TestFetchMetadataWithApiFallback: requests.Timeout("Fake connection timeout"), ], ) + @pytest.mark.parametrize( + "max_attempts,retry_deadline", [(1, 0.0), (2, 1.0), (301, 300.0)] + ) def test_retry_until_failure( self, + error, + max_attempts, + retry_deadline, caplog, mock_requests_session_request, mock_url_helper_time_sleep, - error, ): - mock_requests_session_request.side_effect = [error] * (11) + mock_requests_session_request.side_effect = error with pytest.raises(UrlError) as exc_info: - imds.fetch_metadata_with_api_fallback() + imds.fetch_metadata_with_api_fallback( + retry_deadline=retry_deadline + ) assert exc_info.value.cause == error - assert len(mock_requests_session_request.mock_calls) == ( - self.retries + 1 + + # Connection errors max out at 11 attempts. + max_attempts = ( + 11 + if isinstance(error, requests.ConnectionError) + and max_attempts > 11 + else max_attempts ) - assert ( - mock_url_helper_time_sleep.mock_calls - == [mock.call(1)] * self.retries + assert len(mock_requests_session_request.mock_calls) == (max_attempts) + assert mock_url_helper_time_sleep.mock_calls == [mock.call(1)] * ( + max_attempts - 1 ) logs = [x for x in caplog.record_tuples if x[0] == LOG_PATH] @@ -340,7 +382,7 @@ class TestFetchMetadataWithApiFallback: f".*{error!s}.*" ), ) - for i in range(1, 12) + for i in range(1, max_attempts + 1) ] + [ ( LOG_PATH, @@ -356,12 +398,14 @@ class TestFetchMetadataWithApiFallback: fake_http_error_for_code(501), ], ) + @pytest.mark.parametrize("retry_deadline", [0.0, 1.0, 60.0]) def test_will_not_retry_errors( self, + error, + retry_deadline, caplog, mock_requests_session_request, mock_url_helper_time_sleep, - error, ): fake_md = {"foo": {"bar": []}} mock_requests_session_request.side_effect = [ @@ -370,7 +414,9 @@ class TestFetchMetadataWithApiFallback: ] with pytest.raises(UrlError) as exc_info: - imds.fetch_metadata_with_api_fallback() + imds.fetch_metadata_with_api_fallback( + retry_deadline=retry_deadline + ) assert exc_info.value.cause == error assert len(mock_requests_session_request.mock_calls) == 1 @@ -380,7 +426,7 @@ class TestFetchMetadataWithApiFallback: ( "cloudinit.url_helper", logging.DEBUG, - StringMatch(r"\[0/11\] open.*"), + StringMatch(r"\[0/infinite\] open.*"), ), ( LOG_PATH, @@ -397,8 +443,10 @@ class TestFetchMetadataWithApiFallback: ), ] + @pytest.mark.parametrize("retry_deadline", [0.0, 1.0, 60.0]) def test_non_json_repsonse( self, + retry_deadline, caplog, mock_requests_session_request, wrapped_readurl, @@ -408,16 +456,17 @@ class TestFetchMetadataWithApiFallback: ] with pytest.raises(ValueError): - imds.fetch_metadata_with_api_fallback() + imds.fetch_metadata_with_api_fallback( + retry_deadline=retry_deadline + ) assert wrapped_readurl.mock_calls == [ mock.call( self.default_url, timeout=self.timeout, headers=self.headers, - retries=self.retries, exception_cb=mock.ANY, - infinite=False, + infinite=True, log_req_resp=True, ), ] diff --git a/tests/unittests/sources/test_azure.py b/tests/unittests/sources/test_azure.py index 1c43aa66..b3f7b1bc 100644 --- a/tests/unittests/sources/test_azure.py +++ b/tests/unittests/sources/test_azure.py @@ -2805,8 +2805,10 @@ class TestPreprovisioningHotAttachNics(CiTestCase): @mock.patch("cloudinit.url_helper.time.sleep", autospec=True) @mock.patch("requests.Session.request", autospec=True) @mock.patch(MOCKPATH + "EphemeralDHCPv4", autospec=True) + @mock.patch(MOCKPATH + "time", autospec=True) + @mock.patch("cloudinit.sources.azure.imds.time", autospec=True) def test_check_if_nic_is_primary_retries_on_failures( - self, m_dhcpv4, m_request, m_sleep + self, m_imds_time, m_time, m_dhcpv4, m_request, m_sleep ): """Retry polling for network metadata on all failures except timeout and network unreachable errors""" @@ -2822,27 +2824,45 @@ class TestPreprovisioningHotAttachNics(CiTestCase): } m_req = mock.Mock(content=json.dumps({"not": "empty"})) - m_request.side_effect = ( + errors = ( [requests.Timeout("Fake connection timeout")] * 5 + [requests.ConnectionError("Fake Network Unreachable")] * 5 - + 290 * [fake_http_error_for_code(410)] + + [fake_http_error_for_code(410)] * 5 + [m_req] ) + m_request.side_effect = errors m_dhcpv4.return_value.lease = lease + fake_time = 0.0 + fake_time_increment = 5.0 + + def fake_timer(): + nonlocal fake_time, fake_time_increment + fake_time += fake_time_increment + return fake_time + + m_time.side_effect = fake_timer + m_imds_time.side_effect = fake_timer is_primary = dsa._check_if_nic_is_primary("eth0") self.assertEqual(True, is_primary) - assert len(m_request.mock_calls) == 301 + assert len(m_request.mock_calls) == len(errors) # Re-run tests to verify max http failures. + attempts = 3 + fake_time = 0.0 + fake_time_increment = 300 / attempts + + m_time.side_effect = fake_timer + m_imds_time.side_effect = fake_timer m_request.reset_mock() - m_request.side_effect = 305 * [fake_http_error_for_code(410)] + errors = 100 * [fake_http_error_for_code(410)] + m_request.side_effect = errors dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths) is_primary = dsa._check_if_nic_is_primary("eth1") self.assertEqual(False, is_primary) - assert len(m_request.mock_calls) == 301 + assert len(m_request.mock_calls) == attempts # Re-run tests to verify max connection error retries. m_request.reset_mock() @@ -2854,7 +2874,7 @@ class TestPreprovisioningHotAttachNics(CiTestCase): is_primary = dsa._check_if_nic_is_primary("eth1") self.assertEqual(False, is_primary) - assert len(m_request.mock_calls) == 11 + assert len(m_request.mock_calls) == attempts @mock.patch("cloudinit.distros.networking.LinuxNetworking.try_set_link_up") def test_wait_for_link_up_returns_if_already_up(self, m_is_link_up): @@ -3531,9 +3551,8 @@ class TestProvisioning: "api-version=2021-08-01&extended=true", timeout=2, headers={"Metadata": "true"}, - retries=10, exception_cb=mock.ANY, - infinite=False, + infinite=True, log_req_resp=True, ), ] @@ -3601,9 +3620,8 @@ class TestProvisioning: "api-version=2021-08-01&extended=true", exception_cb=mock.ANY, headers={"Metadata": "true"}, - infinite=False, + infinite=True, log_req_resp=True, - retries=10, timeout=2, ), mock.call( @@ -3620,9 +3638,8 @@ class TestProvisioning: "api-version=2021-08-01&extended=true", exception_cb=mock.ANY, headers={"Metadata": "true"}, - infinite=False, + infinite=True, log_req_resp=True, - retries=10, timeout=2, ), ] @@ -3712,9 +3729,8 @@ class TestProvisioning: "api-version=2021-08-01&extended=true", exception_cb=mock.ANY, headers={"Metadata": "true"}, - infinite=False, + infinite=True, log_req_resp=True, - retries=10, timeout=2, ), mock.call( @@ -3722,9 +3738,8 @@ class TestProvisioning: "api-version=2021-08-01&extended=true", exception_cb=mock.ANY, headers={"Metadata": "true"}, - infinite=False, + infinite=True, log_req_resp=True, - retries=300, timeout=2, ), mock.call( @@ -3741,9 +3756,8 @@ class TestProvisioning: "api-version=2021-08-01&extended=true", exception_cb=mock.ANY, headers={"Metadata": "true"}, - infinite=False, + infinite=True, log_req_resp=True, - retries=10, timeout=2, ), ] @@ -3871,9 +3885,8 @@ class TestProvisioning: "api-version=2021-08-01&extended=true", exception_cb=mock.ANY, headers={"Metadata": "true"}, - infinite=False, + infinite=True, log_req_resp=True, - retries=10, timeout=2, ), mock.call( @@ -3881,9 +3894,8 @@ class TestProvisioning: "api-version=2021-08-01&extended=true", exception_cb=mock.ANY, headers={"Metadata": "true"}, - infinite=False, + infinite=True, log_req_resp=True, - retries=300, timeout=2, ), mock.call( @@ -3900,9 +3912,8 @@ class TestProvisioning: "api-version=2021-08-01&extended=true", exception_cb=mock.ANY, headers={"Metadata": "true"}, - infinite=False, + infinite=True, log_req_resp=True, - retries=10, timeout=2, ), ] @@ -3988,9 +3999,8 @@ class TestProvisioning: "api-version=2021-08-01&extended=true", exception_cb=mock.ANY, headers={"Metadata": "true"}, - infinite=False, + infinite=True, log_req_resp=True, - retries=10, timeout=2, ), mock.call( @@ -4007,9 +4017,8 @@ class TestProvisioning: "api-version=2021-08-01&extended=true", exception_cb=mock.ANY, headers={"Metadata": "true"}, - infinite=False, + infinite=True, log_req_resp=True, - retries=10, timeout=2, ), ] @@ -4108,9 +4117,8 @@ class TestProvisioning: "api-version=2021-08-01&extended=true", exception_cb=mock.ANY, headers={"Metadata": "true"}, - infinite=False, + infinite=True, log_req_resp=True, - retries=10, timeout=2, ), ] -- cgit v1.2.1