summaryrefslogtreecommitdiff
path: root/cloudinit
diff options
context:
space:
mode:
authorChris Patterson <cpatterson@microsoft.com>2023-03-29 15:26:39 -0400
committerGitHub <noreply@github.com>2023-03-29 14:26:39 -0500
commit4fbf5317d7f0a7b33a14f94f85667c962a3c879e (patch)
tree5977937da17c19ae49d728ca276faebea83dd001 /cloudinit
parentd6ac22e1a8a4a81bcb28b137f33b5afc6ba81389 (diff)
downloadcloud-init-git-4fbf5317d7f0a7b33a14f94f85667c962a3c879e.tar.gz
sources/azure: move pps handling out of _poll_imds() (#2075)
Pull out remaining PPS handling bits from _poll_imds() and add two explicit methods for the overloaded path: - _wait_for_pps_running_reuse() for running PPS logic. - _wait_for_pps_unknown_reuse() for unknown and recovery PPS logic. For consistency: - Rename _wait_for_all_nics_ready() -> _wait_for_pps_savable_reuse(). - Move reporting ready logic into _wait_for_pps_os_disk_shutdown(). Drop several impacted tests as coverage already exists in TestProvisioning, and update the rest to handle the +/- 1 DHCP attempt due to varying assumptions around PPS state and DHCP.
Diffstat (limited to 'cloudinit')
-rw-r--r--cloudinit/sources/DataSourceAzure.py168
1 files changed, 83 insertions, 85 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index 807c02c7..927e8cf0 100644
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -9,6 +9,7 @@ import crypt
import os
import os.path
import re
+import socket
import xml.etree.ElementTree as ET
from enum import Enum
from pathlib import Path
@@ -571,11 +572,14 @@ class DataSourceAzure(sources.DataSource):
report_diagnostic_event(msg, logger_func=LOG.error)
raise sources.InvalidMetaDataException(msg)
- if pps_type == PPSType.SAVABLE:
- self._wait_for_all_nics_ready()
+ if pps_type == PPSType.RUNNING:
+ self._wait_for_pps_running_reuse()
+ elif pps_type == PPSType.SAVABLE:
+ self._wait_for_pps_savable_reuse()
elif pps_type == PPSType.OS_DISK:
- self._report_ready_for_pps(create_marker=False)
self._wait_for_pps_os_disk_shutdown()
+ else:
+ self._wait_for_pps_unknown_reuse()
md, userdata_raw, cfg, files = self._reprovision()
# fetch metadata again as it has changed after reprovisioning
@@ -974,15 +978,6 @@ class DataSourceAzure(sources.DataSource):
self._create_report_ready_marker()
@azure_ds_telemetry_reporter
- def _wait_for_pps_os_disk_shutdown(self):
- report_diagnostic_event(
- "Waiting for host to shutdown VM...",
- logger_func=LOG.info,
- )
- sleep(31536000)
- raise BrokenAzureDataSource("Shutdown failure for PPS disk.")
-
- @azure_ds_telemetry_reporter
def _check_if_nic_is_primary(self, ifname: str) -> bool:
"""Check if a given interface is the primary nic or not."""
# For now, only a VM's primary NIC can contact IMDS and WireServer. If
@@ -1060,16 +1055,71 @@ class DataSourceAzure(sources.DataSource):
report_diagnostic_event(str(error), logger_func=LOG.error)
@azure_ds_telemetry_reporter
- def _wait_for_all_nics_ready(self):
- """Wait for nic(s) to be hot-attached. There may be multiple nics
- depending on the customer request.
- But only primary nic would be able to communicate with wireserver
- and IMDS. So we detect and save the primary nic to be used later.
- """
+ def _create_bound_netlink_socket(self) -> socket.socket:
+ try:
+ return netlink.create_bound_netlink_socket()
+ except netlink.NetlinkCreateSocketError as error:
+ report_diagnostic_event(
+ f"Failed to create netlink socket: {error}",
+ logger_func=LOG.error,
+ )
+ raise
+
+ @azure_ds_telemetry_reporter
+ def _wait_for_pps_os_disk_shutdown(self):
+ """Report ready and wait for host to initiate shutdown."""
+ self._report_ready_for_pps(create_marker=False)
+
+ report_diagnostic_event(
+ "Waiting for host to shutdown VM...",
+ logger_func=LOG.info,
+ )
+ sleep(31536000)
+ raise BrokenAzureDataSource("Shutdown failure for PPS disk.")
+
+ @azure_ds_telemetry_reporter
+ def _wait_for_pps_running_reuse(self) -> None:
+ """Report ready and wait for nic link to switch upon re-use."""
+ nl_sock = self._create_bound_netlink_socket()
+
+ try:
+ if (
+ self._ephemeral_dhcp_ctx is None
+ or self._ephemeral_dhcp_ctx.iface is None
+ ):
+ raise RuntimeError("missing ephemeral context")
+
+ iface = self._ephemeral_dhcp_ctx.iface
+ self._report_ready_for_pps()
+
+ LOG.debug(
+ "Wait for vnetswitch to happen on %s",
+ iface,
+ )
+ with events.ReportEventStack(
+ name="wait-for-media-disconnect-connect",
+ description="wait for vnet switch",
+ parent=azure_ds_reporter,
+ ):
+ try:
+ netlink.wait_for_media_disconnect_connect(nl_sock, iface)
+ except AssertionError as e:
+ report_diagnostic_event(
+ "Error while waiting for vnet switch: %s" % e,
+ logger_func=LOG.error,
+ )
+ finally:
+ nl_sock.close()
+
+ # Teardown source PPS network configuration.
+ self._teardown_ephemeral_networking()
+
+ @azure_ds_telemetry_reporter
+ def _wait_for_pps_savable_reuse(self):
+ """Report ready and wait for nic(s) to be hot-attached upon re-use."""
+ nl_sock = self._create_bound_netlink_socket()
- nl_sock = None
try:
- nl_sock = netlink.create_bound_netlink_socket()
self._report_ready_for_pps(expect_url_error=True)
try:
self._teardown_ephemeral_networking()
@@ -1083,76 +1133,25 @@ class DataSourceAzure(sources.DataSource):
self._wait_for_nic_detach(nl_sock)
self._wait_for_hot_attached_primary_nic(nl_sock)
- except netlink.NetlinkCreateSocketError as e:
- report_diagnostic_event(str(e), logger_func=LOG.warning)
- raise
finally:
- if nl_sock:
- nl_sock.close()
+ nl_sock.close()
@azure_ds_telemetry_reporter
- def _poll_imds(self):
- """Poll IMDS for the new provisioning data until we get a valid
- response. Then return the returned JSON object."""
- nl_sock = None
- report_ready = bool(
- not os.path.isfile(self._reported_ready_marker_file)
- )
- dhcp_attempts = 0
-
- if report_ready:
- try:
- if (
- self._ephemeral_dhcp_ctx is None
- or self._ephemeral_dhcp_ctx.iface is None
- ):
- raise RuntimeError("Missing ephemeral context")
- iface = self._ephemeral_dhcp_ctx.iface
+ def _wait_for_pps_unknown_reuse(self):
+ """Report ready if needed for unknown/recovery PPS."""
+ if os.path.isfile(self._reported_ready_marker_file):
+ # Already reported ready, nothing to do.
+ return
- nl_sock = netlink.create_bound_netlink_socket()
- self._report_ready_for_pps()
+ self._report_ready_for_pps()
- LOG.debug(
- "Wait for vnetswitch to happen on %s",
- iface,
- )
- with events.ReportEventStack(
- name="wait-for-media-disconnect-connect",
- description="wait for vnet switch",
- parent=azure_ds_reporter,
- ):
- try:
- netlink.wait_for_media_disconnect_connect(
- nl_sock, iface
- )
- except AssertionError as e:
- report_diagnostic_event(
- "Error while waiting for vnet switch: %s" % e,
- logger_func=LOG.error,
- )
- except netlink.NetlinkCreateSocketError as e:
- report_diagnostic_event(
- "Failed to create bound netlink socket: %s" % e,
- logger_func=LOG.warning,
- )
- raise sources.InvalidMetaDataException(
- "Failed to report ready while in provisioning pool."
- ) from e
- except NoDHCPLeaseError as e:
- report_diagnostic_event(
- "DHCP failed while in provisioning pool",
- logger_func=LOG.warning,
- )
- raise sources.InvalidMetaDataException(
- "Failed to report ready while in provisioning pool."
- ) from e
- finally:
- if nl_sock:
- nl_sock.close()
-
- # Teardown old network configuration.
- self._teardown_ephemeral_networking()
+ # Teardown source PPS network configuration.
+ self._teardown_ephemeral_networking()
+ @azure_ds_telemetry_reporter
+ def _poll_imds(self) -> bytes:
+ """Poll IMDs for reprovisiondata XML document data."""
+ dhcp_attempts = 0
reprovision_data = None
while not reprovision_data:
if not self._is_ephemeral_networking_up():
@@ -1177,7 +1176,6 @@ class DataSourceAzure(sources.DataSource):
"attempted dhcp %d times after reuse" % dhcp_attempts,
logger_func=LOG.debug,
)
-
return reprovision_data
@azure_ds_telemetry_reporter