summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChad Smith <chad.smith@canonical.com>2017-11-30 13:09:30 -0700
committerScott Moser <smoser@brickies.net>2017-11-30 21:51:14 -0500
commit7acc9e68fafbbd7c56587aebe752ba6ba8c8a3db (patch)
tree521cc4ebca1fb7b66f6e1dba97e09b4861bb72d7
parent88368f9851b29dddb5a12e4b21868cbdef906c5c (diff)
downloadcloud-init-git-7acc9e68fafbbd7c56587aebe752ba6ba8c8a3db.tar.gz
ec2: Fix sandboxed dhclient background process cleanup.
There is a race condition where our sandboxed dhclient properly writes a lease file but has not yet written a pid file. If the sandbox temporary directory is torn down before the dhclient subprocess writes a pidfile DataSourceEc2Local gets a traceback and the instance will fallback to DataSourceEc2 in the init-network stage. This wastes boot cycles we'd rather not spend. Fix handling of sandboxed dhclient to wait for both pidfile and leasefile before proceding. If either file doesn't show in 5 seconds, log a warning and return empty lease results {}. LP: #1735331
-rw-r--r--cloudinit/net/dhcp.py44
-rw-r--r--cloudinit/net/tests/test_dhcp.py66
-rw-r--r--cloudinit/sources/DataSourceAzure.py29
-rw-r--r--cloudinit/util.py22
-rw-r--r--tests/unittests/test_datasource/test_azure.py5
5 files changed, 118 insertions, 48 deletions
diff --git a/cloudinit/net/dhcp.py b/cloudinit/net/dhcp.py
index d8624d82..875a4609 100644
--- a/cloudinit/net/dhcp.py
+++ b/cloudinit/net/dhcp.py
@@ -36,22 +36,23 @@ def maybe_perform_dhcp_discovery(nic=None):
skip dhcp_discovery and return an empty dict.
@param nic: Name of the network interface we want to run dhclient on.
- @return: A dict of dhcp options from the dhclient discovery if run,
- otherwise an empty dict is returned.
+ @return: A list of dicts representing dhcp options for each lease obtained
+ from the dhclient discovery if run, otherwise an empty list is
+ returned.
"""
if nic is None:
nic = find_fallback_nic()
if nic is None:
LOG.debug('Skip dhcp_discovery: Unable to find fallback nic.')
- return {}
+ return []
elif nic not in get_devicelist():
LOG.debug(
'Skip dhcp_discovery: nic %s not found in get_devicelist.', nic)
- return {}
+ return []
dhclient_path = util.which('dhclient')
if not dhclient_path:
LOG.debug('Skip dhclient configuration: No dhclient command found.')
- return {}
+ return []
with temp_utils.tempdir(prefix='cloud-init-dhcp-', needs_exe=True) as tdir:
# Use /var/tmp because /run/cloud-init/tmp is mounted noexec
return dhcp_discovery(dhclient_path, nic, tdir)
@@ -60,8 +61,8 @@ def maybe_perform_dhcp_discovery(nic=None):
def parse_dhcp_lease_file(lease_file):
"""Parse the given dhcp lease file for the most recent lease.
- Return a dict of dhcp options as key value pairs for the most recent lease
- block.
+ Return a list of dicts of dhcp options. Each dict contains key value pairs
+ a specific lease in order from oldest to newest.
@raises: InvalidDHCPLeaseFileError on empty of unparseable leasefile
content.
@@ -96,8 +97,8 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
@param cleandir: The directory from which to run dhclient as well as store
dhcp leases.
- @return: A dict of dhcp options parsed from the dhcp.leases file or empty
- dict.
+ @return: A list of dicts of representing the dhcp leases parsed from the
+ dhcp.leases file or empty list.
"""
LOG.debug('Performing a dhcp discovery on %s', interface)
@@ -119,13 +120,26 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
cmd = [sandbox_dhclient_cmd, '-1', '-v', '-lf', lease_file,
'-pf', pid_file, interface, '-sf', '/bin/true']
util.subp(cmd, capture=True)
- pid = None
+
+ # dhclient doesn't write a pid file until after it forks when it gets a
+ # proper lease response. Since cleandir is a temp directory that gets
+ # removed, we need to wait for that pidfile creation before the
+ # cleandir is removed, otherwise we get FileNotFound errors.
+ missing = util.wait_for_files(
+ [pid_file, lease_file], maxwait=5, naplen=0.01)
+ if missing:
+ LOG.warning("dhclient did not produce expected files: %s",
+ ', '.join(os.path.basename(f) for f in missing))
+ return []
+ pid_content = util.load_file(pid_file).strip()
try:
- pid = int(util.load_file(pid_file).strip())
- return parse_dhcp_lease_file(lease_file)
- finally:
- if pid:
- os.kill(pid, signal.SIGKILL)
+ pid = int(pid_content)
+ except ValueError:
+ LOG.debug(
+ "pid file contains non-integer content '%s'", pid_content)
+ else:
+ os.kill(pid, signal.SIGKILL)
+ return parse_dhcp_lease_file(lease_file)
def networkd_parse_lease(content):
diff --git a/cloudinit/net/tests/test_dhcp.py b/cloudinit/net/tests/test_dhcp.py
index 3d8e15c0..db25b6f2 100644
--- a/cloudinit/net/tests/test_dhcp.py
+++ b/cloudinit/net/tests/test_dhcp.py
@@ -1,6 +1,5 @@
# This file is part of cloud-init. See LICENSE file for license information.
-import mock
import os
import signal
from textwrap import dedent
@@ -9,7 +8,8 @@ from cloudinit.net.dhcp import (
InvalidDHCPLeaseFileError, maybe_perform_dhcp_discovery,
parse_dhcp_lease_file, dhcp_discovery, networkd_load_leases)
from cloudinit.util import ensure_file, write_file
-from cloudinit.tests.helpers import CiTestCase, wrap_and_call, populate_dir
+from cloudinit.tests.helpers import (
+ CiTestCase, mock, populate_dir, wrap_and_call)
class TestParseDHCPLeasesFile(CiTestCase):
@@ -69,14 +69,14 @@ class TestDHCPDiscoveryClean(CiTestCase):
def test_no_fallback_nic_found(self, m_fallback_nic):
"""Log and do nothing when nic is absent and no fallback is found."""
m_fallback_nic.return_value = None # No fallback nic found
- self.assertEqual({}, maybe_perform_dhcp_discovery())
+ self.assertEqual([], maybe_perform_dhcp_discovery())
self.assertIn(
'Skip dhcp_discovery: Unable to find fallback nic.',
self.logs.getvalue())
def test_provided_nic_does_not_exist(self):
"""When the provided nic doesn't exist, log a message and no-op."""
- self.assertEqual({}, maybe_perform_dhcp_discovery('idontexist'))
+ self.assertEqual([], maybe_perform_dhcp_discovery('idontexist'))
self.assertIn(
'Skip dhcp_discovery: nic idontexist not found in get_devicelist.',
self.logs.getvalue())
@@ -87,7 +87,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
"""When dhclient doesn't exist in the OS, log the issue and no-op."""
m_fallback.return_value = 'eth9'
m_which.return_value = None # dhclient isn't found
- self.assertEqual({}, maybe_perform_dhcp_discovery())
+ self.assertEqual([], maybe_perform_dhcp_discovery())
self.assertIn(
'Skip dhclient configuration: No dhclient command found.',
self.logs.getvalue())
@@ -117,6 +117,62 @@ class TestDHCPDiscoveryClean(CiTestCase):
@mock.patch('cloudinit.net.dhcp.os.kill')
@mock.patch('cloudinit.net.dhcp.util.subp')
+ def test_dhcp_discovery_run_in_sandbox_warns_invalid_pid(self, m_subp,
+ m_kill):
+ """dhcp_discovery logs a warning when pidfile contains invalid content.
+
+ Lease processing still occurs and no proc kill is attempted.
+ """
+ tmpdir = self.tmp_dir()
+ dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
+ script_content = '#!/bin/bash\necho fake-dhclient'
+ write_file(dhclient_script, script_content, mode=0o755)
+ write_file(self.tmp_path('dhclient.pid', tmpdir), '') # Empty pid ''
+ lease_content = dedent("""
+ lease {
+ interface "eth9";
+ fixed-address 192.168.2.74;
+ option subnet-mask 255.255.255.0;
+ option routers 192.168.2.1;
+ }
+ """)
+ write_file(self.tmp_path('dhcp.leases', tmpdir), lease_content)
+
+ self.assertItemsEqual(
+ [{'interface': 'eth9', 'fixed-address': '192.168.2.74',
+ 'subnet-mask': '255.255.255.0', 'routers': '192.168.2.1'}],
+ dhcp_discovery(dhclient_script, 'eth9', tmpdir))
+ self.assertIn(
+ "pid file contains non-integer content ''", self.logs.getvalue())
+ m_kill.assert_not_called()
+
+ @mock.patch('cloudinit.net.dhcp.os.kill')
+ @mock.patch('cloudinit.net.dhcp.util.wait_for_files')
+ @mock.patch('cloudinit.net.dhcp.util.subp')
+ def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self,
+ m_subp,
+ m_wait,
+ m_kill):
+ """dhcp_discovery waits for the presence of pidfile and dhcp.leases."""
+ tmpdir = self.tmp_dir()
+ dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
+ script_content = '#!/bin/bash\necho fake-dhclient'
+ write_file(dhclient_script, script_content, mode=0o755)
+ # Don't create pid or leases file
+ pidfile = self.tmp_path('dhclient.pid', tmpdir)
+ leasefile = self.tmp_path('dhcp.leases', tmpdir)
+ m_wait.return_value = [pidfile] # Return the missing pidfile wait for
+ self.assertEqual([], dhcp_discovery(dhclient_script, 'eth9', tmpdir))
+ self.assertEqual(
+ mock.call([pidfile, leasefile], maxwait=5, naplen=0.01),
+ m_wait.call_args_list[0])
+ self.assertIn(
+ 'WARNING: dhclient did not produce expected files: dhclient.pid',
+ self.logs.getvalue())
+ m_kill.assert_not_called()
+
+ @mock.patch('cloudinit.net.dhcp.os.kill')
+ @mock.patch('cloudinit.net.dhcp.util.subp')
def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill):
"""dhcp_discovery brings up the interface and runs dhclient.
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index 8c3492d9..14367e9c 100644
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -11,7 +11,6 @@ from functools import partial
import os
import os.path
import re
-import time
from xml.dom import minidom
import xml.etree.ElementTree as ET
@@ -321,7 +320,7 @@ class DataSourceAzure(sources.DataSource):
# https://bugs.launchpad.net/cloud-init/+bug/1717611
missing = util.log_time(logfunc=LOG.debug,
msg="waiting for SSH public key files",
- func=wait_for_files,
+ func=util.wait_for_files,
args=(fp_files, 900))
if len(missing):
@@ -556,8 +555,8 @@ def address_ephemeral_resize(devpath=RESOURCE_DISK_PATH, maxwait=120,
is_new_instance=False):
# wait for ephemeral disk to come up
naplen = .2
- missing = wait_for_files([devpath], maxwait=maxwait, naplen=naplen,
- log_pre="Azure ephemeral disk: ")
+ missing = util.wait_for_files([devpath], maxwait=maxwait, naplen=naplen,
+ log_pre="Azure ephemeral disk: ")
if missing:
LOG.warning("ephemeral device '%s' did not appear after %d seconds.",
@@ -639,28 +638,6 @@ def pubkeys_from_crt_files(flist):
return pubkeys
-def wait_for_files(flist, maxwait, naplen=.5, log_pre=""):
- need = set(flist)
- waited = 0
- while True:
- need -= set([f for f in need if os.path.exists(f)])
- if len(need) == 0:
- LOG.debug("%sAll files appeared after %s seconds: %s",
- log_pre, waited, flist)
- return []
- if waited == 0:
- LOG.info("%sWaiting up to %s seconds for the following files: %s",
- log_pre, maxwait, flist)
- if waited + naplen > maxwait:
- break
- time.sleep(naplen)
- waited += naplen
-
- LOG.warning("%sStill missing files after %s seconds: %s",
- log_pre, maxwait, need)
- return need
-
-
def write_files(datadir, files, dirmode=None):
def _redact_password(cnt, fname):
diff --git a/cloudinit/util.py b/cloudinit/util.py
index e1290aa8..6c014ba5 100644
--- a/cloudinit/util.py
+++ b/cloudinit/util.py
@@ -2541,4 +2541,26 @@ def load_shell_content(content, add_empty=False, empty_val=None):
return data
+def wait_for_files(flist, maxwait, naplen=.5, log_pre=""):
+ need = set(flist)
+ waited = 0
+ while True:
+ need -= set([f for f in need if os.path.exists(f)])
+ if len(need) == 0:
+ LOG.debug("%sAll files appeared after %s seconds: %s",
+ log_pre, waited, flist)
+ return []
+ if waited == 0:
+ LOG.debug("%sWaiting up to %s seconds for the following files: %s",
+ log_pre, maxwait, flist)
+ if waited + naplen > maxwait:
+ break
+ time.sleep(naplen)
+ waited += naplen
+
+ LOG.debug("%sStill missing files after %s seconds: %s",
+ log_pre, maxwait, need)
+ return need
+
+
# vi: ts=4 expandtab
diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py
index 0a117771..7cb1812a 100644
--- a/tests/unittests/test_datasource/test_azure.py
+++ b/tests/unittests/test_datasource/test_azure.py
@@ -171,7 +171,6 @@ scbus-1 on xpt0 bus 0
self.apply_patches([
(dsaz, 'list_possible_azure_ds_devs', dsdevs),
(dsaz, 'invoke_agent', _invoke_agent),
- (dsaz, 'wait_for_files', _wait_for_files),
(dsaz, 'pubkeys_from_crt_files', _pubkeys_from_crt_files),
(dsaz, 'perform_hostname_bounce', mock.MagicMock()),
(dsaz, 'get_hostname', mock.MagicMock()),
@@ -179,6 +178,8 @@ scbus-1 on xpt0 bus 0
(dsaz, 'get_metadata_from_fabric', self.get_metadata_from_fabric),
(dsaz.util, 'read_dmi_data', mock.MagicMock(
side_effect=_dmi_mocks)),
+ (dsaz.util, 'wait_for_files', mock.MagicMock(
+ side_effect=_wait_for_files)),
])
dsrc = dsaz.DataSourceAzure(
@@ -647,7 +648,7 @@ class TestAzureBounce(TestCase):
self.patches.enter_context(
mock.patch.object(dsaz, 'invoke_agent'))
self.patches.enter_context(
- mock.patch.object(dsaz, 'wait_for_files'))
+ mock.patch.object(dsaz.util, 'wait_for_files'))
self.patches.enter_context(
mock.patch.object(dsaz, 'list_possible_azure_ds_devs',
mock.MagicMock(return_value=[])))