summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.opendev.org>2021-02-12 18:11:57 +0000
committerGerrit Code Review <review@openstack.org>2021-02-12 18:11:57 +0000
commit52ff615c987936a441dd9dee1fcbc3931045033c (patch)
treeb2f074bebd7c5974109d9439809935102838dfb8
parent766d8f11b4101570829788ed11b2ab3fef85e3f8 (diff)
parentd9913370de4046f8818cfbf47fd775a50275d0a1 (diff)
downloadironic-52ff615c987936a441dd9dee1fcbc3931045033c.tar.gz
Merge "Guard conductor from consuming all of the ram"
-rw-r--r--devstack/lib/ironic3
-rw-r--r--doc/source/admin/troubleshooting.rst43
-rw-r--r--ironic/common/exception.py5
-rw-r--r--ironic/common/images.py2
-rw-r--r--ironic/common/utils.py60
-rw-r--r--ironic/conf/default.py26
-rw-r--r--ironic/drivers/modules/iscsi_deploy.py14
-rw-r--r--ironic/tests/unit/common/test_utils.py45
-rw-r--r--ironic/tests/unit/drivers/modules/test_image_cache.py26
-rw-r--r--ironic/tests/unit/drivers/modules/test_iscsi_deploy.py20
-rw-r--r--releasenotes/notes/limit-memory-consumption-c7949a49853ba83d.yaml23
11 files changed, 266 insertions, 1 deletions
diff --git a/devstack/lib/ironic b/devstack/lib/ironic
index 69f26b8fd..3eaea3d88 100644
--- a/devstack/lib/ironic
+++ b/devstack/lib/ironic
@@ -1415,6 +1415,9 @@ function configure_ironic {
iniset $IRONIC_CONF_FILE DEFAULT use_syslog $SYSLOG
# NOTE(vsaienko) with multinode each conductor should have its own host.
iniset $IRONIC_CONF_FILE DEFAULT host $LOCAL_HOSTNAME
+ # NOTE(TheJulia) Set a minimum amount of memory that is more in-line with
+ # OpenStack CI and the images deployed.
+ iniset $IRONIC_CONF_FILE DEFAULT minimum_required_memory 256
# Retrieve deployment logs
iniset $IRONIC_CONF_FILE agent deploy_logs_collect $IRONIC_DEPLOY_LOGS_COLLECT
iniset $IRONIC_CONF_FILE agent deploy_logs_storage_backend $IRONIC_DEPLOY_LOGS_STORAGE_BACKEND
diff --git a/doc/source/admin/troubleshooting.rst b/doc/source/admin/troubleshooting.rst
index 19542bb62..acbcdea76 100644
--- a/doc/source/admin/troubleshooting.rst
+++ b/doc/source/admin/troubleshooting.rst
@@ -829,3 +829,46 @@ commonly where it is encountered.:
Once you have updated the saved interfaces, you should be able to safely
return the ``ironic.conf`` configuration change in changing what interfaces
are enabled by the conductor.
+
+I'm getting Out of Memory errors
+================================
+
+This issue, also known as the "the OOMKiller got my conductor" case,
+is where your OS system memory reaches a point where the operating
+system engages measures to shed active memory consumption in order
+to prevent a complete failure of the machine. Unfortunately this
+can cause unpredictable behavior.
+
+How did I get here?
+-------------------
+
+One of the major consumers of memory in a host running an ironic-conductor is
+transformation of disk images using the ``qemu-img`` tool. This tool, because
+the disk images it works with are both compressed and out of linear block
+order, requires a considerable amount of memory to efficently re-assemble
+and write-out a disk to a device, or to simply convert the format such as
+to a ``raw`` image.
+
+By default, ironic's configuration limits this conversion to 1 GB of RAM
+for the process, but each conversion does cause additional buffer memory
+to be used, which increases overall system memory pressure. Generally
+memory pressure alone from buffers will not cause an out of memory condition,
+but the multiple conversions or deployments running at the same time
+CAN cause extreme memory pressure and risk the system running out of memory.
+
+How do I resolve this?
+----------------------
+
+This can be addressed a few different ways:
+
+ * Use raw images, however these images can be substantially larger
+ and require more data to be transmitted "over the wire".
+ * Add more physical memory.
+ * Add swap space.
+ * Reduce concurrency, possibly via another conductor or changing the
+ nova-compute.conf ``max_concurrent_builds`` parameter.
+ * Or finally, adjust the ``[DEFAULT]minimum_required_memory`` parameter
+ in your ironic.conf file. The default should be considered a "default
+ of last resort" and you may need to reserve additional memory. You may
+ also wish to adjust the ``[DEFAULT]minimum_memory_wait_retries`` and
+ ``[DEFAULT]minimum_memory_wait_time`` parameters.
diff --git a/ironic/common/exception.py b/ironic/common/exception.py
index 77ba9f5f4..4d1eb582c 100644
--- a/ironic/common/exception.py
+++ b/ironic/common/exception.py
@@ -808,3 +808,8 @@ class UnknownAttribute(ClientSideError):
class AgentInProgress(IronicException):
_msg_fmt = _('Node %(node)s command "%(command)s" failed. Agent is '
'presently executing a command. Error %(error)s')
+
+
+class InsufficentMemory(IronicException):
+ _msg_fmt = _("Available memory at %(free)s, Insufficent as %(required)s "
+ "is required to proceed at this time.")
diff --git a/ironic/common/images.py b/ironic/common/images.py
index d614539cb..73b0418d9 100644
--- a/ironic/common/images.py
+++ b/ironic/common/images.py
@@ -402,6 +402,8 @@ def image_to_raw(image_href, path, path_tmp):
if fmt != "raw":
staged = "%s.converted" % path
+
+ utils.is_memory_insufficent(raise_if_fail=True)
LOG.debug("%(image)s was %(format)s, converting to raw",
{'image': image_href, 'format': fmt})
with fileutils.remove_path_on_error(staged):
diff --git a/ironic/common/utils.py b/ironic/common/utils.py
index 529e02477..f4d0f0eb5 100644
--- a/ironic/common/utils.py
+++ b/ironic/common/utils.py
@@ -27,6 +27,7 @@ import os
import re
import shutil
import tempfile
+import time
import jinja2
from oslo_concurrency import processutils
@@ -35,6 +36,7 @@ from oslo_serialization import jsonutils
from oslo_utils import fileutils
from oslo_utils import netutils
from oslo_utils import timeutils
+import psutil
import pytz
from ironic.common import exception
@@ -586,3 +588,61 @@ def file_mime_type(path):
"""Gets a mime type of the given file."""
return execute('file', '--brief', '--mime-type', path,
use_standard_locale=True)[0].strip()
+
+
+def _get_mb_ram_available():
+ # NOTE(TheJulia): The .available value is the memory that can be given
+ # to a process without this process beginning to swap itself.
+ return psutil.virtual_memory().available / 1024 / 1024
+
+
+def is_memory_insufficent(raise_if_fail=False):
+ """Checks available system memory and holds the deployment process.
+
+ Evaluates the current system memory available, meaning can be
+ allocated to a process by the kernel upon allocation request,
+ and delays the execution until memory has been freed,
+ or until it has timed out.
+
+ This method will issue a sleep, if the amount of available memory is
+ insufficent. This is configured using the
+ ``[DEFAULT]minimum_memory_wait_time`` and the
+ ``[DEFAULT]minimum_memory_wait_retries``.
+
+ :param raise_if_fail: Default False, but if set to true an
+ InsufficentMemory exception is raised
+ upon insufficent memory.
+ :returns: True if the check has timed out. Otherwise None is returned.
+ :raises: InsufficentMemory if the raise_if_fail parameter is set to
+ True.
+ """
+ required_memory = CONF.minimum_required_memory
+ loop_count = 0
+
+ while _get_mb_ram_available() < required_memory:
+ log_values = {
+ 'available': _get_mb_ram_available(),
+ 'required': required_memory,
+ }
+ if CONF.minimum_memory_warning_only:
+ LOG.warning('Memory is at %(available)s MiB, required is '
+ '%(required)s. Ironic is in warning-only mode '
+ 'which can be changed by altering the '
+ '[DEFAULT]minimum_memory_warning_only',
+ log_values)
+ return False
+ if loop_count >= CONF.minimum_memory_wait_retries:
+ LOG.error('Memory is at %(available)s MiB, required is '
+ '%(required)s. Notifying caller that we have '
+ 'exceeded retries.',
+ log_values)
+ if raise_if_fail:
+ raise exception.InsufficentMemory(
+ free=_get_mb_ram_available(),
+ required=required_memory)
+ return True
+ LOG.warning('Memory is at %(available)s MiB, required is '
+ '%(required)s, waiting.', log_values)
+ # Sleep so interpreter can switch threads.
+ time.sleep(CONF.minimum_memory_wait_time)
+ loop_count = loop_count + 1
diff --git a/ironic/conf/default.py b/ironic/conf/default.py
index f51defd20..f5becc370 100644
--- a/ironic/conf/default.py
+++ b/ironic/conf/default.py
@@ -359,6 +359,32 @@ service_opts = [
('json-rpc', _('use JSON RPC transport'))],
help=_('Which RPC transport implementation to use between '
'conductor and API services')),
+ cfg.BoolOpt('minimum_memory_warning_only',
+ mutable=True,
+ default=True,
+ help=_('Setting to govern if Ironic should only warn instead '
+ 'of attempting to hold back the request in order to '
+ 'prevent the exhaustion of system memory.')),
+ cfg.IntOpt('minimum_required_memory',
+ mutable=True,
+ default=1024,
+ help=_('Minimum memory in MiB for the system to have '
+ 'available prior to starting a memory intensive '
+ 'process on the conductor.')),
+ cfg.IntOpt('minimum_memory_wait_time',
+ mutable=True,
+ default=15,
+ help=_('Seconds to wait between retries for free memory '
+ 'before launching the process. This, combined with '
+ '``memory_wait_retries`` allows the conductor to '
+ 'determine how long we should attempt to directly '
+ 'retry.')),
+ cfg.IntOpt('minimum_memory_wait_retries',
+ mutable=True,
+ default=6,
+ help=_('Number of retries to hold onto the worker before '
+ 'failing or returning the thread to the pool if '
+ 'the conductor can automatically retry.')),
]
utils_opts = [
diff --git a/ironic/drivers/modules/iscsi_deploy.py b/ironic/drivers/modules/iscsi_deploy.py
index 7e3687306..e76c4bfba 100644
--- a/ironic/drivers/modules/iscsi_deploy.py
+++ b/ironic/drivers/modules/iscsi_deploy.py
@@ -699,7 +699,19 @@ class ISCSIDeploy(agent_base.AgentDeployMixin, agent_base.AgentBaseMixin,
node = task.node
LOG.debug('Continuing the deployment on node %s', node.uuid)
-
+ if utils.is_memory_insufficent():
+ # Insufficent memory, but we can just let the agent heartbeat
+ # again in order to initiate deployment when the situation has
+ # changed.
+ LOG.warning('Insufficent memory to write image for node '
+ '%(node)s. Skipping until next heartbeat.',
+ {'node': node.uuid})
+ info = node.driver_internal_info
+ info['skip_current_deploy_step'] = False
+ node.driver_internal_info = info
+ node.last_error = "Deploy delayed due to insufficent memory"
+ node.save()
+ return states.DEPLOYWAIT
uuid_dict_returned = do_agent_iscsi_deploy(task, self._client)
utils.set_node_nested_field(node, 'driver_internal_info',
'deployment_uuids', uuid_dict_returned)
diff --git a/ironic/tests/unit/common/test_utils.py b/ironic/tests/unit/common/test_utils.py
index df60b88ca..e39a8f7db 100644
--- a/ironic/tests/unit/common/test_utils.py
+++ b/ironic/tests/unit/common/test_utils.py
@@ -19,12 +19,14 @@ import os
import os.path
import shutil
import tempfile
+import time
from unittest import mock
import jinja2
from oslo_concurrency import processutils
from oslo_config import cfg
from oslo_utils import netutils
+import psutil
from ironic.common import exception
from ironic.common import utils
@@ -447,6 +449,49 @@ class TempFilesTestCase(base.TestCase):
utils._check_dir_free_space, "/fake/path")
mock_stat.assert_called_once_with("/fake/path")
+ @mock.patch.object(time, 'sleep', autospec=True)
+ @mock.patch.object(psutil, 'virtual_memory', autospec=True)
+ def test_is_memory_insufficent(self, mock_vm_check, mock_sleep):
+ self.config(minimum_memory_warning_only=False)
+
+ class vm_check(object):
+ available = 1000000000
+
+ mock_vm_check.return_value = vm_check
+ self.assertTrue(utils.is_memory_insufficent())
+ self.assertEqual(14, mock_vm_check.call_count)
+
+ @mock.patch.object(time, 'sleep', autospec=True)
+ @mock.patch.object(psutil, 'virtual_memory', autospec=True)
+ def test_is_memory_insufficent_good(self, mock_vm_check,
+ mock_sleep):
+ self.config(minimum_memory_warning_only=False)
+
+ class vm_check(object):
+ available = 3276700000
+
+ mock_vm_check.return_value = vm_check
+ self.assertFalse(utils.is_memory_insufficent())
+ self.assertEqual(1, mock_vm_check.call_count)
+
+ @mock.patch.object(time, 'sleep', autospec=True)
+ @mock.patch.object(psutil, 'virtual_memory', autospec=True)
+ def test_is_memory_insufficent_recovers(self, mock_vm_check,
+ mock_sleep):
+
+ class vm_check_bad(object):
+ available = 1023000000
+
+ class vm_check_good(object):
+ available = 3276700000
+
+ self.config(minimum_memory_warning_only=False)
+ mock_vm_check.side_effect = iter([vm_check_bad,
+ vm_check_bad,
+ vm_check_good])
+ self.assertFalse(utils.is_memory_insufficent())
+ self.assertEqual(3, mock_vm_check.call_count)
+
class GetUpdatedCapabilitiesTestCase(base.TestCase):
diff --git a/ironic/tests/unit/drivers/modules/test_image_cache.py b/ironic/tests/unit/drivers/modules/test_image_cache.py
index 0711fedec..994d3a214 100644
--- a/ironic/tests/unit/drivers/modules/test_image_cache.py
+++ b/ironic/tests/unit/drivers/modules/test_image_cache.py
@@ -62,6 +62,24 @@ class TestImageCacheFetch(base.TestCase):
None, self.uuid, self.dest_path, True)
self.assertFalse(mock_clean_up.called)
+ @mock.patch.object(image_cache, '_fetch', autospec=True)
+ @mock.patch.object(image_cache.ImageCache, 'clean_up', autospec=True)
+ @mock.patch.object(image_cache.ImageCache, '_download_image',
+ autospec=True)
+ def test_fetch_image_no_master_dir_memory_low(self,
+ mock_download,
+ mock_clean_up,
+ mock_fetch):
+ mock_fetch.side_effect = exception.InsufficentMemory
+ self.cache.master_dir = None
+ self.assertRaises(exception.InsufficentMemory,
+ self.cache.fetch_image,
+ self.uuid, self.dest_path)
+ self.assertFalse(mock_download.called)
+ mock_fetch.assert_called_once_with(
+ None, self.uuid, self.dest_path, True)
+ self.assertFalse(mock_clean_up.called)
+
@mock.patch.object(image_cache.ImageCache, 'clean_up', autospec=True)
@mock.patch.object(image_cache.ImageCache, '_download_image',
autospec=True)
@@ -220,6 +238,14 @@ class TestImageCacheFetch(base.TestCase):
self.assertEqual(2, mock_link.call_count)
self.assertTrue(mock_log.error.called)
+ @mock.patch.object(image_cache, '_fetch', autospec=True)
+ def test__download_image_raises_memory_guard(self, mock_fetch):
+ mock_fetch.side_effect = exception.InsufficentMemory
+ self.assertRaises(exception.InsufficentMemory,
+ self.cache._download_image,
+ self.uuid, self.master_path,
+ self.dest_path)
+
@mock.patch.object(os, 'unlink', autospec=True)
class TestUpdateImages(base.TestCase):
diff --git a/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py b/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py
index 03c4b0d01..fc763f8b0 100644
--- a/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py
+++ b/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py
@@ -611,6 +611,9 @@ class ISCSIDeployTestCase(db_base.DbTestCase):
)
self.node.driver_internal_info['agent_url'] = 'http://1.2.3.4:1234'
dhcp_factory.DHCPFactory._dhcp_provider = None
+ # Memory checks shoudn't apply here as they will lead to
+ # false positives for unit testing in CI.
+ self.config(minimum_memory_warning_only=True)
def test_get_properties(self):
with task_manager.acquire(self.context, self.node.uuid,
@@ -1081,6 +1084,23 @@ class ISCSIDeployTestCase(db_base.DbTestCase):
set_boot_device_mock.assert_called_once_with(
mock.ANY, task, device=boot_devices.DISK, persistent=True)
+ @mock.patch.object(iscsi_deploy, 'do_agent_iscsi_deploy', autospec=True)
+ @mock.patch.object(utils, 'is_memory_insufficent', autospec=True)
+ def test_write_image_out_of_memory(self, mock_memory_check,
+ mock_do_iscsi_deploy):
+ mock_memory_check.return_value = True
+ self.node.provision_state = states.DEPLOYWAIT
+ self.node.target_provision_state = states.ACTIVE
+ self.node.save()
+ with task_manager.acquire(self.context, self.node.uuid) as task:
+ value = task.driver.deploy.write_image(task)
+ self.assertEqual(states.DEPLOYWAIT, value)
+ self.assertEqual(states.DEPLOYWAIT, task.node.provision_state)
+ self.assertIn('skip_current_deploy_step',
+ task.node.driver_internal_info)
+ self.assertTrue(mock_memory_check.called)
+ self.assertFalse(mock_do_iscsi_deploy.called)
+
@mock.patch.object(manager_utils, 'restore_power_state_if_needed',
autospec=True)
@mock.patch.object(manager_utils, 'power_on_node_if_needed', autospec=True)
diff --git a/releasenotes/notes/limit-memory-consumption-c7949a49853ba83d.yaml b/releasenotes/notes/limit-memory-consumption-c7949a49853ba83d.yaml
new file mode 100644
index 000000000..5de6036ec
--- /dev/null
+++ b/releasenotes/notes/limit-memory-consumption-c7949a49853ba83d.yaml
@@ -0,0 +1,23 @@
+---
+features:
+ - |
+ The ``ironic-conductor`` process now has a concept of an internal
+ memory limit. The intent of this is to prevent the conductor from running
+ the host out of memory when a large number of deployments have been
+ requested.
+
+ These settings can be tuned using
+ ``[DEFAULT]minimum_required_memory``,
+ ``[DEFAULT]mimimum_memory_wait_time``,
+ ``[DEFAULT]minimum_memory_wait_retries``, and
+ ``[DEFAULT]minimum_memory_warning_only``.
+
+ Where possible, Ironic will attempt to wait out the time window, thus
+ consuming the conductor worker thread which will resume if the memory
+ becomes available. This will effectively rate limit concurrency.
+
+ If raw image conversions with-in the conductor is required, and a
+ situation exists where insufficent memory exists and it cannot be waited,
+ the deployment operation will fail. For the ``iscsi`` deployment
+ interface, which is the other location in ironic that may consume large
+ amounts of memory, the conductor will wait until the next agent heartbeat.