summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.opendev.org>2023-02-27 17:24:15 +0000
committerGerrit Code Review <review@openstack.org>2023-02-27 17:24:15 +0000
commitcf43fa4e3c133714ac0825faa452cec9294c6ae9 (patch)
tree74b52c318f53bfd61dc5085263dfbfcb492ee694
parenteb03345006a04677d674aedc84c1af6b5fd29ed6 (diff)
parent82b8ec7a392516ec8fd292df8788784da7b319ad (diff)
downloadironic-cf43fa4e3c133714ac0825faa452cec9294c6ae9.tar.gz
Merge "Get conductor metric data"
-rw-r--r--doc/source/admin/metrics.rst34
-rw-r--r--ironic/conductor/manager.py80
-rw-r--r--ironic/conductor/periodics.py17
-rw-r--r--ironic/conf/__init__.py2
-rw-r--r--ironic/conf/conductor.py35
-rw-r--r--ironic/conf/opts.py1
-rw-r--r--ironic/conf/sensor_data.py89
-rw-r--r--ironic/tests/unit/conductor/test_manager.py112
-rw-r--r--releasenotes/notes/conductor-metric-collector-support-1b8b8c71f9f59da4.yaml39
-rw-r--r--requirements.txt2
10 files changed, 348 insertions, 63 deletions
diff --git a/doc/source/admin/metrics.rst b/doc/source/admin/metrics.rst
index f435a50c5..733c6569b 100644
--- a/doc/source/admin/metrics.rst
+++ b/doc/source/admin/metrics.rst
@@ -17,8 +17,11 @@ These performance measurements, herein referred to as "metrics", can be
emitted from the Bare Metal service, including ironic-api, ironic-conductor,
and ironic-python-agent. By default, none of the services will emit metrics.
-Configuring the Bare Metal Service to Enable Metrics
-====================================================
+It is important to stress that not only statsd is supported for metrics
+collection and transmission. This is covered later on in our documentation.
+
+Configuring the Bare Metal Service to Enable Metrics with Statsd
+================================================================
Enabling metrics in ironic-api and ironic-conductor
---------------------------------------------------
@@ -62,6 +65,30 @@ in the ironic configuration file as well::
agent_statsd_host = 198.51.100.2
agent_statsd_port = 8125
+.. Note::
+ Use of a different metrics backend with the agent is not presently
+ supported.
+
+Transmission to the Message Bus Notifier
+========================================
+
+Regardless if you're using Ceilometer,
+`ironic-prometheus-exporter <https://docs.openstack.org/ironic-prometheus-exporter/latest/>`_,
+or some scripting you wrote to consume the message bus notifications,
+metrics data can be sent to the message bus notifier from the timer methods
+*and* additional gauge counters by utilizing the ``[metrics]backend``
+configuration option and setting it to ``collector``. When this is the case,
+Information is cached locally and periodically sent along with the general sensor
+data update to the messaging notifier, which can consumed off of the message bus,
+or via notifier plugin (such as is done with ironic-prometheus-exporter).
+
+.. NOTE::
+ Transmission of timer data only works for the Conductor or ``single-process``
+ Ironic service model. A separate webserver process presently does not have
+ the capability of triggering the call to retrieve and transmit the data.
+
+.. NOTE::
+ This functionality requires ironic-lib version 5.4.0 to be installed.
Types of Metrics Emitted
========================
@@ -79,6 +106,9 @@ additional load before enabling metrics. To see which metrics have changed names
or have been removed between releases, refer to the `ironic release notes
<https://docs.openstack.org/releasenotes/ironic/>`_.
+Additional conductor metrics in the form of counts will also be generated in
+limited locations where petinant to the activity of the conductor.
+
.. note::
With the default statsd configuration, each timing metric may create
additional metrics due to how statsd handles timing metrics. For more
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index ad45d2d74..8de34b76a 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -98,6 +98,8 @@ class ConductorManager(base_manager.BaseConductorManager):
def __init__(self, host, topic):
super(ConductorManager, self).__init__(host, topic)
+ # NOTE(TheJulia): This is less a metric-able count, but a means to
+ # sort out nodes and prioritise a subset (of non-responding nodes).
self.power_state_sync_count = collections.defaultdict(int)
@METRICS.timer('ConductorManager._clean_up_caches')
@@ -1433,6 +1435,11 @@ class ConductorManager(base_manager.BaseConductorManager):
finally:
waiters.wait_for_all(futures)
+ # report a count of the nodes
+ METRICS.send_gauge(
+ 'ConductorManager.PowerSyncNodesCount',
+ len(nodes))
+
def _sync_power_state_nodes_task(self, context, nodes):
"""Invokes power state sync on nodes from synchronized queue.
@@ -1451,6 +1458,7 @@ class ConductorManager(base_manager.BaseConductorManager):
can do here to avoid failing a brand new deploy to a node that
we've locked here, though.
"""
+
# FIXME(comstud): Since our initial state checks are outside
# of the lock (to try to avoid the lock), some checks are
# repeated after grabbing the lock so we can unlock quickly.
@@ -1497,6 +1505,12 @@ class ConductorManager(base_manager.BaseConductorManager):
LOG.info("During sync_power_state, node %(node)s was not "
"found and presumed deleted by another process.",
{'node': node_uuid})
+ # TODO(TheJulia): The chance exists that we orphan a node
+ # in power_state_sync_count, albeit it is not much data,
+ # it could eventually cause the memory footprint to grow
+ # on an exceptionally large ironic deployment. We should
+ # make sure we clean it up at some point, but overall given
+ # minimal impact, it is definite low hanging fruit.
except exception.NodeLocked:
LOG.info("During sync_power_state, node %(node)s was "
"already locked by another process. Skip.",
@@ -1513,6 +1527,7 @@ class ConductorManager(base_manager.BaseConductorManager):
# regular power state checking, maintenance is still a required
# condition.
filters={'maintenance': True, 'fault': faults.POWER_FAILURE},
+ node_count_metric_name='ConductorManager.PowerSyncRecoveryNodeCount',
)
def _power_failure_recovery(self, task, context):
"""Periodic task to check power states for nodes in maintenance.
@@ -1855,6 +1870,7 @@ class ConductorManager(base_manager.BaseConductorManager):
predicate=lambda n, m: n.conductor_affinity != m.conductor.id,
limit=lambda: CONF.conductor.periodic_max_workers,
shared_task=False,
+ node_count_metric_name='ConductorManager.SyncLocalStateNodeCount',
)
def _sync_local_state(self, task, context):
"""Perform any actions necessary to sync local state.
@@ -2640,14 +2656,63 @@ class ConductorManager(base_manager.BaseConductorManager):
# Yield on every iteration
eventlet.sleep(0)
+ def _sensors_conductor(self, context):
+ """Called to collect and send metrics "sensors" for the conductor."""
+ # populate the message which will be sent to ceilometer
+ # or other data consumer
+ message = {'message_id': uuidutils.generate_uuid(),
+ 'timestamp': datetime.datetime.utcnow(),
+ 'hostname': self.host}
+
+ try:
+ ev_type = 'ironic.metrics'
+ message['event_type'] = ev_type + '.update'
+ sensors_data = METRICS.get_metrics_data()
+ except AttributeError:
+ # TODO(TheJulia): Remove this at some point, but right now
+ # don't inherently break on version mismatches when people
+ # disregard requriements.
+ LOG.warning(
+ 'get_sensors_data has been configured to collect '
+ 'conductor metrics, however the installed ironic-lib '
+ 'library lacks the functionality. Please update '
+ 'ironic-lib to a minimum of version 5.4.0.')
+ except Exception as e:
+ LOG.exception(
+ "An unknown error occured while attempting to collect "
+ "sensor data from within the conductor. Error: %(error)s",
+ {'error': e})
+ else:
+ message['payload'] = (
+ self._filter_out_unsupported_types(sensors_data))
+ if message['payload']:
+ self.sensors_notifier.info(
+ context, ev_type, message)
+
@METRICS.timer('ConductorManager._send_sensor_data')
- @periodics.periodic(spacing=CONF.conductor.send_sensor_data_interval,
- enabled=CONF.conductor.send_sensor_data)
+ @periodics.periodic(spacing=CONF.sensor_data.interval,
+ enabled=CONF.sensor_data.send_sensor_data)
def _send_sensor_data(self, context):
"""Periodically collects and transmits sensor data notifications."""
+ if CONF.sensor_data.enable_for_conductor:
+ if CONF.sensor_data.workers == 1:
+ # Directly call the sensors_conductor when only one
+ # worker is permitted, so we collect data serially
+ # instead.
+ self._sensors_conductor(context)
+ else:
+ # Also, do not apply the general threshold limit to
+ # the self collection of "sensor" data from the conductor,
+ # as were not launching external processes, we're just reading
+ # from an internal data structure, if we can.
+ self._spawn_worker(self._sensors_conductor, context)
+ if not CONF.sensor_data.enable_for_nodes:
+ # NOTE(TheJulia): If node sensor data is not required, then
+ # skip the rest of this method.
+ return
filters = {}
- if not CONF.conductor.send_sensor_data_for_undeployed_nodes:
+ if not CONF.sensor_data.enable_for_undeployed_nodes:
filters['provision_state'] = states.ACTIVE
nodes = queue.Queue()
@@ -2655,7 +2720,7 @@ class ConductorManager(base_manager.BaseConductorManager):
filters=filters):
nodes.put_nowait(node_info)
- number_of_threads = min(CONF.conductor.send_sensor_data_workers,
+ number_of_threads = min(CONF.sensor_data.workers,
nodes.qsize())
futures = []
for thread_number in range(number_of_threads):
@@ -2671,7 +2736,7 @@ class ConductorManager(base_manager.BaseConductorManager):
break
done, not_done = waiters.wait_for_all(
- futures, timeout=CONF.conductor.send_sensor_data_wait_timeout)
+ futures, timeout=CONF.sensor_data.wait_timeout)
if not_done:
LOG.warning("%d workers for send sensors data did not complete",
len(not_done))
@@ -2680,13 +2745,14 @@ class ConductorManager(base_manager.BaseConductorManager):
"""Filters out sensor data types that aren't specified in the config.
Removes sensor data types that aren't specified in
- CONF.conductor.send_sensor_data_types.
+ CONF.sensor_data.data_types.
:param sensors_data: dict containing sensor types and the associated
data
:returns: dict with unsupported sensor types removed
"""
- allowed = set(x.lower() for x in CONF.conductor.send_sensor_data_types)
+ allowed = set(x.lower() for x in
+ CONF.sensor_data.data_types)
if 'all' in allowed:
return sensors_data
diff --git a/ironic/conductor/periodics.py b/ironic/conductor/periodics.py
index 70bc7bc93..b9c8f8844 100644
--- a/ironic/conductor/periodics.py
+++ b/ironic/conductor/periodics.py
@@ -18,6 +18,7 @@ import inspect
import eventlet
from futurist import periodics
+from ironic_lib import metrics_utils
from oslo_log import log
from ironic.common import exception
@@ -29,6 +30,9 @@ from ironic.drivers import base as driver_base
LOG = log.getLogger(__name__)
+METRICS = metrics_utils.get_metrics_logger(__name__)
+
+
def periodic(spacing, enabled=True, **kwargs):
"""A decorator to define a periodic task.
@@ -46,7 +50,7 @@ class Stop(Exception):
def node_periodic(purpose, spacing, enabled=True, filters=None,
predicate=None, predicate_extra_fields=(), limit=None,
- shared_task=True):
+ shared_task=True, node_count_metric_name=None):
"""A decorator to define a periodic task to act on nodes.
Defines a periodic task that fetches the list of nodes mapped to the
@@ -84,6 +88,9 @@ def node_periodic(purpose, spacing, enabled=True, filters=None,
iteration to determine the limit.
:param shared_task: if ``True``, the task will have a shared lock. It is
recommended to start with a shared lock and upgrade it only if needed.
+ :param node_count_metric_name: A string value to identify a metric
+ representing the count of matching nodes to be recorded upon the
+ completion of the periodic.
"""
node_type = collections.namedtuple(
'Node',
@@ -116,10 +123,11 @@ def node_periodic(purpose, spacing, enabled=True, filters=None,
else:
local_limit = limit
assert local_limit is None or local_limit > 0
-
+ node_count = 0
nodes = manager.iter_nodes(filters=filters,
fields=predicate_extra_fields)
for (node_uuid, *other) in nodes:
+ node_count += 1
if predicate is not None:
node = node_type(node_uuid, *other)
if accepts_manager:
@@ -158,6 +166,11 @@ def node_periodic(purpose, spacing, enabled=True, filters=None,
local_limit -= 1
if not local_limit:
return
+ if node_count_metric_name:
+ # Send post-run metrics.
+ METRICS.send_gauge(
+ node_count_metric_name,
+ node_count)
return wrapper
diff --git a/ironic/conf/__init__.py b/ironic/conf/__init__.py
index c1a893181..41201346f 100644
--- a/ironic/conf/__init__.py
+++ b/ironic/conf/__init__.py
@@ -44,6 +44,7 @@ from ironic.conf import neutron
from ironic.conf import nova
from ironic.conf import pxe
from ironic.conf import redfish
+from ironic.conf import sensor_data
from ironic.conf import service_catalog
from ironic.conf import snmp
from ironic.conf import swift
@@ -80,6 +81,7 @@ neutron.register_opts(CONF)
nova.register_opts(CONF)
pxe.register_opts(CONF)
redfish.register_opts(CONF)
+sensor_data.register_opts(CONF)
service_catalog.register_opts(CONF)
snmp.register_opts(CONF)
swift.register_opts(CONF)
diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py
index 2161b9434..653e30f56 100644
--- a/ironic/conf/conductor.py
+++ b/ironic/conf/conductor.py
@@ -97,41 +97,6 @@ opts = [
cfg.IntOpt('node_locked_retry_interval',
default=1,
help=_('Seconds to sleep between node lock attempts.')),
- cfg.BoolOpt('send_sensor_data',
- default=False,
- help=_('Enable sending sensor data message via the '
- 'notification bus')),
- cfg.IntOpt('send_sensor_data_interval',
- default=600,
- min=1,
- help=_('Seconds between conductor sending sensor data message '
- 'to ceilometer via the notification bus.')),
- cfg.IntOpt('send_sensor_data_workers',
- default=4, min=1,
- help=_('The maximum number of workers that can be started '
- 'simultaneously for send data from sensors periodic '
- 'task.')),
- cfg.IntOpt('send_sensor_data_wait_timeout',
- default=300,
- help=_('The time in seconds to wait for send sensors data '
- 'periodic task to be finished before allowing periodic '
- 'call to happen again. Should be less than '
- 'send_sensor_data_interval value.')),
- cfg.ListOpt('send_sensor_data_types',
- default=['ALL'],
- help=_('List of comma separated meter types which need to be'
- ' sent to Ceilometer. The default value, "ALL", is a '
- 'special value meaning send all the sensor data.')),
- cfg.BoolOpt('send_sensor_data_for_undeployed_nodes',
- default=False,
- help=_('The default for sensor data collection is to only '
- 'collect data for machines that are deployed, however '
- 'operators may desire to know if there are failures '
- 'in hardware that is not presently in use. '
- 'When set to true, the conductor will collect sensor '
- 'information from all nodes when sensor data '
- 'collection is enabled via the send_sensor_data '
- 'setting.')),
cfg.IntOpt('sync_local_state_interval',
default=180,
help=_('When conductors join or leave the cluster, existing '
diff --git a/ironic/conf/opts.py b/ironic/conf/opts.py
index 846949893..a7ebcfb30 100644
--- a/ironic/conf/opts.py
+++ b/ironic/conf/opts.py
@@ -43,6 +43,7 @@ _opts = [
('nova', ironic.conf.nova.list_opts()),
('pxe', ironic.conf.pxe.opts),
('redfish', ironic.conf.redfish.opts),
+ ('sensor_data', ironic.conf.sensor_data.opts),
('service_catalog', ironic.conf.service_catalog.list_opts()),
('snmp', ironic.conf.snmp.opts),
('swift', ironic.conf.swift.list_opts()),
diff --git a/ironic/conf/sensor_data.py b/ironic/conf/sensor_data.py
new file mode 100644
index 000000000..8527113a6
--- /dev/null
+++ b/ironic/conf/sensor_data.py
@@ -0,0 +1,89 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+from oslo_config import cfg
+
+from ironic.common.i18n import _
+
+opts = [
+ cfg.BoolOpt('send_sensor_data',
+ default=False,
+ deprecated_group='conductor',
+ deprecated_name='send_sensor_data',
+ help=_('Enable sending sensor data message via the '
+ 'notification bus.')),
+ cfg.IntOpt('interval',
+ default=600,
+ min=1,
+ deprecated_group='conductor',
+ deprecated_name='send_sensor_data_interval',
+ help=_('Seconds between conductor sending sensor data message '
+ 'via the notification bus. This was originally for '
+ 'consumption via ceilometer, but the data may also '
+ 'be consumed via a plugin like '
+ 'ironic-prometheus-exporter or any other message bus '
+ 'data collector.')),
+ cfg.IntOpt('workers',
+ default=4, min=1,
+ deprecated_group='conductor',
+ deprecated_name='send_sensor_data_workers',
+ help=_('The maximum number of workers that can be started '
+ 'simultaneously for send data from sensors periodic '
+ 'task.')),
+ cfg.IntOpt('wait_timeout',
+ default=300,
+ deprecated_group='conductor',
+ deprecated_name='send_sensor_data_wait_timeout',
+ help=_('The time in seconds to wait for send sensors data '
+ 'periodic task to be finished before allowing periodic '
+ 'call to happen again. Should be less than '
+ 'send_sensor_data_interval value.')),
+ cfg.ListOpt('data_types',
+ default=['ALL'],
+ deprecated_group='conductor',
+ deprecated_name='send_sensor_data_types',
+ help=_('List of comma separated meter types which need to be '
+ 'sent to Ceilometer. The default value, "ALL", is a '
+ 'special value meaning send all the sensor data. '
+ 'This setting only applies to baremetal sensor data '
+ 'being processed through the conductor.')),
+ cfg.BoolOpt('enable_for_undeployed_nodes',
+ default=False,
+ deprecated_group='conductor',
+ deprecated_name='send_sensor_data_for_undeployed_nodes',
+ help=_('The default for sensor data collection is to only '
+ 'collect data for machines that are deployed, however '
+ 'operators may desire to know if there are failures '
+ 'in hardware that is not presently in use. '
+ 'When set to true, the conductor will collect sensor '
+ 'information from all nodes when sensor data '
+ 'collection is enabled via the send_sensor_data '
+ 'setting.')),
+ cfg.BoolOpt('enable_for_conductor',
+ default=True,
+ help=_('If to include sensor metric data for the Conductor '
+ 'process itself in the message payload for sensor '
+ 'data which allows operators to gather instance '
+ 'counts of actions and states to better manage '
+ 'the deployment.')),
+ cfg.BoolOpt('enable_for_nodes',
+ default=True,
+ help=_('If to transmit any sensor data for any nodes under '
+ 'this conductor\'s management. This option superceeds '
+ 'the ``send_sensor_data_for_undeployed_nodes`` '
+ 'setting.')),
+]
+
+
+def register_opts(conf):
+ conf.register_opts(opts, group='sensor_data')
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index ded80718d..027418ba9 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -26,6 +26,7 @@ from unittest import mock
import eventlet
from futurist import waiters
+from ironic_lib import metrics as ironic_metrics
from oslo_config import cfg
import oslo_messaging as messaging
from oslo_utils import uuidutils
@@ -4273,7 +4274,8 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
def test__filter_out_unsupported_types_all(self):
self._start_service()
- CONF.set_override('send_sensor_data_types', ['All'], group='conductor')
+ CONF.set_override('data_types', ['All'],
+ group='sensor_data')
fake_sensors_data = {"t1": {'f1': 'v1'}, "t2": {'f1': 'v1'}}
actual_result = (
self.service._filter_out_unsupported_types(fake_sensors_data))
@@ -4282,7 +4284,8 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
def test__filter_out_unsupported_types_part(self):
self._start_service()
- CONF.set_override('send_sensor_data_types', ['t1'], group='conductor')
+ CONF.set_override('data_types', ['t1'],
+ group='sensor_data')
fake_sensors_data = {"t1": {'f1': 'v1'}, "t2": {'f1': 'v1'}}
actual_result = (
self.service._filter_out_unsupported_types(fake_sensors_data))
@@ -4291,7 +4294,8 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
def test__filter_out_unsupported_types_non(self):
self._start_service()
- CONF.set_override('send_sensor_data_types', ['t3'], group='conductor')
+ CONF.set_override('data_types', ['t3'],
+ group='sensor_data')
fake_sensors_data = {"t1": {'f1': 'v1'}, "t2": {'f1': 'v1'}}
actual_result = (
self.service._filter_out_unsupported_types(fake_sensors_data))
@@ -4305,7 +4309,8 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
for i in range(5):
nodes.put_nowait(('fake_uuid-%d' % i, 'fake-hardware', '', None))
self._start_service()
- CONF.set_override('send_sensor_data', True, group='conductor')
+ CONF.set_override('send_sensor_data', True,
+ group='sensor_data')
task = acquire_mock.return_value.__enter__.return_value
task.node.maintenance = False
@@ -4334,7 +4339,8 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
nodes.put_nowait(('fake_uuid', 'fake-hardware', '', None))
self._start_service()
self.service._shutdown = True
- CONF.set_override('send_sensor_data', True, group='conductor')
+ CONF.set_override('send_sensor_data', True,
+ group='sensor_data')
self.service._sensors_nodes_task(self.context, nodes)
acquire_mock.return_value.__enter__.assert_not_called()
@@ -4343,7 +4349,8 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
nodes = queue.Queue()
nodes.put_nowait(('fake_uuid', 'fake-hardware', '', None))
- CONF.set_override('send_sensor_data', True, group='conductor')
+ CONF.set_override('send_sensor_data', True,
+ group='sensor_data')
self._start_service()
@@ -4361,7 +4368,7 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
nodes = queue.Queue()
nodes.put_nowait(('fake_uuid', 'fake-hardware', '', None))
self._start_service()
- CONF.set_override('send_sensor_data', True, group='conductor')
+ CONF.set_override('send_sensor_data', True, group='sensor_data')
task = acquire_mock.return_value.__enter__.return_value
task.node.maintenance = True
@@ -4384,10 +4391,10 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_spawn):
self._start_service()
- CONF.set_override('send_sensor_data', True, group='conductor')
+ CONF.set_override('send_sensor_data', True, group='sensor_data')
# NOTE(galyna): do not wait for threads to be finished in unittests
- CONF.set_override('send_sensor_data_wait_timeout', 0,
- group='conductor')
+ CONF.set_override('wait_timeout', 0,
+ group='sensor_data')
_mapped_to_this_conductor_mock.return_value = True
get_nodeinfo_list_mock.return_value = [('fake_uuid', 'fake', None)]
self.service._send_sensor_data(self.context)
@@ -4395,6 +4402,37 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.service._sensors_nodes_task,
self.context, mock.ANY)
+ @mock.patch.object(queue, 'Queue', autospec=True)
+ @mock.patch.object(manager.ConductorManager, '_sensors_conductor',
+ autospec=True)
+ @mock.patch.object(manager.ConductorManager, '_spawn_worker',
+ autospec=True)
+ @mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor',
+ autospec=True)
+ @mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list', autospec=True)
+ def test___send_sensor_data_disabled(
+ self, get_nodeinfo_list_mock,
+ _mapped_to_this_conductor_mock,
+ mock_spawn, mock_sensors_conductor,
+ mock_queue):
+ self._start_service()
+
+ CONF.set_override('send_sensor_data', True, group='sensor_data')
+ CONF.set_override('enable_for_nodes', False,
+ group='sensor_data')
+ CONF.set_override('enable_for_conductor', False,
+ group='sensor_data')
+ # NOTE(galyna): do not wait for threads to be finished in unittests
+ CONF.set_override('wait_timeout', 0,
+ group='sensor_data')
+ _mapped_to_this_conductor_mock.return_value = True
+ get_nodeinfo_list_mock.return_value = [('fake_uuid', 'fake', None)]
+ self.service._send_sensor_data(self.context)
+ mock_sensors_conductor.assert_not_called()
+ # NOTE(TheJulia): Can't use the spawn worker since it records other,
+ # unrelated calls. So, queue works well here.
+ mock_queue.assert_not_called()
+
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
autospec=True)
@mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor',
@@ -4407,24 +4445,66 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_spawn.reset_mock()
number_of_workers = 8
- CONF.set_override('send_sensor_data', True, group='conductor')
- CONF.set_override('send_sensor_data_workers', number_of_workers,
- group='conductor')
+ CONF.set_override('send_sensor_data', True, group='sensor_data')
+ CONF.set_override('workers', number_of_workers,
+ group='sensor_data')
# NOTE(galyna): do not wait for threads to be finished in unittests
- CONF.set_override('send_sensor_data_wait_timeout', 0,
- group='conductor')
+ CONF.set_override('wait_timeout', 0,
+ group='sensor_data')
_mapped_to_this_conductor_mock.return_value = True
get_nodeinfo_list_mock.return_value = [('fake_uuid', 'fake',
None)] * 20
self.service._send_sensor_data(self.context)
- self.assertEqual(number_of_workers,
+ self.assertEqual(number_of_workers + 1,
mock_spawn.call_count)
# TODO(TheJulia): At some point, we should add a test to validate that
# a modified filter to return all nodes actually works, although
# the way the sensor tests are written, the list is all mocked.
+ @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
+ autospec=True)
+ @mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor',
+ autospec=True)
+ @mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list', autospec=True)
+ def test___send_sensor_data_one_worker(
+ self, get_nodeinfo_list_mock, _mapped_to_this_conductor_mock,
+ mock_spawn):
+ self._start_service()
+ mock_spawn.reset_mock()
+
+ number_of_workers = 1
+ CONF.set_override('send_sensor_data', True, group='sensor_data')
+ CONF.set_override('workers', number_of_workers,
+ group='sensor_data')
+ # NOTE(galyna): do not wait for threads to be finished in unittests
+ CONF.set_override('wait_timeout', 0,
+ group='sensor_data')
+
+ _mapped_to_this_conductor_mock.return_value = True
+ get_nodeinfo_list_mock.return_value = [('fake_uuid', 'fake',
+ None)] * 20
+ self.service._send_sensor_data(self.context)
+ self.assertEqual(number_of_workers,
+ mock_spawn.call_count)
+
+ @mock.patch.object(messaging.Notifier, 'info', autospec=True)
+ @mock.patch.object(ironic_metrics.MetricLogger,
+ 'get_metrics_data', autospec=True)
+ def test__sensors_conductor(self, mock_get_metrics, mock_notifier):
+ metric = {'metric': 'data'}
+ mock_get_metrics.return_value = metric
+ self._start_service()
+ self.service._sensors_conductor(self.context)
+ self.assertEqual(mock_notifier.call_count, 1)
+ self.assertEqual('ironic.metrics', mock_notifier.call_args.args[2])
+ metrics_dict = mock_notifier.call_args.args[3]
+ self.assertEqual(metrics_dict.get('event_type'),
+ 'ironic.metrics.update')
+ self.assertDictEqual(metrics_dict.get('payload'),
+ metric)
+
@mgr_utils.mock_record_keepalive
class BootDeviceTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
diff --git a/releasenotes/notes/conductor-metric-collector-support-1b8b8c71f9f59da4.yaml b/releasenotes/notes/conductor-metric-collector-support-1b8b8c71f9f59da4.yaml
new file mode 100644
index 000000000..dfa3b0f89
--- /dev/null
+++ b/releasenotes/notes/conductor-metric-collector-support-1b8b8c71f9f59da4.yaml
@@ -0,0 +1,39 @@
+---
+features:
+ - |
+ Adds the ability for Ironic to send conductor process metrics
+ for monitoring. This requires the use of a new ``[metrics]backend``
+ option value of ``collector``. This data was previously only available
+ through the use of statsd. This requires ``ironic-lib`` version ``5.4.0``
+ or newer. This capability can be disabled using the
+ ``[sensor_data]enable_for_conductor`` option if set to False.
+ - |
+ Adds a ``[sensor_data]enable_for_nodes`` configuration option
+ to allow operators to disable sending node metric data via the
+ message bus notifier.
+ - |
+ Adds a new gauge metric ``ConductorManager.PowerSyncNodesCount``
+ which tracks the nodes considered for power state synchrnozation.
+ - Adds a new gauge metric ``ConductorManager.PowerSyncRecoveryNodeCount``
+ which represents the number of nodes which are being evaluated for power
+ state recovery checking.
+ - Adds a new gauge metric ``ConductorManager.SyncLocalStateNodeCount``
+ which represents the number of nodes being tracked locally by the
+ conductor.
+issues:
+ - Sensor data notifications to the message bus, such as using the
+ ``[metrics]backend`` configuration option of ``collector`` on a dedicated
+ API service process or instance, is not presently supported. This
+ functionality requires a periodic task to trigger the transmission
+ of metrics messages to the message bus notifier.
+deprecations:
+ - The setting values starting with ``send_sensor`` in the ``[conductor]``
+ configuration group have been deprecated and moved to a ``[sensor_data]``
+ configuration group. The names have been updated to shorter, operator
+ friendly names..
+upgrades:
+ - Settings starting with ``sensor_data`` in the ``[conductor]``
+ configuration group have been moved to a ``[sensor_data]`` configuration
+ group amd have been renamed to have shorter value names. If configuration
+ values are not updated, the ``oslo.config`` library will emit a warning
+ in the logs.
diff --git a/requirements.txt b/requirements.txt
index 0c73e632e..2f4813baa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,7 @@ WebOb>=1.7.1 # MIT
python-cinderclient!=4.0.0,>=3.3.0 # Apache-2.0
python-glanceclient>=2.8.0 # Apache-2.0
keystoneauth1>=4.2.0 # Apache-2.0
-ironic-lib>=4.6.1 # Apache-2.0
+ironic-lib>=5.4.0 # Apache-2.0
python-swiftclient>=3.2.0 # Apache-2.0
pytz>=2013.6 # MIT
stevedore>=1.29.0 # Apache-2.0