summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/source/admin/monitoring.rst26
-rw-r--r--releasenotes/notes/resource-usage-stats-bfcd6765ef4a9c86.yaml14
-rw-r--r--tests/base.py3
-rw-r--r--tests/unit/test_scheduler.py68
-rw-r--r--zuul/manager/__init__.py3
-rw-r--r--zuul/model.py1
-rw-r--r--zuul/nodepool.py91
7 files changed, 202 insertions, 4 deletions
diff --git a/doc/source/admin/monitoring.rst b/doc/source/admin/monitoring.rst
index a51d17584..bb61381fd 100644
--- a/doc/source/admin/monitoring.rst
+++ b/doc/source/admin/monitoring.rst
@@ -288,6 +288,32 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
Persistently high values indicate more testing node resources
would be helpful.
+ .. stat:: resources
+
+ Holds metrics about resource usage by tenant or project if resources
+ of nodes are reported by nodepool.
+
+ .. stat:: tenant
+
+ Holds resource usage metrics by tenant.
+
+ .. stat:: <tenant>.<resource>
+ :type: counter, gauge
+
+ Counter with the summed usage by tenant as <resource> seconds and
+ gauge with the currently used resources by tenant.
+
+ .. stat:: project
+
+ Holds resource usage metrics by project.
+
+ .. stat:: <project>.<resource>
+ :type: counter, gauge
+
+ Counter with the summed usage by project as <resource> seconds and
+ gauge with the currently used resources by tenant.
+
+
.. stat:: zuul.mergers
Holds metrics related to Zuul mergers.
diff --git a/releasenotes/notes/resource-usage-stats-bfcd6765ef4a9c86.yaml b/releasenotes/notes/resource-usage-stats-bfcd6765ef4a9c86.yaml
new file mode 100644
index 000000000..a11697ada
--- /dev/null
+++ b/releasenotes/notes/resource-usage-stats-bfcd6765ef4a9c86.yaml
@@ -0,0 +1,14 @@
+---
+features:
+ - |
+ Zuul now reports resource usage statistics if they are provided by nodepool.
+
+ The following statistics are emitted:
+
+ - `zuul.nodepool.resources.tenant.{tenant}.{resource}`:
+ Gauge with the currently used resources by tenant and counter with the
+ summed usage by tenant. e.g. cpu seconds
+
+ - `zuul.nodepool.resources.project.{project}.{resource}`:
+ Gauge with the currently used resources by project and counter with the
+ summed usage by project. e.g. cpu seconds
diff --git a/tests/base.py b/tests/base.py
index cd9bb01bf..f1c90e980 100644
--- a/tests/base.py
+++ b/tests/base.py
@@ -1846,6 +1846,7 @@ class FakeNodepool(object):
self.fail_requests = set()
self.remote_ansible = False
self.attributes = None
+ self.resources = None
def stop(self):
self._running = False
@@ -1951,6 +1952,8 @@ class FakeNodepool(object):
host_keys=host_keys,
executor='fake-nodepool',
hold_expiration=None)
+ if self.resources:
+ data['resources'] = self.resources
if self.remote_ansible:
data['connection_type'] = 'ssh'
if 'fakeuser' in node_type:
diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index 85fc28e3f..91939609d 100644
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@@ -5317,6 +5317,72 @@ For CI problems and help debugging, contact ci@example.org"""
self.assertIn('project-test1 : SKIPPED', A.messages[1])
self.assertIn('project-test2 : SKIPPED', A.messages[1])
+ def test_nodepool_resources(self):
+ "Test that resources are reported"
+
+ self.executor_server.hold_jobs_in_build = True
+ self.fake_nodepool.resources = {
+ 'cores': 2,
+ 'ram': 1024,
+ 'instances': 1,
+ }
+ A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+ A.addApproval('Code-Review', 2)
+ self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+ self.waitUntilSettled()
+
+ self.executor_server.release('project-merge')
+ self.waitUntilSettled()
+
+ # Check that resource usage gauges are reported
+ self.assertHistory([
+ dict(name='project-merge', result='SUCCESS', changes='1,1'),
+ ])
+ self.assertReportedStat(
+ 'zuul.nodepool.resources.tenant.tenant-one.cores',
+ value='2', kind='g')
+ self.assertReportedStat(
+ 'zuul.nodepool.resources.tenant.tenant-one.ram',
+ value='1024', kind='g')
+ self.assertReportedStat(
+ 'zuul.nodepool.resources.tenant.tenant-one.instances',
+ value='1', kind='g')
+ self.assertReportedStat(
+ 'zuul.nodepool.resources.project.review_example_com/org/project.'
+ 'cores', value='2', kind='g')
+ self.assertReportedStat(
+ 'zuul.nodepool.resources.project.review_example_com/org/project.'
+ 'ram', value='1024', kind='g')
+ self.assertReportedStat(
+ 'zuul.nodepool.resources.project.review_example_com/org/project.'
+ 'instances', value='1', kind='g')
+
+ # Check that resource usage counters are reported
+ self.assertReportedStat(
+ 'zuul.nodepool.resources.tenant.tenant-one.cores',
+ kind='c')
+ self.assertReportedStat(
+ 'zuul.nodepool.resources.tenant.tenant-one.ram',
+ kind='c')
+ self.assertReportedStat(
+ 'zuul.nodepool.resources.tenant.tenant-one.instances',
+ kind='c')
+ self.assertReportedStat(
+ 'zuul.nodepool.resources.project.review_example_com/org/project.'
+ 'cores', kind='c')
+ self.assertReportedStat(
+ 'zuul.nodepool.resources.project.review_example_com/org/project.'
+ 'ram', kind='c')
+ self.assertReportedStat(
+ 'zuul.nodepool.resources.project.review_example_com/org/project.'
+ 'instances', kind='c')
+
+ self.executor_server.hold_jobs_in_build = False
+ self.executor_server.release()
+ self.waitUntilSettled()
+ self.assertEqual(A.data['status'], 'MERGED')
+ self.assertEqual(A.reported, 2)
+
def test_nodepool_pipeline_priority(self):
"Test that nodes are requested at the correct pipeline priority"
@@ -6520,7 +6586,7 @@ class TestSemaphore(ZuulTestCase):
# Simulate a single zk error in useNodeSet
orig_useNodeSet = self.nodepool.useNodeSet
- def broken_use_nodeset(nodeset):
+ def broken_use_nodeset(nodeset, build_set=None):
# restore original useNodeSet
self.nodepool.useNodeSet = orig_useNodeSet
raise NoNodeError()
diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py
index 0f2d7da06..3cd863ec0 100644
--- a/zuul/manager/__init__.py
+++ b/zuul/manager/__init__.py
@@ -403,7 +403,8 @@ class PipelineManager(object):
self.log.debug("Found job %s for change %s" % (job, item.change))
try:
nodeset = item.current_build_set.getJobNodeSet(job.name)
- self.sched.nodepool.useNodeSet(nodeset)
+ self.sched.nodepool.useNodeSet(
+ nodeset, build_set=item.current_build_set)
self.sched.executor.execute(
job, item, self.pipeline,
build_set.dependent_changes,
diff --git a/zuul/model.py b/zuul/model.py
index 432104b39..4e55aaf46 100644
--- a/zuul/model.py
+++ b/zuul/model.py
@@ -547,6 +547,7 @@ class Node(ConfigObject):
self.region = None
self.username = None
self.hold_expiration = None
+ self.resources = None
@property
def state(self):
diff --git a/zuul/nodepool.py b/zuul/nodepool.py
index 005482f01..ba22c8ddc 100644
--- a/zuul/nodepool.py
+++ b/zuul/nodepool.py
@@ -12,17 +12,30 @@
import logging
+from collections import defaultdict
from zuul import model
from zuul.lib.logutil import get_annotated_logger
from zuul.zk import LockException
+def add_resources(target, source):
+ for key, value in source.items():
+ target[key] += value
+
+
+def subtract_resources(target, source):
+ for key, value in source.items():
+ target[key] -= value
+
+
class Nodepool(object):
log = logging.getLogger('zuul.nodepool')
def __init__(self, scheduler):
self.requests = {}
self.sched = scheduler
+ self.current_resources_by_tenant = {}
+ self.current_resources_by_project = {}
def emitStats(self, request):
# Implements the following :
@@ -60,6 +73,37 @@ class Nodepool(object):
pipe.gauge('zuul.nodepool.current_requests', len(self.requests))
pipe.send()
+ def emitStatsResources(self):
+ if not self.sched.statsd:
+ return
+ statsd = self.sched.statsd
+
+ for tenant, resources in self.current_resources_by_tenant.items():
+ for resource, value in resources.items():
+ key = 'zuul.nodepool.resources.tenant.' \
+ '{tenant}.{resource}'
+ statsd.gauge(key, value, tenant=tenant, resource=resource)
+ for project, resources in self.current_resources_by_project.items():
+ for resource, value in resources.items():
+ key = 'zuul.nodepool.resources.project.' \
+ '{project}.{resource}'
+ statsd.gauge(key, value, project=project, resource=resource)
+
+ def emitStatsResourceCounters(self, tenant, project, resources, duration):
+ if not self.sched.statsd:
+ return
+ statsd = self.sched.statsd
+
+ for resource, value in resources.items():
+ key = 'zuul.nodepool.resources.tenant.{tenant}.{resource}'
+ statsd.incr(key, value * duration,
+ tenant=tenant, resource=resource)
+ for resource, value in resources.items():
+ key = 'zuul.nodepool.resources.project.' \
+ '{project}.{resource}'
+ statsd.incr(key, value * duration,
+ project=project, resource=resource)
+
def requestNodes(self, build_set, job, relative_priority, event=None):
log = get_annotated_logger(self.log, event)
# Create a copy of the nodeset to represent the actual nodes
@@ -157,22 +201,47 @@ class Nodepool(object):
self.log.debug("Removing autohold for %s", autohold_key)
del self.sched.autohold_requests[autohold_key]
- def useNodeSet(self, nodeset):
+ def useNodeSet(self, nodeset, build_set=None):
self.log.info("Setting nodeset %s in use" % (nodeset,))
+ resources = defaultdict(int)
for node in nodeset.getNodes():
if node.lock is None:
raise Exception("Node %s is not locked" % (node,))
node.state = model.STATE_IN_USE
self.sched.zk.storeNode(node)
+ if node.resources:
+ add_resources(resources, node.resources)
+ if build_set and resources:
+ # we have a buildset and thus also tenant and project so we
+ # can emit project specific resource usage stats
+ tenant_name = build_set.item.layout.tenant.name
+ project_name = build_set.item.change.project.canonical_name
+
+ self.current_resources_by_tenant.setdefault(
+ tenant_name, defaultdict(int))
+ self.current_resources_by_project.setdefault(
+ project_name, defaultdict(int))
+
+ add_resources(self.current_resources_by_tenant[tenant_name],
+ resources)
+ add_resources(self.current_resources_by_project[project_name],
+ resources)
+ self.emitStatsResources()
def returnNodeSet(self, nodeset, build=None):
self.log.info("Returning nodeset %s" % (nodeset,))
+ resources = defaultdict(int)
+ duration = None
+ project = None
+ tenant = None
+ if build:
+ project = build.build_set.item.change.project
+ tenant = build.build_set.item.pipeline.tenant.name
if (build and build.start_time and build.end_time and
build.build_set and build.build_set.item and
build.build_set.item.change and
build.build_set.item.change.project):
duration = build.end_time - build.start_time
- project = build.build_set.item.change.project
self.log.info("Nodeset %s with %s nodes was in use "
"for %s seconds for build %s for project %s",
nodeset, len(nodeset.nodes), duration, build,
@@ -183,6 +252,8 @@ class Nodepool(object):
else:
try:
if node.state == model.STATE_IN_USE:
+ if node.resources:
+ add_resources(resources, node.resources)
node.state = model.STATE_USED
self.sched.zk.storeNode(node)
except Exception:
@@ -190,6 +261,22 @@ class Nodepool(object):
"while unlocking:" % (node,))
self._unlockNodes(nodeset.getNodes())
+ # When returning a nodeset we need to update the gauges if we have a
+ # build. Further we calculate resource*duration and increment their
+ # tenant or project specific counters. With that we have both the
+ # current value and also counters to be able to perform accounting.
+ if tenant and project and resources:
+ project_name = project.canonical_name
+ subtract_resources(
+ self.current_resources_by_tenant[tenant], resources)
+ subtract_resources(
+ self.current_resources_by_project[project_name], resources)
+ self.emitStatsResources()
+
+ if duration:
+ self.emitStatsResourceCounters(
+ tenant, project_name, resources, duration)
+
def unlockNodeSet(self, nodeset):
self._unlockNodes(nodeset.getNodes())