diff options
-rw-r--r-- | doc/source/admin/monitoring.rst | 26 | ||||
-rw-r--r-- | releasenotes/notes/resource-usage-stats-bfcd6765ef4a9c86.yaml | 14 | ||||
-rw-r--r-- | tests/base.py | 3 | ||||
-rw-r--r-- | tests/unit/test_scheduler.py | 68 | ||||
-rw-r--r-- | zuul/manager/__init__.py | 3 | ||||
-rw-r--r-- | zuul/model.py | 1 | ||||
-rw-r--r-- | zuul/nodepool.py | 91 |
7 files changed, 202 insertions, 4 deletions
diff --git a/doc/source/admin/monitoring.rst b/doc/source/admin/monitoring.rst index a51d17584..bb61381fd 100644 --- a/doc/source/admin/monitoring.rst +++ b/doc/source/admin/monitoring.rst @@ -288,6 +288,32 @@ These metrics are emitted by the Zuul :ref:`scheduler`: Persistently high values indicate more testing node resources would be helpful. + .. stat:: resources + + Holds metrics about resource usage by tenant or project if resources + of nodes are reported by nodepool. + + .. stat:: tenant + + Holds resource usage metrics by tenant. + + .. stat:: <tenant>.<resource> + :type: counter, gauge + + Counter with the summed usage by tenant as <resource> seconds and + gauge with the currently used resources by tenant. + + .. stat:: project + + Holds resource usage metrics by project. + + .. stat:: <project>.<resource> + :type: counter, gauge + + Counter with the summed usage by project as <resource> seconds and + gauge with the currently used resources by tenant. + + .. stat:: zuul.mergers Holds metrics related to Zuul mergers. diff --git a/releasenotes/notes/resource-usage-stats-bfcd6765ef4a9c86.yaml b/releasenotes/notes/resource-usage-stats-bfcd6765ef4a9c86.yaml new file mode 100644 index 000000000..a11697ada --- /dev/null +++ b/releasenotes/notes/resource-usage-stats-bfcd6765ef4a9c86.yaml @@ -0,0 +1,14 @@ +--- +features: + - | + Zuul now reports resource usage statistics if they are provided by nodepool. + + The following statistics are emitted: + + - `zuul.nodepool.resources.tenant.{tenant}.{resource}`: + Gauge with the currently used resources by tenant and counter with the + summed usage by tenant. e.g. cpu seconds + + - `zuul.nodepool.resources.project.{project}.{resource}`: + Gauge with the currently used resources by project and counter with the + summed usage by project. e.g. cpu seconds diff --git a/tests/base.py b/tests/base.py index cd9bb01bf..f1c90e980 100644 --- a/tests/base.py +++ b/tests/base.py @@ -1846,6 +1846,7 @@ class FakeNodepool(object): self.fail_requests = set() self.remote_ansible = False self.attributes = None + self.resources = None def stop(self): self._running = False @@ -1951,6 +1952,8 @@ class FakeNodepool(object): host_keys=host_keys, executor='fake-nodepool', hold_expiration=None) + if self.resources: + data['resources'] = self.resources if self.remote_ansible: data['connection_type'] = 'ssh' if 'fakeuser' in node_type: diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py index 85fc28e3f..91939609d 100644 --- a/tests/unit/test_scheduler.py +++ b/tests/unit/test_scheduler.py @@ -5317,6 +5317,72 @@ For CI problems and help debugging, contact ci@example.org""" self.assertIn('project-test1 : SKIPPED', A.messages[1]) self.assertIn('project-test2 : SKIPPED', A.messages[1]) + def test_nodepool_resources(self): + "Test that resources are reported" + + self.executor_server.hold_jobs_in_build = True + self.fake_nodepool.resources = { + 'cores': 2, + 'ram': 1024, + 'instances': 1, + } + A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') + A.addApproval('Code-Review', 2) + self.fake_gerrit.addEvent(A.addApproval('Approved', 1)) + self.waitUntilSettled() + + self.executor_server.release('project-merge') + self.waitUntilSettled() + + # Check that resource usage gauges are reported + self.assertHistory([ + dict(name='project-merge', result='SUCCESS', changes='1,1'), + ]) + self.assertReportedStat( + 'zuul.nodepool.resources.tenant.tenant-one.cores', + value='2', kind='g') + self.assertReportedStat( + 'zuul.nodepool.resources.tenant.tenant-one.ram', + value='1024', kind='g') + self.assertReportedStat( + 'zuul.nodepool.resources.tenant.tenant-one.instances', + value='1', kind='g') + self.assertReportedStat( + 'zuul.nodepool.resources.project.review_example_com/org/project.' + 'cores', value='2', kind='g') + self.assertReportedStat( + 'zuul.nodepool.resources.project.review_example_com/org/project.' + 'ram', value='1024', kind='g') + self.assertReportedStat( + 'zuul.nodepool.resources.project.review_example_com/org/project.' + 'instances', value='1', kind='g') + + # Check that resource usage counters are reported + self.assertReportedStat( + 'zuul.nodepool.resources.tenant.tenant-one.cores', + kind='c') + self.assertReportedStat( + 'zuul.nodepool.resources.tenant.tenant-one.ram', + kind='c') + self.assertReportedStat( + 'zuul.nodepool.resources.tenant.tenant-one.instances', + kind='c') + self.assertReportedStat( + 'zuul.nodepool.resources.project.review_example_com/org/project.' + 'cores', kind='c') + self.assertReportedStat( + 'zuul.nodepool.resources.project.review_example_com/org/project.' + 'ram', kind='c') + self.assertReportedStat( + 'zuul.nodepool.resources.project.review_example_com/org/project.' + 'instances', kind='c') + + self.executor_server.hold_jobs_in_build = False + self.executor_server.release() + self.waitUntilSettled() + self.assertEqual(A.data['status'], 'MERGED') + self.assertEqual(A.reported, 2) + def test_nodepool_pipeline_priority(self): "Test that nodes are requested at the correct pipeline priority" @@ -6520,7 +6586,7 @@ class TestSemaphore(ZuulTestCase): # Simulate a single zk error in useNodeSet orig_useNodeSet = self.nodepool.useNodeSet - def broken_use_nodeset(nodeset): + def broken_use_nodeset(nodeset, build_set=None): # restore original useNodeSet self.nodepool.useNodeSet = orig_useNodeSet raise NoNodeError() diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py index 0f2d7da06..3cd863ec0 100644 --- a/zuul/manager/__init__.py +++ b/zuul/manager/__init__.py @@ -403,7 +403,8 @@ class PipelineManager(object): self.log.debug("Found job %s for change %s" % (job, item.change)) try: nodeset = item.current_build_set.getJobNodeSet(job.name) - self.sched.nodepool.useNodeSet(nodeset) + self.sched.nodepool.useNodeSet( + nodeset, build_set=item.current_build_set) self.sched.executor.execute( job, item, self.pipeline, build_set.dependent_changes, diff --git a/zuul/model.py b/zuul/model.py index 432104b39..4e55aaf46 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -547,6 +547,7 @@ class Node(ConfigObject): self.region = None self.username = None self.hold_expiration = None + self.resources = None @property def state(self): diff --git a/zuul/nodepool.py b/zuul/nodepool.py index 005482f01..ba22c8ddc 100644 --- a/zuul/nodepool.py +++ b/zuul/nodepool.py @@ -12,17 +12,30 @@ import logging +from collections import defaultdict from zuul import model from zuul.lib.logutil import get_annotated_logger from zuul.zk import LockException +def add_resources(target, source): + for key, value in source.items(): + target[key] += value + + +def subtract_resources(target, source): + for key, value in source.items(): + target[key] -= value + + class Nodepool(object): log = logging.getLogger('zuul.nodepool') def __init__(self, scheduler): self.requests = {} self.sched = scheduler + self.current_resources_by_tenant = {} + self.current_resources_by_project = {} def emitStats(self, request): # Implements the following : @@ -60,6 +73,37 @@ class Nodepool(object): pipe.gauge('zuul.nodepool.current_requests', len(self.requests)) pipe.send() + def emitStatsResources(self): + if not self.sched.statsd: + return + statsd = self.sched.statsd + + for tenant, resources in self.current_resources_by_tenant.items(): + for resource, value in resources.items(): + key = 'zuul.nodepool.resources.tenant.' \ + '{tenant}.{resource}' + statsd.gauge(key, value, tenant=tenant, resource=resource) + for project, resources in self.current_resources_by_project.items(): + for resource, value in resources.items(): + key = 'zuul.nodepool.resources.project.' \ + '{project}.{resource}' + statsd.gauge(key, value, project=project, resource=resource) + + def emitStatsResourceCounters(self, tenant, project, resources, duration): + if not self.sched.statsd: + return + statsd = self.sched.statsd + + for resource, value in resources.items(): + key = 'zuul.nodepool.resources.tenant.{tenant}.{resource}' + statsd.incr(key, value * duration, + tenant=tenant, resource=resource) + for resource, value in resources.items(): + key = 'zuul.nodepool.resources.project.' \ + '{project}.{resource}' + statsd.incr(key, value * duration, + project=project, resource=resource) + def requestNodes(self, build_set, job, relative_priority, event=None): log = get_annotated_logger(self.log, event) # Create a copy of the nodeset to represent the actual nodes @@ -157,22 +201,47 @@ class Nodepool(object): self.log.debug("Removing autohold for %s", autohold_key) del self.sched.autohold_requests[autohold_key] - def useNodeSet(self, nodeset): + def useNodeSet(self, nodeset, build_set=None): self.log.info("Setting nodeset %s in use" % (nodeset,)) + resources = defaultdict(int) for node in nodeset.getNodes(): if node.lock is None: raise Exception("Node %s is not locked" % (node,)) node.state = model.STATE_IN_USE self.sched.zk.storeNode(node) + if node.resources: + add_resources(resources, node.resources) + if build_set and resources: + # we have a buildset and thus also tenant and project so we + # can emit project specific resource usage stats + tenant_name = build_set.item.layout.tenant.name + project_name = build_set.item.change.project.canonical_name + + self.current_resources_by_tenant.setdefault( + tenant_name, defaultdict(int)) + self.current_resources_by_project.setdefault( + project_name, defaultdict(int)) + + add_resources(self.current_resources_by_tenant[tenant_name], + resources) + add_resources(self.current_resources_by_project[project_name], + resources) + self.emitStatsResources() def returnNodeSet(self, nodeset, build=None): self.log.info("Returning nodeset %s" % (nodeset,)) + resources = defaultdict(int) + duration = None + project = None + tenant = None + if build: + project = build.build_set.item.change.project + tenant = build.build_set.item.pipeline.tenant.name if (build and build.start_time and build.end_time and build.build_set and build.build_set.item and build.build_set.item.change and build.build_set.item.change.project): duration = build.end_time - build.start_time - project = build.build_set.item.change.project self.log.info("Nodeset %s with %s nodes was in use " "for %s seconds for build %s for project %s", nodeset, len(nodeset.nodes), duration, build, @@ -183,6 +252,8 @@ class Nodepool(object): else: try: if node.state == model.STATE_IN_USE: + if node.resources: + add_resources(resources, node.resources) node.state = model.STATE_USED self.sched.zk.storeNode(node) except Exception: @@ -190,6 +261,22 @@ class Nodepool(object): "while unlocking:" % (node,)) self._unlockNodes(nodeset.getNodes()) + # When returning a nodeset we need to update the gauges if we have a + # build. Further we calculate resource*duration and increment their + # tenant or project specific counters. With that we have both the + # current value and also counters to be able to perform accounting. + if tenant and project and resources: + project_name = project.canonical_name + subtract_resources( + self.current_resources_by_tenant[tenant], resources) + subtract_resources( + self.current_resources_by_project[project_name], resources) + self.emitStatsResources() + + if duration: + self.emitStatsResourceCounters( + tenant, project_name, resources, duration) + def unlockNodeSet(self, nodeset): self._unlockNodes(nodeset.getNodes()) |