diff options
author | James E. Blair <jim@acmegating.com> | 2022-02-18 20:23:23 -0800 |
---|---|---|
committer | James E. Blair <jim@acmegating.com> | 2022-02-20 16:55:34 -0800 |
commit | c522bfa46077f0e030f03e0cd4585d5c100ed241 (patch) | |
tree | 117a8e09a740a62fa2f154473227c3c3825a4979 /zuul/zk | |
parent | a1bfee1f9a37f490dbd923175f77a0f62b151c01 (diff) | |
download | zuul-c522bfa46077f0e030f03e0cd4585d5c100ed241.tar.gz |
Add pipeline timing metrics
This adds several metrics for different phases of processing an item
in a pipeline:
* How long we wait for a response from mergers
* How long it takes to get or compute a layout
* How long it takes to freeze jobs
* How long we wait for node requests to complete
* How long we wait for an executor to start running a job
after the request
And finally, the total amount of time from the original event until
the first job starts. We already report that at the tenant level,
this duplicates that for a pipeline-specific metric.
Several of these would also make sense as job metrics, but since they
are mainly intended to diagnose Zuul system performance and not
individual jobs, that would be a waste of storage space due to the
extremely high cardinality.
Additionally, two other timing metrics are added: the cumulative time
spent reading and writing ZKObject data to ZK during pipeline
processing. These can help determine whether more effort should be
spent optimizing ZK data transfer.
In preparing this change, I noticed that python statsd emits floating
point values for timing. It's not clear whether this strictly matches
the statsd spec, but since it does emit values with that precision,
I have removed several int() casts in order to maintain the precision
through to the statsd client.
I also noticed a place where we were writing a monotonic timestamp
value in a JSON serialized string to ZK. I do not believe this value
is currently being used, therefore there is no further error to correct,
however, we should not use time.monotonic() for values that are
serialized since the reference clock will be different on different
systems.
Several new attributes are added to the QueueItem and Build classes,
but are done so in a way that is backwards compatible, so no model api
schema upgrade is needed. The code sites where they are used protect
against the null values which will occur in a mixed-version cluster
(the components will just not emit these stats in those cases).
Change-Id: Iaacbef7fa2ed93bfc398a118c5e8cfbc0a67b846
Diffstat (limited to 'zuul/zk')
-rw-r--r-- | zuul/zk/event_queues.py | 2 | ||||
-rw-r--r-- | zuul/zk/sharding.py | 20 | ||||
-rw-r--r-- | zuul/zk/zkobject.py | 12 |
3 files changed, 32 insertions, 2 deletions
diff --git a/zuul/zk/event_queues.py b/zuul/zk/event_queues.py index fdc99442f..e35e05ecb 100644 --- a/zuul/zk/event_queues.py +++ b/zuul/zk/event_queues.py @@ -590,7 +590,7 @@ class ManagementEventQueue(ZooKeeperEventQueue): return result_data = {"traceback": event.traceback, - "timestamp": time.monotonic()} + "timestamp": time.time()} try: self.kazoo_client.set( event.result_ref, diff --git a/zuul/zk/sharding.py b/zuul/zk/sharding.py index a1278a8a2..5ca1158d5 100644 --- a/zuul/zk/sharding.py +++ b/zuul/zk/sharding.py @@ -14,6 +14,7 @@ import io from contextlib import suppress +import time import zlib from kazoo.exceptions import NoNodeError @@ -30,6 +31,8 @@ class RawShardIO(io.RawIOBase): self.shard_base = path self.compressed_bytes_read = 0 self.compressed_bytes_written = 0 + self.cumulative_read_time = 0.0 + self.cumulative_write_time = 0.0 def readable(self): return True @@ -46,12 +49,17 @@ class RawShardIO(io.RawIOBase): @property def _shards(self): try: - return self.client.get_children(self.shard_base) + start = time.perf_counter() + ret = self.client.get_children(self.shard_base) + self.cumulative_read_time += time.perf_counter() - start + return ret except NoNodeError: return [] def _getData(self, path): + start = time.perf_counter() data, _ = self.client.get(path) + self.cumulative_read_time += time.perf_counter() - start self.compressed_bytes_read += len(data) return zlib.decompress(data) @@ -69,12 +77,14 @@ class RawShardIO(io.RawIOBase): shard_bytes = zlib.compress(shard_bytes) if not (len(shard_bytes) < NODE_BYTE_SIZE_LIMIT): raise RuntimeError("Shard too large") + start = time.perf_counter() self.client.create( "{}/".format(self.shard_base), shard_bytes, sequence=True, makepath=True, ) + self.cumulative_write_time += time.perf_counter() - start self.compressed_bytes_written += len(shard_bytes) return min(byte_count, NODE_BYTE_SIZE_LIMIT) @@ -88,6 +98,10 @@ class BufferedShardWriter(io.BufferedWriter): def compressed_bytes_written(self): return self.__raw.compressed_bytes_written + @property + def cumulative_write_time(self): + return self.__raw.cumulative_write_time + class BufferedShardReader(io.BufferedReader): def __init__(self, client, path): @@ -97,3 +111,7 @@ class BufferedShardReader(io.BufferedReader): @property def compressed_bytes_read(self): return self.__raw.compressed_bytes_read + + @property + def cumulative_read_time(self): + return self.__raw.cumulative_read_time diff --git a/zuul/zk/zkobject.py b/zuul/zk/zkobject.py index 45eb2b8e2..e837bf3ea 100644 --- a/zuul/zk/zkobject.py +++ b/zuul/zk/zkobject.py @@ -31,6 +31,8 @@ class ZKContext: self.lock = lock self.stop_event = stop_event self.log = log + self.cumulative_read_time = 0.0 + self.cumulative_write_time = 0.0 def sessionIsValid(self): return ((not self.lock or self.lock.is_still_valid()) and @@ -237,7 +239,10 @@ class ZKObject: path = self.getPath() while context.sessionIsValid(): try: + start = time.perf_counter() compressed_data, zstat = context.client.get(path) + context.cumulative_read_time += time.perf_counter() - start + self._set(_zkobject_hash=None) try: data = zlib.decompress(compressed_data) @@ -278,6 +283,7 @@ class ZKObject: while context.sessionIsValid(): try: compressed_data = zlib.compress(data) + start = time.perf_counter() if create: real_path, zstat = context.client.create( path, compressed_data, makepath=True, @@ -285,6 +291,7 @@ class ZKObject: else: zstat = context.client.set(path, compressed_data, version=self._zstat.version) + context.cumulative_write_time += time.perf_counter() - start self._set(_zstat=zstat, _zkobject_hash=hash(data), _zkobject_compressed_size=len(compressed_data), @@ -336,6 +343,8 @@ class ShardedZKObject(ZKObject): context.client, path) as stream: data = stream.read() compressed_size = stream.compressed_bytes_read + context.cumulative_read_time += \ + stream.cumulative_read_time if not data and context.client.exists(path) is None: raise NoNodeError self._set(**self.deserialize(data, context)) @@ -382,6 +391,9 @@ class ShardedZKObject(ZKObject): stream.write(data) stream.flush() compressed_size = stream.compressed_bytes_written + context.cumulative_write_time += \ + stream.cumulative_write_time + self._set(_zkobject_hash=hash(data), _zkobject_compressed_size=compressed_size, _zkobject_uncompressed_size=len(data), |