summaryrefslogtreecommitdiff
path: root/zuul/zk
diff options
context:
space:
mode:
authorJames E. Blair <jim@acmegating.com>2022-02-18 20:23:23 -0800
committerJames E. Blair <jim@acmegating.com>2022-02-20 16:55:34 -0800
commitc522bfa46077f0e030f03e0cd4585d5c100ed241 (patch)
tree117a8e09a740a62fa2f154473227c3c3825a4979 /zuul/zk
parenta1bfee1f9a37f490dbd923175f77a0f62b151c01 (diff)
downloadzuul-c522bfa46077f0e030f03e0cd4585d5c100ed241.tar.gz
Add pipeline timing metrics
This adds several metrics for different phases of processing an item in a pipeline: * How long we wait for a response from mergers * How long it takes to get or compute a layout * How long it takes to freeze jobs * How long we wait for node requests to complete * How long we wait for an executor to start running a job after the request And finally, the total amount of time from the original event until the first job starts. We already report that at the tenant level, this duplicates that for a pipeline-specific metric. Several of these would also make sense as job metrics, but since they are mainly intended to diagnose Zuul system performance and not individual jobs, that would be a waste of storage space due to the extremely high cardinality. Additionally, two other timing metrics are added: the cumulative time spent reading and writing ZKObject data to ZK during pipeline processing. These can help determine whether more effort should be spent optimizing ZK data transfer. In preparing this change, I noticed that python statsd emits floating point values for timing. It's not clear whether this strictly matches the statsd spec, but since it does emit values with that precision, I have removed several int() casts in order to maintain the precision through to the statsd client. I also noticed a place where we were writing a monotonic timestamp value in a JSON serialized string to ZK. I do not believe this value is currently being used, therefore there is no further error to correct, however, we should not use time.monotonic() for values that are serialized since the reference clock will be different on different systems. Several new attributes are added to the QueueItem and Build classes, but are done so in a way that is backwards compatible, so no model api schema upgrade is needed. The code sites where they are used protect against the null values which will occur in a mixed-version cluster (the components will just not emit these stats in those cases). Change-Id: Iaacbef7fa2ed93bfc398a118c5e8cfbc0a67b846
Diffstat (limited to 'zuul/zk')
-rw-r--r--zuul/zk/event_queues.py2
-rw-r--r--zuul/zk/sharding.py20
-rw-r--r--zuul/zk/zkobject.py12
3 files changed, 32 insertions, 2 deletions
diff --git a/zuul/zk/event_queues.py b/zuul/zk/event_queues.py
index fdc99442f..e35e05ecb 100644
--- a/zuul/zk/event_queues.py
+++ b/zuul/zk/event_queues.py
@@ -590,7 +590,7 @@ class ManagementEventQueue(ZooKeeperEventQueue):
return
result_data = {"traceback": event.traceback,
- "timestamp": time.monotonic()}
+ "timestamp": time.time()}
try:
self.kazoo_client.set(
event.result_ref,
diff --git a/zuul/zk/sharding.py b/zuul/zk/sharding.py
index a1278a8a2..5ca1158d5 100644
--- a/zuul/zk/sharding.py
+++ b/zuul/zk/sharding.py
@@ -14,6 +14,7 @@
import io
from contextlib import suppress
+import time
import zlib
from kazoo.exceptions import NoNodeError
@@ -30,6 +31,8 @@ class RawShardIO(io.RawIOBase):
self.shard_base = path
self.compressed_bytes_read = 0
self.compressed_bytes_written = 0
+ self.cumulative_read_time = 0.0
+ self.cumulative_write_time = 0.0
def readable(self):
return True
@@ -46,12 +49,17 @@ class RawShardIO(io.RawIOBase):
@property
def _shards(self):
try:
- return self.client.get_children(self.shard_base)
+ start = time.perf_counter()
+ ret = self.client.get_children(self.shard_base)
+ self.cumulative_read_time += time.perf_counter() - start
+ return ret
except NoNodeError:
return []
def _getData(self, path):
+ start = time.perf_counter()
data, _ = self.client.get(path)
+ self.cumulative_read_time += time.perf_counter() - start
self.compressed_bytes_read += len(data)
return zlib.decompress(data)
@@ -69,12 +77,14 @@ class RawShardIO(io.RawIOBase):
shard_bytes = zlib.compress(shard_bytes)
if not (len(shard_bytes) < NODE_BYTE_SIZE_LIMIT):
raise RuntimeError("Shard too large")
+ start = time.perf_counter()
self.client.create(
"{}/".format(self.shard_base),
shard_bytes,
sequence=True,
makepath=True,
)
+ self.cumulative_write_time += time.perf_counter() - start
self.compressed_bytes_written += len(shard_bytes)
return min(byte_count, NODE_BYTE_SIZE_LIMIT)
@@ -88,6 +98,10 @@ class BufferedShardWriter(io.BufferedWriter):
def compressed_bytes_written(self):
return self.__raw.compressed_bytes_written
+ @property
+ def cumulative_write_time(self):
+ return self.__raw.cumulative_write_time
+
class BufferedShardReader(io.BufferedReader):
def __init__(self, client, path):
@@ -97,3 +111,7 @@ class BufferedShardReader(io.BufferedReader):
@property
def compressed_bytes_read(self):
return self.__raw.compressed_bytes_read
+
+ @property
+ def cumulative_read_time(self):
+ return self.__raw.cumulative_read_time
diff --git a/zuul/zk/zkobject.py b/zuul/zk/zkobject.py
index 45eb2b8e2..e837bf3ea 100644
--- a/zuul/zk/zkobject.py
+++ b/zuul/zk/zkobject.py
@@ -31,6 +31,8 @@ class ZKContext:
self.lock = lock
self.stop_event = stop_event
self.log = log
+ self.cumulative_read_time = 0.0
+ self.cumulative_write_time = 0.0
def sessionIsValid(self):
return ((not self.lock or self.lock.is_still_valid()) and
@@ -237,7 +239,10 @@ class ZKObject:
path = self.getPath()
while context.sessionIsValid():
try:
+ start = time.perf_counter()
compressed_data, zstat = context.client.get(path)
+ context.cumulative_read_time += time.perf_counter() - start
+
self._set(_zkobject_hash=None)
try:
data = zlib.decompress(compressed_data)
@@ -278,6 +283,7 @@ class ZKObject:
while context.sessionIsValid():
try:
compressed_data = zlib.compress(data)
+ start = time.perf_counter()
if create:
real_path, zstat = context.client.create(
path, compressed_data, makepath=True,
@@ -285,6 +291,7 @@ class ZKObject:
else:
zstat = context.client.set(path, compressed_data,
version=self._zstat.version)
+ context.cumulative_write_time += time.perf_counter() - start
self._set(_zstat=zstat,
_zkobject_hash=hash(data),
_zkobject_compressed_size=len(compressed_data),
@@ -336,6 +343,8 @@ class ShardedZKObject(ZKObject):
context.client, path) as stream:
data = stream.read()
compressed_size = stream.compressed_bytes_read
+ context.cumulative_read_time += \
+ stream.cumulative_read_time
if not data and context.client.exists(path) is None:
raise NoNodeError
self._set(**self.deserialize(data, context))
@@ -382,6 +391,9 @@ class ShardedZKObject(ZKObject):
stream.write(data)
stream.flush()
compressed_size = stream.compressed_bytes_written
+ context.cumulative_write_time += \
+ stream.cumulative_write_time
+
self._set(_zkobject_hash=hash(data),
_zkobject_compressed_size=compressed_size,
_zkobject_uncompressed_size=len(data),