From 0c76985faf10b39e08e01c51683d76105862de36 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 29 Nov 2021 14:12:56 -0800 Subject: Don't delete pipeline summary objects if they have a syntax error With most ZKObjects, we delete them from ZK if we are unable to deserialize the JSON data. The pipeline manager will likely re-create them if able in that case. But the pipeline summary object is a unique case. We read from it without obtaining a lock, so it's possible (likely even) that a scheduler is in the middle of writing it out (it's sharded, so it can be multiple znodes) when a zuul-web reads it. In that case it was our intention to ignore the error and use the previous data. However, since the zkobject base class automatically deletes the object on error, this could result in deleting the summary from ZK as it's being written. In that case we might continue using cached data (or have no data if we didn't happen to have read from it already) for an extended period of time until the pipeline updates again (and that update could have the same problem). To avoid this, add a class variable to indicate that the pipeline summary object should not delete corrupt data. We will assume that it is in the process of writing (and even if it is legitimately corrupt, the resolution is the same regardless: wait for the scheduler to write it again on the next pipeline pass, which it always does). Change-Id: I6da8e7e01e0a31bf30520fdf9829b2a2f0559c11 --- zuul/zk/zkobject.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'zuul/zk/zkobject.py') diff --git a/zuul/zk/zkobject.py b/zuul/zk/zkobject.py index 7e09108be..b7cd6901a 100644 --- a/zuul/zk/zkobject.py +++ b/zuul/zk/zkobject.py @@ -255,6 +255,10 @@ class ShardedZKObject(ZKObject): # If the node exists when we create we normally error, unless this # is set, in which case we proceed and truncate. truncate_on_create = False + # Normally we delete nodes which have syntax errors, but the + # pipeline summary is read without a write lock, so those are + # expected. Don't delete them in that case. + delete_on_error = True def _load(self, context, path=None): if path is None: @@ -282,7 +286,8 @@ class ShardedZKObject(ZKObject): # ourself here so we know what object triggered it. context.log.error( "Exception loading ZKObject %s", self) - self.delete(context) + if self.delete_on_error: + self.delete(context) raise InvalidObjectError from exc raise Exception("ZooKeeper session or lock not valid") -- cgit v1.2.1