summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames E. Blair <jim@acmegating.com>2023-02-27 13:16:13 -0800
committerJames E. Blair <jim@acmegating.com>2023-02-28 07:11:41 -0800
commit7a8882c642d631247f2339ac67bb3916933d754e (patch)
tree8396b776feee82a2a4dea00f12759e4fcf4d5cf0
parent7254a75cd8ec9a34d9eb10e51a4b93de2e4defd8 (diff)
downloadzuul-7a8882c642d631247f2339ac67bb3916933d754e.tar.gz
Set layout state event ltime in delete-pipeline-state
The delete-pipeline-state command updates the layout state in order to force schedulers to update their local layout (essentially perform a local-only reconfiguration). In doing so, it sets the last event ltime to -1. This is reasonable for initializing a new system, but in an existing system, when an event arrives at the tenant trigger event queue it is assigned the last reconfiguration event ltime seen by that trigger event queue. Later, when a scheduler processes such a trigger event after the delete-pipeline-state command has run, it will refuse to handle the event since it arrived much later than its local layout state. This must then be corrected manually by the operator by forcing a tenant reconfiguration. This means that the system essentially suffers the delay of two sequential reconfigurations before it can proceed. To correct this, set the last event ltime for the layout state to the ltime of the layout state itself. This means that once a scheduler has updated its local layout, it can proceed in processing old events. Change-Id: I66e798adbbdd55ff1beb1ecee39c7f5a5351fc4b
-rw-r--r--tests/unit/test_client.py138
-rwxr-xr-xzuul/cmd/client.py9
2 files changed, 79 insertions, 68 deletions
diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py
index b51639952..1f2b3d220 100644
--- a/tests/unit/test_client.py
+++ b/tests/unit/test_client.py
@@ -27,10 +27,11 @@ import jwt
import testtools
from zuul.zk import ZooKeeperClient
+from zuul.zk.locks import pipeline_lock
from zuul.cmd.client import parse_cutoff
from tests.base import BaseTestCase, ZuulTestCase
-from tests.base import FIXTURE_DIR
+from tests.base import FIXTURE_DIR, iterate_timeout
from kazoo.exceptions import NoNodeError
@@ -362,82 +363,93 @@ class TestOnlineZKOperations(ZuulTestCase):
def assertSQLState(self):
pass
- def test_delete_pipeline_check(self):
- self.executor_server.hold_jobs_in_build = True
- A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
- self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
- self.waitUntilSettled()
-
- config_file = os.path.join(self.test_root, 'zuul.conf')
- with open(config_file, 'w') as f:
- self.config.write(f)
-
- # Make sure the pipeline exists
- self.getZKTree('/zuul/tenant/tenant-one/pipeline/check/item')
- p = subprocess.Popen(
- [os.path.join(sys.prefix, 'bin/zuul-admin'),
- '-c', config_file,
- 'delete-pipeline-state',
- 'tenant-one', 'check',
- ],
- stdout=subprocess.PIPE)
- out, _ = p.communicate()
- self.log.debug(out.decode('utf8'))
- # Make sure it's deleted
- with testtools.ExpectedException(NoNodeError):
- self.getZKTree('/zuul/tenant/tenant-one/pipeline/check/item')
-
- self.executor_server.hold_jobs_in_build = False
- self.executor_server.release()
- B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
- self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
+ def _test_delete_pipeline(self, pipeline):
+ sched = self.scheds.first.sched
+ # Force a reconfiguration due to a config change (so that the
+ # tenant trigger event queue gets a minimum timestamp set)
+ file_dict = {'zuul.yaml': ''}
+ M = self.fake_gerrit.addFakeChange('org/project', 'master', 'A',
+ files=file_dict)
+ M.setMerged()
+ self.fake_gerrit.addEvent(M.getChangeMergedEvent())
self.waitUntilSettled()
- self.assertHistory([
- dict(name='project-merge', result='SUCCESS', changes='1,1'),
- dict(name='project-merge', result='SUCCESS', changes='2,1'),
- dict(name='project-test1', result='SUCCESS', changes='2,1'),
- dict(name='project-test2', result='SUCCESS', changes='2,1'),
- ], ordered=False)
- def test_delete_pipeline_gate(self):
self.executor_server.hold_jobs_in_build = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
- A.addApproval('Code-Review', 2)
- self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+ if pipeline == 'check':
+ self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
+ else:
+ A.addApproval('Code-Review', 2)
+ self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
self.waitUntilSettled()
- config_file = os.path.join(self.test_root, 'zuul.conf')
- with open(config_file, 'w') as f:
- self.config.write(f)
-
- # Make sure the pipeline exists
- self.getZKTree('/zuul/tenant/tenant-one/pipeline/gate/item')
- p = subprocess.Popen(
- [os.path.join(sys.prefix, 'bin/zuul-admin'),
- '-c', config_file,
- 'delete-pipeline-state',
- 'tenant-one', 'gate',
- ],
- stdout=subprocess.PIPE)
- out, _ = p.communicate()
- self.log.debug(out.decode('utf8'))
- # Make sure it's deleted
- with testtools.ExpectedException(NoNodeError):
- self.getZKTree('/zuul/tenant/tenant-one/pipeline/gate/item')
+ # Lock the check pipeline so we don't process the event we're
+ # about to submit (it should go into the pipeline trigger event
+ # queue and stay there while we delete the pipeline state).
+ # This way we verify that events arrived before the deletion
+ # still work.
+ with pipeline_lock(self.zk_client, 'tenant-one', pipeline):
+ self.log.debug('Got pipeline lock')
+ # Add a new event while our old last reconfigure time is
+ # in place.
+ B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
+ if pipeline == 'check':
+ self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
+ else:
+ B.addApproval('Code-Review', 2)
+ self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
+
+ # Wait until it appears in the pipeline trigger event queue
+ self.log.debug('Waiting for event')
+ for x in iterate_timeout(30, 'trigger event queue has events'):
+ if sched.pipeline_trigger_events[
+ 'tenant-one'][pipeline].hasEvents():
+ break
+ self.log.debug('Got event')
+ # It's not necessary to grab the run lock here, but if we
+ # don't the scheduler will busy-wait, so let's do it to
+ # keep things tidy.
+ with sched.run_handler_lock:
+ self.log.debug('Got run lock')
+ config_file = os.path.join(self.test_root, 'zuul.conf')
+ with open(config_file, 'w') as f:
+ self.config.write(f)
+
+ # Make sure the pipeline exists
+ self.getZKTree(
+ f'/zuul/tenant/tenant-one/pipeline/{pipeline}/item')
+ self.log.debug('Deleting pipeline state')
+ p = subprocess.Popen(
+ [os.path.join(sys.prefix, 'bin/zuul-admin'),
+ '-c', config_file,
+ 'delete-pipeline-state',
+ 'tenant-one', pipeline,
+ ],
+ stdout=subprocess.PIPE)
+ # Delete the pipeline state
+ out, _ = p.communicate()
+ self.log.debug(out.decode('utf8'))
+ # Make sure it's deleted
+ with testtools.ExpectedException(NoNodeError):
+ self.getZKTree(
+ f'/zuul/tenant/tenant-one/pipeline/{pipeline}/item')
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
- B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
- B.addApproval('Code-Review', 2)
- self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
self.waitUntilSettled()
self.assertHistory([
- dict(name='project-merge', result='SUCCESS', changes='1,1'),
dict(name='project-merge', result='SUCCESS', changes='2,1'),
- dict(name='project-test1', result='SUCCESS', changes='2,1'),
- dict(name='project-test2', result='SUCCESS', changes='2,1'),
+ dict(name='project-merge', result='SUCCESS', changes='3,1'),
+ dict(name='project-test1', result='SUCCESS', changes='3,1'),
+ dict(name='project-test2', result='SUCCESS', changes='3,1'),
], ordered=False)
+ def test_delete_pipeline_check(self):
+ self._test_delete_pipeline('check')
+
+ def test_delete_pipeline_gate(self):
+ self._test_delete_pipeline('gate')
+
class TestDBPruneParse(BaseTestCase):
def test_db_prune_parse(self):
diff --git a/zuul/cmd/client.py b/zuul/cmd/client.py
index 031b10a1e..1a3738b85 100755
--- a/zuul/cmd/client.py
+++ b/zuul/cmd/client.py
@@ -1032,22 +1032,21 @@ class Client(zuul.cmd.ZuulApp):
with tenant_write_lock(zk_client, args.tenant) as lock:
path = f'/zuul/tenant/{safe_tenant}/pipeline/{safe_pipeline}'
layout_uuid = None
- zk_client.client.delete(
- f'/zuul/tenant/{safe_tenant}/pipeline/{safe_pipeline}',
- recursive=True)
+ zk_client.client.delete(path, recursive=True)
with ZKContext(zk_client, lock, None, self.log) as context:
ps = PipelineState.new(context, _path=path,
layout_uuid=layout_uuid)
+ ltime = ps._zstat.last_modified_transaction_id
# Force everyone to make a new layout for this tenant in
# order to rebuild the shared change queues.
layout_state = LayoutState(
tenant_name=args.tenant,
hostname='admin command',
last_reconfigured=int(time.time()),
- last_reconfigure_event_ltime=-1,
+ last_reconfigure_event_ltime=ltime,
uuid=uuid4().hex,
branch_cache_min_ltimes={},
- ltime=ps._zstat.last_modified_transaction_id,
+ ltime=ltime,
)
tenant_layout_state = LayoutStateStore(zk_client, lambda: None)
tenant_layout_state[args.tenant] = layout_state