37 files changed, 1235 insertions, 144 deletions
diff --git a/doc/source/developer/index.rst b/doc/source/developer/index.rst
index 52266a175..b45c75640 100644
--- a/doc/source/developer/index.rst
+++ b/doc/source/developer/index.rst
@@ -14,6 +14,7 @@ Zuul, though advanced users may find it interesting.
    drivers
    triggers
    testing
+   metrics
    docs
    ansible
    javascript
diff --git a/doc/source/developer/metrics.rst b/doc/source/developer/metrics.rst
new file mode 100644
index 000000000..913a591ba
--- /dev/null
+++ b/doc/source/developer/metrics.rst
@@ -0,0 +1,74 @@
+:title: Metrics
+
+Metrics
+=======
+
+Event Overview
+--------------
+
+The following table illustrates the event and pipeline processing
+sequence as it relates to some of the metrics described in
+:ref:`statsd`.  This is intended as general guidance only and is not
+an exhaustive list.
+
++----------------------------------------+------+------+------+--------------------------------------+
+| Event                                  |       Metrics      | Attribute                            |
++========================================+======+======+======+======================================+
+| Event generated by source              |      |      |      | event.timestamp                      |
++----------------------------------------+------+      +      +--------------------------------------+
+| Enqueued into driver queue             |      |      |      |                                      |
++----------------------------------------+------+      +      +--------------------------------------+
+| Enqueued into tenant trigger queue     |      |      |      | event.arrived_at_scheduler_timestamp |
++----------------------------------------+      + [8]  +      +--------------------------------------+
+| Forwarded to matching pipelines        | [1]  |      |      |                                      |
++----------------------------------------+      +      +      +--------------------------------------+
+| Changes enqueued ahead                 |      |      |      |                                      |
++----------------------------------------+      +      +      +--------------------------------------+
+| Change enqueued                        |      |      |      | item.enqueue_time                    |
++----------------------------------------+------+------+      +--------------------------------------+
+| Changes enqueued behind                |      |      |      |                                      |
++----------------------------------------+------+------+      +--------------------------------------+
+| Set item configuration                 |      |      |      | build_set.configured_time            |
++----------------------------------------+------+------+      +--------------------------------------+
+| Request files changed (if needed)      |      |      |      |                                      |
++----------------------------------------+      +------+      +--------------------------------------+
+| Request merge                          | [2]  |      |      |                                      |
++----------------------------------------+      +------+      +--------------------------------------+
+| Wait for merge (and files if needed)   |      |      | [9]  |                                      |
++----------------------------------------+------+------+      +--------------------------------------+
+| Generate dynamic layout (if needed)    | [3]  |      |      |                                      |
++----------------------------------------+------+------+      +--------------------------------------+
+| Freeze job graph                       | [4]  |      |      |                                      |
++----------------------------------------+------+------+      +--------------------------------------+
+| Request global repo state (if needed)  |      |      |      | build_set.repo_state_request_time    |
++----------------------------------------+ [5]  +------+      +--------------------------------------+
+| Wait for global repo state (if needed) |      |      |      |                                      |
++----------------------------------------+------+------+      +--------------------------------------+
+| Deduplicate jobs                       |      |      |      |                                      |
++----------------------------------------+------+------+      +--------------------------------------+
+| Acquire semaphore (non-resources-first)|      |      |      |                                      |
++----------------------------------------+------+------+      +--------------------------------------+
+| Request nodes                          |      |      |      | request.created_time                 |
++----------------------------------------+ [6]  +------+      +--------------------------------------+
+| Wait for nodes                         |      |      |      |                                      |
++----------------------------------------+------+------+      +--------------------------------------+
+| Acquire semaphore (resources-first)    |      |      |      |                                      |
++----------------------------------------+------+------+      +--------------------------------------+
+| Enqueue build request                  |      |      |      | build.execute_time                   |
++----------------------------------------+ [7]  +------+      +--------------------------------------+
+| Executor starts job                    |      |      |      | build.start_time                     |
++----------------------------------------+------+------+------+--------------------------------------+
+
+======  =============================
+Metric  Name
+======  =============================
+1       event_enqueue_processing_time
+2       merge_request_time
+3       layout_generation_time
+4       job_freeze_time
+5       repo_state_time
+6       node_request_time
+7       job_wait_time
+8       event_enqueue_time
+9       event_job_time
+======  =============================
diff --git a/doc/source/developer/specs/index.rst b/doc/source/developer/specs/index.rst
index d96df0c26..78c11bbb8 100644
--- a/doc/source/developer/specs/index.rst
+++ b/doc/source/developer/specs/index.rst
@@ -23,3 +23,4 @@ documentation instead.
    enhanced-regional-executors
    tenant-resource-quota
    community-matrix
+   tracing
diff --git a/doc/source/developer/specs/tracing.rst b/doc/source/developer/specs/tracing.rst
new file mode 100644
index 000000000..5e0e9e4d1
--- /dev/null
+++ b/doc/source/developer/specs/tracing.rst
@@ -0,0 +1,323 @@
+Tracing
+=======
+
+.. warning:: This is not authoritative documentation.  These features
+   are not currently available in Zuul.  They may change significantly
+   before final implementation, or may never be fully completed.
+
+It can be difficult for a user to understand what steps were involved
+between a trigger event (such as a patchset upload or recheck comment)
+and a buildset report.  If it took an unusually long time it can be
+difficult to determine why.  At present, an operator would need to
+examine logs to determine what steps were involved and the sources of
+any potential delays.  Even experienced operators and developers can
+take quite some time to first collect and then analyze logs to answer
+these questions.
+
+Sometimes these answers may point to routine system operation (such as
+a delay caused by many gate resets, or preparing a large number of
+repositories).  Other times they may point to deficiencies in the
+system (insufficient mergers) or bugs in the code.
+
+Being able to visualize the activities of a Zuul system can help
+operators (and potentially users) triage and diagnose issues more
+quickly and accurately.  Even if examining logs is ultimately required
+in order to fully diagnose an issue, being able to narrow down the
+scope using analsys tools can greatly simplify the process.
+
+Proposed Solution
+-----------------
+
+Implementing distributed tracing in Zuul can help improve the
+observability of the system and aid operators and potentially users in
+understanding the sequence of events.
+
+By exporting information about the processing Zuul performs using the
+OpenTelemetry API, information about Zuul operations can be collected
+in any of several tools for analysis.
+
+OpenTelemetry is an Open Source protocol for exchanging observability
+data, an SDK implementing that protocol, as well as an implementation
+of a collector for distributing information to multiple backends.
+
+It supports three kinds of observability data: `traces`, `metrics`,
+and `logs`.  Since Zuul already has support for metrics and logs, this
+specification proposes that we use only the support in OpenTelemtry
+for `traces`.
+
+Usage Scenarios
+~~~~~~~~~~~~~~~
+
+Usage of OpenTelemetry should be entirely optional and supplementary
+for any Zuul deployment.  Log messages alone should continue to be
+sufficient to analyze any potential problem.
+
+Should a deployer wish to use OpenTelemetry tracing data, a very
+simple deployment for smaller sites may be constructed by running only
+Jaeger.  Jaeger is a service that can receive, store, and display
+tracing information.  The project distributes an all-in-one container
+image which can store data in local filesystem storage.
+
+https://www.jaegertracing.io/
+
+Larger sites may wish to run multiple collectors and feed data to
+larger, distributed storage backends (such as Cassandra,
+Elasticsearch, etc).
+
+Suitability to Zuul
+~~~~~~~~~~~~~~~~~~~
+
+OpenTelemetry tracing, at a high level, is designed to record
+information about events, their timing, and their relation to other
+events.  At first this seems like a natural fit for Zuul, which reacts
+to events, processes events, and generates more events.  However,
+OpenTelemetry's bias toward small and simple web applications is
+evident throughout its documentation and the SDK implementation.
+
+  Traces give us the big picture of what happens when a request is
+  made by user or an application.
+
+Zuul is not driven by user or application requests, and a system
+designed to record several millisecond-long events which make up the
+internal response to a user request of a web app is not necessarily
+the obvious right choice for recording sequences and combinations of
+events which frequently take hours (and sometimes days) to play out
+across multiple systems.
+
+Fortunately, the concepts and protocol implementation of OpenTelemtry
+are sufficiently well-designed for the general case to be able to
+accomodate a system like Zuul, even if the SDK makes incompatible
+assumptions that make integration difficult.  There are some
+challenges to implementation, but because the concepts appear to be
+well matched, we should proceed with using the OpenTelemetry protocol
+and SDK.
+
+Spans
+~~~~~
+
+The key tracing concepts in OpenTelemety are `traces` and `spans`.
+From a data model perspective, the unit of data storage is a `span`.
+A trace itself is really just a unique ID that is common to multiple
+spans.
+
+Spans can relate to other spans as either children or links.  A trace
+is generally considered to have a single 'root' span, and within the
+time period represented by that span, it may have any number of child
+spans (which may further have their own child spans).
+
+OpenTelemetry anticipates that a span on one system may spawn a child
+span on another system and includes facilities for transferring enough
+information about the parent span to a child system that the child
+system alone can emit traces for its span and any children that it
+spawns in turn.
+
+For a concrete example in Zuul, we might have a Zuul scheduler start a
+span for a buildset, and then a merger might emit a child span for
+performing the initial merge, and an executor might emit a child span
+for executing a build.
+
+Spans can relate to other spans (including spans in other traces), so
+sequences of events can be chained together without necessitating that
+they all be part of the same span or trace.
+
+Because Zuul processes series of events which may stretch for long
+periods of time, we should specify what events and actions should
+correspond to spans and traces.  Spans can have arbitrary metadat
+associated with them, so we will be able to search by event or job
+ids.
+
+The following sections describe traces and their child spans.
+
+Event Ingestion
++++++++++++++++
+
+A trace will begin when Zuul receives an event and end when that event
+has been enqueued into scheduler queues (or discarded).  A driver
+completing processing of an event is a definitive point in time so it
+is easy to know when to close the root span for that event's trace
+(whereas if we kept the trace open to include scheduler processing, we
+would need to know when the last trigger event spawned by the
+connection event was complete).
+
+This may include processing in internal queues by a given driver, and
+these processing steps/queues should appear as their own child spans.
+The spans should include event IDs (and potentially other information
+about the event such as change or pull request numbers) as metadata.
+
+Tenant Event Processing
++++++++++++++++++++++++
+
+A trace will begin when a scheduler begins processing a tenant event
+and ends when it has forwarded the event to all pipelines within a
+tenant.  It will link to the event ingestion trace as a follow-on
+span.
+
+Queue Item
+++++++++++
+
+A trace will begin when an item is enqueued and end when it is
+dequeued.  This will be quite a long trace (hours or days).  It is
+expected to be the primary benefit of this telemetry effort as it will
+show the entire lifetime of a queue item.  It will link to the tenant
+event processing trace as a follow-on span.
+
+Within the root span, there will be a span for each buildset (so that
+if a gate reset happens and a new buildset is created, users will see
+a series of buildset spans).  Within a buildset, there will be spans
+for all of the major processing steps, such as merge operations,
+layout calculating, freezing the job graph, and freezing jobs.  Each
+build will also merit a span (retried builds will get their own spans
+as well), and within a job span, there will be child spans for git
+repo prep, job setup, individual playbooks, and cleanup.
+
+SDK Challenges
+~~~~~~~~~~~~~~
+
+As a high-level concept, the idea of spans for each of these
+operations makes sense.  In practice, the SDK makes implementation
+challenging.
+
+The OpenTelemtry SDK makes no provision for beginning a span on one
+system and ending it on another, so the fact that one Zuul scheduler
+might start a buildset span while another ends it is problematic.
+
+Fortunately, the OpenTelemetry API only reports spans when they end,
+not when they start.  This means that we don't need to coordinate a
+"start" API call on one scheduler with an "end" API call on another.
+We can simply emit the trace with its root span at the end.  However,
+any child spans emitted during that time need to know the trace ID
+they should use, which means that we at least need to store a trace ID
+and start timestamp on our starting scheduler for use by any child
+spans as well as the "end span" API call.
+
+The SDK does not support creating a span with a specific trace ID or
+start timestamp (most timestamps are automatic), but it has
+well-defined interfaces for spans and we can subclass the
+implementation to allow us to specify trace IDs and timestamps.  With
+this approach, we can "virtually" start a span on one host, store its
+information in ZooKeeper with whatever long-lived object it is
+associated with (such as a QueueItem) and then make it concrete on
+another host when we end it.
+
+Alternatives
+++++++++++++
+
+This section describes some alternative ideas for dealing with the
+SDK's mismatch with Zuul concepts as well as why they weren't
+selected.
+
+* Multiple root spans with the same trace ID
+
+  Jaeger handles this relatively well, and the timeline view appears
+  as expected (multiple events with whitespace between them).  The
+  graph view in Jaeger may have some trouble displaying this.
+
+  It is not clear that OpenTelemetry anticipates having multiple
+  "root" spans, so it may be best to avoid this in order to avoid
+  potential problems with other tools.
+
+* Child spans without a parent
+
+  If we emit spans that specify a parent which does not exist, Jaeger
+  will display these traces but show a warning that the parent is
+  invalid.  This may occur naturally while the system is operating
+  (builds complete while a buildset is running), but should be
+  eventually corrected once an item is dequeued.  In case of a serious
+  error, we may never close a parent span, which would cause this to
+  persist.  We should accept that this may happen, but try to avoid it
+  happening intentionally.
+
+Links
+~~~~~
+
+Links between spans are fairly primitive in Jaeger.  While the
+OpenTelemetry API includes attributes for links (so that when we link
+a queue item to an event, we could specify that it was a forwarded
+event), Jaeger does not store or render them.  Instead, we are only
+left with a reference to a ``< span in another trace >`` with a
+reference type of ``FOLLOWS_FROM``.  Clicking on that link will
+immediately navigate to the other trace where metadata about the trace
+will be visible, but before clicking on it, users will have little
+idea of what awaits on the other side.
+
+For this reason, we should use span links sparingly so that when they
+are encountered, users are likely to intuit what they are for and are
+not overwhelmed by multiple indistinguishable links.
+
+Events and Exceptions
+~~~~~~~~~~~~~~~~~~~~~
+
+OpenTelemetry allows events to be added to spans.  Events have their
+own timestamp and attributes.  These can be used to add additional
+context to spans (representing single points in time rather than
+events with duration that should be child spans).  Examples might
+include receiving a request to cancel a job or dequeue an item.
+
+Events should not be used as an alternative to logs, nor should all
+log messages be copied as events.  Events should be used sparingly to
+avoid overwhelming the tracing storage with data and the user with
+information.
+
+Exceptions may also be included in spans.  This happens automatically
+and by default when using the context managers supplied by the SDK.
+Because many spans in Zuul will be unable to use the SDK context
+managers and any exception information would need to be explicitly
+handled and stored in ZooKeeper, we will disable inclusion of
+exception information in spans.  This will provide a more consistent
+experience (so that users don't see the absence of an exception in
+tracing information to indicate the absence of an error in logs) and
+reduce the cost of supporting traces (extra storage in ZooKeeper and
+in the telemetry storage).
+
+If we decide that exception information is worth including in the
+future, this decision will be easy to revisit and reverse.
+
+Sensitive Information
+~~~~~~~~~~~~~~~~~~~~~
+
+No sensitive information (secrets, passwords, job variables, etc)
+should be included in tracing output.  All output should be suitable
+for an audience of Zuul users (that is, if someone has access to the
+Zuul dashboard, then tracing data should not have any more sensitive
+information than they already have access to).  For public-facing Zuul
+systems (such as OpenDev), the information should be suitable for
+public use.
+
+Protobuf and gRPC
+~~~~~~~~~~~~~~~~~
+
+The most efficient and straightforward method of transmitting data
+from Zuul to a collector (including Jaeger) is using OTLP with gRPC
+(OpenTelemetry Protocol + gRPC Remote Procedure Calls).  Because
+Protobuf applications include automatically generated code, we may
+encounter the occasional version inconsistency.  We may need to
+navigate package requirements more than normal due to this (especially
+if we have multiple packages that depend on protobuf).
+
+For a contemporary example, the OpenTelemetry project is in the
+process of pinning to an older version of protobuf:
+
+https://github.com/open-telemetry/opentelemetry-python/issues/2717
+
+There is an HTTP+JSON exporter as well, so in the case that something
+goes very wrong with protobuf+gRPC, that may be available as a fallback.
+
+Work Items
+----------
+
+* Add OpenTelemetry SDK and support for configuring an exporter to
+  zuul.conf
+* Implement SDK subclasses to support opening and closing spans on
+  different hosts
+* Instrument event processing in each driver
+* Instrument event processing in scheduler
+* Instrument queue items and related spans
+* Document a simple Jaeger setup as a quickstart add-on (similar to
+  authz)
+* Optional: work with OpenDev to run a public Jaeger server for
+  OpenDev
+
+The last item is not required for this specification (and not our
+choice as Zuul developers to make) but it would be nice if there were
+one available so that all Zuul users and developers have a reference
+implementation available for community collaboration.
diff --git a/doc/source/monitoring.rst b/doc/source/monitoring.rst
index 0c2cb4351..1cb61ee01 100644
--- a/doc/source/monitoring.rst
+++ b/doc/source/monitoring.rst
@@ -110,7 +110,27 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
          operation(s).  This will always include a request to a Zuul
          merger to speculatively merge the change, but it may also
          include a second request submitted in parallel to identify
-         the files altered by the change.
+         the files altered by the change.  Includes
+         :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.merger_merge_op_time`
+         and
+         :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.merger_files_changes_op_time`.
+
+      .. stat:: merger_merge_op_time
+         :type: timer
+
+         The amount of time the merger spent performing a merge
+         operation.  This does not include any of the round-trip time
+         from the scheduler to the merger, or any other merge
+         operations.
+
+      .. stat:: merger_files_changes_op_time
+         :type: timer
+
+         The amount of time the merger spent performing a files-changes
+         operation to detect changed files (this is sometimes
+         performed if the source does not provide this information).
+         This does not include any of the round-trip time from the
+         scheduler to the merger, or any other merge operations.
 
       .. stat:: layout_generation_time
          :type: timer
@@ -128,7 +148,17 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
 
          The amount of time waiting for a secondary Zuul merger
          operation to collect additional information about the repo
-         state of required projects.
+         state of required projects.  Includes
+         :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.merger_repo_state_op_time`.
+
+      .. stat:: merger_repo_state_op_time
+         :type: timer
+
+         The amount of time the merger spent performing a repo state
+         operation to collect additional information about the repo
+         state of required projects.  This does not include any of the
+         round-trip time from the scheduler to the merger, or any
+         other merge operations.
 
       .. stat:: node_request_time
          :type: timer
diff --git a/playbooks/zuul-stream/fixtures/test-stream.yaml b/playbooks/zuul-stream/fixtures/test-stream.yaml
index 05b75daf9..0326ae54e 100644
--- a/playbooks/zuul-stream/fixtures/test-stream.yaml
+++ b/playbooks/zuul-stream/fixtures/test-stream.yaml
@@ -1,3 +1,16 @@
+- name: Start zuul stream daemon
+  hosts: node
+  tasks:
+
+    # NOTE : when new_console is set, this playbook runs with
+    # ZUUL_CONSOLE_PORT=19887 so that we test with the zuul_console
+    # from the Zuul checkout, and not the one started by test
+    # infrastructure.
+    - name: Start zuul_console on non-default port
+      zuul_console:
+        port: 19887
+      when: new_console | default(false)
+
 - name: Run some commands to show that logging works
   hosts: node
   tasks:
@@ -51,3 +64,15 @@
 
     - name: Print binary data
       command: echo -e '\x80abc'
+
+    - name: Find any console log files
+      find:
+        paths: /tmp
+        patterns: 'console-*.log'
+      register: _tmp_files
+
+    # We check this list in zuul-stream/functional.yaml to make sure
+    # we're cleaning up console log files.
+    - name: Dump tmp files
+      debug:
+        var: _tmp_files
diff --git a/playbooks/zuul-stream/functional.yaml b/playbooks/zuul-stream/functional.yaml
index ee1643fbc..7ae4704a9 100644
--- a/playbooks/zuul-stream/functional.yaml
+++ b/playbooks/zuul-stream/functional.yaml
@@ -7,71 +7,104 @@
         # the python version of the platform is changed.
         python_path: "/usr/local/lib/python3.10/dist-packages"
 
-    - name: Run ansible that should succeed
+    - name: Run ansible that should succeed against testing console
       command: >
         /usr/lib/zuul/ansible/{{ zuul_ansible_version }}/bin/ansible-playbook
+        -e "new_console=true"
         src/opendev.org/zuul/zuul/playbooks/zuul-stream/fixtures/test-stream.yaml
       environment:
+        # Setup by test-stream.yaml so we start a new zuul_console
+        # from this checkout.
+        ZUUL_CONSOLE_PORT: 19887
         ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
         ZUUL_JOBDIR: "{{ ansible_user_dir}}"
         PYTHONPATH: "{{ python_path }}"
+      register: _success_output
 
-    - name: Run ansible playbook that should fail
+    - name: Save raw output to file
+      copy:
+        content: '{{ _success_output.stdout }}'
+        dest: 'console-job-output-success-19887.txt'
+
+    - name: Save output
+      shell: |
+          mv job-output.txt job-output-success-19887.txt
+          mv job-output.json job-output-success-19887.json
+
+    - name: Check protocol version
+      assert:
+        that:
+          - "'[node1] Reports streaming version: 1' in _success_output.stdout"
+
+    # Streamer puts out a line like
+    #  [node1] Starting to log 916b2084-4bbb-80e5-248e-000000000016-1-node1 for task TASK: Print binary data
+    # One of the tasks in job-output shows find: results;
+    # the console file for this task should not be there.
+    - name: Validate temporary files removed
+      shell: |
+        for f in $(grep 'Starting to log' console-job-output-success-19887.txt | awk '{print $5}'); do
+          echo "Checking ${f}"
+          if grep -q '"path": "/tmp/console-'${f}'.log"' job-output-success-19887.txt; then
+            echo "*** /tmp/${f}.log still exists"
+            exit 1
+          fi
+        done
+
+    # NOTE(ianw) 2022-07 : we deliberatly have this second step to run
+    # against the console setup by the infrastructure executor in the
+    # job pre playbooks as a backwards compatability sanity check.
+    - name: Run ansible that should succeed against extant console
       command: >
         /usr/lib/zuul/ansible/{{ zuul_ansible_version }}/bin/ansible-playbook
-        src/opendev.org/zuul/zuul/playbooks/zuul-stream/fixtures/test-stream-failure.yaml
-      register: failed_results
-      failed_when: "failed_results.rc != 2"
+        -e "new_console=false"
+        src/opendev.org/zuul/zuul/playbooks/zuul-stream/fixtures/test-stream.yaml
       environment:
         ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
         ZUUL_JOBDIR: "{{ ansible_user_dir}}"
         PYTHONPATH: "{{ python_path }}"
+      register: _success_output
 
-    - name: Validate output - setupvar
-      shell: |
-        egrep "^.*\| node1 \|\s+\"setupvar\": {" job-output.txt
-        egrep "^.*\| node2 \|\s+\"setupvar\": {" job-output.txt
+    - name: Save raw output to file
+      copy:
+        content: '{{ _success_output.stdout }}'
+        dest: 'console-job-output-success-19885.txt'
 
-    - name: Validate output - shell task
+    - name: Save output
       shell: |
-        egrep "^.*\| node1 \| 1: lo:" job-output.txt
-        egrep "^.*\| node2 \| 1: lo:" job-output.txt
+          mv job-output.txt job-output-success-19885.txt
+          mv job-output.json job-output-success-19885.json
 
-    - name: Validate output - loop with items
-      shell: |
-        egrep "^.+\| node1 \| ok: Item: item1" job-output.txt
-        egrep "^.+\| node1 \| ok: Item: item2" job-output.txt
-        egrep "^.+\| node1 \| ok: Item: item3" job-output.txt
-        egrep "^.+\| node2 \| ok: Item: item1" job-output.txt
-        egrep "^.+\| node2 \| ok: Item: item2" job-output.txt
-        egrep "^.+\| node2 \| ok: Item: item3" job-output.txt
+    - name: Validate outputs
+      include_tasks: validate.yaml
+      loop:
+        - job-output-success-19887.txt
+        - job-output-success-19885.txt
 
-    - name: Validate output - loop with complex items
-      shell: |
-        egrep "^.+\| node1 \| ok: Item: Runtime" job-output.txt
-        egrep "^.+\| node2 \| ok: Item: Runtime" job-output.txt
+    # failure case
 
-    - name: Validate output - failed shell task
-      shell: |
-        egrep "^.+\| node1 \| Exception: Test module failure exception task" job-output.txt
-        egrep "^.+\| node2 \| Exception: Test module failure exception task" job-output.txt
+    - name: Run ansible playbook that should fail
+      command: >
+        /usr/lib/zuul/ansible/{{ zuul_ansible_version }}/bin/ansible-playbook
+        src/opendev.org/zuul/zuul/playbooks/zuul-stream/fixtures/test-stream-failure.yaml
+      register: failed_results
+      failed_when: "failed_results.rc != 2"
+      environment:
+        ZUUL_CONSOLE_PORT: 19887
+        ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
+        ZUUL_JOBDIR: "{{ ansible_user_dir}}"
+        PYTHONPATH: "{{ python_path }}"
 
-    - name: Validate output - item loop with exception
+    - name: Save output
       shell: |
-        egrep "^.+\| node1 \| Exception: Test module failure exception loop" job-output.txt
-        egrep "^.+\| node2 \| Exception: Test module failure exception loop" job-output.txt
+          mv job-output.txt job-output-failure.txt
+          mv job-output.json job-output-failure.json
 
     - name: Validate output - failure shell task with exception
       shell: |
-        egrep "^.+\| node1 \| Exception: Test module failure exception fail-task" job-output.txt
-        egrep "^.+\| node2 \| Exception: Test module failure exception fail-task" job-output.txt
+        egrep "^.+\| node1 \| Exception: Test module failure exception fail-task" job-output-failure.txt
+        egrep "^.+\| node2 \| Exception: Test module failure exception fail-task" job-output-failure.txt
 
     - name: Validate output - failure item loop with exception
       shell: |
-        egrep "^.+\| node1 \| Exception: Test module failure exception fail-loop" job-output.txt
-        egrep "^.+\| node2 \| Exception: Test module failure exception fail-loop" job-output.txt
-
-    - name: Validate output - binary data
-      shell: |
-        egrep "^.*\| node1 \| \\\\x80abc" job-output.txt
-        egrep "^.*\| node2 \| \\\\x80abc" job-output.txt
+        egrep "^.+\| node1 \| Exception: Test module failure exception fail-loop" job-output-failure.txt
+        egrep "^.+\| node2 \| Exception: Test module failure exception fail-loop" job-output-failure.txt
diff --git a/playbooks/zuul-stream/post.yaml b/playbooks/zuul-stream/post.yaml
index 2c717a82e..4beb8d1f9 100644
--- a/playbooks/zuul-stream/post.yaml
+++ b/playbooks/zuul-stream/post.yaml
@@ -10,9 +10,6 @@
         state: directory
       delegate_to: localhost
 
-    - name: Rename job-output.txt
-      command: mv job-output.txt stream-job-output.txt
-
     - name: Fetch files
       synchronize:
         src: "{{ ansible_user_dir }}/{{ item }}"
@@ -21,5 +18,10 @@
       with_items:
         - logging.json
         - ansible.cfg
-        - stream-job-output.txt
-        - job-output.json
+        - console-job-output-success-19887.txt
+        - job-output-success-19887.txt
+        - job-output-success-19887.json
+        - job-output-success-19885.txt
+        - job-output-success-19885.json
+        - job-output-failure.txt
+        - job-output-failure.json
diff --git a/playbooks/zuul-stream/validate.yaml b/playbooks/zuul-stream/validate.yaml
new file mode 100644
index 000000000..73ccd873a
--- /dev/null
+++ b/playbooks/zuul-stream/validate.yaml
@@ -0,0 +1,38 @@
+- name: Validate output - setupvar
+  shell: |
+    egrep "^.*\| node1 \|\s+\"setupvar\": {" {{ item }}
+    egrep "^.*\| node2 \|\s+\"setupvar\": {" {{ item }}
+
+- name: Validate output - shell task
+  shell: |
+    egrep "^.*\| node1 \| 1: lo:" {{ item }}
+    egrep "^.*\| node2 \| 1: lo:" {{ item }}
+
+- name: Validate output - loop with items
+  shell: |
+    egrep "^.+\| node1 \| ok: Item: item1" {{ item }}
+    egrep "^.+\| node1 \| ok: Item: item2" {{ item }}
+    egrep "^.+\| node1 \| ok: Item: item3" {{ item }}
+    egrep "^.+\| node2 \| ok: Item: item1" {{ item }}
+    egrep "^.+\| node2 \| ok: Item: item2" {{ item }}
+    egrep "^.+\| node2 \| ok: Item: item3" {{ item }}
+
+- name: Validate output - loop with complex items
+  shell: |
+    egrep "^.+\| node1 \| ok: Item: Runtime" {{ item }}
+    egrep "^.+\| node2 \| ok: Item: Runtime" {{ item }}
+
+- name: Validate output - failed shell task
+  shell: |
+    egrep "^.+\| node1 \| Exception: Test module failure exception task" {{ item }}
+    egrep "^.+\| node2 \| Exception: Test module failure exception task" {{ item }}
+
+- name: Validate output - item loop with exception
+  shell: |
+    egrep "^.+\| node1 \| Exception: Test module failure exception loop" {{ item }}
+    egrep "^.+\| node2 \| Exception: Test module failure exception loop" {{ item }}
+
+- name: Validate output - binary data
+  shell: |
+    egrep "^.*\| node1 \| \\\\x80abc" {{ item }}
+    egrep "^.*\| node2 \| \\\\x80abc" {{ item }}
diff --git a/tests/base.py b/tests/base.py
index 5a85ea0d7..cebcf2e1f 100644
--- a/tests/base.py
+++ b/tests/base.py
@@ -3091,6 +3091,8 @@ class FakeBuild(object):
         self.paused = False
         self.aborted = False
         self.requeue = False
+        self.should_fail = False
+        self.should_retry = False
         self.created = time.time()
         self.changes = None
         items = self.parameters['zuul']['items']
@@ -3162,6 +3164,8 @@ class FakeBuild(object):
         return result
 
     def shouldFail(self):
+        if self.should_fail:
+            return True
         changes = self.executor_server.fail_tests.get(self.name, [])
         for change in changes:
             if self.hasChanges(change):
@@ -3169,6 +3173,8 @@ class FakeBuild(object):
         return False
 
     def shouldRetry(self):
+        if self.should_retry:
+            return True
         entries = self.executor_server.retry_tests.get(self.name, [])
         for entry in entries:
             if self.hasChanges(entry['change']):
@@ -3662,7 +3668,7 @@ class FakeSMTP(object):
 class FakeNodepool(object):
     REQUEST_ROOT = '/nodepool/requests'
     NODE_ROOT = '/nodepool/nodes'
-    LAUNCHER_ROOT = '/nodepool/launchers'
+    COMPONENT_ROOT = '/nodepool/components'
 
     log = logging.getLogger("zuul.test.FakeNodepool")
 
@@ -3726,10 +3732,11 @@ class FakeNodepool(object):
             self.fulfillRequest(req)
 
     def registerLauncher(self, labels=["label1"], id="FakeLauncher"):
-        path = os.path.join(self.LAUNCHER_ROOT, id)
+        path = os.path.join(self.COMPONENT_ROOT, 'pool', id)
         data = {'id': id, 'supported_labels': labels}
         self.client.create(
-            path, json.dumps(data).encode('utf8'), makepath=True)
+            path, json.dumps(data).encode('utf8'),
+            ephemeral=True, makepath=True, sequence=True)
 
     def getNodeRequests(self):
         try:
diff --git a/tests/fixtures/layouts/job-dedup-false.yaml b/tests/fixtures/layouts/job-dedup-false.yaml
index 2c0e6ee2e..9254f0b41 100644
--- a/tests/fixtures/layouts/job-dedup-false.yaml
+++ b/tests/fixtures/layouts/job-dedup-false.yaml
@@ -39,6 +39,7 @@
 - job:
     name: common-job
     deduplicate: false
+    pre-run: playbooks/pre.yaml
     required-projects:
       - org/project1
       - org/project2
diff --git a/tests/fixtures/layouts/job-dedup-noop.yaml b/tests/fixtures/layouts/job-dedup-noop.yaml
new file mode 100644
index 000000000..9383fd8b6
--- /dev/null
+++ b/tests/fixtures/layouts/job-dedup-noop.yaml
@@ -0,0 +1,55 @@
+- queue:
+    name: integrated
+    allow-circular-dependencies: true
+
+- pipeline:
+    name: gate
+    manager: dependent
+    success-message: Build succeeded (gate).
+    require:
+      gerrit:
+        approval:
+          - Approved: 1
+    trigger:
+      gerrit:
+        - event: comment-added
+          approval:
+            - Approved: 1
+    success:
+      gerrit:
+        Verified: 2
+        submit: true
+    failure:
+      gerrit:
+        Verified: -2
+    start:
+      gerrit:
+        Verified: 0
+    precedence: high
+
+- job:
+    name: base
+    parent: null
+    pre-run: playbooks/pre.yaml
+    run: playbooks/run.yaml
+    nodeset:
+      nodes:
+        - label: debian
+          name: controller
+
+- job:
+    name: common-job
+    required-projects:
+      - org/project1
+
+- job:
+    name: project1-job
+
+- project:
+    name: org/project1
+    queue: integrated
+    gate:
+      jobs:
+        - noop
+        - common-job
+        - project1-job
diff --git a/tests/remote/test_remote_zuul_stream.py b/tests/remote/test_remote_zuul_stream.py
index 1f6b7fff7..1c705127e 100644
--- a/tests/remote/test_remote_zuul_stream.py
+++ b/tests/remote/test_remote_zuul_stream.py
@@ -29,7 +29,7 @@ class FunctionalZuulStreamMixIn:
         self.log_console_port = 19000 + int(
             self.ansible_core_version.split('.')[1])
         self.executor_server.log_console_port = self.log_console_port
-        self.wait_timeout = 120
+        self.wait_timeout = 180
         self.fake_nodepool.remote_ansible = True
 
         ansible_remote = os.environ.get('ZUUL_REMOTE_IPV4')
diff --git a/tests/unit/test_circular_dependencies.py b/tests/unit/test_circular_dependencies.py
index ac5ad13f5..2223008d5 100644
--- a/tests/unit/test_circular_dependencies.py
+++ b/tests/unit/test_circular_dependencies.py
@@ -1703,6 +1703,151 @@ class TestGerritCircularDependencies(ZuulTestCase):
         ], ordered=False)
         self.assertEqual(len(self.fake_nodepool.history), 3)
 
+    @simple_layout('layouts/job-dedup-false.yaml')
+    def test_job_deduplication_false_failed_job(self):
+        # Test that if we are *not* deduplicating jobs, we don't
+        # duplicate the result on two different builds.
+        # The way we check that is to retry the common-job between two
+        # items, but only once, and only on one item.  The other item
+        # should be unaffected.
+        self.executor_server.hold_jobs_in_build = True
+        A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
+        B = self.fake_gerrit.addFakeChange('org/project2', 'master', 'B')
+
+        # A <-> B
+        A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+            A.subject, B.data["url"]
+        )
+        B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+            B.subject, A.data["url"]
+        )
+
+        A.addApproval('Code-Review', 2)
+        B.addApproval('Code-Review', 2)
+
+        self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+        self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
+
+        # If we don't make sure these jobs finish first, then one of
+        # the items may complete before the other and cause Zuul to
+        # abort the project*-job on the other item (with a "bundle
+        # failed to merge" error).
+        self.waitUntilSettled()
+        for build in self.builds:
+            if build.name == 'common-job' and build.project == 'org/project1':
+                break
+        else:
+            raise Exception("Unable to find build")
+        build.should_retry = True
+
+        # Store a reference to the queue items so we can inspect their
+        # internal attributes later to double check the retry build
+        # count is correct.
+        tenant = self.scheds.first.sched.abide.tenants.get('tenant-one')
+        pipeline = tenant.layout.pipelines['gate']
+        items = pipeline.getAllItems()
+        self.assertEqual(len(items), 2)
+
+        self.executor_server.release('project1-job')
+        self.executor_server.release('project2-job')
+        self.waitUntilSettled()
+        self.executor_server.hold_jobs_in_build = False
+        self.executor_server.release()
+        self.waitUntilSettled()
+
+        self.assertEqual(A.data['status'], 'MERGED')
+        self.assertEqual(B.data['status'], 'MERGED')
+        self.assertHistory([
+            dict(name="project2-job", result="SUCCESS", changes="2,1 1,1"),
+            dict(name="project1-job", result="SUCCESS", changes="2,1 1,1"),
+            dict(name="common-job", result=None, changes="2,1 1,1"),
+            dict(name="common-job", result="SUCCESS", changes="2,1 1,1"),
+            dict(name="common-job", result="SUCCESS", changes="2,1 1,1"),
+        ], ordered=False)
+        self.assertEqual(len(self.fake_nodepool.history), 5)
+        self.assertEqual(items[0].change.project.name, 'org/project2')
+        self.assertEqual(len(items[0].current_build_set.retry_builds), 0)
+        self.assertEqual(items[1].change.project.name, 'org/project1')
+        self.assertEqual(len(items[1].current_build_set.retry_builds), 1)
+
+    @simple_layout('layouts/job-dedup-auto-shared.yaml')
+    def test_job_deduplication_multi_scheduler(self):
+        # Test that a second scheduler can correctly refresh
+        # deduplicated builds
+        self.executor_server.hold_jobs_in_build = True
+        A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
+        B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B')
+
+        # A <-> B
+        A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+            A.subject, B.data["url"]
+        )
+        B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+            B.subject, A.data["url"]
+        )
+
+        A.addApproval('Code-Review', 2)
+        B.addApproval('Code-Review', 2)
+
+        self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+        self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
+
+        self.waitUntilSettled()
+
+        app = self.createScheduler()
+        app.start()
+        self.assertEqual(len(self.scheds), 2)
+
+        # Hold the lock on the first scheduler so that only the second
+        # will act.
+        with self.scheds.first.sched.run_handler_lock:
+            self.executor_server.hold_jobs_in_build = False
+            self.executor_server.release()
+            self.waitUntilSettled(matcher=[app])
+
+        self.assertEqual(A.data['status'], 'MERGED')
+        self.assertEqual(B.data['status'], 'MERGED')
+        self.assertHistory([
+            dict(name="project1-job", result="SUCCESS", changes="2,1 1,1"),
+            dict(name="common-job", result="SUCCESS", changes="2,1 1,1"),
+        ], ordered=False)
+
+    @simple_layout('layouts/job-dedup-noop.yaml')
+    def test_job_deduplication_noop(self):
+        # Test that we don't deduplicate noop (there's no good reason
+        # to do so)
+        A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
+        B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B')
+
+        # A <-> B
+        A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+            A.subject, B.data["url"]
+        )
+        B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+            B.subject, A.data["url"]
+        )
+
+        A.addApproval('Code-Review', 2)
+        B.addApproval('Code-Review', 2)
+
+        self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+        self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
+
+        self.waitUntilSettled()
+
+        self.assertEqual(A.data['status'], 'MERGED')
+        self.assertEqual(B.data['status'], 'MERGED')
+        self.assertHistory([
+            dict(name="project1-job", result="SUCCESS", changes="2,1 1,1"),
+            dict(name="common-job", result="SUCCESS", changes="2,1 1,1"),
+        ], ordered=False)
+        # It's tricky to get info about a noop build, but the jobs in
+        # the report have the build UUID, so we make sure it's
+        # different.
+        a_noop = [l for l in A.messages[-1].split('\n') if 'noop' in l][0]
+        b_noop = [l for l in B.messages[-1].split('\n') if 'noop' in l][0]
+        self.assertNotEqual(a_noop, b_noop)
+
     @simple_layout('layouts/job-dedup-retry.yaml')
     def test_job_deduplication_retry(self):
         A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
diff --git a/tests/unit/test_github_driver.py b/tests/unit/test_github_driver.py
index f9f3371fa..49dae0ccb 100644
--- a/tests/unit/test_github_driver.py
+++ b/tests/unit/test_github_driver.py
@@ -1384,7 +1384,7 @@ class TestGithubDriver(ZuulTestCase):
         # now check if the merge was done via rebase
         merges = [report for report in self.fake_github.github_data.reports
                   if report[2] == 'merge']
-        assert(len(merges) == 1 and merges[0][3] == 'squash')
+        assert (len(merges) == 1 and merges[0][3] == 'squash')
 
     @simple_layout('layouts/basic-github.yaml', driver='github')
     def test_invalid_event(self):
diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index eb5ee826f..66c508fea 100644
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@@ -226,6 +226,18 @@ class TestSchedulerZoneFallback(ZuulTestCase):
     def test_jobs_executed(self):
         "Test that jobs are executed and a change is merged per zone"
         self.hold_jobs_in_queue = True
+
+        # Validate that the reported executor stats are correct. Since
+        # the executor accepts zoned and unzoned job it should be counted
+        # in both metrics.
+        self.assertReportedStat(
+            'zuul.executors.online', value='1', kind='g')
+        self.assertReportedStat(
+            'zuul.executors.unzoned.online', value='1', kind='g')
+        self.assertReportedStat(
+            'zuul.executors.zone.test-provider_vpn.online',
+            value='1', kind='g')
+
         A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
         A.addApproval('Code-Review', 2)
         self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
@@ -452,6 +464,7 @@ class TestScheduler(ZuulTestCase):
                 'zuul.tenant.tenant-one.reconfiguration_time',
                 'zuul.tenant.tenant-one.pipeline.gate.event_enqueue_time',
                 'zuul.tenant.tenant-one.pipeline.gate.merge_request_time',
+                'zuul.tenant.tenant-one.pipeline.gate.merger_merge_op_time',
                 'zuul.tenant.tenant-one.pipeline.gate.job_freeze_time',
                 'zuul.tenant.tenant-one.pipeline.gate.node_request_time',
                 'zuul.tenant.tenant-one.pipeline.gate.job_wait_time',
@@ -5731,8 +5744,7 @@ For CI problems and help debugging, contact ci@example.org"""
         self.assertEqual(A.reported, 2)
         self.assertTrue(re.search('project-merge .* NODE_FAILURE',
                                   A.messages[1]))
-        self.assertTrue(re.search('project-test1 .* SKIPPED', A.messages[1]))
-        self.assertTrue(re.search('project-test2 .* SKIPPED', A.messages[1]))
+        self.assertTrue('Skipped 2 jobs' in A.messages[1])
 
     def test_nodepool_resources(self):
         "Test that resources are reported"
@@ -6854,7 +6866,7 @@ class TestDependencyGraph(ZuulTestCase):
         self.assertHistory([
             dict(name='build', result='FAILURE', changes='1,1'),
         ], ordered=False)
-        self.assertIn('SKIPPED', A.messages[0])
+        self.assertIn('Skipped 1 job', A.messages[0])
 
 
 class TestDuplicatePipeline(ZuulTestCase):
diff --git a/tests/unit/test_sos.py b/tests/unit/test_sos.py
index 7d95f2548..f371a8064 100644
--- a/tests/unit/test_sos.py
+++ b/tests/unit/test_sos.py
@@ -55,6 +55,79 @@ class TestScaleOutScheduler(ZuulTestCase):
             dict(name='project-test2', result='SUCCESS', changes='1,1'),
         ], ordered=False)
 
+    def test_pipeline_cache_clear(self):
+        # Test that the pipeline cache on a second scheduler isn't
+        # holding old change objects.
+
+        # Hold jobs in build
+        sched1 = self.scheds.first
+        self.executor_server.hold_jobs_in_build = True
+
+        # We need a pair of changes in order to populate the pipeline
+        # change cache (a single change doesn't activate the cache,
+        # it's for dependencies).
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+        A.addApproval('Code-Review', 2)
+        B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
+        B.addApproval('Code-Review', 2)
+        B.addApproval('Approved', 1)
+        B.setDependsOn(A, 1)
+
+        # Fail a job
+        self.executor_server.failJob('project-test1', A)
+
+        # Enqueue into gate with scheduler 1
+        self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+        self.waitUntilSettled()
+
+        # Start scheduler 2
+        sched2 = self.createScheduler()
+        sched2.start()
+        self.assertEqual(len(self.scheds), 2)
+
+        # Pause scheduler 1
+        with sched1.sched.run_handler_lock:
+            # Release jobs
+            self.executor_server.hold_jobs_in_build = False
+            self.executor_server.release()
+            # Wait for scheduler 2 to dequeue
+            self.waitUntilSettled(matcher=[sched2])
+        # Unpause scheduler 1
+        self.assertEqual(A.data['status'], 'NEW')
+        self.assertEqual(B.data['status'], 'NEW')
+
+        # Clear zk change cache
+        self.fake_gerrit._change_cache.prune([], max_age=0)
+
+        # At this point, scheduler 1 should have a bogus change entry
+        # in the pipeline cache because scheduler 2 performed the
+        # dequeue so scheduler 1 never cleaned up its cache.
+
+        self.executor_server.fail_tests.clear()
+        self.executor_server.hold_jobs_in_build = True
+        # Pause scheduler 1
+        with sched1.sched.run_handler_lock:
+            # Enqueue into gate with scheduler 2
+            self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+            self.waitUntilSettled(matcher=[sched2])
+
+        # Pause scheduler 2
+        with sched2.sched.run_handler_lock:
+            # Make sure that scheduler 1 does some pipeline runs which
+            # reconstitute state from ZK.  This gives it the
+            # opportunity to use old cache data if we don't clear it.
+
+            # Release job1
+            self.executor_server.release()
+            self.waitUntilSettled(matcher=[sched1])
+            # Release job2
+            self.executor_server.hold_jobs_in_build = False
+            self.executor_server.release()
+            # Wait for scheduler 1 to merge change
+            self.waitUntilSettled(matcher=[sched1])
+        self.assertEqual(A.data['status'], 'MERGED')
+        self.assertEqual(B.data['status'], 'MERGED')
+
     @simple_layout('layouts/multi-scheduler-status.yaml')
     def test_multi_scheduler_status(self):
         self.hold_merge_jobs_in_queue = True
diff --git a/tests/unit/test_v3.py b/tests/unit/test_v3.py
index 9fd8ea1d3..a89bb3007 100644
--- a/tests/unit/test_v3.py
+++ b/tests/unit/test_v3.py
@@ -4930,7 +4930,7 @@ class TestDataReturn(AnsibleZuulTestCase):
         self.assertIn(
             '- data-return https://zuul.example.com/',
             A.messages[-1])
-        self.assertTrue(re.search('child .* SKIPPED', A.messages[-1]))
+        self.assertTrue('Skipped 1 job' in A.messages[-1])
         self.assertIn('Build succeeded', A.messages[-1])
 
     def test_data_return_invalid_child_job(self):
@@ -4943,7 +4943,7 @@ class TestDataReturn(AnsibleZuulTestCase):
         self.assertIn(
             '- data-return-invalid-child-job https://zuul.example.com',
             A.messages[-1])
-        self.assertTrue(re.search('data-return .* SKIPPED', A.messages[-1]))
+        self.assertTrue('Skipped 1 job' in A.messages[-1])
         self.assertIn('Build succeeded', A.messages[-1])
 
     def test_data_return_skip_all_child_jobs(self):
@@ -4957,8 +4957,7 @@ class TestDataReturn(AnsibleZuulTestCase):
         self.assertIn(
             '- data-return-skip-all https://zuul.example.com/',
             A.messages[-1])
-        self.assertTrue(re.search('child .* SKIPPED', A.messages[-1]))
-        self.assertTrue(re.search('data-return .* SKIPPED', A.messages[-1]))
+        self.assertTrue('Skipped 2 jobs' in A.messages[-1])
         self.assertIn('Build succeeded', A.messages[-1])
 
     def test_data_return_skip_all_child_jobs_with_soft_dependencies(self):
@@ -4972,8 +4971,7 @@ class TestDataReturn(AnsibleZuulTestCase):
         ])
         self.assertIn('- data-return-cd https://zuul.example.com/',
                       A.messages[-1])
-        self.assertTrue(re.search('data-return-a .* SKIPPED', A.messages[-1]))
-        self.assertTrue(re.search('data-return-b .* SKIPPED', A.messages[-1]))
+        self.assertTrue('Skipped 2 jobs' in A.messages[-1])
         self.assertIn('Build succeeded', A.messages[-1])
 
     def test_several_zuul_return(self):
@@ -4987,7 +4985,7 @@ class TestDataReturn(AnsibleZuulTestCase):
         self.assertIn(
             '- several-zuul-return-child https://zuul.example.com/',
             A.messages[-1])
-        self.assertTrue(re.search('data-return .* SKIPPED', A.messages[-1]))
+        self.assertTrue('Skipped 1 job' in A.messages[-1])
         self.assertIn('Build succeeded', A.messages[-1])
 
     def test_data_return_skip_retry(self):
@@ -6905,7 +6903,7 @@ class TestJobPause(AnsibleZuulTestCase):
             dict(name='compile', result='SUCCESS', changes='1,1'),
         ])
 
-        self.assertTrue(re.search('test .* SKIPPED', A.messages[0]))
+        self.assertTrue('Skipped 1 job' in A.messages[0])
 
     def test_job_pause_pre_skipped_child(self):
         """
@@ -6953,7 +6951,7 @@ class TestJobPause(AnsibleZuulTestCase):
             dict(name='compile', result='SUCCESS', changes='1,1'),
         ])
 
-        self.assertTrue(re.search('test .* SKIPPED', A.messages[0]))
+        self.assertTrue('Skipped 1 job' in A.messages[0])
 
     def test_job_pause_skipped_child_retry(self):
         """
@@ -7822,7 +7820,7 @@ class TestProvidesRequiresMysql(ZuulTestCase):
             dict(name='image-builder', result='FAILURE', changes='1,1'),
             dict(name='hold', result='SUCCESS', changes='1,1'),
         ], ordered=False)
-        self.assertTrue(re.search('image-user .* SKIPPED', A.messages[0]))
+        self.assertTrue('Skipped 1 job' in A.messages[0])
 
         B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B')
         B.data['commitMessage'] = '%s\n\nDepends-On: %s\n' % (
diff --git a/web/public/openapi.yaml b/web/public/openapi.yaml
index 312a1907e..b101c66e0 100644
--- a/web/public/openapi.yaml
+++ b/web/public/openapi.yaml
@@ -474,11 +474,9 @@ components:
           description: The event id
           type: string
         item_ahead:
-          description: The list of events ahead
-          items:
-            type: string
-          type: array
-        item_behind:
+          description: The event ahead
+          type: string
+        items_behind:
           description: The list of events behind
           items:
             type: string
diff --git a/web/src/containers/status/ChangePanel.jsx b/web/src/containers/status/ChangePanel.jsx
index 33fc4687c..dd4fc27e5 100644
--- a/web/src/containers/status/ChangePanel.jsx
+++ b/web/src/containers/status/ChangePanel.jsx
@@ -18,6 +18,7 @@ import { connect } from 'react-redux'
 import { Link } from 'react-router-dom'
 import * as moment from 'moment'
 import 'moment-duration-format'
+import { Button } from '@patternfly/react-core'
 
 
 class ChangePanel extends React.Component {
@@ -30,9 +31,11 @@ class ChangePanel extends React.Component {
   constructor () {
     super()
     this.state = {
-      expanded: false
+      expanded: false,
+      showSkipped: false,
     }
     this.onClick = this.onClick.bind(this)
+    this.toggleSkippedJobs = this.toggleSkippedJobs.bind(this)
     this.clicked = false
   }
 
@@ -120,12 +123,13 @@ class ChangePanel extends React.Component {
   }
 
   renderProgressBar (change) {
-    let jobPercent = (100 / change.jobs.length).toFixed(2)
+    const interesting_jobs = change.jobs.filter(j => this.jobStrResult(j) !== 'skipped')
+    let jobPercent = (100 / interesting_jobs.length).toFixed(2)
     return (
       <div className='progress zuul-change-total-result'>
         {change.jobs.map((job, idx) => {
           let result = this.jobStrResult(job)
-          if (['queued', 'waiting'].includes(result)) {
+          if (['queued', 'waiting', 'skipped'].includes(result)) {
             return ''
           }
           let className = ''
@@ -144,7 +148,6 @@ class ChangePanel extends React.Component {
               className = ' progress-bar-warning'
               break
             case 'paused':
-            case 'skipped':
               className = ' progress-bar-info'
               break
             default:
@@ -302,15 +305,39 @@ class ChangePanel extends React.Component {
       </span>)
   }
 
+  toggleSkippedJobs (e) {
+    // Skip middle mouse button
+    if (e.button === 1) {
+      return
+    }
+    this.setState({ showSkipped: !this.state.showSkipped })
+  }
+
   renderJobList (jobs, times) {
+    const [buttonText, interestingJobs] = this.state.showSkipped ?
+          ['Hide', jobs] :
+          ['Show', jobs.filter(j => this.jobStrResult(j) !== 'skipped')]
+    const skippedJobCount = jobs.length - interestingJobs.length
+
     return (
-      <ul className='list-group zuul-patchset-body'>
-        {jobs.map((job, idx) => (
-          <li key={idx} className='list-group-item zuul-change-job'>
-            {this.renderJob(job, times.jobs[job.name])}
-          </li>
-        ))}
-      </ul>)
+      <>
+        <ul className='list-group zuul-patchset-body'>
+          {interestingJobs.map((job, idx) => (
+            <li key={idx} className='list-group-item zuul-change-job'>
+              {this.renderJob(job, times.jobs[job.name])}
+            </li>
+          ))}
+          {(this.state.showSkipped || skippedJobCount) ? (
+            <li key='last' className='list-group-item zuul-change-job'>
+              <Button variant="link" className='zuul-skipped-jobs-button'
+                      onClick={this.toggleSkippedJobs}>
+                {buttonText} {skippedJobCount ? skippedJobCount : ''} skipped job{skippedJobCount === 1 ? '' : 's'}
+              </Button>
+            </li>
+          ) : ''}
+        </ul>
+      </>
+    )
   }
 
   calculateTimes (change) {
diff --git a/web/src/containers/status/ChangePanel.test.jsx b/web/src/containers/status/ChangePanel.test.jsx
index 5a4be8602..cd27edc73 100644
--- a/web/src/containers/status/ChangePanel.test.jsx
+++ b/web/src/containers/status/ChangePanel.test.jsx
@@ -16,6 +16,7 @@ import React from 'react'
 import { Link, BrowserRouter as Router } from 'react-router-dom'
 import { Provider } from 'react-redux'
 import { create } from 'react-test-renderer'
+import { Button } from '@patternfly/react-core'
 
 import { setTenantAction } from '../../actions/tenant'
 import configureStore from '../../store'
@@ -45,6 +46,8 @@ it('change panel render multi tenant links', () => {
   const jobLink = application.root.findByType(Link)
   expect(jobLink.props.to).toEqual(
     '/t/tenant-one/stream/42')
+  const skipButton = application.root.findAllByType(Button)
+  expect(skipButton === undefined)
 })
 
 it('change panel render white-label tenant links', () => {
@@ -60,4 +63,29 @@ it('change panel render white-label tenant links', () => {
   const jobLink = application.root.findByType(Link)
   expect(jobLink.props.to).toEqual(
     '/stream/42')
+  const skipButton = application.root.findAllByType(Button)
+  expect(skipButton === undefined)
+})
+
+it('change panel skip jobs', () => {
+  const fakeChange = {
+    project: 'org-project',
+    jobs: [{
+      name: 'job-name',
+      url: 'stream/42',
+      result: 'skipped'
+    }]
+  }
+
+  const store = configureStore()
+  store.dispatch(setTenantAction('tenant-one', true))
+  const application = create(
+    <Provider store={store}>
+      <Router>
+        <ChangePanel change={fakeChange} globalExpanded={true} />
+      </Router>
+    </Provider>
+  )
+  const skipButton = application.root.findByType(Button)
+  expect(skipButton.props.children.includes('skipped job'))
 })
diff --git a/web/src/index.css b/web/src/index.css
index e8c67a372..eddbca673 100644
--- a/web/src/index.css
+++ b/web/src/index.css
@@ -189,6 +189,11 @@ a.refresh {
     font-size: small;
 }
 
+.zuul-skipped-jobs-button {
+    font-size: small;
+    padding: 0;
+}
+
 .zuul-non-voting-desc {
     font-size: smaller;
 }
diff --git a/zuul/ansible/base/callback/zuul_stream.py b/zuul/ansible/base/callback/zuul_stream.py
index 720261cb2..f31983ed6 100644
--- a/zuul/ansible/base/callback/zuul_stream.py
+++ b/zuul/ansible/base/callback/zuul_stream.py
@@ -48,6 +48,7 @@ from zuul.ansible import paths
 from zuul.ansible import logconfig
 
 LOG_STREAM_PORT = int(os.environ.get("ZUUL_CONSOLE_PORT", 19885))
+LOG_STREAM_VERSION = 0
 
 
 def zuul_filter_result(result):
@@ -103,6 +104,7 @@ class CallbackModule(default.CallbackModule):
         self._items_done = False
         self._deferred_result = None
         self._playbook_name = None
+        self._zuul_console_version = 0
 
     def configure_logger(self):
         # ansible appends timestamp, user and pid to the log lines emitted
@@ -129,9 +131,7 @@ class CallbackModule(default.CallbackModule):
             else:
                 self._display.display(msg)
 
-    def _read_log(self, host, ip, port, log_id, task_name, hosts):
-        self._log("[%s] Starting to log %s for task %s"
-                  % (host, log_id, task_name), job=False, executor=True)
+    def _read_log_connect(self, host, ip, port):
         logger_retries = 0
         while True:
             try:
@@ -141,6 +141,7 @@ class CallbackModule(default.CallbackModule):
                 # logs continously. Without this we can easily trip the 5
                 # second timeout.
                 s.settimeout(None)
+                return s
             except socket.timeout:
                 self._log(
                     "Timeout exception waiting for the logger. "
@@ -151,7 +152,7 @@ class CallbackModule(default.CallbackModule):
                     "Timeout exception waiting for the logger. "
                     "Please check connectivity to [%s:%s]"
                     % (ip, port))
-                return
+                return None
             except Exception:
                 if logger_retries % 10 == 0:
                     self._log("[%s] Waiting on logger" % host,
@@ -159,31 +160,77 @@ class CallbackModule(default.CallbackModule):
                 logger_retries += 1
                 time.sleep(0.1)
                 continue
-            msg = "%s\n" % log_id
-            s.send(msg.encode("utf-8"))
-            buff = s.recv(4096)
-            buffering = True
-            while buffering:
-                if b'\n' in buff:
-                    (line, buff) = buff.split(b'\n', 1)
-                    # We can potentially get binary data here. In order to
-                    # being able to handle that use the backslashreplace
-                    # error handling method. This decodes unknown utf-8
-                    # code points to escape sequences which exactly represent
-                    # the correct data without throwing a decoding exception.
-                    done = self._log_streamline(
-                        host, line.decode("utf-8", "backslashreplace"))
-                    if done:
-                        return
+
+    def _read_log(self, host, ip, port, log_id, task_name, hosts):
+        self._log("[%s] Starting to log %s for task %s"
+                  % (host, log_id, task_name), job=False, executor=True)
+
+        s = self._read_log_connect(host, ip, port)
+        if s is None:
+            # Can't connect; _read_log_connect() already logged an
+            # error for us, just bail
+            return
+
+        # Find out what version we are running against
+        s.send(f'v:{LOG_STREAM_VERSION}\n'.encode('utf-8'))
+        buff = s.recv(1024).decode('utf-8').strip()
+
+        # NOTE(ianw) 2022-07-21 : zuul_console from < 6.3.0 do not
+        # understand this protocol.  They will assume the send
+        # above is a log request and send back the not found
+        # message in a loop.  So to handle this we disconnect and
+        # reconnect.  When we decide to remove this, we can remove
+        # anything in the "version 0" path.
+        if buff == '[Zuul] Log not found':
+            s.close()
+            s = self._read_log_connect(host, ip, port)
+            if s is None:
+                return
+        else:
+            self._zuul_console_version = int(buff)
+        self._log('[%s] Reports streaming version: %d' %
+                  (host, self._zuul_console_version),
+                  job=False, executor=True)
+
+        if self._zuul_console_version >= 1:
+            msg = 's:%s\n' % log_id
+        else:
+            msg = '%s\n' % log_id
+
+        s.send(msg.encode("utf-8"))
+        buff = s.recv(4096)
+        buffering = True
+        while buffering:
+            if b'\n' in buff:
+                (line, buff) = buff.split(b'\n', 1)
+                # We can potentially get binary data here. In order to
+                # being able to handle that use the backslashreplace
+                # error handling method. This decodes unknown utf-8
+                # code points to escape sequences which exactly represent
+                # the correct data without throwing a decoding exception.
+                done = self._log_streamline(
+                    host, line.decode("utf-8", "backslashreplace"))
+                if done:
+                    if self._zuul_console_version > 0:
+                        try:
+                            # reestablish connection and tell console to
+                            # clean up
+                            s = self._read_log_connect(host, ip, port)
+                            s.send(f'f:{log_id}\n'.encode('utf-8'))
+                            s.close()
+                        except Exception:
+                            # Don't worry if this fails
+                            pass
+                    return
+            else:
+                more = s.recv(4096)
+                if not more:
+                    buffering = False
                 else:
-                    more = s.recv(4096)
-                    if not more:
-                        buffering = False
-                    else:
-                        buff += more
-            if buff:
-                self._log_streamline(
-                    host, buff.decode("utf-8", "backslashreplace"))
+                    buff += more
+        if buff:
+            self._log_streamline(
+                host, buff.decode("utf-8", "backslashreplace"))
 
     def _log_streamline(self, host, line):
         if "[Zuul] Task exit code" in line:
diff --git a/zuul/ansible/base/library/zuul_console.py b/zuul/ansible/base/library/zuul_console.py
index 9dffbbc3a..aa999cac1 100755
--- a/zuul/ansible/base/library/zuul_console.py
+++ b/zuul/ansible/base/library/zuul_console.py
@@ -24,6 +24,14 @@ import subprocess
 import threading
 import time
 
+# This is the version we report to the zuul_stream callback.  It is
+# expected that this (zuul_console) process can be long-lived, so if
+# there are updates this ensures a later streaming callback can still
+# talk to us.
+ZUUL_CONSOLE_PROTO_VERSION = 1
+# This is the template for the file name of the log-file written out
+# by the command.py override command in the executor's Ansible
+# install.
 LOG_STREAM_FILE = '/tmp/console-{log_uuid}.log'
 LOG_STREAM_PORT = 19885
 
@@ -162,15 +170,49 @@ class Server(object):
                 ret = buffer.decode('utf-8')
                 x = ret.find('\n')
                 if x > 0:
-                    return ret[:x]
+                    return ret[:x].strip()
             except UnicodeDecodeError:
                 pass
 
-    def handleOneConnection(self, conn):
-        log_uuid = self.get_command(conn)
+    def _clean_uuid(self, log_uuid):
         # use path split to make use the input isn't trying to be clever
         # and construct some path like /tmp/console-/../../something
-        log_uuid = os.path.split(log_uuid.rstrip())[-1]
+        return os.path.split(log_uuid)[-1]
+
+    def handleOneConnection(self, conn):
+        # V1 protocol
+        # -----------
+        #  v:<ver>    get version number, <ver> is remote version
+        #  s:<uuid>   send logs for <uuid>
+        #  f:<uuid>   finalise/cleanup <uuid>
+        while True:
+            command = self.get_command(conn)
+            if command.startswith('v:'):
+                # NOTE(ianw) : remote sends its version.  We currently
+                # don't have anything to do with this value, so ignore
+                # for now.
+                cmd = '%s\n' % (ZUUL_CONSOLE_PROTO_VERSION)
+                conn.send(cmd.encode('utf-8'))
+                continue
+            elif command.startswith('f:'):
+                log_uuid = self._clean_uuid(command[2:])
+                try:
+                    os.unlink(self.path.format(log_uuid=log_uuid))
+                except Exception:
+                    # something might have cleaned /tmp
+                    pass
+                continue
+            elif command.startswith('s:'):
+                log_uuid = self._clean_uuid(command[2:])
+                break
+            else:
+                # NOTE(ianw): 2022-07-21 In releases < 6.3.0 the streaming
+                # side would just send a raw uuid and nothing else; so by
+                # default assume that is what is coming in here.  We can
+                # remove this fallback when we decide it is no longer
+                # necessary.
+                log_uuid = self._clean_uuid(command)
+                break
 
         # FIXME: this won't notice disconnects until it tries to send
         console = None
diff --git a/zuul/driver/gerrit/gerritconnection.py b/zuul/driver/gerrit/gerritconnection.py
index 6aea4388b..1ec334915 100644
--- a/zuul/driver/gerrit/gerritconnection.py
+++ b/zuul/driver/gerrit/gerritconnection.py
@@ -1470,7 +1470,7 @@ class GerritConnection(ZKChangeCacheMixin, ZKBranchCacheMixin, BaseConnection):
         # for large projects like nova
         alldata = []
         chunk, more_changes = _query_chunk(query, event)
-        while(chunk):
+        while chunk:
             alldata.extend(chunk)
             if more_changes is None:
                 # continue sortKey based (before Gerrit 2.9)
diff --git a/zuul/driver/github/githubreporter.py b/zuul/driver/github/githubreporter.py
index de62f2565..73d3a6f13 100644
--- a/zuul/driver/github/githubreporter.py
+++ b/zuul/driver/github/githubreporter.py
@@ -135,9 +135,12 @@ class GithubReporter(BaseReporter):
     def _formatItemReportJobs(self, item):
         # Return the list of jobs portion of the report
         ret = ''
-        jobs_fields = self._getItemReportJobsFields(item)
+        jobs_fields, skipped = self._getItemReportJobsFields(item)
         for job_fields in jobs_fields:
             ret += self._formatJobResult(job_fields)
+        if skipped:
+            jobtext = 'job' if skipped == 1 else 'jobs'
+            ret += 'Skipped %i %s\n' % (skipped, jobtext)
         return ret
 
     def addPullComment(self, item, comment=None):
diff --git a/zuul/driver/pagure/pagurereporter.py b/zuul/driver/pagure/pagurereporter.py
index 0bfdbc9b8..b38035752 100644
--- a/zuul/driver/pagure/pagurereporter.py
+++ b/zuul/driver/pagure/pagurereporter.py
@@ -67,9 +67,12 @@ class PagureReporter(BaseReporter):
     def _formatItemReportJobs(self, item):
         # Return the list of jobs portion of the report
         ret = ''
-        jobs_fields = self._getItemReportJobsFields(item)
+        jobs_fields, skipped = self._getItemReportJobsFields(item)
         for job_fields in jobs_fields:
             ret += '- [%s](%s) : %s%s%s%s\n' % job_fields[:6]
+        if skipped:
+            jobtext = 'job' if skipped == 1 else 'jobs'
+            ret += 'Skipped %i %s\n' % (skipped, jobtext)
         return ret
 
     def addPullComment(self, item, comment=None):
diff --git a/zuul/driver/smtp/smtpreporter.py b/zuul/driver/smtp/smtpreporter.py
index 4815026ab..a5d8938c1 100644
--- a/zuul/driver/smtp/smtpreporter.py
+++ b/zuul/driver/smtp/smtpreporter.py
@@ -42,7 +42,7 @@ class SMTPReporter(BaseReporter):
 
         if 'subject' in self.config:
             subject = self.config['subject'].format(
-                change=item.change)
+                change=item.change, pipeline=item.pipeline.getSafeAttributes())
         else:
             subject = "Report for change {change} against {ref}".format(
                 change=item.change, ref=item.change.ref)
diff --git a/zuul/executor/server.py b/zuul/executor/server.py
index 2165a797b..e00612e9e 100644
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@@ -3157,6 +3157,10 @@ class ExecutorServer(BaseMergeServer):
         self.allow_unzoned = get_default(self.config, 'executor',
                                          'allow_unzoned', False)
 
+        # If this executor has no zone configured it is implicitly unzoned
+        if self.zone is None:
+            self.allow_unzoned = True
+
         # Those attributes won't change, so it's enough to set them once on the
         # component info.
         self.component_info.zone = self.zone
@@ -3735,7 +3739,7 @@ class ExecutorServer(BaseMergeServer):
                 sensor.reportStats(self.statsd, base_key)
 
     def finishJob(self, unique):
-        del(self.job_workers[unique])
+        del self.job_workers[unique]
         self.log.debug(
             "Finishing Job: %s, queue(%d): %s",
             unique,
diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py
index 642aededd..a66f5ad22 100644
--- a/zuul/manager/__init__.py
+++ b/zuul/manager/__init__.py
@@ -235,6 +235,9 @@ class PipelineManager(metaclass=ABCMeta):
             resolved_changes.append(change)
         return resolved_changes
 
+    def clearCache(self):
+        self._change_cache.clear()
+
     def _maintainCache(self):
         active_layout_uuids = set()
         referenced_change_keys = set()
@@ -1768,9 +1771,12 @@ class PipelineManager(metaclass=ABCMeta):
         build_in_items = [item]
         if item.bundle:
             for other_item in item.bundle.items:
-                if other_item not in build_in_items:
-                    if other_item.current_build_set.getBuild(build.job.name):
-                        build_in_items.append(other_item)
+                if other_item in build_in_items:
+                    continue
+                other_build = other_item.current_build_set.getBuild(
+                    build.job.name)
+                if other_build is not None and other_build is build:
+                    build_in_items.append(other_item)
         for item in build_in_items:
             # We don't care about some actions below if this build
             # isn't in the current buildset, so determine that before
@@ -1817,18 +1823,27 @@ class PipelineManager(metaclass=ABCMeta):
             # We're the second of the files/merger pair, report the stat
             self.reportPipelineTiming('merge_request_time',
                                       build_set.configured_time)
+        if event.elapsed_time:
+            self.reportPipelineTiming('merger_files_changes_op_time',
+                                      event.elapsed_time, elapsed=True)
 
     def onMergeCompleted(self, event, build_set):
         if build_set.merge_state == build_set.COMPLETE:
             self._onGlobalRepoStateCompleted(event, build_set)
             self.reportPipelineTiming('repo_state_time',
                                       build_set.repo_state_request_time)
+            if event.elapsed_time:
+                self.reportPipelineTiming('merger_repo_state_op_time',
+                                          event.elapsed_time, elapsed=True)
         else:
             self._onMergeCompleted(event, build_set)
             if build_set.files_state == build_set.COMPLETE:
                 # We're the second of the files/merger pair, report the stat
                 self.reportPipelineTiming('merge_request_time',
                                           build_set.configured_time)
+            if event.elapsed_time:
+                self.reportPipelineTiming('merger_merge_op_time',
+                                          event.elapsed_time, elapsed=True)
 
     def _onMergeCompleted(self, event, build_set):
 
@@ -2120,7 +2135,7 @@ class PipelineManager(metaclass=ABCMeta):
         except Exception:
             self.log.exception("Exception reporting pipeline stats")
 
-    def reportPipelineTiming(self, key, start, end=None):
+    def reportPipelineTiming(self, key, start, end=None, elapsed=False):
         if not self.sched.statsd:
             return
         if not start:
@@ -2130,5 +2145,8 @@ class PipelineManager(metaclass=ABCMeta):
         pipeline = self.pipeline
         tenant = pipeline.tenant
         stats_key = f'zuul.tenant.{tenant.name}.pipeline.{pipeline.name}'
-        dt = (end - start) * 1000
+        if elapsed:
+            dt = start
+        else:
+            dt = (end - start) * 1000
         self.sched.statsd.timing(f'{stats_key}.{key}', dt)
diff --git a/zuul/merger/client.py b/zuul/merger/client.py
index 362644b98..29fa39aaf 100644
--- a/zuul/merger/client.py
+++ b/zuul/merger/client.py
@@ -159,7 +159,9 @@ class MergeClient(object):
                 "via result event for %s", merge_request)
             if merge_request.job_type == MergeRequest.FILES_CHANGES:
                 event = FilesChangesCompletedEvent(
-                    merge_request.build_set_uuid, files=None
+                    merge_request.build_set_uuid,
+                    files=None,
+                    elapsed_time=None,
                 )
             else:
                 event = MergeCompletedEvent(
@@ -172,6 +174,7 @@ class MergeClient(object):
                     repo_state=None,
                     item_in_branches=None,
                     errors=None,
+                    elapsed_time=None,
                 )
             try:
                 self.result_events[merge_request.tenant_name][
diff --git a/zuul/merger/server.py b/zuul/merger/server.py
index 91597714f..fe5b938a1 100644
--- a/zuul/merger/server.py
+++ b/zuul/merger/server.py
@@ -256,6 +256,7 @@ class BaseMergeServer(metaclass=ABCMeta):
 
     def executeMergeJob(self, merge_request, params):
         result = None
+        start = time.monotonic()
         if merge_request.job_type == MergeRequest.MERGE:
             result = self.merge(merge_request, params)
         elif merge_request.job_type == MergeRequest.CAT:
@@ -264,6 +265,8 @@ class BaseMergeServer(metaclass=ABCMeta):
             result = self.refstate(merge_request, params)
         elif merge_request.job_type == MergeRequest.FILES_CHANGES:
             result = self.fileschanges(merge_request, params)
+        end = time.monotonic()
+        result['elapsed_time'] = end - start
         return result
 
     def cat(self, merge_request, args):
@@ -376,6 +379,7 @@ class BaseMergeServer(metaclass=ABCMeta):
         item_in_branches = result.get("item_in_branches", [])
         files = result.get("files", {})
         errors = result.get("errors", [])
+        elapsed_time = result.get("elapsed_time")
 
         log.info(
             "Merge %s complete, merged: %s, updated: %s, commit: %s, "
@@ -407,7 +411,9 @@ class BaseMergeServer(metaclass=ABCMeta):
             )
             if merge_request.job_type == MergeRequest.FILES_CHANGES:
                 event = FilesChangesCompletedEvent(
-                    merge_request.build_set_uuid, files
+                    merge_request.build_set_uuid,
+                    files,
+                    elapsed_time,
                 )
             else:
                 event = MergeCompletedEvent(
@@ -420,6 +426,7 @@ class BaseMergeServer(metaclass=ABCMeta):
                     repo_state,
                     item_in_branches,
                     errors,
+                    elapsed_time,
                 )
 
             def put_complete_event(log, merge_request, event):
diff --git a/zuul/model.py b/zuul/model.py
index a07d5a640..963332826 100644
--- a/zuul/model.py
+++ b/zuul/model.py
@@ -694,6 +694,11 @@ class PipelineState(zkobject.ZKObject):
         return json.dumps(data, sort_keys=True).encode("utf8")
 
     def deserialize(self, raw, context):
+        # We may have old change objects in the pipeline cache, so
+        # make sure they are the same objects we would get from the
+        # source change cache.
+        self.pipeline.manager.clearCache()
+
         data = super().deserialize(raw, context)
         existing_queues = {
             q.getPath(): q for q in self.queues + self.old_queues
@@ -742,8 +747,56 @@ class PipelineState(zkobject.ZKObject):
             "queues": queues,
             "old_queues": old_queues,
         })
+        if context.build_references:
+            self._fixBuildReferences(data, context)
+            context.build_references = False
         return data
 
+    def _fixBuildReferences(self, data, context):
+        # Reconcile duplicate builds; if we find any BuildReference
+        # objects, look up the actual builds and replace
+        log = context.log
+        build_map = {}
+        to_replace_dicts = []
+        to_replace_lists = []
+        for queue in data['queues'] + data['old_queues']:
+            for item in queue.queue:
+                buildset = item.current_build_set
+                for build_job, build in buildset.builds.items():
+                    if isinstance(build, BuildReference):
+                        to_replace_dicts.append((item,
+                                                 buildset.builds,
+                                                 build_job,
+                                                 build._path))
+                    else:
+                        build_map[build.getPath()] = build
+                for job_name, build_list in buildset.retry_builds.items():
+                    for build in build_list:
+                        if isinstance(build, BuildReference):
+                            to_replace_lists.append((item,
+                                                     build_list,
+                                                     build,
+                                                     build._path))
+                        else:
+                            build_map[build.getPath()] = build
+        for (item, build_dict, build_job, build_path) in to_replace_dicts:
+            orig_build = build_map.get(build_path)
+            if orig_build:
+                build_dict[build_job] = orig_build
+            else:
+                log.warning("Unable to find deduplicated build %s for %s",
+                            build_path, item)
+                del build_dict[build_job]
+        for (item, build_list, build, build_path) in to_replace_lists:
+            idx = build_list.index(build)
+            orig_build = build_map.get(build_path)
+            if orig_build:
+                build_list[idx] = build_map[build_path]
+            else:
+                log.warning("Unable to find deduplicated build %s for %s",
+                            build_path, item)
+                del build_list[idx]
+
     def _getKnownItems(self):
         items = []
         for queue in (*self.old_queues, *self.queues):
@@ -3424,6 +3477,11 @@ class BuildRequest(JobRequest):
         )
 
 
+class BuildReference:
+    def __init__(self, _path):
+        self._path = _path
+
+
 class Build(zkobject.ZKObject):
     """A Build is an instance of a single execution of a Job.
 
@@ -3852,6 +3910,13 @@ class BuildSet(zkobject.ZKObject):
         }
         return json.dumps(data, sort_keys=True).encode("utf8")
 
+    def _isMyBuild(self, build_path):
+        parts = build_path.split('/')
+        buildset_uuid = parts[-5]
+        if buildset_uuid == self.uuid:
+            return True
+        return False
+
     def deserialize(self, raw, context):
         data = super().deserialize(raw, context)
         # Set our UUID so that getPath() returns the correct path for
@@ -3944,8 +4009,12 @@ class BuildSet(zkobject.ZKObject):
                         if not build.result:
                             build.refresh(context)
                     else:
-                        build = Build.fromZK(
-                            context, build_path, job=job, build_set=self)
+                        if not self._isMyBuild(build_path):
+                            build = BuildReference(build_path)
+                            context.build_references = True
+                        else:
+                            build = Build.fromZK(
+                                context, build_path, job=job, build_set=self)
                     builds[job_name] = build
 
                 for retry_path in data["retry_builds"].get(job_name, []):
@@ -3954,8 +4023,12 @@ class BuildSet(zkobject.ZKObject):
                         # Retry builds never change.
                         pass
                     else:
-                        retry_build = Build.fromZK(
-                            context, retry_path, job=job, build_set=self)
+                        if not self._isMyBuild(retry_path):
+                            retry_build = BuildReference(retry_path)
+                            context.build_references = True
+                        else:
+                            retry_build = Build.fromZK(
+                                context, retry_path, job=job, build_set=self)
                     retry_builds[job_name].append(retry_build)
 
         data.update({
@@ -6180,12 +6253,13 @@ class MergeCompletedEvent(ResultEvent):
     :arg dict repo_state: The starting repo state before the merge.
     :arg list item_in_branches: A list of branches in which the final
         commit in the merge list appears (changes without refs).
-    :arg list errors: A list of error message strings
+    :arg list errors: A list of error message strings.
+    :arg float elapsed_time: Elapsed time of merge op in seconds.
     """
 
     def __init__(self, request_uuid, build_set_uuid, merged, updated,
                  commit, files, repo_state, item_in_branches,
-                 errors):
+                 errors, elapsed_time):
         self.request_uuid = request_uuid
         self.build_set_uuid = build_set_uuid
         self.merged = merged
@@ -6195,6 +6269,7 @@ class MergeCompletedEvent(ResultEvent):
         self.repo_state = repo_state or {}
         self.item_in_branches = item_in_branches or []
         self.errors = errors or []
+        self.elapsed_time = elapsed_time
 
     def __repr__(self):
         return ('<MergeCompletedEvent job: %s buildset: %s merged: %s '
@@ -6214,6 +6289,7 @@ class MergeCompletedEvent(ResultEvent):
             "repo_state": dict(self.repo_state),
             "item_in_branches": list(self.item_in_branches),
             "errors": list(self.errors),
+            "elapsed_time": self.elapsed_time,
         }
 
     @classmethod
@@ -6228,6 +6304,7 @@ class MergeCompletedEvent(ResultEvent):
             dict(data.get("repo_state", {})),
             list(data.get("item_in_branches", [])),
             list(data.get("errors", [])),
+            data.get("elapsed_time"),
         )
 
 
@@ -6236,16 +6313,19 @@ class FilesChangesCompletedEvent(ResultEvent):
 
     :arg BuildSet build_set: The build_set which is ready.
     :arg list files: List of files changed.
+    :arg float elapsed_time: Elapsed time of merge op in seconds.
     """
 
-    def __init__(self, build_set_uuid, files):
+    def __init__(self, build_set_uuid, files, elapsed_time):
         self.build_set_uuid = build_set_uuid
         self.files = files or []
+        self.elapsed_time = elapsed_time
 
     def toDict(self):
         return {
             "build_set_uuid": self.build_set_uuid,
             "files": list(self.files),
+            "elapsed_time": self.elapsed_time,
         }
 
     @classmethod
@@ -6253,6 +6333,7 @@ class FilesChangesCompletedEvent(ResultEvent):
         return cls(
             data.get("build_set_uuid"),
             list(data.get("files", [])),
+            data.get("elapsed_time"),
         )
 
 
@@ -7078,6 +7159,7 @@ class Layout(object):
         noop = Job('noop')
         noop.description = 'A job that will always succeed, no operation.'
         noop.parent = noop.BASE_JOB_MARKER
+        noop.deduplicate = False
         noop.run = (PlaybookContext(None, 'noop.yaml', [], []),)
         self.jobs = {'noop': [noop]}
         self.nodesets = {}
diff --git a/zuul/reporter/__init__.py b/zuul/reporter/__init__.py
index 9b8f2c11c..5723316a3 100644
--- a/zuul/reporter/__init__.py
+++ b/zuul/reporter/__init__.py
@@ -257,9 +257,13 @@ class BaseReporter(object, metaclass=abc.ABCMeta):
         # Extract the report elements from an item
         config = self.connection.sched.config
         jobs_fields = []
+        skipped = 0
         for job in item.getJobs():
             build = item.current_build_set.getBuild(job.name)
             (result, url) = item.formatJobResult(job)
+            if result == 'SKIPPED':
+                skipped += 1
+                continue
             if not job.voting:
                 voting = ' (non-voting)'
             else:
@@ -300,12 +304,15 @@ class BaseReporter(object, metaclass=abc.ABCMeta):
             success_message = job.success_message
             jobs_fields.append(
                 (name, url, result, error, elapsed, voting, success_message))
-        return jobs_fields
+        return jobs_fields, skipped
 
     def _formatItemReportJobs(self, item):
         # Return the list of jobs portion of the report
         ret = ''
-        jobs_fields = self._getItemReportJobsFields(item)
+        jobs_fields, skipped = self._getItemReportJobsFields(item)
         for job_fields in jobs_fields:
             ret += '- %s%s : %s%s%s%s\n' % job_fields[:6]
+        if skipped:
+            jobtext = 'job' if skipped == 1 else 'jobs'
+            ret += 'Skipped %i %s\n' % (skipped, jobtext)
         return ret
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 272235757..dfc922cf1 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -438,12 +438,12 @@ class Scheduler(threading.Thread):
         mergers_online = 0
 
         for executor_component in self.component_registry.all("executor"):
-            if executor_component.allow_unzoned or not executor_component.zone:
+            if executor_component.allow_unzoned:
                 if executor_component.state == BaseComponent.RUNNING:
                     executors_unzoned_online += 1
                 if executor_component.accepting_work:
                     executors_unzoned_accepting += 1
-            else:
+            if executor_component.zone:
                 zone_stats = zoned_executor_stats.setdefault(
                     executor_component.zone,
                     executor_stats_default.copy())
diff --git a/zuul/zk/nodepool.py b/zuul/zk/nodepool.py
index 109053f11..1be4dc2b5 100644
--- a/zuul/zk/nodepool.py
+++ b/zuul/zk/nodepool.py
@@ -12,6 +12,7 @@
 
 import json
 import logging
+import os
 import time
 from enum import Enum
 from typing import Optional, List
@@ -43,7 +44,7 @@ class ZooKeeperNodepool(ZooKeeperBase):
     Class implementing Nodepool related ZooKeeper interface.
     """
     NODES_ROOT = "/nodepool/nodes"
-    LAUNCHER_ROOT = "/nodepool/launchers"
+    COMPONENT_ROOT = "/nodepool/components"
     REQUEST_ROOT = '/nodepool/requests'
     REQUEST_LOCK_ROOT = "/nodepool/requests-lock"
     HOLD_REQUEST_ROOT = '/zuul/hold-requests'
@@ -95,9 +96,6 @@ class ZooKeeperNodepool(ZooKeeperBase):
             self._node_tree.close()
             self._node_tree = None
 
-    def _launcherPath(self, launcher):
-        return "%s/%s" % (self.LAUNCHER_ROOT, launcher)
-
     def _nodePath(self, node):
         return "%s/%s" % (self.NODES_ROOT, node)
 
@@ -113,15 +111,15 @@ class ZooKeeperNodepool(ZooKeeperBase):
 
         :returns: A list of Launcher objects, or empty list if none are found.
         """
+        root_path = os.path.join(self.COMPONENT_ROOT, 'pool')
         try:
-            launcher_ids = self.kazoo_client\
-                .get_children(self.LAUNCHER_ROOT)
+            pools = self.kazoo_client.get_children(root_path)
         except NoNodeError:
             return []
 
         objs = []
-        for launcher in launcher_ids:
-            path = self._launcherPath(launcher)
+        for pool in pools:
+            path = os.path.join(root_path, pool)
             try:
                 data, _ = self.kazoo_client.get(path)
             except NoNodeError:
diff --git a/zuul/zk/zkobject.py b/zuul/zk/zkobject.py
index aa32b8b9b..8de3f34ba 100644
--- a/zuul/zk/zkobject.py
+++ b/zuul/zk/zkobject.py
@@ -43,6 +43,7 @@ class ZKContext:
         self.cumulative_write_znodes = 0
         self.cumulative_read_bytes = 0
         self.cumulative_write_bytes = 0
+        self.build_references = False
 
     def sessionIsValid(self):
         return ((not self.lock or self.lock.is_still_valid()) and