summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/source/developer/index.rst1
-rw-r--r--doc/source/developer/metrics.rst74
-rw-r--r--doc/source/developer/specs/index.rst1
-rw-r--r--doc/source/developer/specs/tracing.rst323
-rw-r--r--doc/source/monitoring.rst34
-rw-r--r--playbooks/zuul-stream/fixtures/test-stream.yaml25
-rw-r--r--playbooks/zuul-stream/functional.yaml113
-rw-r--r--playbooks/zuul-stream/post.yaml12
-rw-r--r--playbooks/zuul-stream/validate.yaml38
-rw-r--r--tests/base.py13
-rw-r--r--tests/fixtures/layouts/job-dedup-false.yaml1
-rw-r--r--tests/fixtures/layouts/job-dedup-noop.yaml55
-rw-r--r--tests/remote/test_remote_zuul_stream.py2
-rw-r--r--tests/unit/test_circular_dependencies.py145
-rw-r--r--tests/unit/test_github_driver.py2
-rw-r--r--tests/unit/test_scheduler.py18
-rw-r--r--tests/unit/test_sos.py73
-rw-r--r--tests/unit/test_v3.py18
-rw-r--r--web/public/openapi.yaml8
-rw-r--r--web/src/containers/status/ChangePanel.jsx49
-rw-r--r--web/src/containers/status/ChangePanel.test.jsx28
-rw-r--r--web/src/index.css5
-rw-r--r--zuul/ansible/base/callback/zuul_stream.py103
-rwxr-xr-xzuul/ansible/base/library/zuul_console.py50
-rw-r--r--zuul/driver/gerrit/gerritconnection.py2
-rw-r--r--zuul/driver/github/githubreporter.py5
-rw-r--r--zuul/driver/pagure/pagurereporter.py5
-rw-r--r--zuul/driver/smtp/smtpreporter.py2
-rw-r--r--zuul/executor/server.py6
-rw-r--r--zuul/manager/__init__.py28
-rw-r--r--zuul/merger/client.py5
-rw-r--r--zuul/merger/server.py9
-rw-r--r--zuul/model.py96
-rw-r--r--zuul/reporter/__init__.py11
-rw-r--r--zuul/scheduler.py4
-rw-r--r--zuul/zk/nodepool.py14
-rw-r--r--zuul/zk/zkobject.py1
37 files changed, 1235 insertions, 144 deletions
diff --git a/doc/source/developer/index.rst b/doc/source/developer/index.rst
index 52266a175..b45c75640 100644
--- a/doc/source/developer/index.rst
+++ b/doc/source/developer/index.rst
@@ -14,6 +14,7 @@ Zuul, though advanced users may find it interesting.
drivers
triggers
testing
+ metrics
docs
ansible
javascript
diff --git a/doc/source/developer/metrics.rst b/doc/source/developer/metrics.rst
new file mode 100644
index 000000000..913a591ba
--- /dev/null
+++ b/doc/source/developer/metrics.rst
@@ -0,0 +1,74 @@
+:title: Metrics
+
+Metrics
+=======
+
+Event Overview
+--------------
+
+The following table illustrates the event and pipeline processing
+sequence as it relates to some of the metrics described in
+:ref:`statsd`. This is intended as general guidance only and is not
+an exhaustive list.
+
++----------------------------------------+------+------+------+--------------------------------------+
+| Event | Metrics | Attribute |
++========================================+======+======+======+======================================+
+| Event generated by source | | | | event.timestamp |
++----------------------------------------+------+ + +--------------------------------------+
+| Enqueued into driver queue | | | | |
++----------------------------------------+------+ + +--------------------------------------+
+| Enqueued into tenant trigger queue | | | | event.arrived_at_scheduler_timestamp |
++----------------------------------------+ + [8] + +--------------------------------------+
+| Forwarded to matching pipelines | [1] | | | |
++----------------------------------------+ + + +--------------------------------------+
+| Changes enqueued ahead | | | | |
++----------------------------------------+ + + +--------------------------------------+
+| Change enqueued | | | | item.enqueue_time |
++----------------------------------------+------+------+ +--------------------------------------+
+| Changes enqueued behind | | | | |
++----------------------------------------+------+------+ +--------------------------------------+
+| Set item configuration | | | | build_set.configured_time |
++----------------------------------------+------+------+ +--------------------------------------+
+| Request files changed (if needed) | | | | |
++----------------------------------------+ +------+ +--------------------------------------+
+| Request merge | [2] | | | |
++----------------------------------------+ +------+ +--------------------------------------+
+| Wait for merge (and files if needed) | | | [9] | |
++----------------------------------------+------+------+ +--------------------------------------+
+| Generate dynamic layout (if needed) | [3] | | | |
++----------------------------------------+------+------+ +--------------------------------------+
+| Freeze job graph | [4] | | | |
++----------------------------------------+------+------+ +--------------------------------------+
+| Request global repo state (if needed) | | | | build_set.repo_state_request_time |
++----------------------------------------+ [5] +------+ +--------------------------------------+
+| Wait for global repo state (if needed) | | | | |
++----------------------------------------+------+------+ +--------------------------------------+
+| Deduplicate jobs | | | | |
++----------------------------------------+------+------+ +--------------------------------------+
+| Acquire semaphore (non-resources-first)| | | | |
++----------------------------------------+------+------+ +--------------------------------------+
+| Request nodes | | | | request.created_time |
++----------------------------------------+ [6] +------+ +--------------------------------------+
+| Wait for nodes | | | | |
++----------------------------------------+------+------+ +--------------------------------------+
+| Acquire semaphore (resources-first) | | | | |
++----------------------------------------+------+------+ +--------------------------------------+
+| Enqueue build request | | | | build.execute_time |
++----------------------------------------+ [7] +------+ +--------------------------------------+
+| Executor starts job | | | | build.start_time |
++----------------------------------------+------+------+------+--------------------------------------+
+
+====== =============================
+Metric Name
+====== =============================
+1 event_enqueue_processing_time
+2 merge_request_time
+3 layout_generation_time
+4 job_freeze_time
+5 repo_state_time
+6 node_request_time
+7 job_wait_time
+8 event_enqueue_time
+9 event_job_time
+====== =============================
diff --git a/doc/source/developer/specs/index.rst b/doc/source/developer/specs/index.rst
index d96df0c26..78c11bbb8 100644
--- a/doc/source/developer/specs/index.rst
+++ b/doc/source/developer/specs/index.rst
@@ -23,3 +23,4 @@ documentation instead.
enhanced-regional-executors
tenant-resource-quota
community-matrix
+ tracing
diff --git a/doc/source/developer/specs/tracing.rst b/doc/source/developer/specs/tracing.rst
new file mode 100644
index 000000000..5e0e9e4d1
--- /dev/null
+++ b/doc/source/developer/specs/tracing.rst
@@ -0,0 +1,323 @@
+Tracing
+=======
+
+.. warning:: This is not authoritative documentation. These features
+ are not currently available in Zuul. They may change significantly
+ before final implementation, or may never be fully completed.
+
+It can be difficult for a user to understand what steps were involved
+between a trigger event (such as a patchset upload or recheck comment)
+and a buildset report. If it took an unusually long time it can be
+difficult to determine why. At present, an operator would need to
+examine logs to determine what steps were involved and the sources of
+any potential delays. Even experienced operators and developers can
+take quite some time to first collect and then analyze logs to answer
+these questions.
+
+Sometimes these answers may point to routine system operation (such as
+a delay caused by many gate resets, or preparing a large number of
+repositories). Other times they may point to deficiencies in the
+system (insufficient mergers) or bugs in the code.
+
+Being able to visualize the activities of a Zuul system can help
+operators (and potentially users) triage and diagnose issues more
+quickly and accurately. Even if examining logs is ultimately required
+in order to fully diagnose an issue, being able to narrow down the
+scope using analsys tools can greatly simplify the process.
+
+Proposed Solution
+-----------------
+
+Implementing distributed tracing in Zuul can help improve the
+observability of the system and aid operators and potentially users in
+understanding the sequence of events.
+
+By exporting information about the processing Zuul performs using the
+OpenTelemetry API, information about Zuul operations can be collected
+in any of several tools for analysis.
+
+OpenTelemetry is an Open Source protocol for exchanging observability
+data, an SDK implementing that protocol, as well as an implementation
+of a collector for distributing information to multiple backends.
+
+It supports three kinds of observability data: `traces`, `metrics`,
+and `logs`. Since Zuul already has support for metrics and logs, this
+specification proposes that we use only the support in OpenTelemtry
+for `traces`.
+
+Usage Scenarios
+~~~~~~~~~~~~~~~
+
+Usage of OpenTelemetry should be entirely optional and supplementary
+for any Zuul deployment. Log messages alone should continue to be
+sufficient to analyze any potential problem.
+
+Should a deployer wish to use OpenTelemetry tracing data, a very
+simple deployment for smaller sites may be constructed by running only
+Jaeger. Jaeger is a service that can receive, store, and display
+tracing information. The project distributes an all-in-one container
+image which can store data in local filesystem storage.
+
+https://www.jaegertracing.io/
+
+Larger sites may wish to run multiple collectors and feed data to
+larger, distributed storage backends (such as Cassandra,
+Elasticsearch, etc).
+
+Suitability to Zuul
+~~~~~~~~~~~~~~~~~~~
+
+OpenTelemetry tracing, at a high level, is designed to record
+information about events, their timing, and their relation to other
+events. At first this seems like a natural fit for Zuul, which reacts
+to events, processes events, and generates more events. However,
+OpenTelemetry's bias toward small and simple web applications is
+evident throughout its documentation and the SDK implementation.
+
+ Traces give us the big picture of what happens when a request is
+ made by user or an application.
+
+Zuul is not driven by user or application requests, and a system
+designed to record several millisecond-long events which make up the
+internal response to a user request of a web app is not necessarily
+the obvious right choice for recording sequences and combinations of
+events which frequently take hours (and sometimes days) to play out
+across multiple systems.
+
+Fortunately, the concepts and protocol implementation of OpenTelemtry
+are sufficiently well-designed for the general case to be able to
+accomodate a system like Zuul, even if the SDK makes incompatible
+assumptions that make integration difficult. There are some
+challenges to implementation, but because the concepts appear to be
+well matched, we should proceed with using the OpenTelemetry protocol
+and SDK.
+
+Spans
+~~~~~
+
+The key tracing concepts in OpenTelemety are `traces` and `spans`.
+From a data model perspective, the unit of data storage is a `span`.
+A trace itself is really just a unique ID that is common to multiple
+spans.
+
+Spans can relate to other spans as either children or links. A trace
+is generally considered to have a single 'root' span, and within the
+time period represented by that span, it may have any number of child
+spans (which may further have their own child spans).
+
+OpenTelemetry anticipates that a span on one system may spawn a child
+span on another system and includes facilities for transferring enough
+information about the parent span to a child system that the child
+system alone can emit traces for its span and any children that it
+spawns in turn.
+
+For a concrete example in Zuul, we might have a Zuul scheduler start a
+span for a buildset, and then a merger might emit a child span for
+performing the initial merge, and an executor might emit a child span
+for executing a build.
+
+Spans can relate to other spans (including spans in other traces), so
+sequences of events can be chained together without necessitating that
+they all be part of the same span or trace.
+
+Because Zuul processes series of events which may stretch for long
+periods of time, we should specify what events and actions should
+correspond to spans and traces. Spans can have arbitrary metadat
+associated with them, so we will be able to search by event or job
+ids.
+
+The following sections describe traces and their child spans.
+
+Event Ingestion
++++++++++++++++
+
+A trace will begin when Zuul receives an event and end when that event
+has been enqueued into scheduler queues (or discarded). A driver
+completing processing of an event is a definitive point in time so it
+is easy to know when to close the root span for that event's trace
+(whereas if we kept the trace open to include scheduler processing, we
+would need to know when the last trigger event spawned by the
+connection event was complete).
+
+This may include processing in internal queues by a given driver, and
+these processing steps/queues should appear as their own child spans.
+The spans should include event IDs (and potentially other information
+about the event such as change or pull request numbers) as metadata.
+
+Tenant Event Processing
++++++++++++++++++++++++
+
+A trace will begin when a scheduler begins processing a tenant event
+and ends when it has forwarded the event to all pipelines within a
+tenant. It will link to the event ingestion trace as a follow-on
+span.
+
+Queue Item
+++++++++++
+
+A trace will begin when an item is enqueued and end when it is
+dequeued. This will be quite a long trace (hours or days). It is
+expected to be the primary benefit of this telemetry effort as it will
+show the entire lifetime of a queue item. It will link to the tenant
+event processing trace as a follow-on span.
+
+Within the root span, there will be a span for each buildset (so that
+if a gate reset happens and a new buildset is created, users will see
+a series of buildset spans). Within a buildset, there will be spans
+for all of the major processing steps, such as merge operations,
+layout calculating, freezing the job graph, and freezing jobs. Each
+build will also merit a span (retried builds will get their own spans
+as well), and within a job span, there will be child spans for git
+repo prep, job setup, individual playbooks, and cleanup.
+
+SDK Challenges
+~~~~~~~~~~~~~~
+
+As a high-level concept, the idea of spans for each of these
+operations makes sense. In practice, the SDK makes implementation
+challenging.
+
+The OpenTelemtry SDK makes no provision for beginning a span on one
+system and ending it on another, so the fact that one Zuul scheduler
+might start a buildset span while another ends it is problematic.
+
+Fortunately, the OpenTelemetry API only reports spans when they end,
+not when they start. This means that we don't need to coordinate a
+"start" API call on one scheduler with an "end" API call on another.
+We can simply emit the trace with its root span at the end. However,
+any child spans emitted during that time need to know the trace ID
+they should use, which means that we at least need to store a trace ID
+and start timestamp on our starting scheduler for use by any child
+spans as well as the "end span" API call.
+
+The SDK does not support creating a span with a specific trace ID or
+start timestamp (most timestamps are automatic), but it has
+well-defined interfaces for spans and we can subclass the
+implementation to allow us to specify trace IDs and timestamps. With
+this approach, we can "virtually" start a span on one host, store its
+information in ZooKeeper with whatever long-lived object it is
+associated with (such as a QueueItem) and then make it concrete on
+another host when we end it.
+
+Alternatives
+++++++++++++
+
+This section describes some alternative ideas for dealing with the
+SDK's mismatch with Zuul concepts as well as why they weren't
+selected.
+
+* Multiple root spans with the same trace ID
+
+ Jaeger handles this relatively well, and the timeline view appears
+ as expected (multiple events with whitespace between them). The
+ graph view in Jaeger may have some trouble displaying this.
+
+ It is not clear that OpenTelemetry anticipates having multiple
+ "root" spans, so it may be best to avoid this in order to avoid
+ potential problems with other tools.
+
+* Child spans without a parent
+
+ If we emit spans that specify a parent which does not exist, Jaeger
+ will display these traces but show a warning that the parent is
+ invalid. This may occur naturally while the system is operating
+ (builds complete while a buildset is running), but should be
+ eventually corrected once an item is dequeued. In case of a serious
+ error, we may never close a parent span, which would cause this to
+ persist. We should accept that this may happen, but try to avoid it
+ happening intentionally.
+
+Links
+~~~~~
+
+Links between spans are fairly primitive in Jaeger. While the
+OpenTelemetry API includes attributes for links (so that when we link
+a queue item to an event, we could specify that it was a forwarded
+event), Jaeger does not store or render them. Instead, we are only
+left with a reference to a ``< span in another trace >`` with a
+reference type of ``FOLLOWS_FROM``. Clicking on that link will
+immediately navigate to the other trace where metadata about the trace
+will be visible, but before clicking on it, users will have little
+idea of what awaits on the other side.
+
+For this reason, we should use span links sparingly so that when they
+are encountered, users are likely to intuit what they are for and are
+not overwhelmed by multiple indistinguishable links.
+
+Events and Exceptions
+~~~~~~~~~~~~~~~~~~~~~
+
+OpenTelemetry allows events to be added to spans. Events have their
+own timestamp and attributes. These can be used to add additional
+context to spans (representing single points in time rather than
+events with duration that should be child spans). Examples might
+include receiving a request to cancel a job or dequeue an item.
+
+Events should not be used as an alternative to logs, nor should all
+log messages be copied as events. Events should be used sparingly to
+avoid overwhelming the tracing storage with data and the user with
+information.
+
+Exceptions may also be included in spans. This happens automatically
+and by default when using the context managers supplied by the SDK.
+Because many spans in Zuul will be unable to use the SDK context
+managers and any exception information would need to be explicitly
+handled and stored in ZooKeeper, we will disable inclusion of
+exception information in spans. This will provide a more consistent
+experience (so that users don't see the absence of an exception in
+tracing information to indicate the absence of an error in logs) and
+reduce the cost of supporting traces (extra storage in ZooKeeper and
+in the telemetry storage).
+
+If we decide that exception information is worth including in the
+future, this decision will be easy to revisit and reverse.
+
+Sensitive Information
+~~~~~~~~~~~~~~~~~~~~~
+
+No sensitive information (secrets, passwords, job variables, etc)
+should be included in tracing output. All output should be suitable
+for an audience of Zuul users (that is, if someone has access to the
+Zuul dashboard, then tracing data should not have any more sensitive
+information than they already have access to). For public-facing Zuul
+systems (such as OpenDev), the information should be suitable for
+public use.
+
+Protobuf and gRPC
+~~~~~~~~~~~~~~~~~
+
+The most efficient and straightforward method of transmitting data
+from Zuul to a collector (including Jaeger) is using OTLP with gRPC
+(OpenTelemetry Protocol + gRPC Remote Procedure Calls). Because
+Protobuf applications include automatically generated code, we may
+encounter the occasional version inconsistency. We may need to
+navigate package requirements more than normal due to this (especially
+if we have multiple packages that depend on protobuf).
+
+For a contemporary example, the OpenTelemetry project is in the
+process of pinning to an older version of protobuf:
+
+https://github.com/open-telemetry/opentelemetry-python/issues/2717
+
+There is an HTTP+JSON exporter as well, so in the case that something
+goes very wrong with protobuf+gRPC, that may be available as a fallback.
+
+Work Items
+----------
+
+* Add OpenTelemetry SDK and support for configuring an exporter to
+ zuul.conf
+* Implement SDK subclasses to support opening and closing spans on
+ different hosts
+* Instrument event processing in each driver
+* Instrument event processing in scheduler
+* Instrument queue items and related spans
+* Document a simple Jaeger setup as a quickstart add-on (similar to
+ authz)
+* Optional: work with OpenDev to run a public Jaeger server for
+ OpenDev
+
+The last item is not required for this specification (and not our
+choice as Zuul developers to make) but it would be nice if there were
+one available so that all Zuul users and developers have a reference
+implementation available for community collaboration.
diff --git a/doc/source/monitoring.rst b/doc/source/monitoring.rst
index 0c2cb4351..1cb61ee01 100644
--- a/doc/source/monitoring.rst
+++ b/doc/source/monitoring.rst
@@ -110,7 +110,27 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
operation(s). This will always include a request to a Zuul
merger to speculatively merge the change, but it may also
include a second request submitted in parallel to identify
- the files altered by the change.
+ the files altered by the change. Includes
+ :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.merger_merge_op_time`
+ and
+ :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.merger_files_changes_op_time`.
+
+ .. stat:: merger_merge_op_time
+ :type: timer
+
+ The amount of time the merger spent performing a merge
+ operation. This does not include any of the round-trip time
+ from the scheduler to the merger, or any other merge
+ operations.
+
+ .. stat:: merger_files_changes_op_time
+ :type: timer
+
+ The amount of time the merger spent performing a files-changes
+ operation to detect changed files (this is sometimes
+ performed if the source does not provide this information).
+ This does not include any of the round-trip time from the
+ scheduler to the merger, or any other merge operations.
.. stat:: layout_generation_time
:type: timer
@@ -128,7 +148,17 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
The amount of time waiting for a secondary Zuul merger
operation to collect additional information about the repo
- state of required projects.
+ state of required projects. Includes
+ :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.merger_repo_state_op_time`.
+
+ .. stat:: merger_repo_state_op_time
+ :type: timer
+
+ The amount of time the merger spent performing a repo state
+ operation to collect additional information about the repo
+ state of required projects. This does not include any of the
+ round-trip time from the scheduler to the merger, or any
+ other merge operations.
.. stat:: node_request_time
:type: timer
diff --git a/playbooks/zuul-stream/fixtures/test-stream.yaml b/playbooks/zuul-stream/fixtures/test-stream.yaml
index 05b75daf9..0326ae54e 100644
--- a/playbooks/zuul-stream/fixtures/test-stream.yaml
+++ b/playbooks/zuul-stream/fixtures/test-stream.yaml
@@ -1,3 +1,16 @@
+- name: Start zuul stream daemon
+ hosts: node
+ tasks:
+
+ # NOTE : when new_console is set, this playbook runs with
+ # ZUUL_CONSOLE_PORT=19887 so that we test with the zuul_console
+ # from the Zuul checkout, and not the one started by test
+ # infrastructure.
+ - name: Start zuul_console on non-default port
+ zuul_console:
+ port: 19887
+ when: new_console | default(false)
+
- name: Run some commands to show that logging works
hosts: node
tasks:
@@ -51,3 +64,15 @@
- name: Print binary data
command: echo -e '\x80abc'
+
+ - name: Find any console log files
+ find:
+ paths: /tmp
+ patterns: 'console-*.log'
+ register: _tmp_files
+
+ # We check this list in zuul-stream/functional.yaml to make sure
+ # we're cleaning up console log files.
+ - name: Dump tmp files
+ debug:
+ var: _tmp_files
diff --git a/playbooks/zuul-stream/functional.yaml b/playbooks/zuul-stream/functional.yaml
index ee1643fbc..7ae4704a9 100644
--- a/playbooks/zuul-stream/functional.yaml
+++ b/playbooks/zuul-stream/functional.yaml
@@ -7,71 +7,104 @@
# the python version of the platform is changed.
python_path: "/usr/local/lib/python3.10/dist-packages"
- - name: Run ansible that should succeed
+ - name: Run ansible that should succeed against testing console
command: >
/usr/lib/zuul/ansible/{{ zuul_ansible_version }}/bin/ansible-playbook
+ -e "new_console=true"
src/opendev.org/zuul/zuul/playbooks/zuul-stream/fixtures/test-stream.yaml
environment:
+ # Setup by test-stream.yaml so we start a new zuul_console
+ # from this checkout.
+ ZUUL_CONSOLE_PORT: 19887
ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
ZUUL_JOBDIR: "{{ ansible_user_dir}}"
PYTHONPATH: "{{ python_path }}"
+ register: _success_output
- - name: Run ansible playbook that should fail
+ - name: Save raw output to file
+ copy:
+ content: '{{ _success_output.stdout }}'
+ dest: 'console-job-output-success-19887.txt'
+
+ - name: Save output
+ shell: |
+ mv job-output.txt job-output-success-19887.txt
+ mv job-output.json job-output-success-19887.json
+
+ - name: Check protocol version
+ assert:
+ that:
+ - "'[node1] Reports streaming version: 1' in _success_output.stdout"
+
+ # Streamer puts out a line like
+ # [node1] Starting to log 916b2084-4bbb-80e5-248e-000000000016-1-node1 for task TASK: Print binary data
+ # One of the tasks in job-output shows find: results;
+ # the console file for this task should not be there.
+ - name: Validate temporary files removed
+ shell: |
+ for f in $(grep 'Starting to log' console-job-output-success-19887.txt | awk '{print $5}'); do
+ echo "Checking ${f}"
+ if grep -q '"path": "/tmp/console-'${f}'.log"' job-output-success-19887.txt; then
+ echo "*** /tmp/${f}.log still exists"
+ exit 1
+ fi
+ done
+
+ # NOTE(ianw) 2022-07 : we deliberatly have this second step to run
+ # against the console setup by the infrastructure executor in the
+ # job pre playbooks as a backwards compatability sanity check.
+ - name: Run ansible that should succeed against extant console
command: >
/usr/lib/zuul/ansible/{{ zuul_ansible_version }}/bin/ansible-playbook
- src/opendev.org/zuul/zuul/playbooks/zuul-stream/fixtures/test-stream-failure.yaml
- register: failed_results
- failed_when: "failed_results.rc != 2"
+ -e "new_console=false"
+ src/opendev.org/zuul/zuul/playbooks/zuul-stream/fixtures/test-stream.yaml
environment:
ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
ZUUL_JOBDIR: "{{ ansible_user_dir}}"
PYTHONPATH: "{{ python_path }}"
+ register: _success_output
- - name: Validate output - setupvar
- shell: |
- egrep "^.*\| node1 \|\s+\"setupvar\": {" job-output.txt
- egrep "^.*\| node2 \|\s+\"setupvar\": {" job-output.txt
+ - name: Save raw output to file
+ copy:
+ content: '{{ _success_output.stdout }}'
+ dest: 'console-job-output-success-19885.txt'
- - name: Validate output - shell task
+ - name: Save output
shell: |
- egrep "^.*\| node1 \| 1: lo:" job-output.txt
- egrep "^.*\| node2 \| 1: lo:" job-output.txt
+ mv job-output.txt job-output-success-19885.txt
+ mv job-output.json job-output-success-19885.json
- - name: Validate output - loop with items
- shell: |
- egrep "^.+\| node1 \| ok: Item: item1" job-output.txt
- egrep "^.+\| node1 \| ok: Item: item2" job-output.txt
- egrep "^.+\| node1 \| ok: Item: item3" job-output.txt
- egrep "^.+\| node2 \| ok: Item: item1" job-output.txt
- egrep "^.+\| node2 \| ok: Item: item2" job-output.txt
- egrep "^.+\| node2 \| ok: Item: item3" job-output.txt
+ - name: Validate outputs
+ include_tasks: validate.yaml
+ loop:
+ - job-output-success-19887.txt
+ - job-output-success-19885.txt
- - name: Validate output - loop with complex items
- shell: |
- egrep "^.+\| node1 \| ok: Item: Runtime" job-output.txt
- egrep "^.+\| node2 \| ok: Item: Runtime" job-output.txt
+ # failure case
- - name: Validate output - failed shell task
- shell: |
- egrep "^.+\| node1 \| Exception: Test module failure exception task" job-output.txt
- egrep "^.+\| node2 \| Exception: Test module failure exception task" job-output.txt
+ - name: Run ansible playbook that should fail
+ command: >
+ /usr/lib/zuul/ansible/{{ zuul_ansible_version }}/bin/ansible-playbook
+ src/opendev.org/zuul/zuul/playbooks/zuul-stream/fixtures/test-stream-failure.yaml
+ register: failed_results
+ failed_when: "failed_results.rc != 2"
+ environment:
+ ZUUL_CONSOLE_PORT: 19887
+ ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
+ ZUUL_JOBDIR: "{{ ansible_user_dir}}"
+ PYTHONPATH: "{{ python_path }}"
- - name: Validate output - item loop with exception
+ - name: Save output
shell: |
- egrep "^.+\| node1 \| Exception: Test module failure exception loop" job-output.txt
- egrep "^.+\| node2 \| Exception: Test module failure exception loop" job-output.txt
+ mv job-output.txt job-output-failure.txt
+ mv job-output.json job-output-failure.json
- name: Validate output - failure shell task with exception
shell: |
- egrep "^.+\| node1 \| Exception: Test module failure exception fail-task" job-output.txt
- egrep "^.+\| node2 \| Exception: Test module failure exception fail-task" job-output.txt
+ egrep "^.+\| node1 \| Exception: Test module failure exception fail-task" job-output-failure.txt
+ egrep "^.+\| node2 \| Exception: Test module failure exception fail-task" job-output-failure.txt
- name: Validate output - failure item loop with exception
shell: |
- egrep "^.+\| node1 \| Exception: Test module failure exception fail-loop" job-output.txt
- egrep "^.+\| node2 \| Exception: Test module failure exception fail-loop" job-output.txt
-
- - name: Validate output - binary data
- shell: |
- egrep "^.*\| node1 \| \\\\x80abc" job-output.txt
- egrep "^.*\| node2 \| \\\\x80abc" job-output.txt
+ egrep "^.+\| node1 \| Exception: Test module failure exception fail-loop" job-output-failure.txt
+ egrep "^.+\| node2 \| Exception: Test module failure exception fail-loop" job-output-failure.txt
diff --git a/playbooks/zuul-stream/post.yaml b/playbooks/zuul-stream/post.yaml
index 2c717a82e..4beb8d1f9 100644
--- a/playbooks/zuul-stream/post.yaml
+++ b/playbooks/zuul-stream/post.yaml
@@ -10,9 +10,6 @@
state: directory
delegate_to: localhost
- - name: Rename job-output.txt
- command: mv job-output.txt stream-job-output.txt
-
- name: Fetch files
synchronize:
src: "{{ ansible_user_dir }}/{{ item }}"
@@ -21,5 +18,10 @@
with_items:
- logging.json
- ansible.cfg
- - stream-job-output.txt
- - job-output.json
+ - console-job-output-success-19887.txt
+ - job-output-success-19887.txt
+ - job-output-success-19887.json
+ - job-output-success-19885.txt
+ - job-output-success-19885.json
+ - job-output-failure.txt
+ - job-output-failure.json
diff --git a/playbooks/zuul-stream/validate.yaml b/playbooks/zuul-stream/validate.yaml
new file mode 100644
index 000000000..73ccd873a
--- /dev/null
+++ b/playbooks/zuul-stream/validate.yaml
@@ -0,0 +1,38 @@
+- name: Validate output - setupvar
+ shell: |
+ egrep "^.*\| node1 \|\s+\"setupvar\": {" {{ item }}
+ egrep "^.*\| node2 \|\s+\"setupvar\": {" {{ item }}
+
+- name: Validate output - shell task
+ shell: |
+ egrep "^.*\| node1 \| 1: lo:" {{ item }}
+ egrep "^.*\| node2 \| 1: lo:" {{ item }}
+
+- name: Validate output - loop with items
+ shell: |
+ egrep "^.+\| node1 \| ok: Item: item1" {{ item }}
+ egrep "^.+\| node1 \| ok: Item: item2" {{ item }}
+ egrep "^.+\| node1 \| ok: Item: item3" {{ item }}
+ egrep "^.+\| node2 \| ok: Item: item1" {{ item }}
+ egrep "^.+\| node2 \| ok: Item: item2" {{ item }}
+ egrep "^.+\| node2 \| ok: Item: item3" {{ item }}
+
+- name: Validate output - loop with complex items
+ shell: |
+ egrep "^.+\| node1 \| ok: Item: Runtime" {{ item }}
+ egrep "^.+\| node2 \| ok: Item: Runtime" {{ item }}
+
+- name: Validate output - failed shell task
+ shell: |
+ egrep "^.+\| node1 \| Exception: Test module failure exception task" {{ item }}
+ egrep "^.+\| node2 \| Exception: Test module failure exception task" {{ item }}
+
+- name: Validate output - item loop with exception
+ shell: |
+ egrep "^.+\| node1 \| Exception: Test module failure exception loop" {{ item }}
+ egrep "^.+\| node2 \| Exception: Test module failure exception loop" {{ item }}
+
+- name: Validate output - binary data
+ shell: |
+ egrep "^.*\| node1 \| \\\\x80abc" {{ item }}
+ egrep "^.*\| node2 \| \\\\x80abc" {{ item }}
diff --git a/tests/base.py b/tests/base.py
index 5a85ea0d7..cebcf2e1f 100644
--- a/tests/base.py
+++ b/tests/base.py
@@ -3091,6 +3091,8 @@ class FakeBuild(object):
self.paused = False
self.aborted = False
self.requeue = False
+ self.should_fail = False
+ self.should_retry = False
self.created = time.time()
self.changes = None
items = self.parameters['zuul']['items']
@@ -3162,6 +3164,8 @@ class FakeBuild(object):
return result
def shouldFail(self):
+ if self.should_fail:
+ return True
changes = self.executor_server.fail_tests.get(self.name, [])
for change in changes:
if self.hasChanges(change):
@@ -3169,6 +3173,8 @@ class FakeBuild(object):
return False
def shouldRetry(self):
+ if self.should_retry:
+ return True
entries = self.executor_server.retry_tests.get(self.name, [])
for entry in entries:
if self.hasChanges(entry['change']):
@@ -3662,7 +3668,7 @@ class FakeSMTP(object):
class FakeNodepool(object):
REQUEST_ROOT = '/nodepool/requests'
NODE_ROOT = '/nodepool/nodes'
- LAUNCHER_ROOT = '/nodepool/launchers'
+ COMPONENT_ROOT = '/nodepool/components'
log = logging.getLogger("zuul.test.FakeNodepool")
@@ -3726,10 +3732,11 @@ class FakeNodepool(object):
self.fulfillRequest(req)
def registerLauncher(self, labels=["label1"], id="FakeLauncher"):
- path = os.path.join(self.LAUNCHER_ROOT, id)
+ path = os.path.join(self.COMPONENT_ROOT, 'pool', id)
data = {'id': id, 'supported_labels': labels}
self.client.create(
- path, json.dumps(data).encode('utf8'), makepath=True)
+ path, json.dumps(data).encode('utf8'),
+ ephemeral=True, makepath=True, sequence=True)
def getNodeRequests(self):
try:
diff --git a/tests/fixtures/layouts/job-dedup-false.yaml b/tests/fixtures/layouts/job-dedup-false.yaml
index 2c0e6ee2e..9254f0b41 100644
--- a/tests/fixtures/layouts/job-dedup-false.yaml
+++ b/tests/fixtures/layouts/job-dedup-false.yaml
@@ -39,6 +39,7 @@
- job:
name: common-job
deduplicate: false
+ pre-run: playbooks/pre.yaml
required-projects:
- org/project1
- org/project2
diff --git a/tests/fixtures/layouts/job-dedup-noop.yaml b/tests/fixtures/layouts/job-dedup-noop.yaml
new file mode 100644
index 000000000..9383fd8b6
--- /dev/null
+++ b/tests/fixtures/layouts/job-dedup-noop.yaml
@@ -0,0 +1,55 @@
+- queue:
+ name: integrated
+ allow-circular-dependencies: true
+
+- pipeline:
+ name: gate
+ manager: dependent
+ success-message: Build succeeded (gate).
+ require:
+ gerrit:
+ approval:
+ - Approved: 1
+ trigger:
+ gerrit:
+ - event: comment-added
+ approval:
+ - Approved: 1
+ success:
+ gerrit:
+ Verified: 2
+ submit: true
+ failure:
+ gerrit:
+ Verified: -2
+ start:
+ gerrit:
+ Verified: 0
+ precedence: high
+
+- job:
+ name: base
+ parent: null
+ pre-run: playbooks/pre.yaml
+ run: playbooks/run.yaml
+ nodeset:
+ nodes:
+ - label: debian
+ name: controller
+
+- job:
+ name: common-job
+ required-projects:
+ - org/project1
+
+- job:
+ name: project1-job
+
+- project:
+ name: org/project1
+ queue: integrated
+ gate:
+ jobs:
+ - noop
+ - common-job
+ - project1-job
diff --git a/tests/remote/test_remote_zuul_stream.py b/tests/remote/test_remote_zuul_stream.py
index 1f6b7fff7..1c705127e 100644
--- a/tests/remote/test_remote_zuul_stream.py
+++ b/tests/remote/test_remote_zuul_stream.py
@@ -29,7 +29,7 @@ class FunctionalZuulStreamMixIn:
self.log_console_port = 19000 + int(
self.ansible_core_version.split('.')[1])
self.executor_server.log_console_port = self.log_console_port
- self.wait_timeout = 120
+ self.wait_timeout = 180
self.fake_nodepool.remote_ansible = True
ansible_remote = os.environ.get('ZUUL_REMOTE_IPV4')
diff --git a/tests/unit/test_circular_dependencies.py b/tests/unit/test_circular_dependencies.py
index ac5ad13f5..2223008d5 100644
--- a/tests/unit/test_circular_dependencies.py
+++ b/tests/unit/test_circular_dependencies.py
@@ -1703,6 +1703,151 @@ class TestGerritCircularDependencies(ZuulTestCase):
], ordered=False)
self.assertEqual(len(self.fake_nodepool.history), 3)
+ @simple_layout('layouts/job-dedup-false.yaml')
+ def test_job_deduplication_false_failed_job(self):
+ # Test that if we are *not* deduplicating jobs, we don't
+ # duplicate the result on two different builds.
+ # The way we check that is to retry the common-job between two
+ # items, but only once, and only on one item. The other item
+ # should be unaffected.
+ self.executor_server.hold_jobs_in_build = True
+ A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
+ B = self.fake_gerrit.addFakeChange('org/project2', 'master', 'B')
+
+ # A <-> B
+ A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+ A.subject, B.data["url"]
+ )
+ B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+ B.subject, A.data["url"]
+ )
+
+ A.addApproval('Code-Review', 2)
+ B.addApproval('Code-Review', 2)
+
+ self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+ self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
+
+ # If we don't make sure these jobs finish first, then one of
+ # the items may complete before the other and cause Zuul to
+ # abort the project*-job on the other item (with a "bundle
+ # failed to merge" error).
+ self.waitUntilSettled()
+ for build in self.builds:
+ if build.name == 'common-job' and build.project == 'org/project1':
+ break
+ else:
+ raise Exception("Unable to find build")
+ build.should_retry = True
+
+ # Store a reference to the queue items so we can inspect their
+ # internal attributes later to double check the retry build
+ # count is correct.
+ tenant = self.scheds.first.sched.abide.tenants.get('tenant-one')
+ pipeline = tenant.layout.pipelines['gate']
+ items = pipeline.getAllItems()
+ self.assertEqual(len(items), 2)
+
+ self.executor_server.release('project1-job')
+ self.executor_server.release('project2-job')
+ self.waitUntilSettled()
+ self.executor_server.hold_jobs_in_build = False
+ self.executor_server.release()
+ self.waitUntilSettled()
+
+ self.assertEqual(A.data['status'], 'MERGED')
+ self.assertEqual(B.data['status'], 'MERGED')
+ self.assertHistory([
+ dict(name="project2-job", result="SUCCESS", changes="2,1 1,1"),
+ dict(name="project1-job", result="SUCCESS", changes="2,1 1,1"),
+ dict(name="common-job", result=None, changes="2,1 1,1"),
+ dict(name="common-job", result="SUCCESS", changes="2,1 1,1"),
+ dict(name="common-job", result="SUCCESS", changes="2,1 1,1"),
+ ], ordered=False)
+ self.assertEqual(len(self.fake_nodepool.history), 5)
+ self.assertEqual(items[0].change.project.name, 'org/project2')
+ self.assertEqual(len(items[0].current_build_set.retry_builds), 0)
+ self.assertEqual(items[1].change.project.name, 'org/project1')
+ self.assertEqual(len(items[1].current_build_set.retry_builds), 1)
+
+ @simple_layout('layouts/job-dedup-auto-shared.yaml')
+ def test_job_deduplication_multi_scheduler(self):
+ # Test that a second scheduler can correctly refresh
+ # deduplicated builds
+ self.executor_server.hold_jobs_in_build = True
+ A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
+ B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B')
+
+ # A <-> B
+ A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+ A.subject, B.data["url"]
+ )
+ B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+ B.subject, A.data["url"]
+ )
+
+ A.addApproval('Code-Review', 2)
+ B.addApproval('Code-Review', 2)
+
+ self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+ self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
+
+ self.waitUntilSettled()
+
+ app = self.createScheduler()
+ app.start()
+ self.assertEqual(len(self.scheds), 2)
+
+ # Hold the lock on the first scheduler so that only the second
+ # will act.
+ with self.scheds.first.sched.run_handler_lock:
+ self.executor_server.hold_jobs_in_build = False
+ self.executor_server.release()
+ self.waitUntilSettled(matcher=[app])
+
+ self.assertEqual(A.data['status'], 'MERGED')
+ self.assertEqual(B.data['status'], 'MERGED')
+ self.assertHistory([
+ dict(name="project1-job", result="SUCCESS", changes="2,1 1,1"),
+ dict(name="common-job", result="SUCCESS", changes="2,1 1,1"),
+ ], ordered=False)
+
+ @simple_layout('layouts/job-dedup-noop.yaml')
+ def test_job_deduplication_noop(self):
+ # Test that we don't deduplicate noop (there's no good reason
+ # to do so)
+ A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
+ B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B')
+
+ # A <-> B
+ A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+ A.subject, B.data["url"]
+ )
+ B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+ B.subject, A.data["url"]
+ )
+
+ A.addApproval('Code-Review', 2)
+ B.addApproval('Code-Review', 2)
+
+ self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+ self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
+
+ self.waitUntilSettled()
+
+ self.assertEqual(A.data['status'], 'MERGED')
+ self.assertEqual(B.data['status'], 'MERGED')
+ self.assertHistory([
+ dict(name="project1-job", result="SUCCESS", changes="2,1 1,1"),
+ dict(name="common-job", result="SUCCESS", changes="2,1 1,1"),
+ ], ordered=False)
+ # It's tricky to get info about a noop build, but the jobs in
+ # the report have the build UUID, so we make sure it's
+ # different.
+ a_noop = [l for l in A.messages[-1].split('\n') if 'noop' in l][0]
+ b_noop = [l for l in B.messages[-1].split('\n') if 'noop' in l][0]
+ self.assertNotEqual(a_noop, b_noop)
+
@simple_layout('layouts/job-dedup-retry.yaml')
def test_job_deduplication_retry(self):
A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
diff --git a/tests/unit/test_github_driver.py b/tests/unit/test_github_driver.py
index f9f3371fa..49dae0ccb 100644
--- a/tests/unit/test_github_driver.py
+++ b/tests/unit/test_github_driver.py
@@ -1384,7 +1384,7 @@ class TestGithubDriver(ZuulTestCase):
# now check if the merge was done via rebase
merges = [report for report in self.fake_github.github_data.reports
if report[2] == 'merge']
- assert(len(merges) == 1 and merges[0][3] == 'squash')
+ assert (len(merges) == 1 and merges[0][3] == 'squash')
@simple_layout('layouts/basic-github.yaml', driver='github')
def test_invalid_event(self):
diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index eb5ee826f..66c508fea 100644
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@@ -226,6 +226,18 @@ class TestSchedulerZoneFallback(ZuulTestCase):
def test_jobs_executed(self):
"Test that jobs are executed and a change is merged per zone"
self.hold_jobs_in_queue = True
+
+ # Validate that the reported executor stats are correct. Since
+ # the executor accepts zoned and unzoned job it should be counted
+ # in both metrics.
+ self.assertReportedStat(
+ 'zuul.executors.online', value='1', kind='g')
+ self.assertReportedStat(
+ 'zuul.executors.unzoned.online', value='1', kind='g')
+ self.assertReportedStat(
+ 'zuul.executors.zone.test-provider_vpn.online',
+ value='1', kind='g')
+
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
A.addApproval('Code-Review', 2)
self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
@@ -452,6 +464,7 @@ class TestScheduler(ZuulTestCase):
'zuul.tenant.tenant-one.reconfiguration_time',
'zuul.tenant.tenant-one.pipeline.gate.event_enqueue_time',
'zuul.tenant.tenant-one.pipeline.gate.merge_request_time',
+ 'zuul.tenant.tenant-one.pipeline.gate.merger_merge_op_time',
'zuul.tenant.tenant-one.pipeline.gate.job_freeze_time',
'zuul.tenant.tenant-one.pipeline.gate.node_request_time',
'zuul.tenant.tenant-one.pipeline.gate.job_wait_time',
@@ -5731,8 +5744,7 @@ For CI problems and help debugging, contact ci@example.org"""
self.assertEqual(A.reported, 2)
self.assertTrue(re.search('project-merge .* NODE_FAILURE',
A.messages[1]))
- self.assertTrue(re.search('project-test1 .* SKIPPED', A.messages[1]))
- self.assertTrue(re.search('project-test2 .* SKIPPED', A.messages[1]))
+ self.assertTrue('Skipped 2 jobs' in A.messages[1])
def test_nodepool_resources(self):
"Test that resources are reported"
@@ -6854,7 +6866,7 @@ class TestDependencyGraph(ZuulTestCase):
self.assertHistory([
dict(name='build', result='FAILURE', changes='1,1'),
], ordered=False)
- self.assertIn('SKIPPED', A.messages[0])
+ self.assertIn('Skipped 1 job', A.messages[0])
class TestDuplicatePipeline(ZuulTestCase):
diff --git a/tests/unit/test_sos.py b/tests/unit/test_sos.py
index 7d95f2548..f371a8064 100644
--- a/tests/unit/test_sos.py
+++ b/tests/unit/test_sos.py
@@ -55,6 +55,79 @@ class TestScaleOutScheduler(ZuulTestCase):
dict(name='project-test2', result='SUCCESS', changes='1,1'),
], ordered=False)
+ def test_pipeline_cache_clear(self):
+ # Test that the pipeline cache on a second scheduler isn't
+ # holding old change objects.
+
+ # Hold jobs in build
+ sched1 = self.scheds.first
+ self.executor_server.hold_jobs_in_build = True
+
+ # We need a pair of changes in order to populate the pipeline
+ # change cache (a single change doesn't activate the cache,
+ # it's for dependencies).
+ A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+ A.addApproval('Code-Review', 2)
+ B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
+ B.addApproval('Code-Review', 2)
+ B.addApproval('Approved', 1)
+ B.setDependsOn(A, 1)
+
+ # Fail a job
+ self.executor_server.failJob('project-test1', A)
+
+ # Enqueue into gate with scheduler 1
+ self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+ self.waitUntilSettled()
+
+ # Start scheduler 2
+ sched2 = self.createScheduler()
+ sched2.start()
+ self.assertEqual(len(self.scheds), 2)
+
+ # Pause scheduler 1
+ with sched1.sched.run_handler_lock:
+ # Release jobs
+ self.executor_server.hold_jobs_in_build = False
+ self.executor_server.release()
+ # Wait for scheduler 2 to dequeue
+ self.waitUntilSettled(matcher=[sched2])
+ # Unpause scheduler 1
+ self.assertEqual(A.data['status'], 'NEW')
+ self.assertEqual(B.data['status'], 'NEW')
+
+ # Clear zk change cache
+ self.fake_gerrit._change_cache.prune([], max_age=0)
+
+ # At this point, scheduler 1 should have a bogus change entry
+ # in the pipeline cache because scheduler 2 performed the
+ # dequeue so scheduler 1 never cleaned up its cache.
+
+ self.executor_server.fail_tests.clear()
+ self.executor_server.hold_jobs_in_build = True
+ # Pause scheduler 1
+ with sched1.sched.run_handler_lock:
+ # Enqueue into gate with scheduler 2
+ self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+ self.waitUntilSettled(matcher=[sched2])
+
+ # Pause scheduler 2
+ with sched2.sched.run_handler_lock:
+ # Make sure that scheduler 1 does some pipeline runs which
+ # reconstitute state from ZK. This gives it the
+ # opportunity to use old cache data if we don't clear it.
+
+ # Release job1
+ self.executor_server.release()
+ self.waitUntilSettled(matcher=[sched1])
+ # Release job2
+ self.executor_server.hold_jobs_in_build = False
+ self.executor_server.release()
+ # Wait for scheduler 1 to merge change
+ self.waitUntilSettled(matcher=[sched1])
+ self.assertEqual(A.data['status'], 'MERGED')
+ self.assertEqual(B.data['status'], 'MERGED')
+
@simple_layout('layouts/multi-scheduler-status.yaml')
def test_multi_scheduler_status(self):
self.hold_merge_jobs_in_queue = True
diff --git a/tests/unit/test_v3.py b/tests/unit/test_v3.py
index 9fd8ea1d3..a89bb3007 100644
--- a/tests/unit/test_v3.py
+++ b/tests/unit/test_v3.py
@@ -4930,7 +4930,7 @@ class TestDataReturn(AnsibleZuulTestCase):
self.assertIn(
'- data-return https://zuul.example.com/',
A.messages[-1])
- self.assertTrue(re.search('child .* SKIPPED', A.messages[-1]))
+ self.assertTrue('Skipped 1 job' in A.messages[-1])
self.assertIn('Build succeeded', A.messages[-1])
def test_data_return_invalid_child_job(self):
@@ -4943,7 +4943,7 @@ class TestDataReturn(AnsibleZuulTestCase):
self.assertIn(
'- data-return-invalid-child-job https://zuul.example.com',
A.messages[-1])
- self.assertTrue(re.search('data-return .* SKIPPED', A.messages[-1]))
+ self.assertTrue('Skipped 1 job' in A.messages[-1])
self.assertIn('Build succeeded', A.messages[-1])
def test_data_return_skip_all_child_jobs(self):
@@ -4957,8 +4957,7 @@ class TestDataReturn(AnsibleZuulTestCase):
self.assertIn(
'- data-return-skip-all https://zuul.example.com/',
A.messages[-1])
- self.assertTrue(re.search('child .* SKIPPED', A.messages[-1]))
- self.assertTrue(re.search('data-return .* SKIPPED', A.messages[-1]))
+ self.assertTrue('Skipped 2 jobs' in A.messages[-1])
self.assertIn('Build succeeded', A.messages[-1])
def test_data_return_skip_all_child_jobs_with_soft_dependencies(self):
@@ -4972,8 +4971,7 @@ class TestDataReturn(AnsibleZuulTestCase):
])
self.assertIn('- data-return-cd https://zuul.example.com/',
A.messages[-1])
- self.assertTrue(re.search('data-return-a .* SKIPPED', A.messages[-1]))
- self.assertTrue(re.search('data-return-b .* SKIPPED', A.messages[-1]))
+ self.assertTrue('Skipped 2 jobs' in A.messages[-1])
self.assertIn('Build succeeded', A.messages[-1])
def test_several_zuul_return(self):
@@ -4987,7 +4985,7 @@ class TestDataReturn(AnsibleZuulTestCase):
self.assertIn(
'- several-zuul-return-child https://zuul.example.com/',
A.messages[-1])
- self.assertTrue(re.search('data-return .* SKIPPED', A.messages[-1]))
+ self.assertTrue('Skipped 1 job' in A.messages[-1])
self.assertIn('Build succeeded', A.messages[-1])
def test_data_return_skip_retry(self):
@@ -6905,7 +6903,7 @@ class TestJobPause(AnsibleZuulTestCase):
dict(name='compile', result='SUCCESS', changes='1,1'),
])
- self.assertTrue(re.search('test .* SKIPPED', A.messages[0]))
+ self.assertTrue('Skipped 1 job' in A.messages[0])
def test_job_pause_pre_skipped_child(self):
"""
@@ -6953,7 +6951,7 @@ class TestJobPause(AnsibleZuulTestCase):
dict(name='compile', result='SUCCESS', changes='1,1'),
])
- self.assertTrue(re.search('test .* SKIPPED', A.messages[0]))
+ self.assertTrue('Skipped 1 job' in A.messages[0])
def test_job_pause_skipped_child_retry(self):
"""
@@ -7822,7 +7820,7 @@ class TestProvidesRequiresMysql(ZuulTestCase):
dict(name='image-builder', result='FAILURE', changes='1,1'),
dict(name='hold', result='SUCCESS', changes='1,1'),
], ordered=False)
- self.assertTrue(re.search('image-user .* SKIPPED', A.messages[0]))
+ self.assertTrue('Skipped 1 job' in A.messages[0])
B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B')
B.data['commitMessage'] = '%s\n\nDepends-On: %s\n' % (
diff --git a/web/public/openapi.yaml b/web/public/openapi.yaml
index 312a1907e..b101c66e0 100644
--- a/web/public/openapi.yaml
+++ b/web/public/openapi.yaml
@@ -474,11 +474,9 @@ components:
description: The event id
type: string
item_ahead:
- description: The list of events ahead
- items:
- type: string
- type: array
- item_behind:
+ description: The event ahead
+ type: string
+ items_behind:
description: The list of events behind
items:
type: string
diff --git a/web/src/containers/status/ChangePanel.jsx b/web/src/containers/status/ChangePanel.jsx
index 33fc4687c..dd4fc27e5 100644
--- a/web/src/containers/status/ChangePanel.jsx
+++ b/web/src/containers/status/ChangePanel.jsx
@@ -18,6 +18,7 @@ import { connect } from 'react-redux'
import { Link } from 'react-router-dom'
import * as moment from 'moment'
import 'moment-duration-format'
+import { Button } from '@patternfly/react-core'
class ChangePanel extends React.Component {
@@ -30,9 +31,11 @@ class ChangePanel extends React.Component {
constructor () {
super()
this.state = {
- expanded: false
+ expanded: false,
+ showSkipped: false,
}
this.onClick = this.onClick.bind(this)
+ this.toggleSkippedJobs = this.toggleSkippedJobs.bind(this)
this.clicked = false
}
@@ -120,12 +123,13 @@ class ChangePanel extends React.Component {
}
renderProgressBar (change) {
- let jobPercent = (100 / change.jobs.length).toFixed(2)
+ const interesting_jobs = change.jobs.filter(j => this.jobStrResult(j) !== 'skipped')
+ let jobPercent = (100 / interesting_jobs.length).toFixed(2)
return (
<div className='progress zuul-change-total-result'>
{change.jobs.map((job, idx) => {
let result = this.jobStrResult(job)
- if (['queued', 'waiting'].includes(result)) {
+ if (['queued', 'waiting', 'skipped'].includes(result)) {
return ''
}
let className = ''
@@ -144,7 +148,6 @@ class ChangePanel extends React.Component {
className = ' progress-bar-warning'
break
case 'paused':
- case 'skipped':
className = ' progress-bar-info'
break
default:
@@ -302,15 +305,39 @@ class ChangePanel extends React.Component {
</span>)
}
+ toggleSkippedJobs (e) {
+ // Skip middle mouse button
+ if (e.button === 1) {
+ return
+ }
+ this.setState({ showSkipped: !this.state.showSkipped })
+ }
+
renderJobList (jobs, times) {
+ const [buttonText, interestingJobs] = this.state.showSkipped ?
+ ['Hide', jobs] :
+ ['Show', jobs.filter(j => this.jobStrResult(j) !== 'skipped')]
+ const skippedJobCount = jobs.length - interestingJobs.length
+
return (
- <ul className='list-group zuul-patchset-body'>
- {jobs.map((job, idx) => (
- <li key={idx} className='list-group-item zuul-change-job'>
- {this.renderJob(job, times.jobs[job.name])}
- </li>
- ))}
- </ul>)
+ <>
+ <ul className='list-group zuul-patchset-body'>
+ {interestingJobs.map((job, idx) => (
+ <li key={idx} className='list-group-item zuul-change-job'>
+ {this.renderJob(job, times.jobs[job.name])}
+ </li>
+ ))}
+ {(this.state.showSkipped || skippedJobCount) ? (
+ <li key='last' className='list-group-item zuul-change-job'>
+ <Button variant="link" className='zuul-skipped-jobs-button'
+ onClick={this.toggleSkippedJobs}>
+ {buttonText} {skippedJobCount ? skippedJobCount : ''} skipped job{skippedJobCount === 1 ? '' : 's'}
+ </Button>
+ </li>
+ ) : ''}
+ </ul>
+ </>
+ )
}
calculateTimes (change) {
diff --git a/web/src/containers/status/ChangePanel.test.jsx b/web/src/containers/status/ChangePanel.test.jsx
index 5a4be8602..cd27edc73 100644
--- a/web/src/containers/status/ChangePanel.test.jsx
+++ b/web/src/containers/status/ChangePanel.test.jsx
@@ -16,6 +16,7 @@ import React from 'react'
import { Link, BrowserRouter as Router } from 'react-router-dom'
import { Provider } from 'react-redux'
import { create } from 'react-test-renderer'
+import { Button } from '@patternfly/react-core'
import { setTenantAction } from '../../actions/tenant'
import configureStore from '../../store'
@@ -45,6 +46,8 @@ it('change panel render multi tenant links', () => {
const jobLink = application.root.findByType(Link)
expect(jobLink.props.to).toEqual(
'/t/tenant-one/stream/42')
+ const skipButton = application.root.findAllByType(Button)
+ expect(skipButton === undefined)
})
it('change panel render white-label tenant links', () => {
@@ -60,4 +63,29 @@ it('change panel render white-label tenant links', () => {
const jobLink = application.root.findByType(Link)
expect(jobLink.props.to).toEqual(
'/stream/42')
+ const skipButton = application.root.findAllByType(Button)
+ expect(skipButton === undefined)
+})
+
+it('change panel skip jobs', () => {
+ const fakeChange = {
+ project: 'org-project',
+ jobs: [{
+ name: 'job-name',
+ url: 'stream/42',
+ result: 'skipped'
+ }]
+ }
+
+ const store = configureStore()
+ store.dispatch(setTenantAction('tenant-one', true))
+ const application = create(
+ <Provider store={store}>
+ <Router>
+ <ChangePanel change={fakeChange} globalExpanded={true} />
+ </Router>
+ </Provider>
+ )
+ const skipButton = application.root.findByType(Button)
+ expect(skipButton.props.children.includes('skipped job'))
})
diff --git a/web/src/index.css b/web/src/index.css
index e8c67a372..eddbca673 100644
--- a/web/src/index.css
+++ b/web/src/index.css
@@ -189,6 +189,11 @@ a.refresh {
font-size: small;
}
+.zuul-skipped-jobs-button {
+ font-size: small;
+ padding: 0;
+}
+
.zuul-non-voting-desc {
font-size: smaller;
}
diff --git a/zuul/ansible/base/callback/zuul_stream.py b/zuul/ansible/base/callback/zuul_stream.py
index 720261cb2..f31983ed6 100644
--- a/zuul/ansible/base/callback/zuul_stream.py
+++ b/zuul/ansible/base/callback/zuul_stream.py
@@ -48,6 +48,7 @@ from zuul.ansible import paths
from zuul.ansible import logconfig
LOG_STREAM_PORT = int(os.environ.get("ZUUL_CONSOLE_PORT", 19885))
+LOG_STREAM_VERSION = 0
def zuul_filter_result(result):
@@ -103,6 +104,7 @@ class CallbackModule(default.CallbackModule):
self._items_done = False
self._deferred_result = None
self._playbook_name = None
+ self._zuul_console_version = 0
def configure_logger(self):
# ansible appends timestamp, user and pid to the log lines emitted
@@ -129,9 +131,7 @@ class CallbackModule(default.CallbackModule):
else:
self._display.display(msg)
- def _read_log(self, host, ip, port, log_id, task_name, hosts):
- self._log("[%s] Starting to log %s for task %s"
- % (host, log_id, task_name), job=False, executor=True)
+ def _read_log_connect(self, host, ip, port):
logger_retries = 0
while True:
try:
@@ -141,6 +141,7 @@ class CallbackModule(default.CallbackModule):
# logs continously. Without this we can easily trip the 5
# second timeout.
s.settimeout(None)
+ return s
except socket.timeout:
self._log(
"Timeout exception waiting for the logger. "
@@ -151,7 +152,7 @@ class CallbackModule(default.CallbackModule):
"Timeout exception waiting for the logger. "
"Please check connectivity to [%s:%s]"
% (ip, port))
- return
+ return None
except Exception:
if logger_retries % 10 == 0:
self._log("[%s] Waiting on logger" % host,
@@ -159,31 +160,77 @@ class CallbackModule(default.CallbackModule):
logger_retries += 1
time.sleep(0.1)
continue
- msg = "%s\n" % log_id
- s.send(msg.encode("utf-8"))
- buff = s.recv(4096)
- buffering = True
- while buffering:
- if b'\n' in buff:
- (line, buff) = buff.split(b'\n', 1)
- # We can potentially get binary data here. In order to
- # being able to handle that use the backslashreplace
- # error handling method. This decodes unknown utf-8
- # code points to escape sequences which exactly represent
- # the correct data without throwing a decoding exception.
- done = self._log_streamline(
- host, line.decode("utf-8", "backslashreplace"))
- if done:
- return
+
+ def _read_log(self, host, ip, port, log_id, task_name, hosts):
+ self._log("[%s] Starting to log %s for task %s"
+ % (host, log_id, task_name), job=False, executor=True)
+
+ s = self._read_log_connect(host, ip, port)
+ if s is None:
+ # Can't connect; _read_log_connect() already logged an
+ # error for us, just bail
+ return
+
+ # Find out what version we are running against
+ s.send(f'v:{LOG_STREAM_VERSION}\n'.encode('utf-8'))
+ buff = s.recv(1024).decode('utf-8').strip()
+
+ # NOTE(ianw) 2022-07-21 : zuul_console from < 6.3.0 do not
+ # understand this protocol. They will assume the send
+ # above is a log request and send back the not found
+ # message in a loop. So to handle this we disconnect and
+ # reconnect. When we decide to remove this, we can remove
+ # anything in the "version 0" path.
+ if buff == '[Zuul] Log not found':
+ s.close()
+ s = self._read_log_connect(host, ip, port)
+ if s is None:
+ return
+ else:
+ self._zuul_console_version = int(buff)
+ self._log('[%s] Reports streaming version: %d' %
+ (host, self._zuul_console_version),
+ job=False, executor=True)
+
+ if self._zuul_console_version >= 1:
+ msg = 's:%s\n' % log_id
+ else:
+ msg = '%s\n' % log_id
+
+ s.send(msg.encode("utf-8"))
+ buff = s.recv(4096)
+ buffering = True
+ while buffering:
+ if b'\n' in buff:
+ (line, buff) = buff.split(b'\n', 1)
+ # We can potentially get binary data here. In order to
+ # being able to handle that use the backslashreplace
+ # error handling method. This decodes unknown utf-8
+ # code points to escape sequences which exactly represent
+ # the correct data without throwing a decoding exception.
+ done = self._log_streamline(
+ host, line.decode("utf-8", "backslashreplace"))
+ if done:
+ if self._zuul_console_version > 0:
+ try:
+ # reestablish connection and tell console to
+ # clean up
+ s = self._read_log_connect(host, ip, port)
+ s.send(f'f:{log_id}\n'.encode('utf-8'))
+ s.close()
+ except Exception:
+ # Don't worry if this fails
+ pass
+ return
+ else:
+ more = s.recv(4096)
+ if not more:
+ buffering = False
else:
- more = s.recv(4096)
- if not more:
- buffering = False
- else:
- buff += more
- if buff:
- self._log_streamline(
- host, buff.decode("utf-8", "backslashreplace"))
+ buff += more
+ if buff:
+ self._log_streamline(
+ host, buff.decode("utf-8", "backslashreplace"))
def _log_streamline(self, host, line):
if "[Zuul] Task exit code" in line:
diff --git a/zuul/ansible/base/library/zuul_console.py b/zuul/ansible/base/library/zuul_console.py
index 9dffbbc3a..aa999cac1 100755
--- a/zuul/ansible/base/library/zuul_console.py
+++ b/zuul/ansible/base/library/zuul_console.py
@@ -24,6 +24,14 @@ import subprocess
import threading
import time
+# This is the version we report to the zuul_stream callback. It is
+# expected that this (zuul_console) process can be long-lived, so if
+# there are updates this ensures a later streaming callback can still
+# talk to us.
+ZUUL_CONSOLE_PROTO_VERSION = 1
+# This is the template for the file name of the log-file written out
+# by the command.py override command in the executor's Ansible
+# install.
LOG_STREAM_FILE = '/tmp/console-{log_uuid}.log'
LOG_STREAM_PORT = 19885
@@ -162,15 +170,49 @@ class Server(object):
ret = buffer.decode('utf-8')
x = ret.find('\n')
if x > 0:
- return ret[:x]
+ return ret[:x].strip()
except UnicodeDecodeError:
pass
- def handleOneConnection(self, conn):
- log_uuid = self.get_command(conn)
+ def _clean_uuid(self, log_uuid):
# use path split to make use the input isn't trying to be clever
# and construct some path like /tmp/console-/../../something
- log_uuid = os.path.split(log_uuid.rstrip())[-1]
+ return os.path.split(log_uuid)[-1]
+
+ def handleOneConnection(self, conn):
+ # V1 protocol
+ # -----------
+ # v:<ver> get version number, <ver> is remote version
+ # s:<uuid> send logs for <uuid>
+ # f:<uuid> finalise/cleanup <uuid>
+ while True:
+ command = self.get_command(conn)
+ if command.startswith('v:'):
+ # NOTE(ianw) : remote sends its version. We currently
+ # don't have anything to do with this value, so ignore
+ # for now.
+ cmd = '%s\n' % (ZUUL_CONSOLE_PROTO_VERSION)
+ conn.send(cmd.encode('utf-8'))
+ continue
+ elif command.startswith('f:'):
+ log_uuid = self._clean_uuid(command[2:])
+ try:
+ os.unlink(self.path.format(log_uuid=log_uuid))
+ except Exception:
+ # something might have cleaned /tmp
+ pass
+ continue
+ elif command.startswith('s:'):
+ log_uuid = self._clean_uuid(command[2:])
+ break
+ else:
+ # NOTE(ianw): 2022-07-21 In releases < 6.3.0 the streaming
+ # side would just send a raw uuid and nothing else; so by
+ # default assume that is what is coming in here. We can
+ # remove this fallback when we decide it is no longer
+ # necessary.
+ log_uuid = self._clean_uuid(command)
+ break
# FIXME: this won't notice disconnects until it tries to send
console = None
diff --git a/zuul/driver/gerrit/gerritconnection.py b/zuul/driver/gerrit/gerritconnection.py
index 6aea4388b..1ec334915 100644
--- a/zuul/driver/gerrit/gerritconnection.py
+++ b/zuul/driver/gerrit/gerritconnection.py
@@ -1470,7 +1470,7 @@ class GerritConnection(ZKChangeCacheMixin, ZKBranchCacheMixin, BaseConnection):
# for large projects like nova
alldata = []
chunk, more_changes = _query_chunk(query, event)
- while(chunk):
+ while chunk:
alldata.extend(chunk)
if more_changes is None:
# continue sortKey based (before Gerrit 2.9)
diff --git a/zuul/driver/github/githubreporter.py b/zuul/driver/github/githubreporter.py
index de62f2565..73d3a6f13 100644
--- a/zuul/driver/github/githubreporter.py
+++ b/zuul/driver/github/githubreporter.py
@@ -135,9 +135,12 @@ class GithubReporter(BaseReporter):
def _formatItemReportJobs(self, item):
# Return the list of jobs portion of the report
ret = ''
- jobs_fields = self._getItemReportJobsFields(item)
+ jobs_fields, skipped = self._getItemReportJobsFields(item)
for job_fields in jobs_fields:
ret += self._formatJobResult(job_fields)
+ if skipped:
+ jobtext = 'job' if skipped == 1 else 'jobs'
+ ret += 'Skipped %i %s\n' % (skipped, jobtext)
return ret
def addPullComment(self, item, comment=None):
diff --git a/zuul/driver/pagure/pagurereporter.py b/zuul/driver/pagure/pagurereporter.py
index 0bfdbc9b8..b38035752 100644
--- a/zuul/driver/pagure/pagurereporter.py
+++ b/zuul/driver/pagure/pagurereporter.py
@@ -67,9 +67,12 @@ class PagureReporter(BaseReporter):
def _formatItemReportJobs(self, item):
# Return the list of jobs portion of the report
ret = ''
- jobs_fields = self._getItemReportJobsFields(item)
+ jobs_fields, skipped = self._getItemReportJobsFields(item)
for job_fields in jobs_fields:
ret += '- [%s](%s) : %s%s%s%s\n' % job_fields[:6]
+ if skipped:
+ jobtext = 'job' if skipped == 1 else 'jobs'
+ ret += 'Skipped %i %s\n' % (skipped, jobtext)
return ret
def addPullComment(self, item, comment=None):
diff --git a/zuul/driver/smtp/smtpreporter.py b/zuul/driver/smtp/smtpreporter.py
index 4815026ab..a5d8938c1 100644
--- a/zuul/driver/smtp/smtpreporter.py
+++ b/zuul/driver/smtp/smtpreporter.py
@@ -42,7 +42,7 @@ class SMTPReporter(BaseReporter):
if 'subject' in self.config:
subject = self.config['subject'].format(
- change=item.change)
+ change=item.change, pipeline=item.pipeline.getSafeAttributes())
else:
subject = "Report for change {change} against {ref}".format(
change=item.change, ref=item.change.ref)
diff --git a/zuul/executor/server.py b/zuul/executor/server.py
index 2165a797b..e00612e9e 100644
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@@ -3157,6 +3157,10 @@ class ExecutorServer(BaseMergeServer):
self.allow_unzoned = get_default(self.config, 'executor',
'allow_unzoned', False)
+ # If this executor has no zone configured it is implicitly unzoned
+ if self.zone is None:
+ self.allow_unzoned = True
+
# Those attributes won't change, so it's enough to set them once on the
# component info.
self.component_info.zone = self.zone
@@ -3735,7 +3739,7 @@ class ExecutorServer(BaseMergeServer):
sensor.reportStats(self.statsd, base_key)
def finishJob(self, unique):
- del(self.job_workers[unique])
+ del self.job_workers[unique]
self.log.debug(
"Finishing Job: %s, queue(%d): %s",
unique,
diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py
index 642aededd..a66f5ad22 100644
--- a/zuul/manager/__init__.py
+++ b/zuul/manager/__init__.py
@@ -235,6 +235,9 @@ class PipelineManager(metaclass=ABCMeta):
resolved_changes.append(change)
return resolved_changes
+ def clearCache(self):
+ self._change_cache.clear()
+
def _maintainCache(self):
active_layout_uuids = set()
referenced_change_keys = set()
@@ -1768,9 +1771,12 @@ class PipelineManager(metaclass=ABCMeta):
build_in_items = [item]
if item.bundle:
for other_item in item.bundle.items:
- if other_item not in build_in_items:
- if other_item.current_build_set.getBuild(build.job.name):
- build_in_items.append(other_item)
+ if other_item in build_in_items:
+ continue
+ other_build = other_item.current_build_set.getBuild(
+ build.job.name)
+ if other_build is not None and other_build is build:
+ build_in_items.append(other_item)
for item in build_in_items:
# We don't care about some actions below if this build
# isn't in the current buildset, so determine that before
@@ -1817,18 +1823,27 @@ class PipelineManager(metaclass=ABCMeta):
# We're the second of the files/merger pair, report the stat
self.reportPipelineTiming('merge_request_time',
build_set.configured_time)
+ if event.elapsed_time:
+ self.reportPipelineTiming('merger_files_changes_op_time',
+ event.elapsed_time, elapsed=True)
def onMergeCompleted(self, event, build_set):
if build_set.merge_state == build_set.COMPLETE:
self._onGlobalRepoStateCompleted(event, build_set)
self.reportPipelineTiming('repo_state_time',
build_set.repo_state_request_time)
+ if event.elapsed_time:
+ self.reportPipelineTiming('merger_repo_state_op_time',
+ event.elapsed_time, elapsed=True)
else:
self._onMergeCompleted(event, build_set)
if build_set.files_state == build_set.COMPLETE:
# We're the second of the files/merger pair, report the stat
self.reportPipelineTiming('merge_request_time',
build_set.configured_time)
+ if event.elapsed_time:
+ self.reportPipelineTiming('merger_merge_op_time',
+ event.elapsed_time, elapsed=True)
def _onMergeCompleted(self, event, build_set):
@@ -2120,7 +2135,7 @@ class PipelineManager(metaclass=ABCMeta):
except Exception:
self.log.exception("Exception reporting pipeline stats")
- def reportPipelineTiming(self, key, start, end=None):
+ def reportPipelineTiming(self, key, start, end=None, elapsed=False):
if not self.sched.statsd:
return
if not start:
@@ -2130,5 +2145,8 @@ class PipelineManager(metaclass=ABCMeta):
pipeline = self.pipeline
tenant = pipeline.tenant
stats_key = f'zuul.tenant.{tenant.name}.pipeline.{pipeline.name}'
- dt = (end - start) * 1000
+ if elapsed:
+ dt = start
+ else:
+ dt = (end - start) * 1000
self.sched.statsd.timing(f'{stats_key}.{key}', dt)
diff --git a/zuul/merger/client.py b/zuul/merger/client.py
index 362644b98..29fa39aaf 100644
--- a/zuul/merger/client.py
+++ b/zuul/merger/client.py
@@ -159,7 +159,9 @@ class MergeClient(object):
"via result event for %s", merge_request)
if merge_request.job_type == MergeRequest.FILES_CHANGES:
event = FilesChangesCompletedEvent(
- merge_request.build_set_uuid, files=None
+ merge_request.build_set_uuid,
+ files=None,
+ elapsed_time=None,
)
else:
event = MergeCompletedEvent(
@@ -172,6 +174,7 @@ class MergeClient(object):
repo_state=None,
item_in_branches=None,
errors=None,
+ elapsed_time=None,
)
try:
self.result_events[merge_request.tenant_name][
diff --git a/zuul/merger/server.py b/zuul/merger/server.py
index 91597714f..fe5b938a1 100644
--- a/zuul/merger/server.py
+++ b/zuul/merger/server.py
@@ -256,6 +256,7 @@ class BaseMergeServer(metaclass=ABCMeta):
def executeMergeJob(self, merge_request, params):
result = None
+ start = time.monotonic()
if merge_request.job_type == MergeRequest.MERGE:
result = self.merge(merge_request, params)
elif merge_request.job_type == MergeRequest.CAT:
@@ -264,6 +265,8 @@ class BaseMergeServer(metaclass=ABCMeta):
result = self.refstate(merge_request, params)
elif merge_request.job_type == MergeRequest.FILES_CHANGES:
result = self.fileschanges(merge_request, params)
+ end = time.monotonic()
+ result['elapsed_time'] = end - start
return result
def cat(self, merge_request, args):
@@ -376,6 +379,7 @@ class BaseMergeServer(metaclass=ABCMeta):
item_in_branches = result.get("item_in_branches", [])
files = result.get("files", {})
errors = result.get("errors", [])
+ elapsed_time = result.get("elapsed_time")
log.info(
"Merge %s complete, merged: %s, updated: %s, commit: %s, "
@@ -407,7 +411,9 @@ class BaseMergeServer(metaclass=ABCMeta):
)
if merge_request.job_type == MergeRequest.FILES_CHANGES:
event = FilesChangesCompletedEvent(
- merge_request.build_set_uuid, files
+ merge_request.build_set_uuid,
+ files,
+ elapsed_time,
)
else:
event = MergeCompletedEvent(
@@ -420,6 +426,7 @@ class BaseMergeServer(metaclass=ABCMeta):
repo_state,
item_in_branches,
errors,
+ elapsed_time,
)
def put_complete_event(log, merge_request, event):
diff --git a/zuul/model.py b/zuul/model.py
index a07d5a640..963332826 100644
--- a/zuul/model.py
+++ b/zuul/model.py
@@ -694,6 +694,11 @@ class PipelineState(zkobject.ZKObject):
return json.dumps(data, sort_keys=True).encode("utf8")
def deserialize(self, raw, context):
+ # We may have old change objects in the pipeline cache, so
+ # make sure they are the same objects we would get from the
+ # source change cache.
+ self.pipeline.manager.clearCache()
+
data = super().deserialize(raw, context)
existing_queues = {
q.getPath(): q for q in self.queues + self.old_queues
@@ -742,8 +747,56 @@ class PipelineState(zkobject.ZKObject):
"queues": queues,
"old_queues": old_queues,
})
+ if context.build_references:
+ self._fixBuildReferences(data, context)
+ context.build_references = False
return data
+ def _fixBuildReferences(self, data, context):
+ # Reconcile duplicate builds; if we find any BuildReference
+ # objects, look up the actual builds and replace
+ log = context.log
+ build_map = {}
+ to_replace_dicts = []
+ to_replace_lists = []
+ for queue in data['queues'] + data['old_queues']:
+ for item in queue.queue:
+ buildset = item.current_build_set
+ for build_job, build in buildset.builds.items():
+ if isinstance(build, BuildReference):
+ to_replace_dicts.append((item,
+ buildset.builds,
+ build_job,
+ build._path))
+ else:
+ build_map[build.getPath()] = build
+ for job_name, build_list in buildset.retry_builds.items():
+ for build in build_list:
+ if isinstance(build, BuildReference):
+ to_replace_lists.append((item,
+ build_list,
+ build,
+ build._path))
+ else:
+ build_map[build.getPath()] = build
+ for (item, build_dict, build_job, build_path) in to_replace_dicts:
+ orig_build = build_map.get(build_path)
+ if orig_build:
+ build_dict[build_job] = orig_build
+ else:
+ log.warning("Unable to find deduplicated build %s for %s",
+ build_path, item)
+ del build_dict[build_job]
+ for (item, build_list, build, build_path) in to_replace_lists:
+ idx = build_list.index(build)
+ orig_build = build_map.get(build_path)
+ if orig_build:
+ build_list[idx] = build_map[build_path]
+ else:
+ log.warning("Unable to find deduplicated build %s for %s",
+ build_path, item)
+ del build_list[idx]
+
def _getKnownItems(self):
items = []
for queue in (*self.old_queues, *self.queues):
@@ -3424,6 +3477,11 @@ class BuildRequest(JobRequest):
)
+class BuildReference:
+ def __init__(self, _path):
+ self._path = _path
+
+
class Build(zkobject.ZKObject):
"""A Build is an instance of a single execution of a Job.
@@ -3852,6 +3910,13 @@ class BuildSet(zkobject.ZKObject):
}
return json.dumps(data, sort_keys=True).encode("utf8")
+ def _isMyBuild(self, build_path):
+ parts = build_path.split('/')
+ buildset_uuid = parts[-5]
+ if buildset_uuid == self.uuid:
+ return True
+ return False
+
def deserialize(self, raw, context):
data = super().deserialize(raw, context)
# Set our UUID so that getPath() returns the correct path for
@@ -3944,8 +4009,12 @@ class BuildSet(zkobject.ZKObject):
if not build.result:
build.refresh(context)
else:
- build = Build.fromZK(
- context, build_path, job=job, build_set=self)
+ if not self._isMyBuild(build_path):
+ build = BuildReference(build_path)
+ context.build_references = True
+ else:
+ build = Build.fromZK(
+ context, build_path, job=job, build_set=self)
builds[job_name] = build
for retry_path in data["retry_builds"].get(job_name, []):
@@ -3954,8 +4023,12 @@ class BuildSet(zkobject.ZKObject):
# Retry builds never change.
pass
else:
- retry_build = Build.fromZK(
- context, retry_path, job=job, build_set=self)
+ if not self._isMyBuild(retry_path):
+ retry_build = BuildReference(retry_path)
+ context.build_references = True
+ else:
+ retry_build = Build.fromZK(
+ context, retry_path, job=job, build_set=self)
retry_builds[job_name].append(retry_build)
data.update({
@@ -6180,12 +6253,13 @@ class MergeCompletedEvent(ResultEvent):
:arg dict repo_state: The starting repo state before the merge.
:arg list item_in_branches: A list of branches in which the final
commit in the merge list appears (changes without refs).
- :arg list errors: A list of error message strings
+ :arg list errors: A list of error message strings.
+ :arg float elapsed_time: Elapsed time of merge op in seconds.
"""
def __init__(self, request_uuid, build_set_uuid, merged, updated,
commit, files, repo_state, item_in_branches,
- errors):
+ errors, elapsed_time):
self.request_uuid = request_uuid
self.build_set_uuid = build_set_uuid
self.merged = merged
@@ -6195,6 +6269,7 @@ class MergeCompletedEvent(ResultEvent):
self.repo_state = repo_state or {}
self.item_in_branches = item_in_branches or []
self.errors = errors or []
+ self.elapsed_time = elapsed_time
def __repr__(self):
return ('<MergeCompletedEvent job: %s buildset: %s merged: %s '
@@ -6214,6 +6289,7 @@ class MergeCompletedEvent(ResultEvent):
"repo_state": dict(self.repo_state),
"item_in_branches": list(self.item_in_branches),
"errors": list(self.errors),
+ "elapsed_time": self.elapsed_time,
}
@classmethod
@@ -6228,6 +6304,7 @@ class MergeCompletedEvent(ResultEvent):
dict(data.get("repo_state", {})),
list(data.get("item_in_branches", [])),
list(data.get("errors", [])),
+ data.get("elapsed_time"),
)
@@ -6236,16 +6313,19 @@ class FilesChangesCompletedEvent(ResultEvent):
:arg BuildSet build_set: The build_set which is ready.
:arg list files: List of files changed.
+ :arg float elapsed_time: Elapsed time of merge op in seconds.
"""
- def __init__(self, build_set_uuid, files):
+ def __init__(self, build_set_uuid, files, elapsed_time):
self.build_set_uuid = build_set_uuid
self.files = files or []
+ self.elapsed_time = elapsed_time
def toDict(self):
return {
"build_set_uuid": self.build_set_uuid,
"files": list(self.files),
+ "elapsed_time": self.elapsed_time,
}
@classmethod
@@ -6253,6 +6333,7 @@ class FilesChangesCompletedEvent(ResultEvent):
return cls(
data.get("build_set_uuid"),
list(data.get("files", [])),
+ data.get("elapsed_time"),
)
@@ -7078,6 +7159,7 @@ class Layout(object):
noop = Job('noop')
noop.description = 'A job that will always succeed, no operation.'
noop.parent = noop.BASE_JOB_MARKER
+ noop.deduplicate = False
noop.run = (PlaybookContext(None, 'noop.yaml', [], []),)
self.jobs = {'noop': [noop]}
self.nodesets = {}
diff --git a/zuul/reporter/__init__.py b/zuul/reporter/__init__.py
index 9b8f2c11c..5723316a3 100644
--- a/zuul/reporter/__init__.py
+++ b/zuul/reporter/__init__.py
@@ -257,9 +257,13 @@ class BaseReporter(object, metaclass=abc.ABCMeta):
# Extract the report elements from an item
config = self.connection.sched.config
jobs_fields = []
+ skipped = 0
for job in item.getJobs():
build = item.current_build_set.getBuild(job.name)
(result, url) = item.formatJobResult(job)
+ if result == 'SKIPPED':
+ skipped += 1
+ continue
if not job.voting:
voting = ' (non-voting)'
else:
@@ -300,12 +304,15 @@ class BaseReporter(object, metaclass=abc.ABCMeta):
success_message = job.success_message
jobs_fields.append(
(name, url, result, error, elapsed, voting, success_message))
- return jobs_fields
+ return jobs_fields, skipped
def _formatItemReportJobs(self, item):
# Return the list of jobs portion of the report
ret = ''
- jobs_fields = self._getItemReportJobsFields(item)
+ jobs_fields, skipped = self._getItemReportJobsFields(item)
for job_fields in jobs_fields:
ret += '- %s%s : %s%s%s%s\n' % job_fields[:6]
+ if skipped:
+ jobtext = 'job' if skipped == 1 else 'jobs'
+ ret += 'Skipped %i %s\n' % (skipped, jobtext)
return ret
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 272235757..dfc922cf1 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -438,12 +438,12 @@ class Scheduler(threading.Thread):
mergers_online = 0
for executor_component in self.component_registry.all("executor"):
- if executor_component.allow_unzoned or not executor_component.zone:
+ if executor_component.allow_unzoned:
if executor_component.state == BaseComponent.RUNNING:
executors_unzoned_online += 1
if executor_component.accepting_work:
executors_unzoned_accepting += 1
- else:
+ if executor_component.zone:
zone_stats = zoned_executor_stats.setdefault(
executor_component.zone,
executor_stats_default.copy())
diff --git a/zuul/zk/nodepool.py b/zuul/zk/nodepool.py
index 109053f11..1be4dc2b5 100644
--- a/zuul/zk/nodepool.py
+++ b/zuul/zk/nodepool.py
@@ -12,6 +12,7 @@
import json
import logging
+import os
import time
from enum import Enum
from typing import Optional, List
@@ -43,7 +44,7 @@ class ZooKeeperNodepool(ZooKeeperBase):
Class implementing Nodepool related ZooKeeper interface.
"""
NODES_ROOT = "/nodepool/nodes"
- LAUNCHER_ROOT = "/nodepool/launchers"
+ COMPONENT_ROOT = "/nodepool/components"
REQUEST_ROOT = '/nodepool/requests'
REQUEST_LOCK_ROOT = "/nodepool/requests-lock"
HOLD_REQUEST_ROOT = '/zuul/hold-requests'
@@ -95,9 +96,6 @@ class ZooKeeperNodepool(ZooKeeperBase):
self._node_tree.close()
self._node_tree = None
- def _launcherPath(self, launcher):
- return "%s/%s" % (self.LAUNCHER_ROOT, launcher)
-
def _nodePath(self, node):
return "%s/%s" % (self.NODES_ROOT, node)
@@ -113,15 +111,15 @@ class ZooKeeperNodepool(ZooKeeperBase):
:returns: A list of Launcher objects, or empty list if none are found.
"""
+ root_path = os.path.join(self.COMPONENT_ROOT, 'pool')
try:
- launcher_ids = self.kazoo_client\
- .get_children(self.LAUNCHER_ROOT)
+ pools = self.kazoo_client.get_children(root_path)
except NoNodeError:
return []
objs = []
- for launcher in launcher_ids:
- path = self._launcherPath(launcher)
+ for pool in pools:
+ path = os.path.join(root_path, pool)
try:
data, _ = self.kazoo_client.get(path)
except NoNodeError:
diff --git a/zuul/zk/zkobject.py b/zuul/zk/zkobject.py
index aa32b8b9b..8de3f34ba 100644
--- a/zuul/zk/zkobject.py
+++ b/zuul/zk/zkobject.py
@@ -43,6 +43,7 @@ class ZKContext:
self.cumulative_write_znodes = 0
self.cumulative_read_bytes = 0
self.cumulative_write_bytes = 0
+ self.build_references = False
def sessionIsValid(self):
return ((not self.lock or self.lock.is_still_valid()) and