diff options
37 files changed, 1235 insertions, 144 deletions
diff --git a/doc/source/developer/index.rst b/doc/source/developer/index.rst index 52266a175..b45c75640 100644 --- a/doc/source/developer/index.rst +++ b/doc/source/developer/index.rst @@ -14,6 +14,7 @@ Zuul, though advanced users may find it interesting. drivers triggers testing + metrics docs ansible javascript diff --git a/doc/source/developer/metrics.rst b/doc/source/developer/metrics.rst new file mode 100644 index 000000000..913a591ba --- /dev/null +++ b/doc/source/developer/metrics.rst @@ -0,0 +1,74 @@ +:title: Metrics + +Metrics +======= + +Event Overview +-------------- + +The following table illustrates the event and pipeline processing +sequence as it relates to some of the metrics described in +:ref:`statsd`. This is intended as general guidance only and is not +an exhaustive list. + ++----------------------------------------+------+------+------+--------------------------------------+ +| Event | Metrics | Attribute | ++========================================+======+======+======+======================================+ +| Event generated by source | | | | event.timestamp | ++----------------------------------------+------+ + +--------------------------------------+ +| Enqueued into driver queue | | | | | ++----------------------------------------+------+ + +--------------------------------------+ +| Enqueued into tenant trigger queue | | | | event.arrived_at_scheduler_timestamp | ++----------------------------------------+ + [8] + +--------------------------------------+ +| Forwarded to matching pipelines | [1] | | | | ++----------------------------------------+ + + +--------------------------------------+ +| Changes enqueued ahead | | | | | ++----------------------------------------+ + + +--------------------------------------+ +| Change enqueued | | | | item.enqueue_time | ++----------------------------------------+------+------+ +--------------------------------------+ +| Changes enqueued behind | | | | | ++----------------------------------------+------+------+ +--------------------------------------+ +| Set item configuration | | | | build_set.configured_time | ++----------------------------------------+------+------+ +--------------------------------------+ +| Request files changed (if needed) | | | | | ++----------------------------------------+ +------+ +--------------------------------------+ +| Request merge | [2] | | | | ++----------------------------------------+ +------+ +--------------------------------------+ +| Wait for merge (and files if needed) | | | [9] | | ++----------------------------------------+------+------+ +--------------------------------------+ +| Generate dynamic layout (if needed) | [3] | | | | ++----------------------------------------+------+------+ +--------------------------------------+ +| Freeze job graph | [4] | | | | ++----------------------------------------+------+------+ +--------------------------------------+ +| Request global repo state (if needed) | | | | build_set.repo_state_request_time | ++----------------------------------------+ [5] +------+ +--------------------------------------+ +| Wait for global repo state (if needed) | | | | | ++----------------------------------------+------+------+ +--------------------------------------+ +| Deduplicate jobs | | | | | ++----------------------------------------+------+------+ +--------------------------------------+ +| Acquire semaphore (non-resources-first)| | | | | ++----------------------------------------+------+------+ +--------------------------------------+ +| Request nodes | | | | request.created_time | ++----------------------------------------+ [6] +------+ +--------------------------------------+ +| Wait for nodes | | | | | ++----------------------------------------+------+------+ +--------------------------------------+ +| Acquire semaphore (resources-first) | | | | | ++----------------------------------------+------+------+ +--------------------------------------+ +| Enqueue build request | | | | build.execute_time | ++----------------------------------------+ [7] +------+ +--------------------------------------+ +| Executor starts job | | | | build.start_time | ++----------------------------------------+------+------+------+--------------------------------------+ + +====== ============================= +Metric Name +====== ============================= +1 event_enqueue_processing_time +2 merge_request_time +3 layout_generation_time +4 job_freeze_time +5 repo_state_time +6 node_request_time +7 job_wait_time +8 event_enqueue_time +9 event_job_time +====== ============================= diff --git a/doc/source/developer/specs/index.rst b/doc/source/developer/specs/index.rst index d96df0c26..78c11bbb8 100644 --- a/doc/source/developer/specs/index.rst +++ b/doc/source/developer/specs/index.rst @@ -23,3 +23,4 @@ documentation instead. enhanced-regional-executors tenant-resource-quota community-matrix + tracing diff --git a/doc/source/developer/specs/tracing.rst b/doc/source/developer/specs/tracing.rst new file mode 100644 index 000000000..5e0e9e4d1 --- /dev/null +++ b/doc/source/developer/specs/tracing.rst @@ -0,0 +1,323 @@ +Tracing +======= + +.. warning:: This is not authoritative documentation. These features + are not currently available in Zuul. They may change significantly + before final implementation, or may never be fully completed. + +It can be difficult for a user to understand what steps were involved +between a trigger event (such as a patchset upload or recheck comment) +and a buildset report. If it took an unusually long time it can be +difficult to determine why. At present, an operator would need to +examine logs to determine what steps were involved and the sources of +any potential delays. Even experienced operators and developers can +take quite some time to first collect and then analyze logs to answer +these questions. + +Sometimes these answers may point to routine system operation (such as +a delay caused by many gate resets, or preparing a large number of +repositories). Other times they may point to deficiencies in the +system (insufficient mergers) or bugs in the code. + +Being able to visualize the activities of a Zuul system can help +operators (and potentially users) triage and diagnose issues more +quickly and accurately. Even if examining logs is ultimately required +in order to fully diagnose an issue, being able to narrow down the +scope using analsys tools can greatly simplify the process. + +Proposed Solution +----------------- + +Implementing distributed tracing in Zuul can help improve the +observability of the system and aid operators and potentially users in +understanding the sequence of events. + +By exporting information about the processing Zuul performs using the +OpenTelemetry API, information about Zuul operations can be collected +in any of several tools for analysis. + +OpenTelemetry is an Open Source protocol for exchanging observability +data, an SDK implementing that protocol, as well as an implementation +of a collector for distributing information to multiple backends. + +It supports three kinds of observability data: `traces`, `metrics`, +and `logs`. Since Zuul already has support for metrics and logs, this +specification proposes that we use only the support in OpenTelemtry +for `traces`. + +Usage Scenarios +~~~~~~~~~~~~~~~ + +Usage of OpenTelemetry should be entirely optional and supplementary +for any Zuul deployment. Log messages alone should continue to be +sufficient to analyze any potential problem. + +Should a deployer wish to use OpenTelemetry tracing data, a very +simple deployment for smaller sites may be constructed by running only +Jaeger. Jaeger is a service that can receive, store, and display +tracing information. The project distributes an all-in-one container +image which can store data in local filesystem storage. + +https://www.jaegertracing.io/ + +Larger sites may wish to run multiple collectors and feed data to +larger, distributed storage backends (such as Cassandra, +Elasticsearch, etc). + +Suitability to Zuul +~~~~~~~~~~~~~~~~~~~ + +OpenTelemetry tracing, at a high level, is designed to record +information about events, their timing, and their relation to other +events. At first this seems like a natural fit for Zuul, which reacts +to events, processes events, and generates more events. However, +OpenTelemetry's bias toward small and simple web applications is +evident throughout its documentation and the SDK implementation. + + Traces give us the big picture of what happens when a request is + made by user or an application. + +Zuul is not driven by user or application requests, and a system +designed to record several millisecond-long events which make up the +internal response to a user request of a web app is not necessarily +the obvious right choice for recording sequences and combinations of +events which frequently take hours (and sometimes days) to play out +across multiple systems. + +Fortunately, the concepts and protocol implementation of OpenTelemtry +are sufficiently well-designed for the general case to be able to +accomodate a system like Zuul, even if the SDK makes incompatible +assumptions that make integration difficult. There are some +challenges to implementation, but because the concepts appear to be +well matched, we should proceed with using the OpenTelemetry protocol +and SDK. + +Spans +~~~~~ + +The key tracing concepts in OpenTelemety are `traces` and `spans`. +From a data model perspective, the unit of data storage is a `span`. +A trace itself is really just a unique ID that is common to multiple +spans. + +Spans can relate to other spans as either children or links. A trace +is generally considered to have a single 'root' span, and within the +time period represented by that span, it may have any number of child +spans (which may further have their own child spans). + +OpenTelemetry anticipates that a span on one system may spawn a child +span on another system and includes facilities for transferring enough +information about the parent span to a child system that the child +system alone can emit traces for its span and any children that it +spawns in turn. + +For a concrete example in Zuul, we might have a Zuul scheduler start a +span for a buildset, and then a merger might emit a child span for +performing the initial merge, and an executor might emit a child span +for executing a build. + +Spans can relate to other spans (including spans in other traces), so +sequences of events can be chained together without necessitating that +they all be part of the same span or trace. + +Because Zuul processes series of events which may stretch for long +periods of time, we should specify what events and actions should +correspond to spans and traces. Spans can have arbitrary metadat +associated with them, so we will be able to search by event or job +ids. + +The following sections describe traces and their child spans. + +Event Ingestion ++++++++++++++++ + +A trace will begin when Zuul receives an event and end when that event +has been enqueued into scheduler queues (or discarded). A driver +completing processing of an event is a definitive point in time so it +is easy to know when to close the root span for that event's trace +(whereas if we kept the trace open to include scheduler processing, we +would need to know when the last trigger event spawned by the +connection event was complete). + +This may include processing in internal queues by a given driver, and +these processing steps/queues should appear as their own child spans. +The spans should include event IDs (and potentially other information +about the event such as change or pull request numbers) as metadata. + +Tenant Event Processing ++++++++++++++++++++++++ + +A trace will begin when a scheduler begins processing a tenant event +and ends when it has forwarded the event to all pipelines within a +tenant. It will link to the event ingestion trace as a follow-on +span. + +Queue Item +++++++++++ + +A trace will begin when an item is enqueued and end when it is +dequeued. This will be quite a long trace (hours or days). It is +expected to be the primary benefit of this telemetry effort as it will +show the entire lifetime of a queue item. It will link to the tenant +event processing trace as a follow-on span. + +Within the root span, there will be a span for each buildset (so that +if a gate reset happens and a new buildset is created, users will see +a series of buildset spans). Within a buildset, there will be spans +for all of the major processing steps, such as merge operations, +layout calculating, freezing the job graph, and freezing jobs. Each +build will also merit a span (retried builds will get their own spans +as well), and within a job span, there will be child spans for git +repo prep, job setup, individual playbooks, and cleanup. + +SDK Challenges +~~~~~~~~~~~~~~ + +As a high-level concept, the idea of spans for each of these +operations makes sense. In practice, the SDK makes implementation +challenging. + +The OpenTelemtry SDK makes no provision for beginning a span on one +system and ending it on another, so the fact that one Zuul scheduler +might start a buildset span while another ends it is problematic. + +Fortunately, the OpenTelemetry API only reports spans when they end, +not when they start. This means that we don't need to coordinate a +"start" API call on one scheduler with an "end" API call on another. +We can simply emit the trace with its root span at the end. However, +any child spans emitted during that time need to know the trace ID +they should use, which means that we at least need to store a trace ID +and start timestamp on our starting scheduler for use by any child +spans as well as the "end span" API call. + +The SDK does not support creating a span with a specific trace ID or +start timestamp (most timestamps are automatic), but it has +well-defined interfaces for spans and we can subclass the +implementation to allow us to specify trace IDs and timestamps. With +this approach, we can "virtually" start a span on one host, store its +information in ZooKeeper with whatever long-lived object it is +associated with (such as a QueueItem) and then make it concrete on +another host when we end it. + +Alternatives +++++++++++++ + +This section describes some alternative ideas for dealing with the +SDK's mismatch with Zuul concepts as well as why they weren't +selected. + +* Multiple root spans with the same trace ID + + Jaeger handles this relatively well, and the timeline view appears + as expected (multiple events with whitespace between them). The + graph view in Jaeger may have some trouble displaying this. + + It is not clear that OpenTelemetry anticipates having multiple + "root" spans, so it may be best to avoid this in order to avoid + potential problems with other tools. + +* Child spans without a parent + + If we emit spans that specify a parent which does not exist, Jaeger + will display these traces but show a warning that the parent is + invalid. This may occur naturally while the system is operating + (builds complete while a buildset is running), but should be + eventually corrected once an item is dequeued. In case of a serious + error, we may never close a parent span, which would cause this to + persist. We should accept that this may happen, but try to avoid it + happening intentionally. + +Links +~~~~~ + +Links between spans are fairly primitive in Jaeger. While the +OpenTelemetry API includes attributes for links (so that when we link +a queue item to an event, we could specify that it was a forwarded +event), Jaeger does not store or render them. Instead, we are only +left with a reference to a ``< span in another trace >`` with a +reference type of ``FOLLOWS_FROM``. Clicking on that link will +immediately navigate to the other trace where metadata about the trace +will be visible, but before clicking on it, users will have little +idea of what awaits on the other side. + +For this reason, we should use span links sparingly so that when they +are encountered, users are likely to intuit what they are for and are +not overwhelmed by multiple indistinguishable links. + +Events and Exceptions +~~~~~~~~~~~~~~~~~~~~~ + +OpenTelemetry allows events to be added to spans. Events have their +own timestamp and attributes. These can be used to add additional +context to spans (representing single points in time rather than +events with duration that should be child spans). Examples might +include receiving a request to cancel a job or dequeue an item. + +Events should not be used as an alternative to logs, nor should all +log messages be copied as events. Events should be used sparingly to +avoid overwhelming the tracing storage with data and the user with +information. + +Exceptions may also be included in spans. This happens automatically +and by default when using the context managers supplied by the SDK. +Because many spans in Zuul will be unable to use the SDK context +managers and any exception information would need to be explicitly +handled and stored in ZooKeeper, we will disable inclusion of +exception information in spans. This will provide a more consistent +experience (so that users don't see the absence of an exception in +tracing information to indicate the absence of an error in logs) and +reduce the cost of supporting traces (extra storage in ZooKeeper and +in the telemetry storage). + +If we decide that exception information is worth including in the +future, this decision will be easy to revisit and reverse. + +Sensitive Information +~~~~~~~~~~~~~~~~~~~~~ + +No sensitive information (secrets, passwords, job variables, etc) +should be included in tracing output. All output should be suitable +for an audience of Zuul users (that is, if someone has access to the +Zuul dashboard, then tracing data should not have any more sensitive +information than they already have access to). For public-facing Zuul +systems (such as OpenDev), the information should be suitable for +public use. + +Protobuf and gRPC +~~~~~~~~~~~~~~~~~ + +The most efficient and straightforward method of transmitting data +from Zuul to a collector (including Jaeger) is using OTLP with gRPC +(OpenTelemetry Protocol + gRPC Remote Procedure Calls). Because +Protobuf applications include automatically generated code, we may +encounter the occasional version inconsistency. We may need to +navigate package requirements more than normal due to this (especially +if we have multiple packages that depend on protobuf). + +For a contemporary example, the OpenTelemetry project is in the +process of pinning to an older version of protobuf: + +https://github.com/open-telemetry/opentelemetry-python/issues/2717 + +There is an HTTP+JSON exporter as well, so in the case that something +goes very wrong with protobuf+gRPC, that may be available as a fallback. + +Work Items +---------- + +* Add OpenTelemetry SDK and support for configuring an exporter to + zuul.conf +* Implement SDK subclasses to support opening and closing spans on + different hosts +* Instrument event processing in each driver +* Instrument event processing in scheduler +* Instrument queue items and related spans +* Document a simple Jaeger setup as a quickstart add-on (similar to + authz) +* Optional: work with OpenDev to run a public Jaeger server for + OpenDev + +The last item is not required for this specification (and not our +choice as Zuul developers to make) but it would be nice if there were +one available so that all Zuul users and developers have a reference +implementation available for community collaboration. diff --git a/doc/source/monitoring.rst b/doc/source/monitoring.rst index 0c2cb4351..1cb61ee01 100644 --- a/doc/source/monitoring.rst +++ b/doc/source/monitoring.rst @@ -110,7 +110,27 @@ These metrics are emitted by the Zuul :ref:`scheduler`: operation(s). This will always include a request to a Zuul merger to speculatively merge the change, but it may also include a second request submitted in parallel to identify - the files altered by the change. + the files altered by the change. Includes + :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.merger_merge_op_time` + and + :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.merger_files_changes_op_time`. + + .. stat:: merger_merge_op_time + :type: timer + + The amount of time the merger spent performing a merge + operation. This does not include any of the round-trip time + from the scheduler to the merger, or any other merge + operations. + + .. stat:: merger_files_changes_op_time + :type: timer + + The amount of time the merger spent performing a files-changes + operation to detect changed files (this is sometimes + performed if the source does not provide this information). + This does not include any of the round-trip time from the + scheduler to the merger, or any other merge operations. .. stat:: layout_generation_time :type: timer @@ -128,7 +148,17 @@ These metrics are emitted by the Zuul :ref:`scheduler`: The amount of time waiting for a secondary Zuul merger operation to collect additional information about the repo - state of required projects. + state of required projects. Includes + :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.merger_repo_state_op_time`. + + .. stat:: merger_repo_state_op_time + :type: timer + + The amount of time the merger spent performing a repo state + operation to collect additional information about the repo + state of required projects. This does not include any of the + round-trip time from the scheduler to the merger, or any + other merge operations. .. stat:: node_request_time :type: timer diff --git a/playbooks/zuul-stream/fixtures/test-stream.yaml b/playbooks/zuul-stream/fixtures/test-stream.yaml index 05b75daf9..0326ae54e 100644 --- a/playbooks/zuul-stream/fixtures/test-stream.yaml +++ b/playbooks/zuul-stream/fixtures/test-stream.yaml @@ -1,3 +1,16 @@ +- name: Start zuul stream daemon + hosts: node + tasks: + + # NOTE : when new_console is set, this playbook runs with + # ZUUL_CONSOLE_PORT=19887 so that we test with the zuul_console + # from the Zuul checkout, and not the one started by test + # infrastructure. + - name: Start zuul_console on non-default port + zuul_console: + port: 19887 + when: new_console | default(false) + - name: Run some commands to show that logging works hosts: node tasks: @@ -51,3 +64,15 @@ - name: Print binary data command: echo -e '\x80abc' + + - name: Find any console log files + find: + paths: /tmp + patterns: 'console-*.log' + register: _tmp_files + + # We check this list in zuul-stream/functional.yaml to make sure + # we're cleaning up console log files. + - name: Dump tmp files + debug: + var: _tmp_files diff --git a/playbooks/zuul-stream/functional.yaml b/playbooks/zuul-stream/functional.yaml index ee1643fbc..7ae4704a9 100644 --- a/playbooks/zuul-stream/functional.yaml +++ b/playbooks/zuul-stream/functional.yaml @@ -7,71 +7,104 @@ # the python version of the platform is changed. python_path: "/usr/local/lib/python3.10/dist-packages" - - name: Run ansible that should succeed + - name: Run ansible that should succeed against testing console command: > /usr/lib/zuul/ansible/{{ zuul_ansible_version }}/bin/ansible-playbook + -e "new_console=true" src/opendev.org/zuul/zuul/playbooks/zuul-stream/fixtures/test-stream.yaml environment: + # Setup by test-stream.yaml so we start a new zuul_console + # from this checkout. + ZUUL_CONSOLE_PORT: 19887 ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json" ZUUL_JOBDIR: "{{ ansible_user_dir}}" PYTHONPATH: "{{ python_path }}" + register: _success_output - - name: Run ansible playbook that should fail + - name: Save raw output to file + copy: + content: '{{ _success_output.stdout }}' + dest: 'console-job-output-success-19887.txt' + + - name: Save output + shell: | + mv job-output.txt job-output-success-19887.txt + mv job-output.json job-output-success-19887.json + + - name: Check protocol version + assert: + that: + - "'[node1] Reports streaming version: 1' in _success_output.stdout" + + # Streamer puts out a line like + # [node1] Starting to log 916b2084-4bbb-80e5-248e-000000000016-1-node1 for task TASK: Print binary data + # One of the tasks in job-output shows find: results; + # the console file for this task should not be there. + - name: Validate temporary files removed + shell: | + for f in $(grep 'Starting to log' console-job-output-success-19887.txt | awk '{print $5}'); do + echo "Checking ${f}" + if grep -q '"path": "/tmp/console-'${f}'.log"' job-output-success-19887.txt; then + echo "*** /tmp/${f}.log still exists" + exit 1 + fi + done + + # NOTE(ianw) 2022-07 : we deliberatly have this second step to run + # against the console setup by the infrastructure executor in the + # job pre playbooks as a backwards compatability sanity check. + - name: Run ansible that should succeed against extant console command: > /usr/lib/zuul/ansible/{{ zuul_ansible_version }}/bin/ansible-playbook - src/opendev.org/zuul/zuul/playbooks/zuul-stream/fixtures/test-stream-failure.yaml - register: failed_results - failed_when: "failed_results.rc != 2" + -e "new_console=false" + src/opendev.org/zuul/zuul/playbooks/zuul-stream/fixtures/test-stream.yaml environment: ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json" ZUUL_JOBDIR: "{{ ansible_user_dir}}" PYTHONPATH: "{{ python_path }}" + register: _success_output - - name: Validate output - setupvar - shell: | - egrep "^.*\| node1 \|\s+\"setupvar\": {" job-output.txt - egrep "^.*\| node2 \|\s+\"setupvar\": {" job-output.txt + - name: Save raw output to file + copy: + content: '{{ _success_output.stdout }}' + dest: 'console-job-output-success-19885.txt' - - name: Validate output - shell task + - name: Save output shell: | - egrep "^.*\| node1 \| 1: lo:" job-output.txt - egrep "^.*\| node2 \| 1: lo:" job-output.txt + mv job-output.txt job-output-success-19885.txt + mv job-output.json job-output-success-19885.json - - name: Validate output - loop with items - shell: | - egrep "^.+\| node1 \| ok: Item: item1" job-output.txt - egrep "^.+\| node1 \| ok: Item: item2" job-output.txt - egrep "^.+\| node1 \| ok: Item: item3" job-output.txt - egrep "^.+\| node2 \| ok: Item: item1" job-output.txt - egrep "^.+\| node2 \| ok: Item: item2" job-output.txt - egrep "^.+\| node2 \| ok: Item: item3" job-output.txt + - name: Validate outputs + include_tasks: validate.yaml + loop: + - job-output-success-19887.txt + - job-output-success-19885.txt - - name: Validate output - loop with complex items - shell: | - egrep "^.+\| node1 \| ok: Item: Runtime" job-output.txt - egrep "^.+\| node2 \| ok: Item: Runtime" job-output.txt + # failure case - - name: Validate output - failed shell task - shell: | - egrep "^.+\| node1 \| Exception: Test module failure exception task" job-output.txt - egrep "^.+\| node2 \| Exception: Test module failure exception task" job-output.txt + - name: Run ansible playbook that should fail + command: > + /usr/lib/zuul/ansible/{{ zuul_ansible_version }}/bin/ansible-playbook + src/opendev.org/zuul/zuul/playbooks/zuul-stream/fixtures/test-stream-failure.yaml + register: failed_results + failed_when: "failed_results.rc != 2" + environment: + ZUUL_CONSOLE_PORT: 19887 + ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json" + ZUUL_JOBDIR: "{{ ansible_user_dir}}" + PYTHONPATH: "{{ python_path }}" - - name: Validate output - item loop with exception + - name: Save output shell: | - egrep "^.+\| node1 \| Exception: Test module failure exception loop" job-output.txt - egrep "^.+\| node2 \| Exception: Test module failure exception loop" job-output.txt + mv job-output.txt job-output-failure.txt + mv job-output.json job-output-failure.json - name: Validate output - failure shell task with exception shell: | - egrep "^.+\| node1 \| Exception: Test module failure exception fail-task" job-output.txt - egrep "^.+\| node2 \| Exception: Test module failure exception fail-task" job-output.txt + egrep "^.+\| node1 \| Exception: Test module failure exception fail-task" job-output-failure.txt + egrep "^.+\| node2 \| Exception: Test module failure exception fail-task" job-output-failure.txt - name: Validate output - failure item loop with exception shell: | - egrep "^.+\| node1 \| Exception: Test module failure exception fail-loop" job-output.txt - egrep "^.+\| node2 \| Exception: Test module failure exception fail-loop" job-output.txt - - - name: Validate output - binary data - shell: | - egrep "^.*\| node1 \| \\\\x80abc" job-output.txt - egrep "^.*\| node2 \| \\\\x80abc" job-output.txt + egrep "^.+\| node1 \| Exception: Test module failure exception fail-loop" job-output-failure.txt + egrep "^.+\| node2 \| Exception: Test module failure exception fail-loop" job-output-failure.txt diff --git a/playbooks/zuul-stream/post.yaml b/playbooks/zuul-stream/post.yaml index 2c717a82e..4beb8d1f9 100644 --- a/playbooks/zuul-stream/post.yaml +++ b/playbooks/zuul-stream/post.yaml @@ -10,9 +10,6 @@ state: directory delegate_to: localhost - - name: Rename job-output.txt - command: mv job-output.txt stream-job-output.txt - - name: Fetch files synchronize: src: "{{ ansible_user_dir }}/{{ item }}" @@ -21,5 +18,10 @@ with_items: - logging.json - ansible.cfg - - stream-job-output.txt - - job-output.json + - console-job-output-success-19887.txt + - job-output-success-19887.txt + - job-output-success-19887.json + - job-output-success-19885.txt + - job-output-success-19885.json + - job-output-failure.txt + - job-output-failure.json diff --git a/playbooks/zuul-stream/validate.yaml b/playbooks/zuul-stream/validate.yaml new file mode 100644 index 000000000..73ccd873a --- /dev/null +++ b/playbooks/zuul-stream/validate.yaml @@ -0,0 +1,38 @@ +- name: Validate output - setupvar + shell: | + egrep "^.*\| node1 \|\s+\"setupvar\": {" {{ item }} + egrep "^.*\| node2 \|\s+\"setupvar\": {" {{ item }} + +- name: Validate output - shell task + shell: | + egrep "^.*\| node1 \| 1: lo:" {{ item }} + egrep "^.*\| node2 \| 1: lo:" {{ item }} + +- name: Validate output - loop with items + shell: | + egrep "^.+\| node1 \| ok: Item: item1" {{ item }} + egrep "^.+\| node1 \| ok: Item: item2" {{ item }} + egrep "^.+\| node1 \| ok: Item: item3" {{ item }} + egrep "^.+\| node2 \| ok: Item: item1" {{ item }} + egrep "^.+\| node2 \| ok: Item: item2" {{ item }} + egrep "^.+\| node2 \| ok: Item: item3" {{ item }} + +- name: Validate output - loop with complex items + shell: | + egrep "^.+\| node1 \| ok: Item: Runtime" {{ item }} + egrep "^.+\| node2 \| ok: Item: Runtime" {{ item }} + +- name: Validate output - failed shell task + shell: | + egrep "^.+\| node1 \| Exception: Test module failure exception task" {{ item }} + egrep "^.+\| node2 \| Exception: Test module failure exception task" {{ item }} + +- name: Validate output - item loop with exception + shell: | + egrep "^.+\| node1 \| Exception: Test module failure exception loop" {{ item }} + egrep "^.+\| node2 \| Exception: Test module failure exception loop" {{ item }} + +- name: Validate output - binary data + shell: | + egrep "^.*\| node1 \| \\\\x80abc" {{ item }} + egrep "^.*\| node2 \| \\\\x80abc" {{ item }} diff --git a/tests/base.py b/tests/base.py index 5a85ea0d7..cebcf2e1f 100644 --- a/tests/base.py +++ b/tests/base.py @@ -3091,6 +3091,8 @@ class FakeBuild(object): self.paused = False self.aborted = False self.requeue = False + self.should_fail = False + self.should_retry = False self.created = time.time() self.changes = None items = self.parameters['zuul']['items'] @@ -3162,6 +3164,8 @@ class FakeBuild(object): return result def shouldFail(self): + if self.should_fail: + return True changes = self.executor_server.fail_tests.get(self.name, []) for change in changes: if self.hasChanges(change): @@ -3169,6 +3173,8 @@ class FakeBuild(object): return False def shouldRetry(self): + if self.should_retry: + return True entries = self.executor_server.retry_tests.get(self.name, []) for entry in entries: if self.hasChanges(entry['change']): @@ -3662,7 +3668,7 @@ class FakeSMTP(object): class FakeNodepool(object): REQUEST_ROOT = '/nodepool/requests' NODE_ROOT = '/nodepool/nodes' - LAUNCHER_ROOT = '/nodepool/launchers' + COMPONENT_ROOT = '/nodepool/components' log = logging.getLogger("zuul.test.FakeNodepool") @@ -3726,10 +3732,11 @@ class FakeNodepool(object): self.fulfillRequest(req) def registerLauncher(self, labels=["label1"], id="FakeLauncher"): - path = os.path.join(self.LAUNCHER_ROOT, id) + path = os.path.join(self.COMPONENT_ROOT, 'pool', id) data = {'id': id, 'supported_labels': labels} self.client.create( - path, json.dumps(data).encode('utf8'), makepath=True) + path, json.dumps(data).encode('utf8'), + ephemeral=True, makepath=True, sequence=True) def getNodeRequests(self): try: diff --git a/tests/fixtures/layouts/job-dedup-false.yaml b/tests/fixtures/layouts/job-dedup-false.yaml index 2c0e6ee2e..9254f0b41 100644 --- a/tests/fixtures/layouts/job-dedup-false.yaml +++ b/tests/fixtures/layouts/job-dedup-false.yaml @@ -39,6 +39,7 @@ - job: name: common-job deduplicate: false + pre-run: playbooks/pre.yaml required-projects: - org/project1 - org/project2 diff --git a/tests/fixtures/layouts/job-dedup-noop.yaml b/tests/fixtures/layouts/job-dedup-noop.yaml new file mode 100644 index 000000000..9383fd8b6 --- /dev/null +++ b/tests/fixtures/layouts/job-dedup-noop.yaml @@ -0,0 +1,55 @@ +- queue: + name: integrated + allow-circular-dependencies: true + +- pipeline: + name: gate + manager: dependent + success-message: Build succeeded (gate). + require: + gerrit: + approval: + - Approved: 1 + trigger: + gerrit: + - event: comment-added + approval: + - Approved: 1 + success: + gerrit: + Verified: 2 + submit: true + failure: + gerrit: + Verified: -2 + start: + gerrit: + Verified: 0 + precedence: high + +- job: + name: base + parent: null + pre-run: playbooks/pre.yaml + run: playbooks/run.yaml + nodeset: + nodes: + - label: debian + name: controller + +- job: + name: common-job + required-projects: + - org/project1 + +- job: + name: project1-job + +- project: + name: org/project1 + queue: integrated + gate: + jobs: + - noop + - common-job + - project1-job diff --git a/tests/remote/test_remote_zuul_stream.py b/tests/remote/test_remote_zuul_stream.py index 1f6b7fff7..1c705127e 100644 --- a/tests/remote/test_remote_zuul_stream.py +++ b/tests/remote/test_remote_zuul_stream.py @@ -29,7 +29,7 @@ class FunctionalZuulStreamMixIn: self.log_console_port = 19000 + int( self.ansible_core_version.split('.')[1]) self.executor_server.log_console_port = self.log_console_port - self.wait_timeout = 120 + self.wait_timeout = 180 self.fake_nodepool.remote_ansible = True ansible_remote = os.environ.get('ZUUL_REMOTE_IPV4') diff --git a/tests/unit/test_circular_dependencies.py b/tests/unit/test_circular_dependencies.py index ac5ad13f5..2223008d5 100644 --- a/tests/unit/test_circular_dependencies.py +++ b/tests/unit/test_circular_dependencies.py @@ -1703,6 +1703,151 @@ class TestGerritCircularDependencies(ZuulTestCase): ], ordered=False) self.assertEqual(len(self.fake_nodepool.history), 3) + @simple_layout('layouts/job-dedup-false.yaml') + def test_job_deduplication_false_failed_job(self): + # Test that if we are *not* deduplicating jobs, we don't + # duplicate the result on two different builds. + # The way we check that is to retry the common-job between two + # items, but only once, and only on one item. The other item + # should be unaffected. + self.executor_server.hold_jobs_in_build = True + A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A') + B = self.fake_gerrit.addFakeChange('org/project2', 'master', 'B') + + # A <-> B + A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format( + A.subject, B.data["url"] + ) + B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format( + B.subject, A.data["url"] + ) + + A.addApproval('Code-Review', 2) + B.addApproval('Code-Review', 2) + + self.fake_gerrit.addEvent(A.addApproval('Approved', 1)) + self.fake_gerrit.addEvent(B.addApproval('Approved', 1)) + + # If we don't make sure these jobs finish first, then one of + # the items may complete before the other and cause Zuul to + # abort the project*-job on the other item (with a "bundle + # failed to merge" error). + self.waitUntilSettled() + for build in self.builds: + if build.name == 'common-job' and build.project == 'org/project1': + break + else: + raise Exception("Unable to find build") + build.should_retry = True + + # Store a reference to the queue items so we can inspect their + # internal attributes later to double check the retry build + # count is correct. + tenant = self.scheds.first.sched.abide.tenants.get('tenant-one') + pipeline = tenant.layout.pipelines['gate'] + items = pipeline.getAllItems() + self.assertEqual(len(items), 2) + + self.executor_server.release('project1-job') + self.executor_server.release('project2-job') + self.waitUntilSettled() + self.executor_server.hold_jobs_in_build = False + self.executor_server.release() + self.waitUntilSettled() + + self.assertEqual(A.data['status'], 'MERGED') + self.assertEqual(B.data['status'], 'MERGED') + self.assertHistory([ + dict(name="project2-job", result="SUCCESS", changes="2,1 1,1"), + dict(name="project1-job", result="SUCCESS", changes="2,1 1,1"), + dict(name="common-job", result=None, changes="2,1 1,1"), + dict(name="common-job", result="SUCCESS", changes="2,1 1,1"), + dict(name="common-job", result="SUCCESS", changes="2,1 1,1"), + ], ordered=False) + self.assertEqual(len(self.fake_nodepool.history), 5) + self.assertEqual(items[0].change.project.name, 'org/project2') + self.assertEqual(len(items[0].current_build_set.retry_builds), 0) + self.assertEqual(items[1].change.project.name, 'org/project1') + self.assertEqual(len(items[1].current_build_set.retry_builds), 1) + + @simple_layout('layouts/job-dedup-auto-shared.yaml') + def test_job_deduplication_multi_scheduler(self): + # Test that a second scheduler can correctly refresh + # deduplicated builds + self.executor_server.hold_jobs_in_build = True + A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A') + B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B') + + # A <-> B + A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format( + A.subject, B.data["url"] + ) + B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format( + B.subject, A.data["url"] + ) + + A.addApproval('Code-Review', 2) + B.addApproval('Code-Review', 2) + + self.fake_gerrit.addEvent(A.addApproval('Approved', 1)) + self.fake_gerrit.addEvent(B.addApproval('Approved', 1)) + + self.waitUntilSettled() + + app = self.createScheduler() + app.start() + self.assertEqual(len(self.scheds), 2) + + # Hold the lock on the first scheduler so that only the second + # will act. + with self.scheds.first.sched.run_handler_lock: + self.executor_server.hold_jobs_in_build = False + self.executor_server.release() + self.waitUntilSettled(matcher=[app]) + + self.assertEqual(A.data['status'], 'MERGED') + self.assertEqual(B.data['status'], 'MERGED') + self.assertHistory([ + dict(name="project1-job", result="SUCCESS", changes="2,1 1,1"), + dict(name="common-job", result="SUCCESS", changes="2,1 1,1"), + ], ordered=False) + + @simple_layout('layouts/job-dedup-noop.yaml') + def test_job_deduplication_noop(self): + # Test that we don't deduplicate noop (there's no good reason + # to do so) + A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A') + B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B') + + # A <-> B + A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format( + A.subject, B.data["url"] + ) + B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format( + B.subject, A.data["url"] + ) + + A.addApproval('Code-Review', 2) + B.addApproval('Code-Review', 2) + + self.fake_gerrit.addEvent(A.addApproval('Approved', 1)) + self.fake_gerrit.addEvent(B.addApproval('Approved', 1)) + + self.waitUntilSettled() + + self.assertEqual(A.data['status'], 'MERGED') + self.assertEqual(B.data['status'], 'MERGED') + self.assertHistory([ + dict(name="project1-job", result="SUCCESS", changes="2,1 1,1"), + dict(name="common-job", result="SUCCESS", changes="2,1 1,1"), + ], ordered=False) + # It's tricky to get info about a noop build, but the jobs in + # the report have the build UUID, so we make sure it's + # different. + a_noop = [l for l in A.messages[-1].split('\n') if 'noop' in l][0] + b_noop = [l for l in B.messages[-1].split('\n') if 'noop' in l][0] + self.assertNotEqual(a_noop, b_noop) + @simple_layout('layouts/job-dedup-retry.yaml') def test_job_deduplication_retry(self): A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A') diff --git a/tests/unit/test_github_driver.py b/tests/unit/test_github_driver.py index f9f3371fa..49dae0ccb 100644 --- a/tests/unit/test_github_driver.py +++ b/tests/unit/test_github_driver.py @@ -1384,7 +1384,7 @@ class TestGithubDriver(ZuulTestCase): # now check if the merge was done via rebase merges = [report for report in self.fake_github.github_data.reports if report[2] == 'merge'] - assert(len(merges) == 1 and merges[0][3] == 'squash') + assert (len(merges) == 1 and merges[0][3] == 'squash') @simple_layout('layouts/basic-github.yaml', driver='github') def test_invalid_event(self): diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py index eb5ee826f..66c508fea 100644 --- a/tests/unit/test_scheduler.py +++ b/tests/unit/test_scheduler.py @@ -226,6 +226,18 @@ class TestSchedulerZoneFallback(ZuulTestCase): def test_jobs_executed(self): "Test that jobs are executed and a change is merged per zone" self.hold_jobs_in_queue = True + + # Validate that the reported executor stats are correct. Since + # the executor accepts zoned and unzoned job it should be counted + # in both metrics. + self.assertReportedStat( + 'zuul.executors.online', value='1', kind='g') + self.assertReportedStat( + 'zuul.executors.unzoned.online', value='1', kind='g') + self.assertReportedStat( + 'zuul.executors.zone.test-provider_vpn.online', + value='1', kind='g') + A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') A.addApproval('Code-Review', 2) self.fake_gerrit.addEvent(A.addApproval('Approved', 1)) @@ -452,6 +464,7 @@ class TestScheduler(ZuulTestCase): 'zuul.tenant.tenant-one.reconfiguration_time', 'zuul.tenant.tenant-one.pipeline.gate.event_enqueue_time', 'zuul.tenant.tenant-one.pipeline.gate.merge_request_time', + 'zuul.tenant.tenant-one.pipeline.gate.merger_merge_op_time', 'zuul.tenant.tenant-one.pipeline.gate.job_freeze_time', 'zuul.tenant.tenant-one.pipeline.gate.node_request_time', 'zuul.tenant.tenant-one.pipeline.gate.job_wait_time', @@ -5731,8 +5744,7 @@ For CI problems and help debugging, contact ci@example.org""" self.assertEqual(A.reported, 2) self.assertTrue(re.search('project-merge .* NODE_FAILURE', A.messages[1])) - self.assertTrue(re.search('project-test1 .* SKIPPED', A.messages[1])) - self.assertTrue(re.search('project-test2 .* SKIPPED', A.messages[1])) + self.assertTrue('Skipped 2 jobs' in A.messages[1]) def test_nodepool_resources(self): "Test that resources are reported" @@ -6854,7 +6866,7 @@ class TestDependencyGraph(ZuulTestCase): self.assertHistory([ dict(name='build', result='FAILURE', changes='1,1'), ], ordered=False) - self.assertIn('SKIPPED', A.messages[0]) + self.assertIn('Skipped 1 job', A.messages[0]) class TestDuplicatePipeline(ZuulTestCase): diff --git a/tests/unit/test_sos.py b/tests/unit/test_sos.py index 7d95f2548..f371a8064 100644 --- a/tests/unit/test_sos.py +++ b/tests/unit/test_sos.py @@ -55,6 +55,79 @@ class TestScaleOutScheduler(ZuulTestCase): dict(name='project-test2', result='SUCCESS', changes='1,1'), ], ordered=False) + def test_pipeline_cache_clear(self): + # Test that the pipeline cache on a second scheduler isn't + # holding old change objects. + + # Hold jobs in build + sched1 = self.scheds.first + self.executor_server.hold_jobs_in_build = True + + # We need a pair of changes in order to populate the pipeline + # change cache (a single change doesn't activate the cache, + # it's for dependencies). + A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') + A.addApproval('Code-Review', 2) + B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B') + B.addApproval('Code-Review', 2) + B.addApproval('Approved', 1) + B.setDependsOn(A, 1) + + # Fail a job + self.executor_server.failJob('project-test1', A) + + # Enqueue into gate with scheduler 1 + self.fake_gerrit.addEvent(A.addApproval('Approved', 1)) + self.waitUntilSettled() + + # Start scheduler 2 + sched2 = self.createScheduler() + sched2.start() + self.assertEqual(len(self.scheds), 2) + + # Pause scheduler 1 + with sched1.sched.run_handler_lock: + # Release jobs + self.executor_server.hold_jobs_in_build = False + self.executor_server.release() + # Wait for scheduler 2 to dequeue + self.waitUntilSettled(matcher=[sched2]) + # Unpause scheduler 1 + self.assertEqual(A.data['status'], 'NEW') + self.assertEqual(B.data['status'], 'NEW') + + # Clear zk change cache + self.fake_gerrit._change_cache.prune([], max_age=0) + + # At this point, scheduler 1 should have a bogus change entry + # in the pipeline cache because scheduler 2 performed the + # dequeue so scheduler 1 never cleaned up its cache. + + self.executor_server.fail_tests.clear() + self.executor_server.hold_jobs_in_build = True + # Pause scheduler 1 + with sched1.sched.run_handler_lock: + # Enqueue into gate with scheduler 2 + self.fake_gerrit.addEvent(A.addApproval('Approved', 1)) + self.waitUntilSettled(matcher=[sched2]) + + # Pause scheduler 2 + with sched2.sched.run_handler_lock: + # Make sure that scheduler 1 does some pipeline runs which + # reconstitute state from ZK. This gives it the + # opportunity to use old cache data if we don't clear it. + + # Release job1 + self.executor_server.release() + self.waitUntilSettled(matcher=[sched1]) + # Release job2 + self.executor_server.hold_jobs_in_build = False + self.executor_server.release() + # Wait for scheduler 1 to merge change + self.waitUntilSettled(matcher=[sched1]) + self.assertEqual(A.data['status'], 'MERGED') + self.assertEqual(B.data['status'], 'MERGED') + @simple_layout('layouts/multi-scheduler-status.yaml') def test_multi_scheduler_status(self): self.hold_merge_jobs_in_queue = True diff --git a/tests/unit/test_v3.py b/tests/unit/test_v3.py index 9fd8ea1d3..a89bb3007 100644 --- a/tests/unit/test_v3.py +++ b/tests/unit/test_v3.py @@ -4930,7 +4930,7 @@ class TestDataReturn(AnsibleZuulTestCase): self.assertIn( '- data-return https://zuul.example.com/', A.messages[-1]) - self.assertTrue(re.search('child .* SKIPPED', A.messages[-1])) + self.assertTrue('Skipped 1 job' in A.messages[-1]) self.assertIn('Build succeeded', A.messages[-1]) def test_data_return_invalid_child_job(self): @@ -4943,7 +4943,7 @@ class TestDataReturn(AnsibleZuulTestCase): self.assertIn( '- data-return-invalid-child-job https://zuul.example.com', A.messages[-1]) - self.assertTrue(re.search('data-return .* SKIPPED', A.messages[-1])) + self.assertTrue('Skipped 1 job' in A.messages[-1]) self.assertIn('Build succeeded', A.messages[-1]) def test_data_return_skip_all_child_jobs(self): @@ -4957,8 +4957,7 @@ class TestDataReturn(AnsibleZuulTestCase): self.assertIn( '- data-return-skip-all https://zuul.example.com/', A.messages[-1]) - self.assertTrue(re.search('child .* SKIPPED', A.messages[-1])) - self.assertTrue(re.search('data-return .* SKIPPED', A.messages[-1])) + self.assertTrue('Skipped 2 jobs' in A.messages[-1]) self.assertIn('Build succeeded', A.messages[-1]) def test_data_return_skip_all_child_jobs_with_soft_dependencies(self): @@ -4972,8 +4971,7 @@ class TestDataReturn(AnsibleZuulTestCase): ]) self.assertIn('- data-return-cd https://zuul.example.com/', A.messages[-1]) - self.assertTrue(re.search('data-return-a .* SKIPPED', A.messages[-1])) - self.assertTrue(re.search('data-return-b .* SKIPPED', A.messages[-1])) + self.assertTrue('Skipped 2 jobs' in A.messages[-1]) self.assertIn('Build succeeded', A.messages[-1]) def test_several_zuul_return(self): @@ -4987,7 +4985,7 @@ class TestDataReturn(AnsibleZuulTestCase): self.assertIn( '- several-zuul-return-child https://zuul.example.com/', A.messages[-1]) - self.assertTrue(re.search('data-return .* SKIPPED', A.messages[-1])) + self.assertTrue('Skipped 1 job' in A.messages[-1]) self.assertIn('Build succeeded', A.messages[-1]) def test_data_return_skip_retry(self): @@ -6905,7 +6903,7 @@ class TestJobPause(AnsibleZuulTestCase): dict(name='compile', result='SUCCESS', changes='1,1'), ]) - self.assertTrue(re.search('test .* SKIPPED', A.messages[0])) + self.assertTrue('Skipped 1 job' in A.messages[0]) def test_job_pause_pre_skipped_child(self): """ @@ -6953,7 +6951,7 @@ class TestJobPause(AnsibleZuulTestCase): dict(name='compile', result='SUCCESS', changes='1,1'), ]) - self.assertTrue(re.search('test .* SKIPPED', A.messages[0])) + self.assertTrue('Skipped 1 job' in A.messages[0]) def test_job_pause_skipped_child_retry(self): """ @@ -7822,7 +7820,7 @@ class TestProvidesRequiresMysql(ZuulTestCase): dict(name='image-builder', result='FAILURE', changes='1,1'), dict(name='hold', result='SUCCESS', changes='1,1'), ], ordered=False) - self.assertTrue(re.search('image-user .* SKIPPED', A.messages[0])) + self.assertTrue('Skipped 1 job' in A.messages[0]) B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B') B.data['commitMessage'] = '%s\n\nDepends-On: %s\n' % ( diff --git a/web/public/openapi.yaml b/web/public/openapi.yaml index 312a1907e..b101c66e0 100644 --- a/web/public/openapi.yaml +++ b/web/public/openapi.yaml @@ -474,11 +474,9 @@ components: description: The event id type: string item_ahead: - description: The list of events ahead - items: - type: string - type: array - item_behind: + description: The event ahead + type: string + items_behind: description: The list of events behind items: type: string diff --git a/web/src/containers/status/ChangePanel.jsx b/web/src/containers/status/ChangePanel.jsx index 33fc4687c..dd4fc27e5 100644 --- a/web/src/containers/status/ChangePanel.jsx +++ b/web/src/containers/status/ChangePanel.jsx @@ -18,6 +18,7 @@ import { connect } from 'react-redux' import { Link } from 'react-router-dom' import * as moment from 'moment' import 'moment-duration-format' +import { Button } from '@patternfly/react-core' class ChangePanel extends React.Component { @@ -30,9 +31,11 @@ class ChangePanel extends React.Component { constructor () { super() this.state = { - expanded: false + expanded: false, + showSkipped: false, } this.onClick = this.onClick.bind(this) + this.toggleSkippedJobs = this.toggleSkippedJobs.bind(this) this.clicked = false } @@ -120,12 +123,13 @@ class ChangePanel extends React.Component { } renderProgressBar (change) { - let jobPercent = (100 / change.jobs.length).toFixed(2) + const interesting_jobs = change.jobs.filter(j => this.jobStrResult(j) !== 'skipped') + let jobPercent = (100 / interesting_jobs.length).toFixed(2) return ( <div className='progress zuul-change-total-result'> {change.jobs.map((job, idx) => { let result = this.jobStrResult(job) - if (['queued', 'waiting'].includes(result)) { + if (['queued', 'waiting', 'skipped'].includes(result)) { return '' } let className = '' @@ -144,7 +148,6 @@ class ChangePanel extends React.Component { className = ' progress-bar-warning' break case 'paused': - case 'skipped': className = ' progress-bar-info' break default: @@ -302,15 +305,39 @@ class ChangePanel extends React.Component { </span>) } + toggleSkippedJobs (e) { + // Skip middle mouse button + if (e.button === 1) { + return + } + this.setState({ showSkipped: !this.state.showSkipped }) + } + renderJobList (jobs, times) { + const [buttonText, interestingJobs] = this.state.showSkipped ? + ['Hide', jobs] : + ['Show', jobs.filter(j => this.jobStrResult(j) !== 'skipped')] + const skippedJobCount = jobs.length - interestingJobs.length + return ( - <ul className='list-group zuul-patchset-body'> - {jobs.map((job, idx) => ( - <li key={idx} className='list-group-item zuul-change-job'> - {this.renderJob(job, times.jobs[job.name])} - </li> - ))} - </ul>) + <> + <ul className='list-group zuul-patchset-body'> + {interestingJobs.map((job, idx) => ( + <li key={idx} className='list-group-item zuul-change-job'> + {this.renderJob(job, times.jobs[job.name])} + </li> + ))} + {(this.state.showSkipped || skippedJobCount) ? ( + <li key='last' className='list-group-item zuul-change-job'> + <Button variant="link" className='zuul-skipped-jobs-button' + onClick={this.toggleSkippedJobs}> + {buttonText} {skippedJobCount ? skippedJobCount : ''} skipped job{skippedJobCount === 1 ? '' : 's'} + </Button> + </li> + ) : ''} + </ul> + </> + ) } calculateTimes (change) { diff --git a/web/src/containers/status/ChangePanel.test.jsx b/web/src/containers/status/ChangePanel.test.jsx index 5a4be8602..cd27edc73 100644 --- a/web/src/containers/status/ChangePanel.test.jsx +++ b/web/src/containers/status/ChangePanel.test.jsx @@ -16,6 +16,7 @@ import React from 'react' import { Link, BrowserRouter as Router } from 'react-router-dom' import { Provider } from 'react-redux' import { create } from 'react-test-renderer' +import { Button } from '@patternfly/react-core' import { setTenantAction } from '../../actions/tenant' import configureStore from '../../store' @@ -45,6 +46,8 @@ it('change panel render multi tenant links', () => { const jobLink = application.root.findByType(Link) expect(jobLink.props.to).toEqual( '/t/tenant-one/stream/42') + const skipButton = application.root.findAllByType(Button) + expect(skipButton === undefined) }) it('change panel render white-label tenant links', () => { @@ -60,4 +63,29 @@ it('change panel render white-label tenant links', () => { const jobLink = application.root.findByType(Link) expect(jobLink.props.to).toEqual( '/stream/42') + const skipButton = application.root.findAllByType(Button) + expect(skipButton === undefined) +}) + +it('change panel skip jobs', () => { + const fakeChange = { + project: 'org-project', + jobs: [{ + name: 'job-name', + url: 'stream/42', + result: 'skipped' + }] + } + + const store = configureStore() + store.dispatch(setTenantAction('tenant-one', true)) + const application = create( + <Provider store={store}> + <Router> + <ChangePanel change={fakeChange} globalExpanded={true} /> + </Router> + </Provider> + ) + const skipButton = application.root.findByType(Button) + expect(skipButton.props.children.includes('skipped job')) }) diff --git a/web/src/index.css b/web/src/index.css index e8c67a372..eddbca673 100644 --- a/web/src/index.css +++ b/web/src/index.css @@ -189,6 +189,11 @@ a.refresh { font-size: small; } +.zuul-skipped-jobs-button { + font-size: small; + padding: 0; +} + .zuul-non-voting-desc { font-size: smaller; } diff --git a/zuul/ansible/base/callback/zuul_stream.py b/zuul/ansible/base/callback/zuul_stream.py index 720261cb2..f31983ed6 100644 --- a/zuul/ansible/base/callback/zuul_stream.py +++ b/zuul/ansible/base/callback/zuul_stream.py @@ -48,6 +48,7 @@ from zuul.ansible import paths from zuul.ansible import logconfig LOG_STREAM_PORT = int(os.environ.get("ZUUL_CONSOLE_PORT", 19885)) +LOG_STREAM_VERSION = 0 def zuul_filter_result(result): @@ -103,6 +104,7 @@ class CallbackModule(default.CallbackModule): self._items_done = False self._deferred_result = None self._playbook_name = None + self._zuul_console_version = 0 def configure_logger(self): # ansible appends timestamp, user and pid to the log lines emitted @@ -129,9 +131,7 @@ class CallbackModule(default.CallbackModule): else: self._display.display(msg) - def _read_log(self, host, ip, port, log_id, task_name, hosts): - self._log("[%s] Starting to log %s for task %s" - % (host, log_id, task_name), job=False, executor=True) + def _read_log_connect(self, host, ip, port): logger_retries = 0 while True: try: @@ -141,6 +141,7 @@ class CallbackModule(default.CallbackModule): # logs continously. Without this we can easily trip the 5 # second timeout. s.settimeout(None) + return s except socket.timeout: self._log( "Timeout exception waiting for the logger. " @@ -151,7 +152,7 @@ class CallbackModule(default.CallbackModule): "Timeout exception waiting for the logger. " "Please check connectivity to [%s:%s]" % (ip, port)) - return + return None except Exception: if logger_retries % 10 == 0: self._log("[%s] Waiting on logger" % host, @@ -159,31 +160,77 @@ class CallbackModule(default.CallbackModule): logger_retries += 1 time.sleep(0.1) continue - msg = "%s\n" % log_id - s.send(msg.encode("utf-8")) - buff = s.recv(4096) - buffering = True - while buffering: - if b'\n' in buff: - (line, buff) = buff.split(b'\n', 1) - # We can potentially get binary data here. In order to - # being able to handle that use the backslashreplace - # error handling method. This decodes unknown utf-8 - # code points to escape sequences which exactly represent - # the correct data without throwing a decoding exception. - done = self._log_streamline( - host, line.decode("utf-8", "backslashreplace")) - if done: - return + + def _read_log(self, host, ip, port, log_id, task_name, hosts): + self._log("[%s] Starting to log %s for task %s" + % (host, log_id, task_name), job=False, executor=True) + + s = self._read_log_connect(host, ip, port) + if s is None: + # Can't connect; _read_log_connect() already logged an + # error for us, just bail + return + + # Find out what version we are running against + s.send(f'v:{LOG_STREAM_VERSION}\n'.encode('utf-8')) + buff = s.recv(1024).decode('utf-8').strip() + + # NOTE(ianw) 2022-07-21 : zuul_console from < 6.3.0 do not + # understand this protocol. They will assume the send + # above is a log request and send back the not found + # message in a loop. So to handle this we disconnect and + # reconnect. When we decide to remove this, we can remove + # anything in the "version 0" path. + if buff == '[Zuul] Log not found': + s.close() + s = self._read_log_connect(host, ip, port) + if s is None: + return + else: + self._zuul_console_version = int(buff) + self._log('[%s] Reports streaming version: %d' % + (host, self._zuul_console_version), + job=False, executor=True) + + if self._zuul_console_version >= 1: + msg = 's:%s\n' % log_id + else: + msg = '%s\n' % log_id + + s.send(msg.encode("utf-8")) + buff = s.recv(4096) + buffering = True + while buffering: + if b'\n' in buff: + (line, buff) = buff.split(b'\n', 1) + # We can potentially get binary data here. In order to + # being able to handle that use the backslashreplace + # error handling method. This decodes unknown utf-8 + # code points to escape sequences which exactly represent + # the correct data without throwing a decoding exception. + done = self._log_streamline( + host, line.decode("utf-8", "backslashreplace")) + if done: + if self._zuul_console_version > 0: + try: + # reestablish connection and tell console to + # clean up + s = self._read_log_connect(host, ip, port) + s.send(f'f:{log_id}\n'.encode('utf-8')) + s.close() + except Exception: + # Don't worry if this fails + pass + return + else: + more = s.recv(4096) + if not more: + buffering = False else: - more = s.recv(4096) - if not more: - buffering = False - else: - buff += more - if buff: - self._log_streamline( - host, buff.decode("utf-8", "backslashreplace")) + buff += more + if buff: + self._log_streamline( + host, buff.decode("utf-8", "backslashreplace")) def _log_streamline(self, host, line): if "[Zuul] Task exit code" in line: diff --git a/zuul/ansible/base/library/zuul_console.py b/zuul/ansible/base/library/zuul_console.py index 9dffbbc3a..aa999cac1 100755 --- a/zuul/ansible/base/library/zuul_console.py +++ b/zuul/ansible/base/library/zuul_console.py @@ -24,6 +24,14 @@ import subprocess import threading import time +# This is the version we report to the zuul_stream callback. It is +# expected that this (zuul_console) process can be long-lived, so if +# there are updates this ensures a later streaming callback can still +# talk to us. +ZUUL_CONSOLE_PROTO_VERSION = 1 +# This is the template for the file name of the log-file written out +# by the command.py override command in the executor's Ansible +# install. LOG_STREAM_FILE = '/tmp/console-{log_uuid}.log' LOG_STREAM_PORT = 19885 @@ -162,15 +170,49 @@ class Server(object): ret = buffer.decode('utf-8') x = ret.find('\n') if x > 0: - return ret[:x] + return ret[:x].strip() except UnicodeDecodeError: pass - def handleOneConnection(self, conn): - log_uuid = self.get_command(conn) + def _clean_uuid(self, log_uuid): # use path split to make use the input isn't trying to be clever # and construct some path like /tmp/console-/../../something - log_uuid = os.path.split(log_uuid.rstrip())[-1] + return os.path.split(log_uuid)[-1] + + def handleOneConnection(self, conn): + # V1 protocol + # ----------- + # v:<ver> get version number, <ver> is remote version + # s:<uuid> send logs for <uuid> + # f:<uuid> finalise/cleanup <uuid> + while True: + command = self.get_command(conn) + if command.startswith('v:'): + # NOTE(ianw) : remote sends its version. We currently + # don't have anything to do with this value, so ignore + # for now. + cmd = '%s\n' % (ZUUL_CONSOLE_PROTO_VERSION) + conn.send(cmd.encode('utf-8')) + continue + elif command.startswith('f:'): + log_uuid = self._clean_uuid(command[2:]) + try: + os.unlink(self.path.format(log_uuid=log_uuid)) + except Exception: + # something might have cleaned /tmp + pass + continue + elif command.startswith('s:'): + log_uuid = self._clean_uuid(command[2:]) + break + else: + # NOTE(ianw): 2022-07-21 In releases < 6.3.0 the streaming + # side would just send a raw uuid and nothing else; so by + # default assume that is what is coming in here. We can + # remove this fallback when we decide it is no longer + # necessary. + log_uuid = self._clean_uuid(command) + break # FIXME: this won't notice disconnects until it tries to send console = None diff --git a/zuul/driver/gerrit/gerritconnection.py b/zuul/driver/gerrit/gerritconnection.py index 6aea4388b..1ec334915 100644 --- a/zuul/driver/gerrit/gerritconnection.py +++ b/zuul/driver/gerrit/gerritconnection.py @@ -1470,7 +1470,7 @@ class GerritConnection(ZKChangeCacheMixin, ZKBranchCacheMixin, BaseConnection): # for large projects like nova alldata = [] chunk, more_changes = _query_chunk(query, event) - while(chunk): + while chunk: alldata.extend(chunk) if more_changes is None: # continue sortKey based (before Gerrit 2.9) diff --git a/zuul/driver/github/githubreporter.py b/zuul/driver/github/githubreporter.py index de62f2565..73d3a6f13 100644 --- a/zuul/driver/github/githubreporter.py +++ b/zuul/driver/github/githubreporter.py @@ -135,9 +135,12 @@ class GithubReporter(BaseReporter): def _formatItemReportJobs(self, item): # Return the list of jobs portion of the report ret = '' - jobs_fields = self._getItemReportJobsFields(item) + jobs_fields, skipped = self._getItemReportJobsFields(item) for job_fields in jobs_fields: ret += self._formatJobResult(job_fields) + if skipped: + jobtext = 'job' if skipped == 1 else 'jobs' + ret += 'Skipped %i %s\n' % (skipped, jobtext) return ret def addPullComment(self, item, comment=None): diff --git a/zuul/driver/pagure/pagurereporter.py b/zuul/driver/pagure/pagurereporter.py index 0bfdbc9b8..b38035752 100644 --- a/zuul/driver/pagure/pagurereporter.py +++ b/zuul/driver/pagure/pagurereporter.py @@ -67,9 +67,12 @@ class PagureReporter(BaseReporter): def _formatItemReportJobs(self, item): # Return the list of jobs portion of the report ret = '' - jobs_fields = self._getItemReportJobsFields(item) + jobs_fields, skipped = self._getItemReportJobsFields(item) for job_fields in jobs_fields: ret += '- [%s](%s) : %s%s%s%s\n' % job_fields[:6] + if skipped: + jobtext = 'job' if skipped == 1 else 'jobs' + ret += 'Skipped %i %s\n' % (skipped, jobtext) return ret def addPullComment(self, item, comment=None): diff --git a/zuul/driver/smtp/smtpreporter.py b/zuul/driver/smtp/smtpreporter.py index 4815026ab..a5d8938c1 100644 --- a/zuul/driver/smtp/smtpreporter.py +++ b/zuul/driver/smtp/smtpreporter.py @@ -42,7 +42,7 @@ class SMTPReporter(BaseReporter): if 'subject' in self.config: subject = self.config['subject'].format( - change=item.change) + change=item.change, pipeline=item.pipeline.getSafeAttributes()) else: subject = "Report for change {change} against {ref}".format( change=item.change, ref=item.change.ref) diff --git a/zuul/executor/server.py b/zuul/executor/server.py index 2165a797b..e00612e9e 100644 --- a/zuul/executor/server.py +++ b/zuul/executor/server.py @@ -3157,6 +3157,10 @@ class ExecutorServer(BaseMergeServer): self.allow_unzoned = get_default(self.config, 'executor', 'allow_unzoned', False) + # If this executor has no zone configured it is implicitly unzoned + if self.zone is None: + self.allow_unzoned = True + # Those attributes won't change, so it's enough to set them once on the # component info. self.component_info.zone = self.zone @@ -3735,7 +3739,7 @@ class ExecutorServer(BaseMergeServer): sensor.reportStats(self.statsd, base_key) def finishJob(self, unique): - del(self.job_workers[unique]) + del self.job_workers[unique] self.log.debug( "Finishing Job: %s, queue(%d): %s", unique, diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py index 642aededd..a66f5ad22 100644 --- a/zuul/manager/__init__.py +++ b/zuul/manager/__init__.py @@ -235,6 +235,9 @@ class PipelineManager(metaclass=ABCMeta): resolved_changes.append(change) return resolved_changes + def clearCache(self): + self._change_cache.clear() + def _maintainCache(self): active_layout_uuids = set() referenced_change_keys = set() @@ -1768,9 +1771,12 @@ class PipelineManager(metaclass=ABCMeta): build_in_items = [item] if item.bundle: for other_item in item.bundle.items: - if other_item not in build_in_items: - if other_item.current_build_set.getBuild(build.job.name): - build_in_items.append(other_item) + if other_item in build_in_items: + continue + other_build = other_item.current_build_set.getBuild( + build.job.name) + if other_build is not None and other_build is build: + build_in_items.append(other_item) for item in build_in_items: # We don't care about some actions below if this build # isn't in the current buildset, so determine that before @@ -1817,18 +1823,27 @@ class PipelineManager(metaclass=ABCMeta): # We're the second of the files/merger pair, report the stat self.reportPipelineTiming('merge_request_time', build_set.configured_time) + if event.elapsed_time: + self.reportPipelineTiming('merger_files_changes_op_time', + event.elapsed_time, elapsed=True) def onMergeCompleted(self, event, build_set): if build_set.merge_state == build_set.COMPLETE: self._onGlobalRepoStateCompleted(event, build_set) self.reportPipelineTiming('repo_state_time', build_set.repo_state_request_time) + if event.elapsed_time: + self.reportPipelineTiming('merger_repo_state_op_time', + event.elapsed_time, elapsed=True) else: self._onMergeCompleted(event, build_set) if build_set.files_state == build_set.COMPLETE: # We're the second of the files/merger pair, report the stat self.reportPipelineTiming('merge_request_time', build_set.configured_time) + if event.elapsed_time: + self.reportPipelineTiming('merger_merge_op_time', + event.elapsed_time, elapsed=True) def _onMergeCompleted(self, event, build_set): @@ -2120,7 +2135,7 @@ class PipelineManager(metaclass=ABCMeta): except Exception: self.log.exception("Exception reporting pipeline stats") - def reportPipelineTiming(self, key, start, end=None): + def reportPipelineTiming(self, key, start, end=None, elapsed=False): if not self.sched.statsd: return if not start: @@ -2130,5 +2145,8 @@ class PipelineManager(metaclass=ABCMeta): pipeline = self.pipeline tenant = pipeline.tenant stats_key = f'zuul.tenant.{tenant.name}.pipeline.{pipeline.name}' - dt = (end - start) * 1000 + if elapsed: + dt = start + else: + dt = (end - start) * 1000 self.sched.statsd.timing(f'{stats_key}.{key}', dt) diff --git a/zuul/merger/client.py b/zuul/merger/client.py index 362644b98..29fa39aaf 100644 --- a/zuul/merger/client.py +++ b/zuul/merger/client.py @@ -159,7 +159,9 @@ class MergeClient(object): "via result event for %s", merge_request) if merge_request.job_type == MergeRequest.FILES_CHANGES: event = FilesChangesCompletedEvent( - merge_request.build_set_uuid, files=None + merge_request.build_set_uuid, + files=None, + elapsed_time=None, ) else: event = MergeCompletedEvent( @@ -172,6 +174,7 @@ class MergeClient(object): repo_state=None, item_in_branches=None, errors=None, + elapsed_time=None, ) try: self.result_events[merge_request.tenant_name][ diff --git a/zuul/merger/server.py b/zuul/merger/server.py index 91597714f..fe5b938a1 100644 --- a/zuul/merger/server.py +++ b/zuul/merger/server.py @@ -256,6 +256,7 @@ class BaseMergeServer(metaclass=ABCMeta): def executeMergeJob(self, merge_request, params): result = None + start = time.monotonic() if merge_request.job_type == MergeRequest.MERGE: result = self.merge(merge_request, params) elif merge_request.job_type == MergeRequest.CAT: @@ -264,6 +265,8 @@ class BaseMergeServer(metaclass=ABCMeta): result = self.refstate(merge_request, params) elif merge_request.job_type == MergeRequest.FILES_CHANGES: result = self.fileschanges(merge_request, params) + end = time.monotonic() + result['elapsed_time'] = end - start return result def cat(self, merge_request, args): @@ -376,6 +379,7 @@ class BaseMergeServer(metaclass=ABCMeta): item_in_branches = result.get("item_in_branches", []) files = result.get("files", {}) errors = result.get("errors", []) + elapsed_time = result.get("elapsed_time") log.info( "Merge %s complete, merged: %s, updated: %s, commit: %s, " @@ -407,7 +411,9 @@ class BaseMergeServer(metaclass=ABCMeta): ) if merge_request.job_type == MergeRequest.FILES_CHANGES: event = FilesChangesCompletedEvent( - merge_request.build_set_uuid, files + merge_request.build_set_uuid, + files, + elapsed_time, ) else: event = MergeCompletedEvent( @@ -420,6 +426,7 @@ class BaseMergeServer(metaclass=ABCMeta): repo_state, item_in_branches, errors, + elapsed_time, ) def put_complete_event(log, merge_request, event): diff --git a/zuul/model.py b/zuul/model.py index a07d5a640..963332826 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -694,6 +694,11 @@ class PipelineState(zkobject.ZKObject): return json.dumps(data, sort_keys=True).encode("utf8") def deserialize(self, raw, context): + # We may have old change objects in the pipeline cache, so + # make sure they are the same objects we would get from the + # source change cache. + self.pipeline.manager.clearCache() + data = super().deserialize(raw, context) existing_queues = { q.getPath(): q for q in self.queues + self.old_queues @@ -742,8 +747,56 @@ class PipelineState(zkobject.ZKObject): "queues": queues, "old_queues": old_queues, }) + if context.build_references: + self._fixBuildReferences(data, context) + context.build_references = False return data + def _fixBuildReferences(self, data, context): + # Reconcile duplicate builds; if we find any BuildReference + # objects, look up the actual builds and replace + log = context.log + build_map = {} + to_replace_dicts = [] + to_replace_lists = [] + for queue in data['queues'] + data['old_queues']: + for item in queue.queue: + buildset = item.current_build_set + for build_job, build in buildset.builds.items(): + if isinstance(build, BuildReference): + to_replace_dicts.append((item, + buildset.builds, + build_job, + build._path)) + else: + build_map[build.getPath()] = build + for job_name, build_list in buildset.retry_builds.items(): + for build in build_list: + if isinstance(build, BuildReference): + to_replace_lists.append((item, + build_list, + build, + build._path)) + else: + build_map[build.getPath()] = build + for (item, build_dict, build_job, build_path) in to_replace_dicts: + orig_build = build_map.get(build_path) + if orig_build: + build_dict[build_job] = orig_build + else: + log.warning("Unable to find deduplicated build %s for %s", + build_path, item) + del build_dict[build_job] + for (item, build_list, build, build_path) in to_replace_lists: + idx = build_list.index(build) + orig_build = build_map.get(build_path) + if orig_build: + build_list[idx] = build_map[build_path] + else: + log.warning("Unable to find deduplicated build %s for %s", + build_path, item) + del build_list[idx] + def _getKnownItems(self): items = [] for queue in (*self.old_queues, *self.queues): @@ -3424,6 +3477,11 @@ class BuildRequest(JobRequest): ) +class BuildReference: + def __init__(self, _path): + self._path = _path + + class Build(zkobject.ZKObject): """A Build is an instance of a single execution of a Job. @@ -3852,6 +3910,13 @@ class BuildSet(zkobject.ZKObject): } return json.dumps(data, sort_keys=True).encode("utf8") + def _isMyBuild(self, build_path): + parts = build_path.split('/') + buildset_uuid = parts[-5] + if buildset_uuid == self.uuid: + return True + return False + def deserialize(self, raw, context): data = super().deserialize(raw, context) # Set our UUID so that getPath() returns the correct path for @@ -3944,8 +4009,12 @@ class BuildSet(zkobject.ZKObject): if not build.result: build.refresh(context) else: - build = Build.fromZK( - context, build_path, job=job, build_set=self) + if not self._isMyBuild(build_path): + build = BuildReference(build_path) + context.build_references = True + else: + build = Build.fromZK( + context, build_path, job=job, build_set=self) builds[job_name] = build for retry_path in data["retry_builds"].get(job_name, []): @@ -3954,8 +4023,12 @@ class BuildSet(zkobject.ZKObject): # Retry builds never change. pass else: - retry_build = Build.fromZK( - context, retry_path, job=job, build_set=self) + if not self._isMyBuild(retry_path): + retry_build = BuildReference(retry_path) + context.build_references = True + else: + retry_build = Build.fromZK( + context, retry_path, job=job, build_set=self) retry_builds[job_name].append(retry_build) data.update({ @@ -6180,12 +6253,13 @@ class MergeCompletedEvent(ResultEvent): :arg dict repo_state: The starting repo state before the merge. :arg list item_in_branches: A list of branches in which the final commit in the merge list appears (changes without refs). - :arg list errors: A list of error message strings + :arg list errors: A list of error message strings. + :arg float elapsed_time: Elapsed time of merge op in seconds. """ def __init__(self, request_uuid, build_set_uuid, merged, updated, commit, files, repo_state, item_in_branches, - errors): + errors, elapsed_time): self.request_uuid = request_uuid self.build_set_uuid = build_set_uuid self.merged = merged @@ -6195,6 +6269,7 @@ class MergeCompletedEvent(ResultEvent): self.repo_state = repo_state or {} self.item_in_branches = item_in_branches or [] self.errors = errors or [] + self.elapsed_time = elapsed_time def __repr__(self): return ('<MergeCompletedEvent job: %s buildset: %s merged: %s ' @@ -6214,6 +6289,7 @@ class MergeCompletedEvent(ResultEvent): "repo_state": dict(self.repo_state), "item_in_branches": list(self.item_in_branches), "errors": list(self.errors), + "elapsed_time": self.elapsed_time, } @classmethod @@ -6228,6 +6304,7 @@ class MergeCompletedEvent(ResultEvent): dict(data.get("repo_state", {})), list(data.get("item_in_branches", [])), list(data.get("errors", [])), + data.get("elapsed_time"), ) @@ -6236,16 +6313,19 @@ class FilesChangesCompletedEvent(ResultEvent): :arg BuildSet build_set: The build_set which is ready. :arg list files: List of files changed. + :arg float elapsed_time: Elapsed time of merge op in seconds. """ - def __init__(self, build_set_uuid, files): + def __init__(self, build_set_uuid, files, elapsed_time): self.build_set_uuid = build_set_uuid self.files = files or [] + self.elapsed_time = elapsed_time def toDict(self): return { "build_set_uuid": self.build_set_uuid, "files": list(self.files), + "elapsed_time": self.elapsed_time, } @classmethod @@ -6253,6 +6333,7 @@ class FilesChangesCompletedEvent(ResultEvent): return cls( data.get("build_set_uuid"), list(data.get("files", [])), + data.get("elapsed_time"), ) @@ -7078,6 +7159,7 @@ class Layout(object): noop = Job('noop') noop.description = 'A job that will always succeed, no operation.' noop.parent = noop.BASE_JOB_MARKER + noop.deduplicate = False noop.run = (PlaybookContext(None, 'noop.yaml', [], []),) self.jobs = {'noop': [noop]} self.nodesets = {} diff --git a/zuul/reporter/__init__.py b/zuul/reporter/__init__.py index 9b8f2c11c..5723316a3 100644 --- a/zuul/reporter/__init__.py +++ b/zuul/reporter/__init__.py @@ -257,9 +257,13 @@ class BaseReporter(object, metaclass=abc.ABCMeta): # Extract the report elements from an item config = self.connection.sched.config jobs_fields = [] + skipped = 0 for job in item.getJobs(): build = item.current_build_set.getBuild(job.name) (result, url) = item.formatJobResult(job) + if result == 'SKIPPED': + skipped += 1 + continue if not job.voting: voting = ' (non-voting)' else: @@ -300,12 +304,15 @@ class BaseReporter(object, metaclass=abc.ABCMeta): success_message = job.success_message jobs_fields.append( (name, url, result, error, elapsed, voting, success_message)) - return jobs_fields + return jobs_fields, skipped def _formatItemReportJobs(self, item): # Return the list of jobs portion of the report ret = '' - jobs_fields = self._getItemReportJobsFields(item) + jobs_fields, skipped = self._getItemReportJobsFields(item) for job_fields in jobs_fields: ret += '- %s%s : %s%s%s%s\n' % job_fields[:6] + if skipped: + jobtext = 'job' if skipped == 1 else 'jobs' + ret += 'Skipped %i %s\n' % (skipped, jobtext) return ret diff --git a/zuul/scheduler.py b/zuul/scheduler.py index 272235757..dfc922cf1 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -438,12 +438,12 @@ class Scheduler(threading.Thread): mergers_online = 0 for executor_component in self.component_registry.all("executor"): - if executor_component.allow_unzoned or not executor_component.zone: + if executor_component.allow_unzoned: if executor_component.state == BaseComponent.RUNNING: executors_unzoned_online += 1 if executor_component.accepting_work: executors_unzoned_accepting += 1 - else: + if executor_component.zone: zone_stats = zoned_executor_stats.setdefault( executor_component.zone, executor_stats_default.copy()) diff --git a/zuul/zk/nodepool.py b/zuul/zk/nodepool.py index 109053f11..1be4dc2b5 100644 --- a/zuul/zk/nodepool.py +++ b/zuul/zk/nodepool.py @@ -12,6 +12,7 @@ import json import logging +import os import time from enum import Enum from typing import Optional, List @@ -43,7 +44,7 @@ class ZooKeeperNodepool(ZooKeeperBase): Class implementing Nodepool related ZooKeeper interface. """ NODES_ROOT = "/nodepool/nodes" - LAUNCHER_ROOT = "/nodepool/launchers" + COMPONENT_ROOT = "/nodepool/components" REQUEST_ROOT = '/nodepool/requests' REQUEST_LOCK_ROOT = "/nodepool/requests-lock" HOLD_REQUEST_ROOT = '/zuul/hold-requests' @@ -95,9 +96,6 @@ class ZooKeeperNodepool(ZooKeeperBase): self._node_tree.close() self._node_tree = None - def _launcherPath(self, launcher): - return "%s/%s" % (self.LAUNCHER_ROOT, launcher) - def _nodePath(self, node): return "%s/%s" % (self.NODES_ROOT, node) @@ -113,15 +111,15 @@ class ZooKeeperNodepool(ZooKeeperBase): :returns: A list of Launcher objects, or empty list if none are found. """ + root_path = os.path.join(self.COMPONENT_ROOT, 'pool') try: - launcher_ids = self.kazoo_client\ - .get_children(self.LAUNCHER_ROOT) + pools = self.kazoo_client.get_children(root_path) except NoNodeError: return [] objs = [] - for launcher in launcher_ids: - path = self._launcherPath(launcher) + for pool in pools: + path = os.path.join(root_path, pool) try: data, _ = self.kazoo_client.get(path) except NoNodeError: diff --git a/zuul/zk/zkobject.py b/zuul/zk/zkobject.py index aa32b8b9b..8de3f34ba 100644 --- a/zuul/zk/zkobject.py +++ b/zuul/zk/zkobject.py @@ -43,6 +43,7 @@ class ZKContext: self.cumulative_write_znodes = 0 self.cumulative_read_bytes = 0 self.cumulative_write_bytes = 0 + self.build_references = False def sessionIsValid(self): return ((not self.lock or self.lock.is_still_valid()) and |