summaryrefslogtreecommitdiff
path: root/lib/gitlab/ci/queue/metrics.rb
blob: 7ecb9a1db167ea26d568c961b857131faab9cae7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# frozen_string_literal: true

module Gitlab
  module Ci
    module Queue
      class Metrics
        extend Gitlab::Utils::StrongMemoize

        QUEUE_DURATION_SECONDS_BUCKETS = [1, 3, 10, 30, 60, 300, 900, 1800, 3600].freeze
        QUEUE_ACTIVE_RUNNERS_BUCKETS = [1, 3, 10, 30, 60, 300, 900, 1800, 3600].freeze
        QUEUE_DEPTH_TOTAL_BUCKETS = [1, 2, 3, 5, 8, 16, 32, 50, 100, 250, 500, 1000, 2000, 5000].freeze
        QUEUE_SIZE_TOTAL_BUCKETS = [1, 5, 10, 50, 100, 500, 1000, 2000, 5000, 7500, 10000, 15000, 20000].freeze
        QUEUE_PROCESSING_DURATION_SECONDS_BUCKETS = [0.01, 0.05, 0.1, 0.3, 0.5, 1, 5, 10, 30, 60, 180, 300].freeze

        METRICS_SHARD_TAG_PREFIX = 'metrics_shard::'
        DEFAULT_METRICS_SHARD = 'default'
        JOBS_RUNNING_FOR_PROJECT_MAX_BUCKET = 5

        OPERATION_COUNTERS = [
          :build_can_pick,
          :build_not_pick,
          :build_not_pending,
          :build_temporary_locked,
          :build_conflict_lock,
          :build_conflict_exception,
          :build_conflict_transition,
          :queue_attempt,
          :queue_conflict,
          :queue_iteration,
          :queue_depth_limit,
          :queue_replication_lag,
          :runner_pre_assign_checks_failed,
          :runner_pre_assign_checks_success,
          :runner_queue_tick
        ].to_set.freeze

        QUEUE_DEPTH_HISTOGRAMS = [
          :found,
          :not_found,
          :conflict
        ].to_set.freeze

        attr_reader :runner

        def initialize(runner)
          @runner = runner
        end

        def register_failure
          self.class.failed_attempt_counter.increment
          self.class.attempt_counter.increment
        end

        def register_success(job)
          labels = { shared_runner: runner.instance_type?,
                     jobs_running_for_project: jobs_running_for_project(job),
                     shard: DEFAULT_METRICS_SHARD }

          if runner.instance_type?
            shard = runner.tag_list.sort.find { |name| name.starts_with?(METRICS_SHARD_TAG_PREFIX) }
            labels[:shard] = shard.gsub(METRICS_SHARD_TAG_PREFIX, '') if shard
          end

          self.class.job_queue_duration_seconds.observe(labels, Time.current - job.queued_at) unless job.queued_at.nil?
          self.class.attempt_counter.increment
        end

        # rubocop: disable CodeReuse/ActiveRecord
        def jobs_running_for_project(job)
          return '+Inf' unless runner.instance_type?

          # excluding currently started job
          running_jobs_count = job.project.builds.running.where(runner: ::Ci::Runner.instance_type)
                                  .limit(JOBS_RUNNING_FOR_PROJECT_MAX_BUCKET + 1).count - 1
          running_jobs_count < JOBS_RUNNING_FOR_PROJECT_MAX_BUCKET ? running_jobs_count : "#{JOBS_RUNNING_FOR_PROJECT_MAX_BUCKET}+"
        end
        # rubocop: enable CodeReuse/ActiveRecord

        def increment_queue_operation(operation)
          if !Rails.env.production? && !OPERATION_COUNTERS.include?(operation)
            raise ArgumentError, "unknown queue operation: #{operation}"
          end

          self.class.queue_operations_total.increment(operation: operation)
        end

        def observe_queue_depth(queue, size)
          return unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics, default_enabled: false)

          if !Rails.env.production? && !QUEUE_DEPTH_HISTOGRAMS.include?(queue)
            raise ArgumentError, "unknown queue depth label: #{queue}"
          end

          self.class.queue_depth_total.observe({ queue: queue }, size.to_f)
        end

        def observe_queue_size(size_proc, runner_type)
          return unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics, default_enabled: false)

          self.class.queue_size_total.observe({ runner_type: runner_type }, size_proc.call.to_f)
        end

        def observe_queue_time(metric, runner_type)
          start_time = ::Gitlab::Metrics::System.monotonic_time

          result = yield

          return result unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics, default_enabled: false)

          seconds = ::Gitlab::Metrics::System.monotonic_time - start_time

          case metric
          when :process
            self.class.queue_iteration_duration_seconds.observe({ runner_type: runner_type }, seconds.to_f)
          when :retrieve
            self.class.queue_retrieval_duration_seconds.observe({ runner_type: runner_type }, seconds.to_f)
          else
            raise ArgumentError unless Rails.env.production?
          end

          result
        end

        def self.observe_active_runners(runners_proc)
          return unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics, default_enabled: false)

          queue_active_runners_total.observe({}, runners_proc.call.to_f)
        end

        def self.increment_runner_tick(runner)
          self.new(runner).increment_queue_operation(:runner_queue_tick)
        end

        def self.failed_attempt_counter
          strong_memoize(:failed_attempt_counter) do
            name = :job_register_attempts_failed_total
            comment = 'Counts the times a runner tries to register a job'

            Gitlab::Metrics.counter(name, comment)
          end
        end

        def self.attempt_counter
          strong_memoize(:attempt_counter) do
            name = :job_register_attempts_total
            comment = 'Counts the times a runner tries to register a job'

            Gitlab::Metrics.counter(name, comment)
          end
        end

        def self.job_queue_duration_seconds
          strong_memoize(:job_queue_duration_seconds) do
            name = :job_queue_duration_seconds
            comment = 'Request handling execution time'
            buckets = QUEUE_DURATION_SECONDS_BUCKETS
            labels = {}

            Gitlab::Metrics.histogram(name, comment, labels, buckets)
          end
        end

        def self.queue_operations_total
          strong_memoize(:queue_operations_total) do
            name = :gitlab_ci_queue_operations_total
            comment = 'Counts all the operations that are happening inside a queue'

            Gitlab::Metrics.counter(name, comment)
          end
        end

        def self.queue_depth_total
          strong_memoize(:queue_depth_total) do
            name = :gitlab_ci_queue_depth_total
            comment = 'Size of a CI/CD builds queue in relation to the operation result'
            buckets = QUEUE_DEPTH_TOTAL_BUCKETS
            labels = {}

            Gitlab::Metrics.histogram(name, comment, labels, buckets)
          end
        end

        def self.queue_size_total
          strong_memoize(:queue_size_total) do
            name = :gitlab_ci_queue_size_total
            comment = 'Size of initialized CI/CD builds queue'
            buckets = QUEUE_SIZE_TOTAL_BUCKETS
            labels = {}

            Gitlab::Metrics.histogram(name, comment, labels, buckets)
          end
        end

        def self.queue_iteration_duration_seconds
          strong_memoize(:queue_iteration_duration_seconds) do
            name = :gitlab_ci_queue_iteration_duration_seconds
            comment = 'Time it takes to find a build in CI/CD queue'
            buckets = QUEUE_PROCESSING_DURATION_SECONDS_BUCKETS
            labels = {}

            Gitlab::Metrics.histogram(name, comment, labels, buckets)
          end
        end

        def self.queue_retrieval_duration_seconds
          strong_memoize(:queue_retrieval_duration_seconds) do
            name = :gitlab_ci_queue_retrieval_duration_seconds
            comment = 'Time it takes to execute a SQL query to retrieve builds queue'
            buckets = QUEUE_PROCESSING_DURATION_SECONDS_BUCKETS
            labels = {}

            Gitlab::Metrics.histogram(name, comment, labels, buckets)
          end
        end

        def self.queue_active_runners_total
          strong_memoize(:queue_active_runners_total) do
            name = :gitlab_ci_queue_active_runners_total
            comment = 'The amount of active runners that can process queue in a project'
            buckets = QUEUE_ACTIVE_RUNNERS_BUCKETS
            labels = {}

            Gitlab::Metrics.histogram(name, comment, labels, buckets)
          end
        end
      end
    end
  end
end