1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
|
# frozen_string_literal: true
# For large tables, PostgreSQL can take a long time to count rows due to MVCC.
# Implements a distinct and ordinary batch counter
# Needs indexes on the column below to calculate max, min and range queries
# For larger tables just set use higher batch_size with index optimization
#
# In order to not use a possible complex time consuming query when calculating min and max for batch_distinct_count
# the start and finish can be sent specifically
#
# Grouped relations can be used as well. However, the preferred batch count should be around 10K because group by count is more expensive.
#
# See https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
#
# Examples:
# extend ::Gitlab::Database::BatchCount
# batch_count(User.active)
# batch_count(::Clusters::Cluster.aws_installed.enabled, :cluster_id)
# batch_count(Namespace.group(:type))
# batch_distinct_count(::Project, :creator_id)
# batch_distinct_count(::Project.with_active_services.service_desk_enabled.where(time_period), start: ::User.minimum(:id), finish: ::User.maximum(:id))
# batch_distinct_count(Project.group(:visibility_level), :creator_id)
# batch_sum(User, :sign_in_count)
# batch_sum(Issue.group(:state_id), :weight))
module Gitlab
module Database
module BatchCount
def batch_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
BatchCounter.new(relation, column: column).count(batch_size: batch_size, start: start, finish: finish)
end
def batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
BatchCounter.new(relation, column: column).count(mode: :distinct, batch_size: batch_size, start: start, finish: finish)
end
def batch_sum(relation, column, batch_size: nil, start: nil, finish: nil)
BatchCounter.new(relation, column: nil, operation: :sum, operation_args: [column]).count(batch_size: batch_size, start: start, finish: finish)
end
class << self
include BatchCount
end
end
class BatchCounter
FALLBACK = -1
MIN_REQUIRED_BATCH_SIZE = 1_250
DEFAULT_SUM_BATCH_SIZE = 1_000
MAX_ALLOWED_LOOPS = 10_000
SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
ALLOWED_MODES = [:itself, :distinct].freeze
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_DISTINCT_BATCH_SIZE = 10_000
DEFAULT_BATCH_SIZE = 100_000
def initialize(relation, column: nil, operation: :count, operation_args: nil)
@relation = relation
@column = column || relation.primary_key
@operation = operation
@operation_args = operation_args
end
def unwanted_configuration?(finish, batch_size, start)
(@operation == :count && batch_size <= MIN_REQUIRED_BATCH_SIZE) ||
(@operation == :sum && batch_size < DEFAULT_SUM_BATCH_SIZE) ||
(finish - start) / batch_size >= MAX_ALLOWED_LOOPS ||
start > finish
end
def count(batch_size: nil, mode: :itself, start: nil, finish: nil)
raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
check_mode!(mode)
# non-distinct have better performance
batch_size ||= batch_size_for_mode_and_operation(mode, @operation)
start = actual_start(start)
finish = actual_finish(finish)
raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
return FALLBACK if unwanted_configuration?(finish, batch_size, start)
results = nil
batch_start = start
while batch_start <= finish
batch_relation = build_relation_batch(batch_start, batch_start + batch_size, mode)
begin
results = merge_results(results, batch_relation.send(@operation, *@operation_args)) # rubocop:disable GitlabSecurity/PublicSend
batch_start += batch_size
rescue ActiveRecord::QueryCanceled => error
# retry with a safe batch size & warmer cache
if batch_size >= 2 * MIN_REQUIRED_BATCH_SIZE
batch_size /= 2
else
log_canceled_batch_fetch(batch_start, mode, batch_relation.to_sql, error)
return FALLBACK
end
end
sleep(SLEEP_TIME_IN_SECONDS)
end
results
end
def merge_results(results, object)
return object unless results
if object.is_a?(Hash)
results.merge!(object) { |_, a, b| a + b }
else
results + object
end
end
private
def build_relation_batch(start, finish, mode)
@relation.select(@column).public_send(mode).where(between_condition(start, finish)) # rubocop:disable GitlabSecurity/PublicSend
end
def batch_size_for_mode_and_operation(mode, operation)
return DEFAULT_SUM_BATCH_SIZE if operation == :sum
mode == :distinct ? DEFAULT_DISTINCT_BATCH_SIZE : DEFAULT_BATCH_SIZE
end
def between_condition(start, finish)
return @column.between(start...finish) if @column.is_a?(Arel::Attributes::Attribute)
{ @column => start...finish }
end
def actual_start(start)
start || @relation.unscope(:group, :having).minimum(@column) || 0
end
def actual_finish(finish)
finish || @relation.unscope(:group, :having).maximum(@column) || 0
end
def check_mode!(mode)
raise "The mode #{mode.inspect} is not supported" unless ALLOWED_MODES.include?(mode)
raise 'Use distinct count for optimized distinct counting' if @relation.limit(1).distinct_value.present? && mode != :distinct
raise 'Use distinct count only with non id fields' if @column == :id && mode == :distinct
end
def log_canceled_batch_fetch(batch_start, mode, query, error)
Gitlab::AppJsonLogger
.error(
event: 'batch_count',
relation: @relation.table_name,
operation: @operation,
operation_args: @operation_args,
start: batch_start,
mode: mode,
query: query,
message: "Query has been canceled with message: #{error.message}"
)
end
end
end
end
|