summaryrefslogtreecommitdiff
path: root/lib/gitlab/database/consistency_checker.rb
blob: e398fef744cc736f0e29c95c94bfff20e02d7120 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# frozen_string_literal: true

module Gitlab
  module Database
    class ConsistencyChecker
      BATCH_SIZE = 1000
      MAX_BATCHES = 25
      MAX_RUNTIME = 30.seconds # must be less than the scheduling frequency of the ConsistencyCheck jobs

      delegate :monotonic_time, to: :'Gitlab::Metrics::System'

      def initialize(source_model:, target_model:, source_columns:, target_columns:)
        @source_model = source_model
        @target_model = target_model
        @source_columns = source_columns
        @target_columns = target_columns
        @source_sort_column = source_columns.first
        @target_sort_column = target_columns.first
        @result = { matches: 0, mismatches: 0, batches: 0, mismatches_details: [] }
      end

      # rubocop:disable Metrics/AbcSize
      def execute(start_id:)
        current_start_id = start_id

        return build_result(next_start_id: nil) if max_id.nil?
        return build_result(next_start_id: min_id) if current_start_id > max_id

        @start_time = monotonic_time

        MAX_BATCHES.times do
          if (current_start_id <= max_id) && !over_time_limit?
            ids_range = current_start_id...(current_start_id + BATCH_SIZE)
            # rubocop: disable CodeReuse/ActiveRecord
            source_data = source_model.where(source_sort_column => ids_range)
                            .order(source_sort_column => :asc).pluck(*source_columns)
            target_data = target_model.where(target_sort_column => ids_range)
                            .order(target_sort_column => :asc).pluck(*target_columns)
            # rubocop: enable CodeReuse/ActiveRecord

            current_start_id += BATCH_SIZE
            result[:matches] += append_mismatches_details(source_data, target_data)
            result[:batches] += 1
          else
            break
          end
        end

        result[:mismatches] = result[:mismatches_details].length
        metrics_counter.increment({ source_table: source_model.table_name, result: "match" }, result[:matches])
        metrics_counter.increment({ source_table: source_model.table_name, result: "mismatch" }, result[:mismatches])

        build_result(next_start_id: current_start_id > max_id ? min_id : current_start_id)
      end
      # rubocop:enable Metrics/AbcSize

      private

      attr_reader :source_model, :target_model, :source_columns, :target_columns,
                  :source_sort_column, :target_sort_column, :start_time, :result

      def build_result(next_start_id:)
        { next_start_id: next_start_id }.merge(result)
      end

      def over_time_limit?
        (monotonic_time - start_time) >= MAX_RUNTIME
      end

      # This where comparing the items happen, and building the diff log
      # It returns the number of matching elements
      def append_mismatches_details(source_data, target_data)
        # Mapping difference the sort key to the item values
        # source - target
        source_diff_hash = (source_data - target_data).index_by { |item| item.shift }
        # target - source
        target_diff_hash = (target_data - source_data).index_by { |item| item.shift }

        matches = source_data.length - source_diff_hash.length

        # Items that exist in the first table + Different items
        source_diff_hash.each do |id, values|
          result[:mismatches_details] << {
            id: id,
            source_table: values,
            target_table: target_diff_hash[id]
          }
        end

        # Only the items that exist in the target table
        target_diff_hash.each do |id, values|
          next if source_diff_hash[id] # It's already added

          result[:mismatches_details] << {
            id: id,
            source_table: source_diff_hash[id],
            target_table: values
          }
        end

        matches
      end

      # rubocop: disable CodeReuse/ActiveRecord
      def min_id
        @min_id ||= source_model.minimum(source_sort_column)
      end

      def max_id
        @max_id ||= source_model.maximum(source_sort_column)
      end
      # rubocop: enable CodeReuse/ActiveRecord

      def metrics_counter
        @metrics_counter ||= Gitlab::Metrics.counter(
          :consistency_checks,
          "Consistency Check Results"
        )
      end
    end
  end
end