summaryrefslogtreecommitdiff
path: root/lib/gitlab/database/loose_index_scan_distinct_count.rb
blob: 884f4d47ff81dcf97b19d7836310f60ee473a445 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# frozen_string_literal: true

module Gitlab
  module Database
    # This class builds efficient batched distinct query by using loose index scan.
    # Consider the following example:
    # > Issue.distinct(:project_id).where(project_id: (1...100)).count
    #
    # Note: there is an index on project_id
    #
    # This query will read each element in the index matching the project_id filter.
    # If for a project_id has 100_000 issues, all 100_000 elements will be read.
    #
    # A loose index scan will read only one entry from the index for each project_id to reduce the number of disk reads.
    #
    # Usage:
    #
    # Gitlab::Database::LooseIndexScanDisctinctCount.new(Issue, :project_id).count(from: 1, to: 100)
    #
    # The query will return the number of distinct projects_ids between 1 and 100
    #
    # Getting the Arel query:
    #
    # Gitlab::Database::LooseIndexScanDisctinctCount.new(Issue, :project_id).build_query(from: 1, to: 100)
    class LooseIndexScanDistinctCount
      COLUMN_ALIAS = 'distinct_count_column'

      ColumnConfigurationError = Class.new(StandardError)

      def initialize(scope, column)
        if scope.is_a?(ActiveRecord::Relation)
          @scope = scope
          @model = scope.model
        else
          @scope = scope.where({})
          @model = scope
        end

        @column = transform_column(column)
      end

      def count(from:, to:)
        build_query(from: from, to: to).count(COLUMN_ALIAS)
      end

      def build_query(from:, to:) # rubocop:disable Metrics/AbcSize
        cte = Gitlab::SQL::RecursiveCTE.new(:counter_cte, union_args: { remove_order: false })
        table = model.arel_table

        cte << @scope
          .dup
          .select(column.as(COLUMN_ALIAS))
          .where(column.gteq(from))
          .where(column.lt(to))
          .order(column)
          .limit(1)

        inner_query = @scope
          .dup
          .where(column.gt(cte.table[COLUMN_ALIAS]))
          .where(column.lt(to))
          .select(column.as(COLUMN_ALIAS))
          .order(column)
          .limit(1)

        cte << cte.table
          .project(Arel::Nodes::Grouping.new(Arel.sql(inner_query.to_sql)).as(COLUMN_ALIAS))
          .where(cte.table[COLUMN_ALIAS].lt(to))

        model
          .with
          .recursive(cte.to_arel)
          .from(cte.alias_to(table))
          .unscope(where: :source_type)
          .unscope(where: model.inheritance_column) # Remove STI query, not needed here
      end

      private

      attr_reader :column, :model

      # Transforms the column so it can be used in Arel expressions
      #
      # 'table.column' => 'table.column'
      # 'column' => 'table_name.column'
      # :column => 'table_name.column'
      # Arel::Attributes::Attribute => name of the column
      def transform_column(column)
        if column.is_a?(String) || column.is_a?(Symbol)
          column_as_string = column.to_s
          column_as_string = "#{model.table_name}.#{column_as_string}" unless column_as_string.include?('.')

          Arel.sql(column_as_string)
        elsif column.is_a?(Arel::Attributes::Attribute)
          column
        else
          raise ColumnConfigurationError.new("Cannot transform the column: #{column.inspect}, please provide the column name as string")
        end
      end
    end
  end
end