summaryrefslogtreecommitdiff
path: root/lib/gitlab/pagination/keyset/in_operator_optimization
diff options
context:
space:
mode:
Diffstat (limited to 'lib/gitlab/pagination/keyset/in_operator_optimization')
-rw-r--r--lib/gitlab/pagination/keyset/in_operator_optimization/array_scope_columns.rb59
-rw-r--r--lib/gitlab/pagination/keyset/in_operator_optimization/column_data.rb39
-rw-r--r--lib/gitlab/pagination/keyset/in_operator_optimization/order_by_columns.rb76
-rw-r--r--lib/gitlab/pagination/keyset/in_operator_optimization/query_builder.rb290
4 files changed, 464 insertions, 0 deletions
diff --git a/lib/gitlab/pagination/keyset/in_operator_optimization/array_scope_columns.rb b/lib/gitlab/pagination/keyset/in_operator_optimization/array_scope_columns.rb
new file mode 100644
index 00000000000..95afd5a8595
--- /dev/null
+++ b/lib/gitlab/pagination/keyset/in_operator_optimization/array_scope_columns.rb
@@ -0,0 +1,59 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Pagination
+ module Keyset
+ module InOperatorOptimization
+ class ArrayScopeColumns
+ ARRAY_SCOPE_CTE_NAME = 'array_cte'
+
+ def initialize(columns)
+ validate_columns!(columns)
+
+ array_scope_table = Arel::Table.new(ARRAY_SCOPE_CTE_NAME)
+ @columns = columns.map do |column|
+ ColumnData.new(column, "array_scope_#{column}", array_scope_table)
+ end
+ end
+
+ def array_scope_cte_name
+ ARRAY_SCOPE_CTE_NAME
+ end
+
+ def array_aggregated_columns
+ columns.map(&:array_aggregated_column)
+ end
+
+ def array_aggregated_column_names
+ columns.map(&:array_aggregated_column_name)
+ end
+
+ def arel_columns
+ columns.map(&:arel_column)
+ end
+
+ def array_lookup_expressions_by_position(table_name)
+ columns.map do |column|
+ Arel.sql("#{table_name}.#{column.array_aggregated_column_name}[position]")
+ end
+ end
+
+ private
+
+ attr_reader :columns
+
+ def validate_columns!(columns)
+ if columns.blank?
+ msg = <<~MSG
+ No array columns were given.
+ Make sure you explicitly select the columns in the array_scope parameter.
+ Example: Project.select(:id)
+ MSG
+ raise StandardError, msg
+ end
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/pagination/keyset/in_operator_optimization/column_data.rb b/lib/gitlab/pagination/keyset/in_operator_optimization/column_data.rb
new file mode 100644
index 00000000000..3f620f74eca
--- /dev/null
+++ b/lib/gitlab/pagination/keyset/in_operator_optimization/column_data.rb
@@ -0,0 +1,39 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Pagination
+ module Keyset
+ module InOperatorOptimization
+ class ColumnData
+ attr_reader :original_column_name, :as, :arel_table
+
+ def initialize(original_column_name, as, arel_table)
+ @original_column_name = original_column_name.to_s
+ @as = as.to_s
+ @arel_table = arel_table
+ end
+
+ def projection
+ arel_column.as(as)
+ end
+
+ def arel_column
+ arel_table[original_column_name]
+ end
+
+ def arel_column_as
+ arel_table[as]
+ end
+
+ def array_aggregated_column_name
+ "#{arel_table.name}_#{original_column_name}_array"
+ end
+
+ def array_aggregated_column
+ Arel::Nodes::NamedFunction.new('ARRAY_AGG', [arel_column]).as(array_aggregated_column_name)
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/pagination/keyset/in_operator_optimization/order_by_columns.rb b/lib/gitlab/pagination/keyset/in_operator_optimization/order_by_columns.rb
new file mode 100644
index 00000000000..d8c69a74e6b
--- /dev/null
+++ b/lib/gitlab/pagination/keyset/in_operator_optimization/order_by_columns.rb
@@ -0,0 +1,76 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Pagination
+ module Keyset
+ module InOperatorOptimization
+ class OrderByColumns
+ include Enumerable
+
+ # This class exposes collection methods for the order by columns
+ #
+ # Example: by modelling the `issues.created_at ASC, issues.id ASC` ORDER BY
+ # SQL clause, this class will receive two ColumnOrderDefinition objects
+ def initialize(columns, arel_table)
+ @columns = columns.map do |column|
+ ColumnData.new(column.attribute_name, "order_by_columns_#{column.attribute_name}", arel_table)
+ end
+ end
+
+ def arel_columns
+ columns.map(&:arel_column)
+ end
+
+ def array_aggregated_columns
+ columns.map(&:array_aggregated_column)
+ end
+
+ def array_aggregated_column_names
+ columns.map(&:array_aggregated_column_name)
+ end
+
+ def original_column_names
+ columns.map(&:original_column_name)
+ end
+
+ def original_column_names_as_arel_string
+ columns.map { |c| Arel.sql(c.original_column_name) }
+ end
+
+ def original_column_names_as_tmp_tamble
+ temp_table = Arel::Table.new('record')
+ original_column_names.map { |c| temp_table[c] }
+ end
+
+ def cursor_values(table_name)
+ columns.each_with_object({}) do |column, hash|
+ hash[column.original_column_name] = Arel.sql("#{table_name}.#{column.array_aggregated_column_name}[position]")
+ end
+ end
+
+ def array_lookup_expressions_by_position(table_name)
+ columns.map do |column|
+ Arel.sql("#{table_name}.#{column.array_aggregated_column_name}[position]")
+ end
+ end
+
+ def replace_value_in_array_by_position_expressions
+ columns.map do |column|
+ name = "#{QueryBuilder::RECURSIVE_CTE_NAME}.#{column.array_aggregated_column_name}"
+ new_value = "next_cursor_values.#{column.original_column_name}"
+ "#{name}[:position_query.position-1]||#{new_value}||#{name}[position_query.position+1:]"
+ end
+ end
+
+ def each(&block)
+ columns.each(&block)
+ end
+
+ private
+
+ attr_reader :columns
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/pagination/keyset/in_operator_optimization/query_builder.rb b/lib/gitlab/pagination/keyset/in_operator_optimization/query_builder.rb
new file mode 100644
index 00000000000..39d6e016ac7
--- /dev/null
+++ b/lib/gitlab/pagination/keyset/in_operator_optimization/query_builder.rb
@@ -0,0 +1,290 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Pagination
+ module Keyset
+ module InOperatorOptimization
+ # rubocop: disable CodeReuse/ActiveRecord
+ class QueryBuilder
+ UnsupportedScopeOrder = Class.new(StandardError)
+
+ RECURSIVE_CTE_NAME = 'recursive_keyset_cte'
+ RECORDS_COLUMN = 'records'
+
+ # This class optimizes slow database queries (PostgreSQL specific) where the
+ # IN SQL operator is used with sorting.
+ #
+ # Arguments:
+ # scope - ActiveRecord::Relation supporting keyset pagination
+ # array_scope - ActiveRecord::Relation for the `IN` subselect
+ # array_mapping_scope - Lambda for connecting scope with array_scope
+ # finder_query - ActiveRecord::Relation for finding one row by the passed in cursor values
+ # values - keyset cursor values (optional)
+ #
+ # Example ActiveRecord query: Issues in the namespace hierarchy
+ # > scope = Issue
+ # > .where(project_id: Group.find(9970).all_projects.select(:id))
+ # > .order(:created_at, :id)
+ # > .limit(20);
+ #
+ # Optimized version:
+ #
+ # > scope = Issue.where({}).order(:created_at, :id) # base scope
+ # > array_scope = Group.find(9970).all_projects.select(:id)
+ # > array_mapping_scope = -> (id_expression) { Issue.where(Issue.arel_table[:project_id].eq(id_expression)) }
+ #
+ # # finding the record by id is good enough, we can ignore the created_at_expression
+ # > finder_query = -> (created_at_expression, id_expression) { Issue.where(Issue.arel_table[:id].eq(id_expression)) }
+ #
+ # > Gitlab::Pagination::Keyset::InOperatorOptimization::QueryBuilder.new(
+ # > scope: scope,
+ # > array_scope: array_scope,
+ # > array_mapping_scope: array_mapping_scope,
+ # > finder_query: finder_query
+ # > ).execute.limit(20)
+ def initialize(scope:, array_scope:, array_mapping_scope:, finder_query:, values: {})
+ @scope, success = Gitlab::Pagination::Keyset::SimpleOrderBuilder.build(scope)
+
+ unless success
+ error_message = <<~MSG
+ The order on the scope does not support keyset pagination. You might need to define a custom Order object.\n
+ See https://docs.gitlab.com/ee/development/database/keyset_pagination.html#complex-order-configuration\n
+ Or the Gitlab::Pagination::Keyset::Order class for examples
+ MSG
+ raise(UnsupportedScopeOrder, error_message)
+ end
+
+ @order = Gitlab::Pagination::Keyset::Order.extract_keyset_order_object(scope)
+ @array_scope = array_scope
+ @array_mapping_scope = array_mapping_scope
+ @finder_query = finder_query
+ @values = values
+ @model = @scope.model
+ @table_name = @model.table_name
+ @arel_table = @model.arel_table
+ end
+
+ def execute
+ selector_cte = Gitlab::SQL::CTE.new(:array_cte, array_scope)
+
+ cte = Gitlab::SQL::RecursiveCTE.new(RECURSIVE_CTE_NAME, union_args: { remove_duplicates: false, remove_order: false })
+ cte << initializer_query
+ cte << data_collector_query
+
+ q = cte
+ .apply_to(model.where({})
+ .with(selector_cte.to_arel))
+ .select(result_collector_final_projections)
+ .where("count <> 0") # filter out the initializer row
+
+ model.from(q.arel.as(table_name))
+ end
+
+ private
+
+ attr_reader :array_scope, :scope, :order, :array_mapping_scope, :finder_query, :values, :model, :table_name, :arel_table
+
+ def initializer_query
+ array_column_names = array_scope_columns.array_aggregated_column_names + order_by_columns.array_aggregated_column_names
+
+ projections = [
+ *result_collector_initializer_columns,
+ *array_column_names,
+ '0::bigint AS count'
+ ]
+
+ model.select(projections).from(build_column_arrays_query).limit(1)
+ end
+
+ # This query finds the first cursor values for each item in the array CTE.
+ #
+ # array_cte:
+ #
+ # |project_id|
+ # |----------|
+ # | 1|
+ # | 2|
+ # | 3|
+ # | 4|
+ #
+ # For each project_id, find the first issues row by respecting the created_at, id order.
+ #
+ # The `array_mapping_scope` parameter defines how the `array_scope` and the `scope` can be combined.
+ #
+ # scope = Issue.where({}) # empty scope
+ # array_mapping_scope = Issue.where(project_id: X)
+ #
+ # scope.merge(array_mapping_scope) # Issue.where(project_id: X)
+ #
+ # X will be replaced with a value from the `array_cte` temporary table.
+ #
+ # |created_at|id|
+ # |----------|--|
+ # |2020-01-15| 2|
+ # |2020-01-07| 3|
+ # |2020-01-07| 4|
+ # |2020-01-10| 5|
+ def build_column_arrays_query
+ q = Arel::SelectManager.new
+ .project(array_scope_columns.array_aggregated_columns + order_by_columns.array_aggregated_columns)
+ .from(array_cte)
+ .join(Arel.sql("LEFT JOIN LATERAL (#{initial_keyset_query.to_sql}) #{table_name} ON TRUE"))
+
+ order_by_columns.each { |column| q.where(column.arel_column.not_eq(nil)) }
+
+ q.as('array_scope_lateral_query')
+ end
+
+ def array_cte
+ Arel::SelectManager.new
+ .project(array_scope_columns.arel_columns)
+ .from(Arel.sql(array_scope_columns.array_scope_cte_name))
+ .as(array_scope_columns.array_scope_cte_name)
+ end
+
+ def initial_keyset_query
+ keyset_scope = scope.merge(array_mapping_scope.call(*array_scope_columns.arel_columns))
+ order
+ .apply_cursor_conditions(keyset_scope, values, use_union_optimization: true)
+ .reselect(*order_by_columns.arel_columns)
+ .limit(1)
+ end
+
+ def data_collector_query
+ array_column_list = array_scope_columns.array_aggregated_column_names
+
+ order_column_value_arrays = order_by_columns.replace_value_in_array_by_position_expressions
+
+ select = [
+ *result_collector_columns,
+ *array_column_list,
+ *order_column_value_arrays,
+ "#{RECURSIVE_CTE_NAME}.count + 1"
+ ]
+
+ from = <<~SQL
+ #{RECURSIVE_CTE_NAME},
+ #{array_order_query.lateral.as('position_query').to_sql},
+ #{ensure_one_row(next_cursor_values_query).lateral.as('next_cursor_values').to_sql}
+ SQL
+
+ model.select(select).from(from)
+ end
+
+ # NULL guard. This method ensures that NULL values are returned when the passed in scope returns 0 rows.
+ # Example query: returns issues.id or NULL
+ #
+ # SELECT issues.id FROM (VALUES (NULL)) nulls (id)
+ # LEFT JOIN (SELECT id FROM issues WHERE id = 1 LIMIT 1) issues ON TRUE
+ # LIMIT 1
+ def ensure_one_row(query)
+ q = Arel::SelectManager.new
+ q.projections = order_by_columns.original_column_names_as_tmp_tamble
+
+ null_values = [nil] * order_by_columns.count
+
+ from = Arel::Nodes::Grouping.new(Arel::Nodes::ValuesList.new([null_values])).as('nulls')
+
+ q.from(from)
+ q.join(Arel.sql("LEFT JOIN (#{query.to_sql}) record ON TRUE"))
+ q.limit = 1
+ q
+ end
+
+ # This subquery finds the cursor values for the next record by sorting the generated cursor arrays in memory and taking the first element.
+ # It combines the cursor arrays (UNNEST) together and sorts them according to the originally defined ORDER BY clause.
+ #
+ # Example: issues in the group hierarchy with ORDER BY created_at, id
+ #
+ # |project_id| |created_at|id| # 2 arrays combined: issues_created_at_array, issues_id_array
+ # |----------| |----------|--|
+ # | 1| |2020-01-15| 2|
+ # | 2| |2020-01-07| 3|
+ # | 3| |2020-01-07| 4|
+ # | 4| |2020-01-10| 5|
+ #
+ # The query will return the cursor values: (2020-01-07, 3) and the array position: 1
+ # From the position, we can tell that the record belongs to the project with id 2.
+ def array_order_query
+ q = Arel::SelectManager.new
+ .project([*order_by_columns.original_column_names_as_arel_string, Arel.sql('position')])
+ .from("UNNEST(#{list(order_by_columns.array_aggregated_column_names)}) WITH ORDINALITY AS u(#{list(order_by_columns.original_column_names)}, position)")
+
+ order_by_columns.each { |column| q.where(Arel.sql(column.original_column_name).not_eq(nil)) } # ignore rows where all columns are NULL
+
+ q.order(Arel.sql(order_by_without_table_references)).take(1)
+ end
+
+ # This subquery finds the next cursor values after the previously determined position (from array_order_query).
+ # The current cursor values are passed in as SQL literals since the actual values are encoded into SQL arrays.
+ #
+ # Example: issues in the group hierarchy with ORDER BY created_at, id
+ #
+ # |project_id| |created_at|id| # 2 arrays combined: issues_created_at_array, issues_id_array
+ # |----------| |----------|--|
+ # | 1| |2020-01-15| 2|
+ # | 2| |2020-01-07| 3|
+ # | 3| |2020-01-07| 4|
+ # | 4| |2020-01-10| 5|
+ #
+ # Assuming that the determined position is 1, the cursor values will be the following:
+ # - Filter: project_id = 2
+ # - created_at = 2020-01-07
+ # - id = 3
+ def next_cursor_values_query
+ cursor_values = order_by_columns.cursor_values(RECURSIVE_CTE_NAME)
+ array_mapping_scope_columns = array_scope_columns.array_lookup_expressions_by_position(RECURSIVE_CTE_NAME)
+
+ keyset_scope = scope
+ .reselect(*order_by_columns.arel_columns)
+ .merge(array_mapping_scope.call(*array_mapping_scope_columns))
+
+ order
+ .apply_cursor_conditions(keyset_scope, cursor_values, use_union_optimization: true)
+ .reselect(*order_by_columns.arel_columns)
+ .limit(1)
+ end
+
+ # Generates an ORDER BY clause by using the column position index and the original order clauses.
+ # This method is used to sort the collected arrays in SQL.
+ # Example: "issues".created_at DESC , "issues".id ASC => 1 DESC, 2 ASC
+ def order_by_without_table_references
+ order.column_definitions.each_with_index.map do |column_definition, i|
+ "#{i + 1} #{column_definition.order_direction_as_sql_string}"
+ end.join(", ")
+ end
+
+ def result_collector_initializer_columns
+ ["NULL::#{table_name} AS #{RECORDS_COLUMN}"]
+ end
+
+ def result_collector_columns
+ query = finder_query
+ .call(*order_by_columns.array_lookup_expressions_by_position(RECURSIVE_CTE_NAME))
+ .select("#{table_name}")
+ .limit(1)
+
+ ["(#{query.to_sql})"]
+ end
+
+ def result_collector_final_projections
+ ["(#{RECORDS_COLUMN}).*"]
+ end
+
+ def array_scope_columns
+ @array_scope_columns ||= ArrayScopeColumns.new(array_scope.select_values)
+ end
+
+ def order_by_columns
+ @order_by_columns ||= OrderByColumns.new(order.column_definitions, arel_table)
+ end
+
+ def list(array)
+ array.join(', ')
+ end
+ end
+ # rubocop: enable CodeReuse/ActiveRecord
+ end
+ end
+ end
+end