diff options
Diffstat (limited to 'lib/gitlab/pagination/keyset/in_operator_optimization')
4 files changed, 464 insertions, 0 deletions
diff --git a/lib/gitlab/pagination/keyset/in_operator_optimization/array_scope_columns.rb b/lib/gitlab/pagination/keyset/in_operator_optimization/array_scope_columns.rb new file mode 100644 index 00000000000..95afd5a8595 --- /dev/null +++ b/lib/gitlab/pagination/keyset/in_operator_optimization/array_scope_columns.rb @@ -0,0 +1,59 @@ +# frozen_string_literal: true + +module Gitlab + module Pagination + module Keyset + module InOperatorOptimization + class ArrayScopeColumns + ARRAY_SCOPE_CTE_NAME = 'array_cte' + + def initialize(columns) + validate_columns!(columns) + + array_scope_table = Arel::Table.new(ARRAY_SCOPE_CTE_NAME) + @columns = columns.map do |column| + ColumnData.new(column, "array_scope_#{column}", array_scope_table) + end + end + + def array_scope_cte_name + ARRAY_SCOPE_CTE_NAME + end + + def array_aggregated_columns + columns.map(&:array_aggregated_column) + end + + def array_aggregated_column_names + columns.map(&:array_aggregated_column_name) + end + + def arel_columns + columns.map(&:arel_column) + end + + def array_lookup_expressions_by_position(table_name) + columns.map do |column| + Arel.sql("#{table_name}.#{column.array_aggregated_column_name}[position]") + end + end + + private + + attr_reader :columns + + def validate_columns!(columns) + if columns.blank? + msg = <<~MSG + No array columns were given. + Make sure you explicitly select the columns in the array_scope parameter. + Example: Project.select(:id) + MSG + raise StandardError, msg + end + end + end + end + end + end +end diff --git a/lib/gitlab/pagination/keyset/in_operator_optimization/column_data.rb b/lib/gitlab/pagination/keyset/in_operator_optimization/column_data.rb new file mode 100644 index 00000000000..3f620f74eca --- /dev/null +++ b/lib/gitlab/pagination/keyset/in_operator_optimization/column_data.rb @@ -0,0 +1,39 @@ +# frozen_string_literal: true + +module Gitlab + module Pagination + module Keyset + module InOperatorOptimization + class ColumnData + attr_reader :original_column_name, :as, :arel_table + + def initialize(original_column_name, as, arel_table) + @original_column_name = original_column_name.to_s + @as = as.to_s + @arel_table = arel_table + end + + def projection + arel_column.as(as) + end + + def arel_column + arel_table[original_column_name] + end + + def arel_column_as + arel_table[as] + end + + def array_aggregated_column_name + "#{arel_table.name}_#{original_column_name}_array" + end + + def array_aggregated_column + Arel::Nodes::NamedFunction.new('ARRAY_AGG', [arel_column]).as(array_aggregated_column_name) + end + end + end + end + end +end diff --git a/lib/gitlab/pagination/keyset/in_operator_optimization/order_by_columns.rb b/lib/gitlab/pagination/keyset/in_operator_optimization/order_by_columns.rb new file mode 100644 index 00000000000..d8c69a74e6b --- /dev/null +++ b/lib/gitlab/pagination/keyset/in_operator_optimization/order_by_columns.rb @@ -0,0 +1,76 @@ +# frozen_string_literal: true + +module Gitlab + module Pagination + module Keyset + module InOperatorOptimization + class OrderByColumns + include Enumerable + + # This class exposes collection methods for the order by columns + # + # Example: by modelling the `issues.created_at ASC, issues.id ASC` ORDER BY + # SQL clause, this class will receive two ColumnOrderDefinition objects + def initialize(columns, arel_table) + @columns = columns.map do |column| + ColumnData.new(column.attribute_name, "order_by_columns_#{column.attribute_name}", arel_table) + end + end + + def arel_columns + columns.map(&:arel_column) + end + + def array_aggregated_columns + columns.map(&:array_aggregated_column) + end + + def array_aggregated_column_names + columns.map(&:array_aggregated_column_name) + end + + def original_column_names + columns.map(&:original_column_name) + end + + def original_column_names_as_arel_string + columns.map { |c| Arel.sql(c.original_column_name) } + end + + def original_column_names_as_tmp_tamble + temp_table = Arel::Table.new('record') + original_column_names.map { |c| temp_table[c] } + end + + def cursor_values(table_name) + columns.each_with_object({}) do |column, hash| + hash[column.original_column_name] = Arel.sql("#{table_name}.#{column.array_aggregated_column_name}[position]") + end + end + + def array_lookup_expressions_by_position(table_name) + columns.map do |column| + Arel.sql("#{table_name}.#{column.array_aggregated_column_name}[position]") + end + end + + def replace_value_in_array_by_position_expressions + columns.map do |column| + name = "#{QueryBuilder::RECURSIVE_CTE_NAME}.#{column.array_aggregated_column_name}" + new_value = "next_cursor_values.#{column.original_column_name}" + "#{name}[:position_query.position-1]||#{new_value}||#{name}[position_query.position+1:]" + end + end + + def each(&block) + columns.each(&block) + end + + private + + attr_reader :columns + end + end + end + end +end diff --git a/lib/gitlab/pagination/keyset/in_operator_optimization/query_builder.rb b/lib/gitlab/pagination/keyset/in_operator_optimization/query_builder.rb new file mode 100644 index 00000000000..39d6e016ac7 --- /dev/null +++ b/lib/gitlab/pagination/keyset/in_operator_optimization/query_builder.rb @@ -0,0 +1,290 @@ +# frozen_string_literal: true + +module Gitlab + module Pagination + module Keyset + module InOperatorOptimization + # rubocop: disable CodeReuse/ActiveRecord + class QueryBuilder + UnsupportedScopeOrder = Class.new(StandardError) + + RECURSIVE_CTE_NAME = 'recursive_keyset_cte' + RECORDS_COLUMN = 'records' + + # This class optimizes slow database queries (PostgreSQL specific) where the + # IN SQL operator is used with sorting. + # + # Arguments: + # scope - ActiveRecord::Relation supporting keyset pagination + # array_scope - ActiveRecord::Relation for the `IN` subselect + # array_mapping_scope - Lambda for connecting scope with array_scope + # finder_query - ActiveRecord::Relation for finding one row by the passed in cursor values + # values - keyset cursor values (optional) + # + # Example ActiveRecord query: Issues in the namespace hierarchy + # > scope = Issue + # > .where(project_id: Group.find(9970).all_projects.select(:id)) + # > .order(:created_at, :id) + # > .limit(20); + # + # Optimized version: + # + # > scope = Issue.where({}).order(:created_at, :id) # base scope + # > array_scope = Group.find(9970).all_projects.select(:id) + # > array_mapping_scope = -> (id_expression) { Issue.where(Issue.arel_table[:project_id].eq(id_expression)) } + # + # # finding the record by id is good enough, we can ignore the created_at_expression + # > finder_query = -> (created_at_expression, id_expression) { Issue.where(Issue.arel_table[:id].eq(id_expression)) } + # + # > Gitlab::Pagination::Keyset::InOperatorOptimization::QueryBuilder.new( + # > scope: scope, + # > array_scope: array_scope, + # > array_mapping_scope: array_mapping_scope, + # > finder_query: finder_query + # > ).execute.limit(20) + def initialize(scope:, array_scope:, array_mapping_scope:, finder_query:, values: {}) + @scope, success = Gitlab::Pagination::Keyset::SimpleOrderBuilder.build(scope) + + unless success + error_message = <<~MSG + The order on the scope does not support keyset pagination. You might need to define a custom Order object.\n + See https://docs.gitlab.com/ee/development/database/keyset_pagination.html#complex-order-configuration\n + Or the Gitlab::Pagination::Keyset::Order class for examples + MSG + raise(UnsupportedScopeOrder, error_message) + end + + @order = Gitlab::Pagination::Keyset::Order.extract_keyset_order_object(scope) + @array_scope = array_scope + @array_mapping_scope = array_mapping_scope + @finder_query = finder_query + @values = values + @model = @scope.model + @table_name = @model.table_name + @arel_table = @model.arel_table + end + + def execute + selector_cte = Gitlab::SQL::CTE.new(:array_cte, array_scope) + + cte = Gitlab::SQL::RecursiveCTE.new(RECURSIVE_CTE_NAME, union_args: { remove_duplicates: false, remove_order: false }) + cte << initializer_query + cte << data_collector_query + + q = cte + .apply_to(model.where({}) + .with(selector_cte.to_arel)) + .select(result_collector_final_projections) + .where("count <> 0") # filter out the initializer row + + model.from(q.arel.as(table_name)) + end + + private + + attr_reader :array_scope, :scope, :order, :array_mapping_scope, :finder_query, :values, :model, :table_name, :arel_table + + def initializer_query + array_column_names = array_scope_columns.array_aggregated_column_names + order_by_columns.array_aggregated_column_names + + projections = [ + *result_collector_initializer_columns, + *array_column_names, + '0::bigint AS count' + ] + + model.select(projections).from(build_column_arrays_query).limit(1) + end + + # This query finds the first cursor values for each item in the array CTE. + # + # array_cte: + # + # |project_id| + # |----------| + # | 1| + # | 2| + # | 3| + # | 4| + # + # For each project_id, find the first issues row by respecting the created_at, id order. + # + # The `array_mapping_scope` parameter defines how the `array_scope` and the `scope` can be combined. + # + # scope = Issue.where({}) # empty scope + # array_mapping_scope = Issue.where(project_id: X) + # + # scope.merge(array_mapping_scope) # Issue.where(project_id: X) + # + # X will be replaced with a value from the `array_cte` temporary table. + # + # |created_at|id| + # |----------|--| + # |2020-01-15| 2| + # |2020-01-07| 3| + # |2020-01-07| 4| + # |2020-01-10| 5| + def build_column_arrays_query + q = Arel::SelectManager.new + .project(array_scope_columns.array_aggregated_columns + order_by_columns.array_aggregated_columns) + .from(array_cte) + .join(Arel.sql("LEFT JOIN LATERAL (#{initial_keyset_query.to_sql}) #{table_name} ON TRUE")) + + order_by_columns.each { |column| q.where(column.arel_column.not_eq(nil)) } + + q.as('array_scope_lateral_query') + end + + def array_cte + Arel::SelectManager.new + .project(array_scope_columns.arel_columns) + .from(Arel.sql(array_scope_columns.array_scope_cte_name)) + .as(array_scope_columns.array_scope_cte_name) + end + + def initial_keyset_query + keyset_scope = scope.merge(array_mapping_scope.call(*array_scope_columns.arel_columns)) + order + .apply_cursor_conditions(keyset_scope, values, use_union_optimization: true) + .reselect(*order_by_columns.arel_columns) + .limit(1) + end + + def data_collector_query + array_column_list = array_scope_columns.array_aggregated_column_names + + order_column_value_arrays = order_by_columns.replace_value_in_array_by_position_expressions + + select = [ + *result_collector_columns, + *array_column_list, + *order_column_value_arrays, + "#{RECURSIVE_CTE_NAME}.count + 1" + ] + + from = <<~SQL + #{RECURSIVE_CTE_NAME}, + #{array_order_query.lateral.as('position_query').to_sql}, + #{ensure_one_row(next_cursor_values_query).lateral.as('next_cursor_values').to_sql} + SQL + + model.select(select).from(from) + end + + # NULL guard. This method ensures that NULL values are returned when the passed in scope returns 0 rows. + # Example query: returns issues.id or NULL + # + # SELECT issues.id FROM (VALUES (NULL)) nulls (id) + # LEFT JOIN (SELECT id FROM issues WHERE id = 1 LIMIT 1) issues ON TRUE + # LIMIT 1 + def ensure_one_row(query) + q = Arel::SelectManager.new + q.projections = order_by_columns.original_column_names_as_tmp_tamble + + null_values = [nil] * order_by_columns.count + + from = Arel::Nodes::Grouping.new(Arel::Nodes::ValuesList.new([null_values])).as('nulls') + + q.from(from) + q.join(Arel.sql("LEFT JOIN (#{query.to_sql}) record ON TRUE")) + q.limit = 1 + q + end + + # This subquery finds the cursor values for the next record by sorting the generated cursor arrays in memory and taking the first element. + # It combines the cursor arrays (UNNEST) together and sorts them according to the originally defined ORDER BY clause. + # + # Example: issues in the group hierarchy with ORDER BY created_at, id + # + # |project_id| |created_at|id| # 2 arrays combined: issues_created_at_array, issues_id_array + # |----------| |----------|--| + # | 1| |2020-01-15| 2| + # | 2| |2020-01-07| 3| + # | 3| |2020-01-07| 4| + # | 4| |2020-01-10| 5| + # + # The query will return the cursor values: (2020-01-07, 3) and the array position: 1 + # From the position, we can tell that the record belongs to the project with id 2. + def array_order_query + q = Arel::SelectManager.new + .project([*order_by_columns.original_column_names_as_arel_string, Arel.sql('position')]) + .from("UNNEST(#{list(order_by_columns.array_aggregated_column_names)}) WITH ORDINALITY AS u(#{list(order_by_columns.original_column_names)}, position)") + + order_by_columns.each { |column| q.where(Arel.sql(column.original_column_name).not_eq(nil)) } # ignore rows where all columns are NULL + + q.order(Arel.sql(order_by_without_table_references)).take(1) + end + + # This subquery finds the next cursor values after the previously determined position (from array_order_query). + # The current cursor values are passed in as SQL literals since the actual values are encoded into SQL arrays. + # + # Example: issues in the group hierarchy with ORDER BY created_at, id + # + # |project_id| |created_at|id| # 2 arrays combined: issues_created_at_array, issues_id_array + # |----------| |----------|--| + # | 1| |2020-01-15| 2| + # | 2| |2020-01-07| 3| + # | 3| |2020-01-07| 4| + # | 4| |2020-01-10| 5| + # + # Assuming that the determined position is 1, the cursor values will be the following: + # - Filter: project_id = 2 + # - created_at = 2020-01-07 + # - id = 3 + def next_cursor_values_query + cursor_values = order_by_columns.cursor_values(RECURSIVE_CTE_NAME) + array_mapping_scope_columns = array_scope_columns.array_lookup_expressions_by_position(RECURSIVE_CTE_NAME) + + keyset_scope = scope + .reselect(*order_by_columns.arel_columns) + .merge(array_mapping_scope.call(*array_mapping_scope_columns)) + + order + .apply_cursor_conditions(keyset_scope, cursor_values, use_union_optimization: true) + .reselect(*order_by_columns.arel_columns) + .limit(1) + end + + # Generates an ORDER BY clause by using the column position index and the original order clauses. + # This method is used to sort the collected arrays in SQL. + # Example: "issues".created_at DESC , "issues".id ASC => 1 DESC, 2 ASC + def order_by_without_table_references + order.column_definitions.each_with_index.map do |column_definition, i| + "#{i + 1} #{column_definition.order_direction_as_sql_string}" + end.join(", ") + end + + def result_collector_initializer_columns + ["NULL::#{table_name} AS #{RECORDS_COLUMN}"] + end + + def result_collector_columns + query = finder_query + .call(*order_by_columns.array_lookup_expressions_by_position(RECURSIVE_CTE_NAME)) + .select("#{table_name}") + .limit(1) + + ["(#{query.to_sql})"] + end + + def result_collector_final_projections + ["(#{RECORDS_COLUMN}).*"] + end + + def array_scope_columns + @array_scope_columns ||= ArrayScopeColumns.new(array_scope.select_values) + end + + def order_by_columns + @order_by_columns ||= OrderByColumns.new(order.column_definitions, arel_table) + end + + def list(array) + array.join(', ') + end + end + # rubocop: enable CodeReuse/ActiveRecord + end + end + end +end |