diff options
Diffstat (limited to 'lib/gitlab/database/median.rb')
-rw-r--r-- | lib/gitlab/database/median.rb | 140 |
1 files changed, 100 insertions, 40 deletions
diff --git a/lib/gitlab/database/median.rb b/lib/gitlab/database/median.rb index de2cb040ad6..2f9dd914674 100644 --- a/lib/gitlab/database/median.rb +++ b/lib/gitlab/database/median.rb @@ -2,32 +2,34 @@ module Gitlab module Database module Median + def median_datetime(arel_table, query_so_far, column_sym) + extract_median(execute_queries(arel_table, query_so_far, column_sym)).presence + end + def median_datetimes(arel_table, query_so_far, column_sym, partition_column) - median_queries = - if Gitlab::Database.postgresql? - pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column) - elsif Gitlab::Database.mysql? - mysql_median_datetime_sql(arel_table, query_so_far, column_sym) - end - - results = Array.wrap(median_queries).map do |query| - ActiveRecord::Base.connection.execute(query) - end - extract_medians(results).presence + extract_medians(execute_queries(arel_table, query_so_far, column_sym, partition_column)).presence end - def extract_medians(results) + def extract_median(results) result = results.compact.first if Gitlab::Database.postgresql? - result.values.map do |id, median| - [id.to_i, median&.to_f] - end.to_h + result = result.first.presence + + result['median']&.to_f if result elsif Gitlab::Database.mysql? result.to_a.flatten.first end end + def extract_medians(results) + return {} if Gitlab::Database.mysql? + + results.compact.first.values.map do |id, median| + [id.to_i, median&.to_f] + end.to_h + end + def mysql_median_datetime_sql(arel_table, query_so_far, column_sym) query = arel_table .from(arel_table.project(Arel.sql('*')).order(arel_table[column_sym]).as(arel_table.table_name)) @@ -53,7 +55,7 @@ module Gitlab ] end - def pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column) + def pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column = nil) # Create a CTE with the column we're operating on, row number (after sorting by the column # we're operating on), and count of the table we're operating on (duplicated across) all rows # of the CTE. For example, if we're looking to find the median of the `projects.star_count` @@ -64,49 +66,107 @@ module Gitlab # 5 | 1 | 3 # 9 | 2 | 3 # 15 | 3 | 3 + # + # If a partition column is used we will do the same operation but for separate partitions, + # when that happens the CTE might look like this: + # + # project_id | star_count | row_id | ct + # ------------+------------+--------+---- + # 1 | 5 | 1 | 2 + # 1 | 9 | 2 | 2 + # 2 | 10 | 1 | 3 + # 2 | 15 | 2 | 3 + # 2 | 20 | 3 | 3 cte_table = Arel::Table.new("ordered_records") + cte = Arel::Nodes::As.new( cte_table, - arel_table - .project( - arel_table[partition_column], - arel_table[column_sym].as(column_sym.to_s), - Arel::Nodes::Over.new(Arel::Nodes::NamedFunction.new("rank", []), - Arel::Nodes::Window.new.partition(arel_table[partition_column]) - .order(arel_table[column_sym])).as('row_id'), - arel_table.from(arel_table.alias) - .project("COUNT(*)") - .where(arel_table[partition_column].eq(arel_table.alias[partition_column])).as('ct')). + arel_table.project(*rank_rows(arel_table, column_sym, partition_column)). # Disallow negative values where(arel_table[column_sym].gteq(zero_interval))) # From the CTE, select either the middle row or the middle two rows (this is accomplished # by 'where cte.row_id between cte.ct / 2.0 AND cte.ct / 2.0 + 1'). Find the average of the # selected rows, and this is the median value. - cte_table - .project(cte_table[partition_column]) - .project(average([extract_epoch(cte_table[column_sym])], "median")) - .where( - Arel::Nodes::Between.new( - cte_table[:row_id], - Arel::Nodes::And.new( - [(cte_table[:ct] / Arel.sql('2.0')), - (cte_table[:ct] / Arel.sql('2.0') + 1)] + result = + cte_table + .project(*median_projections(cte_table, column_sym, partition_column)) + .where( + Arel::Nodes::Between.new( + cte_table[:row_id], + Arel::Nodes::And.new( + [(cte_table[:ct] / Arel.sql('2.0')), + (cte_table[:ct] / Arel.sql('2.0') + 1)] + ) ) ) - ) - .with(query_so_far, cte) - .group(cte_table[partition_column]) - .order(cte_table[partition_column]) - .to_sql + .with(query_so_far, cte) + + result.group(cte_table[partition_column]).order(cte_table[partition_column]) if partition_column + + result.to_sql end private + def median_queries(arel_table, query_so_far, column_sym, partition_column = nil) + if Gitlab::Database.postgresql? + pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column) + elsif Gitlab::Database.mysql? + mysql_median_datetime_sql(arel_table, query_so_far, column_sym) + end + end + + def execute_queries(arel_table, query_so_far, column_sym, partition_column = nil) + queries = median_queries(arel_table, query_so_far, column_sym, partition_column) + + Array.wrap(queries).map { |query| ActiveRecord::Base.connection.execute(query) } + end + def average(args, as) Arel::Nodes::NamedFunction.new("AVG", args, as) end + def rank_rows(arel_table, column_sym, partition_column) + column_row = arel_table[column_sym].as(column_sym.to_s) + + if partition_column + partition_row = arel_table[partition_column] + row_id = + Arel::Nodes::Over.new( + Arel::Nodes::NamedFunction.new('rank', []), + Arel::Nodes::Window.new.partition(arel_table[partition_column]) + .order(arel_table[column_sym]) + ).as('row_id') + + count = arel_table.from(arel_table.alias) + .project('COUNT(*)') + .where(arel_table[partition_column].eq(arel_table.alias[partition_column])) + .as('ct') + + [partition_row, column_row, row_id, count] + else + row_id = + Arel::Nodes::Over.new( + Arel::Nodes::NamedFunction.new('row_number', []), + Arel::Nodes::Window.new.order(arel_table[column_sym]) + ).as('row_id') + + count = arel_table.project("COUNT(1)").as('ct') + + [column_row, row_id, count] + end + end + + def median_projections(table, column_sym, partition_column) + if partition_column + [table[partition_column], + average([extract_epoch(table[column_sym])], "median")] + else + [average([extract_epoch(table[column_sym])], "median")] + end + end + def extract_epoch(arel_attribute) Arel.sql(%Q{EXTRACT(EPOCH FROM "#{arel_attribute.relation.name}"."#{arel_attribute.name}")}) end |