diff options
author | Tiago Botelho <tiagonbotelho@hotmail.com> | 2018-02-15 13:23:39 +0000 |
---|---|---|
committer | Tiago Botelho <tiagonbotelho@hotmail.com> | 2018-02-28 10:46:20 +0000 |
commit | 335ee79a73fafdf00fac6e8ffc286ce4bad273ff (patch) | |
tree | d30033fd007c992448ce02f42e46f9da48e223cb /lib | |
parent | 3f31da9c69c550d1698a1376e37d36f4e6e309b5 (diff) | |
download | gitlab-ce-335ee79a73fafdf00fac6e8ffc286ce4bad273ff.tar.gz |
Refactors median code to work with both single and multiple projects
Diffstat (limited to 'lib')
-rw-r--r-- | lib/gitlab/cycle_analytics/base_query.rb | 4 | ||||
-rw-r--r-- | lib/gitlab/cycle_analytics/base_stage.rb | 8 | ||||
-rw-r--r-- | lib/gitlab/cycle_analytics/usage_data.rb | 48 | ||||
-rw-r--r-- | lib/gitlab/database/median.rb | 140 | ||||
-rw-r--r-- | lib/gitlab/usage_data.rb | 7 |
5 files changed, 135 insertions, 72 deletions
diff --git a/lib/gitlab/cycle_analytics/base_query.rb b/lib/gitlab/cycle_analytics/base_query.rb index b9075047487..def9885789e 100644 --- a/lib/gitlab/cycle_analytics/base_query.rb +++ b/lib/gitlab/cycle_analytics/base_query.rb @@ -8,14 +8,14 @@ module Gitlab private def base_query - @base_query ||= stage_query([@project.id]) # rubocop:disable Gitlab/ModuleWithInstanceVariables + @base_query ||= stage_query(@project.id) # rubocop:disable Gitlab/ModuleWithInstanceVariables end def stage_query(project_ids) query = mr_closing_issues_table.join(issue_table).on(issue_table[:id].eq(mr_closing_issues_table[:issue_id])) .join(issue_metrics_table).on(issue_table[:id].eq(issue_metrics_table[:issue_id])) .project(issue_table[:project_id].as("project_id")) - .where(issue_table[:project_id].in(project_ids)) + .where(issue_table[:project_id].in(Array(project_ids))) .where(issue_table[:created_at].gteq(@options[:from])) # rubocop:disable Gitlab/ModuleWithInstanceVariables # Load merge_requests diff --git a/lib/gitlab/cycle_analytics/base_stage.rb b/lib/gitlab/cycle_analytics/base_stage.rb index 45386fee622..5c3fd901f79 100644 --- a/lib/gitlab/cycle_analytics/base_stage.rb +++ b/lib/gitlab/cycle_analytics/base_stage.rb @@ -31,8 +31,12 @@ module Gitlab interval_query = Arel::Nodes::As.new(cte_table, subtract_datetimes(stage_query(project_ids), start_time_attrs, end_time_attrs, name.to_s)) - median_datetimes(cte_table, interval_query, name, :project_id)&.each do |project_id, median| - loader.call(project_id, median) + if project_ids.size == 1 + loader.call(@project.id, median_datetime(cte_table, interval_query, name)) + else + median_datetimes(cte_table, interval_query, name, :project_id)&.each do |project_id, median| + loader.call(project_id, median) + end end end end diff --git a/lib/gitlab/cycle_analytics/usage_data.rb b/lib/gitlab/cycle_analytics/usage_data.rb index 0aefd434e14..5122e3417ca 100644 --- a/lib/gitlab/cycle_analytics/usage_data.rb +++ b/lib/gitlab/cycle_analytics/usage_data.rb @@ -5,27 +5,21 @@ module Gitlab attr_reader :projects, :options - def initialize(projects, options) - @projects = projects - @options = options + def initialize + @projects = Project.sorted_by_activity.limit(PROJECTS_LIMIT) + @options = { from: 7.days.ago } end def to_json total = 0 - values = {} - medians_per_stage.each do |stage_name, medians| - medians = medians.map(&:presence).compact + values = + medians_per_stage.each_with_object({}) do |(stage_name, medians), hsh| + calculations = stage_values(medians) - stage_values = { - average: calc_average(medians), - sd: standard_deviation(medians), - missing: projects.length - medians.length - } - - total += stage_values.values.compact.sum - values[stage_name] = stage_values - end + total += calculations.values.compact.sum + hsh[stage_name] = calculations + end values[:total] = total @@ -43,26 +37,36 @@ module Gitlab end end + def stage_values(medians) + medians = medians.map(&:presence).compact + average = calc_average(medians) + + { + average: average, + sd: standard_deviation(medians, average), + missing: projects.length - medians.length + } + end + def calc_average(values) return if values.empty? (values.sum / values.length).to_i end - def sample_variance(values) + def standard_deviation(values, average) + Math.sqrt(sample_variance(values, average)).to_i + end + + def sample_variance(values, average) return 0 if values.length <= 1 - avg = calc_average(values) sum = values.inject(0) do |acc, val| - acc + (val - avg)**2 + acc + (val - average)**2 end sum / (values.length - 1) end - - def standard_deviation(values) - Math.sqrt(sample_variance(values)).to_i - end end end end diff --git a/lib/gitlab/database/median.rb b/lib/gitlab/database/median.rb index de2cb040ad6..2f9dd914674 100644 --- a/lib/gitlab/database/median.rb +++ b/lib/gitlab/database/median.rb @@ -2,32 +2,34 @@ module Gitlab module Database module Median + def median_datetime(arel_table, query_so_far, column_sym) + extract_median(execute_queries(arel_table, query_so_far, column_sym)).presence + end + def median_datetimes(arel_table, query_so_far, column_sym, partition_column) - median_queries = - if Gitlab::Database.postgresql? - pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column) - elsif Gitlab::Database.mysql? - mysql_median_datetime_sql(arel_table, query_so_far, column_sym) - end - - results = Array.wrap(median_queries).map do |query| - ActiveRecord::Base.connection.execute(query) - end - extract_medians(results).presence + extract_medians(execute_queries(arel_table, query_so_far, column_sym, partition_column)).presence end - def extract_medians(results) + def extract_median(results) result = results.compact.first if Gitlab::Database.postgresql? - result.values.map do |id, median| - [id.to_i, median&.to_f] - end.to_h + result = result.first.presence + + result['median']&.to_f if result elsif Gitlab::Database.mysql? result.to_a.flatten.first end end + def extract_medians(results) + return {} if Gitlab::Database.mysql? + + results.compact.first.values.map do |id, median| + [id.to_i, median&.to_f] + end.to_h + end + def mysql_median_datetime_sql(arel_table, query_so_far, column_sym) query = arel_table .from(arel_table.project(Arel.sql('*')).order(arel_table[column_sym]).as(arel_table.table_name)) @@ -53,7 +55,7 @@ module Gitlab ] end - def pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column) + def pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column = nil) # Create a CTE with the column we're operating on, row number (after sorting by the column # we're operating on), and count of the table we're operating on (duplicated across) all rows # of the CTE. For example, if we're looking to find the median of the `projects.star_count` @@ -64,49 +66,107 @@ module Gitlab # 5 | 1 | 3 # 9 | 2 | 3 # 15 | 3 | 3 + # + # If a partition column is used we will do the same operation but for separate partitions, + # when that happens the CTE might look like this: + # + # project_id | star_count | row_id | ct + # ------------+------------+--------+---- + # 1 | 5 | 1 | 2 + # 1 | 9 | 2 | 2 + # 2 | 10 | 1 | 3 + # 2 | 15 | 2 | 3 + # 2 | 20 | 3 | 3 cte_table = Arel::Table.new("ordered_records") + cte = Arel::Nodes::As.new( cte_table, - arel_table - .project( - arel_table[partition_column], - arel_table[column_sym].as(column_sym.to_s), - Arel::Nodes::Over.new(Arel::Nodes::NamedFunction.new("rank", []), - Arel::Nodes::Window.new.partition(arel_table[partition_column]) - .order(arel_table[column_sym])).as('row_id'), - arel_table.from(arel_table.alias) - .project("COUNT(*)") - .where(arel_table[partition_column].eq(arel_table.alias[partition_column])).as('ct')). + arel_table.project(*rank_rows(arel_table, column_sym, partition_column)). # Disallow negative values where(arel_table[column_sym].gteq(zero_interval))) # From the CTE, select either the middle row or the middle two rows (this is accomplished # by 'where cte.row_id between cte.ct / 2.0 AND cte.ct / 2.0 + 1'). Find the average of the # selected rows, and this is the median value. - cte_table - .project(cte_table[partition_column]) - .project(average([extract_epoch(cte_table[column_sym])], "median")) - .where( - Arel::Nodes::Between.new( - cte_table[:row_id], - Arel::Nodes::And.new( - [(cte_table[:ct] / Arel.sql('2.0')), - (cte_table[:ct] / Arel.sql('2.0') + 1)] + result = + cte_table + .project(*median_projections(cte_table, column_sym, partition_column)) + .where( + Arel::Nodes::Between.new( + cte_table[:row_id], + Arel::Nodes::And.new( + [(cte_table[:ct] / Arel.sql('2.0')), + (cte_table[:ct] / Arel.sql('2.0') + 1)] + ) ) ) - ) - .with(query_so_far, cte) - .group(cte_table[partition_column]) - .order(cte_table[partition_column]) - .to_sql + .with(query_so_far, cte) + + result.group(cte_table[partition_column]).order(cte_table[partition_column]) if partition_column + + result.to_sql end private + def median_queries(arel_table, query_so_far, column_sym, partition_column = nil) + if Gitlab::Database.postgresql? + pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column) + elsif Gitlab::Database.mysql? + mysql_median_datetime_sql(arel_table, query_so_far, column_sym) + end + end + + def execute_queries(arel_table, query_so_far, column_sym, partition_column = nil) + queries = median_queries(arel_table, query_so_far, column_sym, partition_column) + + Array.wrap(queries).map { |query| ActiveRecord::Base.connection.execute(query) } + end + def average(args, as) Arel::Nodes::NamedFunction.new("AVG", args, as) end + def rank_rows(arel_table, column_sym, partition_column) + column_row = arel_table[column_sym].as(column_sym.to_s) + + if partition_column + partition_row = arel_table[partition_column] + row_id = + Arel::Nodes::Over.new( + Arel::Nodes::NamedFunction.new('rank', []), + Arel::Nodes::Window.new.partition(arel_table[partition_column]) + .order(arel_table[column_sym]) + ).as('row_id') + + count = arel_table.from(arel_table.alias) + .project('COUNT(*)') + .where(arel_table[partition_column].eq(arel_table.alias[partition_column])) + .as('ct') + + [partition_row, column_row, row_id, count] + else + row_id = + Arel::Nodes::Over.new( + Arel::Nodes::NamedFunction.new('row_number', []), + Arel::Nodes::Window.new.order(arel_table[column_sym]) + ).as('row_id') + + count = arel_table.project("COUNT(1)").as('ct') + + [column_row, row_id, count] + end + end + + def median_projections(table, column_sym, partition_column) + if partition_column + [table[partition_column], + average([extract_epoch(table[column_sym])], "median")] + else + [average([extract_epoch(table[column_sym])], "median")] + end + end + def extract_epoch(arel_attribute) Arel.sql(%Q{EXTRACT(EPOCH FROM "#{arel_attribute.relation.name}"."#{arel_attribute.name}")}) end diff --git a/lib/gitlab/usage_data.rb b/lib/gitlab/usage_data.rb index dae0e20e156..37d3512990e 100644 --- a/lib/gitlab/usage_data.rb +++ b/lib/gitlab/usage_data.rb @@ -73,12 +73,7 @@ module Gitlab end def cycle_analytics_usage_data - # We only want to generate this data for instances that use PostgreSQL - return {} if Gitlab::Database.mysql? - - projects = Project.sorted_by_activity.limit(Gitlab::CycleAnalytics::UsageData::PROJECTS_LIMIT) - - Gitlab::CycleAnalytics::UsageData.new(projects, { from: 7.days.ago }).to_json + Gitlab::CycleAnalytics::UsageData.new.to_json end def features_usage_data |