diff options
Diffstat (limited to 'lib/gitlab/usage_data_counters/hll_redis_counter.rb')
-rw-r--r-- | lib/gitlab/usage_data_counters/hll_redis_counter.rb | 244 |
1 files changed, 201 insertions, 43 deletions
diff --git a/lib/gitlab/usage_data_counters/hll_redis_counter.rb b/lib/gitlab/usage_data_counters/hll_redis_counter.rb index eb132ef0967..573ad1dce35 100644 --- a/lib/gitlab/usage_data_counters/hll_redis_counter.rb +++ b/lib/gitlab/usage_data_counters/hll_redis_counter.rb @@ -5,18 +5,28 @@ module Gitlab module HLLRedisCounter DEFAULT_WEEKLY_KEY_EXPIRY_LENGTH = 6.weeks DEFAULT_DAILY_KEY_EXPIRY_LENGTH = 29.days - DEFAULT_REDIS_SLOT = ''.freeze - - UnknownEvent = Class.new(StandardError) - UnknownAggregation = Class.new(StandardError) - - KNOWN_EVENTS_PATH = 'lib/gitlab/usage_data_counters/known_events.yml'.freeze + DEFAULT_REDIS_SLOT = '' + + EventError = Class.new(StandardError) + UnknownEvent = Class.new(EventError) + UnknownAggregation = Class.new(EventError) + AggregationMismatch = Class.new(EventError) + SlotMismatch = Class.new(EventError) + CategoryMismatch = Class.new(EventError) + UnknownAggregationOperator = Class.new(EventError) + InvalidContext = Class.new(EventError) + + KNOWN_EVENTS_PATH = File.expand_path('known_events/*.yml', __dir__) ALLOWED_AGGREGATIONS = %i(daily weekly).freeze + UNION_OF_AGGREGATED_METRICS = 'OR' + INTERSECTION_OF_AGGREGATED_METRICS = 'AND' + ALLOWED_METRICS_AGGREGATIONS = [UNION_OF_AGGREGATED_METRICS, INTERSECTION_OF_AGGREGATED_METRICS].freeze + AGGREGATED_METRICS_PATH = File.expand_path('aggregated_metrics/*.yml', __dir__) # Track event on entity_id # Increment a Redis HLL counter for unique event_name and entity_id # - # All events should be added to know_events file lib/gitlab/usage_data_counters/known_events.yml + # All events should be added to known_events yml files lib/gitlab/usage_data_counters/known_events/ # # Event example: # @@ -25,6 +35,7 @@ module Gitlab # category: compliance # Group events in categories # expiry: 29 # Optional expiration time in days, default value 29 days for daily and 6.weeks for weekly # aggregation: daily # Aggregation level, keys are stored daily or weekly + # feature_flag: # The event feature flag # # Usage: # @@ -33,28 +44,24 @@ module Gitlab class << self include Gitlab::Utils::UsageData - def track_event(entity_id, event_name, time = Time.zone.now) - return unless Gitlab::CurrentSettings.usage_ping_enabled? - - event = event_for(event_name) - - raise UnknownEvent.new("Unknown event #{event_name}") unless event.present? - - Gitlab::Redis::HLL.add(key: redis_key(event, time), value: entity_id, expiry: expiry(event)) + def track_event(value, event_name, time = Time.zone.now) + track(value, event_name, time: time) end - def unique_events(event_names:, start_date:, end_date:) - events = events_for(Array(event_names)) - - raise 'Events should be in same slot' unless events_in_same_slot?(events) - raise 'Events should be in same category' unless events_in_same_category?(events) - raise 'Events should have same aggregation level' unless events_same_aggregation?(events) - - aggregation = events.first[:aggregation] + def track_event_in_context(value, event_name, context, time = Time.zone.now) + return if context.blank? + return unless context.in?(valid_context_list) - keys = keys_for_aggregation(aggregation, events: events, start_date: start_date, end_date: end_date) + track(value, event_name, context: context, time: time) + end - redis_usage_data { Gitlab::Redis::HLL.count(keys: keys) } + def unique_events(event_names:, start_date:, end_date:, context: '') + count_unique_events(event_names: event_names, start_date: start_date, end_date: end_date, context: context) do |events| + raise SlotMismatch, events unless events_in_same_slot?(events) + raise CategoryMismatch, events unless events_in_same_category?(events) + raise AggregationMismatch, events unless events_same_aggregation?(events) + raise InvalidContext if context.present? && !context.in?(valid_context_list) + end end def categories @@ -72,8 +79,8 @@ module Gitlab events_names = events_for_category(category) event_results = events_names.each_with_object({}) do |event, hash| - hash["#{event}_weekly"] = unique_events(event_names: event, start_date: 7.days.ago.to_date, end_date: Date.current) - hash["#{event}_monthly"] = unique_events(event_names: event, start_date: 4.weeks.ago.to_date, end_date: Date.current) + hash["#{event}_weekly"] = unique_events(event_names: [event], start_date: 7.days.ago.to_date, end_date: Date.current) + hash["#{event}_monthly"] = unique_events(event_names: [event], start_date: 4.weeks.ago.to_date, end_date: Date.current) end if eligible_for_totals?(events_names) @@ -89,8 +96,136 @@ module Gitlab event_for(event_name).present? end + def aggregated_metrics_monthly_data + aggregated_metrics_data(4.weeks.ago.to_date) + end + + def aggregated_metrics_weekly_data + aggregated_metrics_data(7.days.ago.to_date) + end + + def known_events + @known_events ||= load_events(KNOWN_EVENTS_PATH) + end + + def aggregated_metrics + @aggregated_metrics ||= load_events(AGGREGATED_METRICS_PATH) + end + private + def track(value, event_name, context: '', time: Time.zone.now) + return unless Gitlab::CurrentSettings.usage_ping_enabled? + + event = event_for(event_name) + raise UnknownEvent, "Unknown event #{event_name}" unless event.present? + + Gitlab::Redis::HLL.add(key: redis_key(event, time, context), value: value, expiry: expiry(event)) + end + + # The aray of valid context on which we allow tracking + def valid_context_list + Plan.all_plans + end + + def aggregated_metrics_data(start_date) + aggregated_metrics.each_with_object({}) do |aggregation, weekly_data| + next if aggregation[:feature_flag] && Feature.disabled?(aggregation[:feature_flag], default_enabled: false, type: :development) + + weekly_data[aggregation[:name]] = calculate_count_for_aggregation(aggregation, start_date: start_date, end_date: Date.current) + end + end + + def calculate_count_for_aggregation(aggregation, start_date:, end_date:) + case aggregation[:operator] + when UNION_OF_AGGREGATED_METRICS + calculate_events_union(event_names: aggregation[:events], start_date: start_date, end_date: end_date) + when INTERSECTION_OF_AGGREGATED_METRICS + calculate_events_intersections(event_names: aggregation[:events], start_date: start_date, end_date: end_date) + else + raise UnknownAggregationOperator, "Events should be aggregated with one of operators #{ALLOWED_METRICS_AGGREGATIONS}" + end + end + + # calculate intersection of 'n' sets based on inclusion exclusion principle https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle + # this method will be extracted to dedicated module with https://gitlab.com/gitlab-org/gitlab/-/issues/273391 + def calculate_events_intersections(event_names:, start_date:, end_date:, subset_powers_cache: Hash.new({})) + # calculate power of intersection of all given metrics from inclusion exclusion principle + # |A + B + C| = (|A| + |B| + |C|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C|) => + # |A & B & C| = - (|A| + |B| + |C|) + (|A & B| + |A & C| + .. + |C & D|) + |A + B + C| + # |A + B + C + D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A & B & C & D| => + # |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A + B + C + D| + + # calculate each components of equation except for the last one |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - ... + subset_powers_data = subsets_intersection_powers(event_names, start_date, end_date, subset_powers_cache) + + # calculate last component of the equation |A & B & C & D| = .... - |A + B + C + D| + power_of_union_of_all_events = begin + subset_powers_cache[event_names.size][event_names.join('_+_')] ||= \ + calculate_events_union(event_names: event_names, start_date: start_date, end_date: end_date) + end + + # in order to determine if part of equation (|A & B & C|, |A & B & C & D|), that represents the intersection that we need to calculate, + # is positive or negative in particular equation we need to determine if number of subsets is even or odd. Please take a look at two examples below + # |A + B + C| = (|A| + |B| + |C|) - (|A & B| + |A & C| + .. + |C & D|) + |A & B & C| => + # |A & B & C| = - (|A| + |B| + |C|) + (|A & B| + |A & C| + .. + |C & D|) + |A + B + C| + # |A + B + C + D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A & B & C & D| => + # |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A + B + C + D| + subset_powers_size_even = subset_powers_data.size.even? + + # sum all components of equation except for the last one |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - ... => + sum_of_all_subset_powers = sum_subset_powers(subset_powers_data, subset_powers_size_even) + + # add last component of the equation |A & B & C & D| = sum_of_all_subset_powers - |A + B + C + D| + sum_of_all_subset_powers + (subset_powers_size_even ? power_of_union_of_all_events : -power_of_union_of_all_events) + end + + def sum_subset_powers(subset_powers_data, subset_powers_size_even) + sum_without_sign = subset_powers_data.to_enum.with_index.sum do |value, index| + (index + 1).odd? ? value : -value + end + + (subset_powers_size_even ? -1 : 1) * sum_without_sign + end + + def subsets_intersection_powers(event_names, start_date, end_date, subset_powers_cache) + subset_sizes = (1..(event_names.size - 1)) + + subset_sizes.map do |subset_size| + if subset_size > 1 + # calculate sum of powers of intersection between each subset (with given size) of metrics: #|A + B + C + D| = ... - (|A & B| + |A & C| + .. + |C & D|) + event_names.combination(subset_size).sum do |events_subset| + subset_powers_cache[subset_size][events_subset.join('_&_')] ||= \ + calculate_events_intersections(event_names: events_subset, start_date: start_date, end_date: end_date, subset_powers_cache: subset_powers_cache) + end + else + # calculate sum of powers of each set (metric) alone #|A + B + C + D| = (|A| + |B| + |C| + |D|) - ... + event_names.sum do |event| + subset_powers_cache[subset_size][event] ||= \ + unique_events(event_names: event, start_date: start_date, end_date: end_date) + end + end + end + end + + def calculate_events_union(event_names:, start_date:, end_date:) + count_unique_events(event_names: event_names, start_date: start_date, end_date: end_date) do |events| + raise SlotMismatch, events unless events_in_same_slot?(events) + raise AggregationMismatch, events unless events_same_aggregation?(events) + end + end + + def count_unique_events(event_names:, start_date:, end_date:, context: '') + events = events_for(Array(event_names).map(&:to_s)) + + yield events if block_given? + + aggregation = events.first[:aggregation] + + keys = keys_for_aggregation(aggregation, events: events, start_date: start_date, end_date: end_date, context: context) + redis_usage_data { Gitlab::Redis::HLL.count(keys: keys) } + end + # Allow to add totals for events that are in the same redis slot, category and have the same aggregation level # and if there are more than 1 event def eligible_for_totals?(events_names) @@ -100,16 +235,22 @@ module Gitlab events_in_same_slot?(events) && events_in_same_category?(events) && events_same_aggregation?(events) end - def keys_for_aggregation(aggregation, events:, start_date:, end_date:) + def keys_for_aggregation(aggregation, events:, start_date:, end_date:, context: '') if aggregation.to_sym == :daily - daily_redis_keys(events: events, start_date: start_date, end_date: end_date) + daily_redis_keys(events: events, start_date: start_date, end_date: end_date, context: context) else - weekly_redis_keys(events: events, start_date: start_date, end_date: end_date) + weekly_redis_keys(events: events, start_date: start_date, end_date: end_date, context: context) end end - def known_events - @known_events ||= YAML.load_file(Rails.root.join(KNOWN_EVENTS_PATH)).map(&:with_indifferent_access) + def load_events(wildcard) + Dir[wildcard].each_with_object([]) do |path, events| + events.push(*load_yaml_from_path(path)) + end + end + + def load_yaml_from_path(path) + YAML.safe_load(File.read(path))&.map(&:with_indifferent_access) end def known_events_names @@ -141,7 +282,7 @@ module Gitlab end def event_for(event_name) - known_events.find { |event| event[:name] == event_name } + known_events.find { |event| event[:name] == event_name.to_s } end def events_for(event_names) @@ -153,17 +294,26 @@ module Gitlab end # Compose the key in order to store events daily or weekly - def redis_key(event, time) + def redis_key(event, time, context = '') raise UnknownEvent.new("Unknown event #{event[:name]}") unless known_events_names.include?(event[:name].to_s) raise UnknownAggregation.new("Use :daily or :weekly aggregation") unless ALLOWED_AGGREGATIONS.include?(event[:aggregation].to_sym) + key = apply_slot(event) + key = apply_time_aggregation(key, time, event) + key = "#{context}_#{key}" if context.present? + key + end + + def apply_slot(event) slot = redis_slot(event) - key = if slot.present? - event[:name].to_s.gsub(slot, "{#{slot}}") - else - "{#{event[:name]}}" - end + if slot.present? + event[:name].to_s.gsub(slot, "{#{slot}}") + else + "{#{event[:name]}}" + end + end + def apply_time_aggregation(key, time, event) if event[:aggregation].to_sym == :daily year_day = time.strftime('%G-%j') "#{year_day}-#{key}" @@ -173,21 +323,29 @@ module Gitlab end end - def daily_redis_keys(events:, start_date:, end_date:) + def daily_redis_keys(events:, start_date:, end_date:, context: '') (start_date.to_date..end_date.to_date).map do |date| - events.map { |event| redis_key(event, date) } + events.map { |event| redis_key(event, date, context) } end.flatten end - def weekly_redis_keys(events:, start_date:, end_date:) + def validate_aggregation_operator!(operator) + return true if ALLOWED_METRICS_AGGREGATIONS.include?(operator) + + raise UnknownAggregationOperator.new("Events should be aggregated with one of operators #{ALLOWED_METRICS_AGGREGATIONS}") + end + + def weekly_redis_keys(events:, start_date:, end_date:, context: '') weeks = end_date.to_date.cweek - start_date.to_date.cweek weeks = 1 if weeks == 0 (0..(weeks - 1)).map do |week_increment| - events.map { |event| redis_key(event, start_date + week_increment * 7.days) } + events.map { |event| redis_key(event, start_date + week_increment * 7.days, context) } end.flatten end end end end end + +Gitlab::UsageDataCounters::HLLRedisCounter.prepend_if_ee('EE::Gitlab::UsageDataCounters::HLLRedisCounter') |