Add latest changes from gitlab-org/gitlab@13-8-stable-eev13.8.0-rc42

author: Robert Speicher <rspeicher@gmail.com> 2021-01-20 13:34:23 -0600
committer: Robert Speicher <rspeicher@gmail.com> 2021-01-20 13:34:23 -0600
commit: 6438df3a1e0fb944485cebf07976160184697d72 (patch)
tree: 00b09bfd170e77ae9391b1a2f5a93ef6839f2597 /lib/gitlab/database/postgres_hll
parent: 42bcd54d971da7ef2854b896a7b34f4ef8601067 (diff)
download: gitlab-ce-6438df3a1e0fb944485cebf07976160184697d72.tar.gz
2 files changed, 101 insertions, 38 deletions
diff --git a/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
index 33faa2ef1b0..62dfaeeaae3 100644
--- a/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
+++ b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
@@ -16,9 +16,9 @@ module Gitlab
       # Grouped relations are NOT supported yet.
       #
       # @example Usage
-      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
+      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).execute
       #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
-      #    .estimate_distinct_count(
+      #    .execute(
       #      batch_size: 1_000,
       #      start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
       #      finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
@@ -30,7 +30,6 @@ module Gitlab
       #  for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
       class BatchDistinctCounter
         ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
-        FALLBACK = -1
         MIN_REQUIRED_BATCH_SIZE = 750
         SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
         MAX_DATA_VOLUME = 4_000_000_000
@@ -38,8 +37,10 @@ module Gitlab
         # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
         DEFAULT_BATCH_SIZE = 10_000
 
+        ZERO_OFFSET = 1
+        BUCKET_ID_MASK = (Buckets::TOTAL_BUCKETS - ZERO_OFFSET).to_s(2)
         BIT_31_MASK = "B'0#{'1' * 31}'"
-        BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
+        BIT_32_NORMALIZED_BUCKET_ID_MASK = "B'#{'0' * (32 - BUCKET_ID_MASK.size)}#{BUCKET_ID_MASK}'"
         # @example source_query
         #   SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
         #   FROM %{relation}
@@ -48,73 +49,58 @@ module Gitlab
         #   AND %{column} IS NOT NULL
         BUCKETED_DATA_SQL = <<~SQL
           WITH hashed_attributes AS (%{source_query})
-          SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
+          SELECT (attr_hash_32_bits & #{BIT_32_NORMALIZED_BUCKET_ID_MASK})::int AS bucket_num,
             (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
           FROM hashed_attributes
           GROUP BY 1
         SQL
 
-        TOTAL_BUCKETS_NUMBER = 512
+        WRONG_CONFIGURATION_ERROR = Class.new(ActiveRecord::StatementInvalid)
 
         def initialize(relation, column = nil)
           @relation = relation
           @column = column || relation.primary_key
         end
 
-        def unwanted_configuration?(finish, batch_size, start)
-          batch_size <= MIN_REQUIRED_BATCH_SIZE ||
-            (finish - start) >= MAX_DATA_VOLUME ||
-            start > finish
-        end
-
-        def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
+        # Executes counter that iterates over database source and return Gitlab::Database::PostgresHll::Buckets
+        # that can be used to estimation of number of uniq elements in analysed set
+        #
+        # @param batch_size maximal number of rows that will be analysed by single database query
+        # @param start initial pkey range
+        # @param finish final pkey range
+        # @return [Gitlab::Database::PostgresHll::Buckets] HyperLogLog data structure instance that can estimate number of unique elements
+        def execute(batch_size: nil, start: nil, finish: nil)
           raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
 
           batch_size ||= DEFAULT_BATCH_SIZE
-
           start = actual_start(start)
           finish = actual_finish(finish)
 
-          raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
-          return FALLBACK if unwanted_configuration?(finish, batch_size, start)
+          raise WRONG_CONFIGURATION_ERROR if unwanted_configuration?(start, finish, batch_size)
 
           batch_start = start
-          hll_blob = {}
+          hll_buckets = Buckets.new
 
           while batch_start <= finish
             begin
-              hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
+              hll_buckets.merge_hash!(hll_buckets_for_batch(batch_start, batch_start + batch_size))
               batch_start += batch_size
             end
             sleep(SLEEP_TIME_IN_SECONDS)
           end
 
-          estimate_cardinality(hll_blob)
+          hll_buckets
         end
 
         private
 
-        # arbitrary values that are present in #estimate_cardinality
-        # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
-        # article, they are not representing any entity and serves as tune value
-        # for the whole equation
-        def estimate_cardinality(hll_blob)
-          num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
-
-          num_uniques = (
-            ((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
-              (num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
-          ).to_i
-
-          if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
-            ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
-              Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
-          else
-            num_uniques
-          end
+        def unwanted_configuration?(start, finish, batch_size)
+          batch_size <= MIN_REQUIRED_BATCH_SIZE ||
+            (finish - start) >= MAX_DATA_VOLUME ||
+            start > finish || start < 0 || finish < 0
         end
 
-        def hll_blob_for_batch(start, finish)
+        def hll_buckets_for_batch(start, finish)
           @relation
             .connection
             .execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
diff --git a/lib/gitlab/database/postgres_hll/buckets.rb b/lib/gitlab/database/postgres_hll/buckets.rb
new file mode 100644
index 00000000000..429e823379f
--- /dev/null
+++ b/lib/gitlab/database/postgres_hll/buckets.rb
@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Database
+    module PostgresHll
+      # Bucket class represent data structure build with HyperLogLog algorithm
+      # that models data distribution in analysed set. This representation than can be used
+      # for following purposes
+      #   1. Estimating number of unique elements that this structure represents
+      #   2. Merging with other Buckets structure to later estimate number of unique elements in sum of two
+      #      represented data sets
+      #   3. Serializing Buckets structure to json format, that can be stored in various persistence layers
+      #
+      # @example Usage
+      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).estimated_distinct_count
+      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).merge_hash!(141 => 1, 56 => 5).estimated_distinct_count
+      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).to_json
+
+      # @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
+      #  Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
+      #  for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
+      #  for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
+      class Buckets
+        TOTAL_BUCKETS = 512
+
+        def initialize(buckets = {})
+          @buckets = buckets
+        end
+
+        # Based on HyperLogLog structure estimates number of unique elements in analysed set.
+        #
+        # @return [Float] Estimate number of unique elements
+        def estimated_distinct_count
+          @estimated_distinct_count ||= estimate_cardinality
+        end
+
+        # Updates instance underlying HyperLogLog structure by merging it with other HyperLogLog structure
+        #
+        # @param other_buckets_hash hash with HyperLogLog structure representation
+        def merge_hash!(other_buckets_hash)
+          buckets.merge!(other_buckets_hash) {|_key, old, new| new > old ? new : old }
+        end
+
+        # Serialize instance underlying HyperLogLog structure to JSON format, that can be stored in various persistence layers
+        #
+        # @return [String] HyperLogLog data structure serialized to JSON
+        def to_json(_ = nil)
+          buckets.to_json
+        end
+
+        private
+
+        attr_accessor :buckets
+
+        # arbitrary values that are present in #estimate_cardinality
+        # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
+        # article, they are not representing any entity and serves as tune value
+        # for the whole equation
+        def estimate_cardinality
+          num_zero_buckets = TOTAL_BUCKETS - buckets.size
+
+          num_uniques = (
+            ((TOTAL_BUCKETS**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS))) /
+            (num_zero_buckets + buckets.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
+          ).to_i
+
+          if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS
+            ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS)) * (TOTAL_BUCKETS *
+              Math.log2(TOTAL_BUCKETS.to_f / num_zero_buckets)))
+          else
+            num_uniques
+          end
+        end
+      end
+    end
+  end
+end
author	Robert Speicher <rspeicher@gmail.com>	2021-01-20 13:34:23 -0600
committer	Robert Speicher <rspeicher@gmail.com>	2021-01-20 13:34:23 -0600
commit	6438df3a1e0fb944485cebf07976160184697d72 (patch)
tree	00b09bfd170e77ae9391b1a2f5a93ef6839f2597 /lib/gitlab/database/postgres_hll
parent	42bcd54d971da7ef2854b896a7b34f4ef8601067 (diff)
download	gitlab-ce-6438df3a1e0fb944485cebf07976160184697d72.tar.gz