diff options
Diffstat (limited to 'lib/gitlab/database/similarity_score.rb')
-rw-r--r-- | lib/gitlab/database/similarity_score.rb | 110 |
1 files changed, 110 insertions, 0 deletions
diff --git a/lib/gitlab/database/similarity_score.rb b/lib/gitlab/database/similarity_score.rb new file mode 100644 index 00000000000..2633c29438a --- /dev/null +++ b/lib/gitlab/database/similarity_score.rb @@ -0,0 +1,110 @@ +# frozen_string_literal: true + +module Gitlab + module Database + class SimilarityScore + EMPTY_STRING = Arel.sql("''").freeze + EXPRESSION_ON_INVALID_INPUT = Arel::Nodes::NamedFunction.new('CAST', [Arel.sql('0').as('integer')]).freeze + DEFAULT_MULTIPLIER = 1 + + # This method returns an Arel expression that can be used in an ActiveRecord query to order the resultset by similarity. + # + # Note: Calculating similarity score for large volume of records is inefficient. use SimilarityScore only for smaller + # resultset which is already filtered by other conditions (< 10_000 records). + # + # ==== Parameters + # * +search+ - [String] the user provided search string + # * +rules+ - [{ column: COLUMN, multiplier: 1 }, { column: COLUMN_2, multiplier: 0.5 }] rules for the scoring. + # * +column+ - Arel column expression, example: Project.arel_table["name"] + # * +multiplier+ - Integer or Float to increase or decrease the score (optional, defaults to 1) + # + # ==== Use case + # + # We'd like to search for projects by path, name and description. We want to rank higher the path and name matches, since + # it's more likely that the user was remembering the path or the name of the project. + # + # Rules: + # [ + # { column: Project.arel_table['path'], multiplier: 1 }, + # { column: Project.arel_table['name'], multiplier: 1 }, + # { column: Project.arel_table['description'], multiplier: 0.5 } + # ] + # + # ==== Examples + # + # Similarity calculation based on one column: + # + # Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [{ column: Project.arel_table['name'] }]) + # + # Similarity calculation based on two column, where the second column has lower priority: + # + # Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [ + # { column: Project.arel_table['name'], multiplier: 1 }, + # { column: Project.arel_table['description'], multiplier: 0.5 } + # ]) + # + # Integration with an ActiveRecord query: + # + # table = Project.arel_table + # + # order_expression = Gitlab::Database::SimilarityScore.build_expession(search: 'input', rules: [ + # { column: table['name'], multiplier: 1 }, + # { column: table['description'], multiplier: 0.5 } + # ]) + # + # Project.where("name LIKE ?", '%' + 'input' + '%').order(order_expression.desc) + # + # The expression can be also used in SELECT: + # + # results = Project.select(order_expression.as('similarity')).where("name LIKE ?", '%' + 'input' + '%').order(similarity: :desc) + # puts results.map(&:similarity) + # + def self.build_expression(search:, rules:) + return EXPRESSION_ON_INVALID_INPUT if search.blank? || rules.empty? + + quoted_search = ActiveRecord::Base.connection.quote(search.to_s) + + first_expression, *expressions = rules.map do |rule| + rule_to_arel(quoted_search, rule) + end + + # (SIMILARITY ...) + (SIMILARITY ...) + expressions.inject(first_expression) do |expression1, expression2| + Arel::Nodes::Addition.new(expression1, expression2) + end + end + + # (SIMILARITY(COALESCE(column, ''), 'search_string') * CAST(multiplier AS numeric)) + def self.rule_to_arel(search, rule) + Arel::Nodes::Grouping.new( + Arel::Nodes::Multiplication.new( + similarity_function_call(search, column_expression(rule)), + multiplier_expression(rule) + ) + ) + end + + # COALESCE(column, '') + def self.column_expression(rule) + Arel::Nodes::NamedFunction.new('COALESCE', [rule.fetch(:column), EMPTY_STRING]) + end + + # SIMILARITY(COALESCE(column, ''), 'search_string') + def self.similarity_function_call(search, column) + Arel::Nodes::NamedFunction.new('SIMILARITY', [column, Arel.sql(search)]) + end + + # CAST(multiplier AS numeric) + def self.multiplier_expression(rule) + quoted_multiplier = ActiveRecord::Base.connection.quote(rule.fetch(:multiplier, DEFAULT_MULTIPLIER).to_s) + + Arel::Nodes::NamedFunction.new('CAST', [Arel.sql(quoted_multiplier).as('numeric')]) + end + + private_class_method :rule_to_arel + private_class_method :column_expression + private_class_method :similarity_function_call + private_class_method :multiplier_expression + end + end +end |