summaryrefslogtreecommitdiff
path: root/lib/gitlab/database/similarity_score.rb
blob: 20bf6fa4d30707b38437052818c53112d969cca3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# frozen_string_literal: true

module Gitlab
  module Database
    class SimilarityScore
      EMPTY_STRING = Arel.sql("''").freeze
      EXPRESSION_ON_INVALID_INPUT = Arel::Nodes::NamedFunction.new('CAST', [Arel.sql('0').as('integer')]).freeze
      DEFAULT_MULTIPLIER = 1
      DISPLAY_NAME = self.name.underscore.freeze

      # Adds a "magic" comment in the generated SQL expression in order to be able to tell if we're sorting by similarity.
      # Example: /* gitlab/database/similarity_score */ SIMILARITY(COALESCE...
      SIMILARITY_FUNCTION_CALL_WITH_ANNOTATION = "/* #{DISPLAY_NAME} */ SIMILARITY"

      # This method returns an Arel expression that can be used in an ActiveRecord query to order the resultset by similarity.
      #
      # Note: Calculating similarity score for large volume of records is inefficient. use SimilarityScore only for smaller
      # resultset which is already filtered by other conditions (< 10_000 records).
      #
      # ==== Parameters
      # * +search+ - [String] the user provided search string
      # * +rules+ - [{ column: COLUMN, multiplier: 1 }, { column: COLUMN_2, multiplier: 0.5 }] rules for the scoring.
      #   * +column+ - Arel column expression, example: Project.arel_table["name"]
      #   * +multiplier+ - Integer or Float to increase or decrease the score (optional, defaults to 1)
      #
      # ==== Use case
      #
      # We'd like to search for projects by path, name and description. We want to rank higher the path and name matches, since
      # it's more likely that the user was remembering the path or the name of the project.
      #
      # Rules:
      #   [
      #     { column: Project.arel_table['path'], multiplier: 1 },
      #     { column: Project.arel_table['name'], multiplier: 1 },
      #     { column: Project.arel_table['description'], multiplier: 0.5 }
      #   ]
      #
      # ==== Examples
      #
      #  Similarity calculation based on one column:
      #
      #  Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [{ column: Project.arel_table['name'] }])
      #
      #  Similarity calculation based on two column, where the second column has lower priority:
      #
      #  Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [
      #    { column: Project.arel_table['name'], multiplier: 1 },
      #    { column: Project.arel_table['description'], multiplier: 0.5 }
      #  ])
      #
      #  Integration with an ActiveRecord query:
      #
      #  table = Project.arel_table
      #
      #  order_expression = Gitlab::Database::SimilarityScore.build_expession(search: 'input', rules: [
      #    { column: table['name'], multiplier: 1 },
      #    { column: table['description'], multiplier: 0.5 }
      #  ])
      #
      #  Project.where("name LIKE ?", '%' + 'input' + '%').order(order_expression.desc)
      #
      #  The expression can be also used in SELECT:
      #
      #  results = Project.select(order_expression.as('similarity')).where("name LIKE ?", '%' + 'input' + '%').order(similarity: :desc)
      #  puts results.map(&:similarity)
      #
      def self.build_expression(search:, rules:)
        return EXPRESSION_ON_INVALID_INPUT if search.blank? || rules.empty?

        quoted_search = ActiveRecord::Base.connection.quote(search.to_s)

        first_expression, *expressions = rules.map do |rule|
          rule_to_arel(quoted_search, rule)
        end

        # (SIMILARITY ...) + (SIMILARITY ...)
        additions = expressions.inject(first_expression) do |expression1, expression2|
          Arel::Nodes::Addition.new(expression1, expression2)
        end

        score_as_numeric = Arel::Nodes::NamedFunction.new('CAST', [Arel::Nodes::Grouping.new(additions).as('numeric')])

        # Rounding the score to two decimals
        Arel::Nodes::NamedFunction.new('ROUND', [score_as_numeric, 2])
      end

      def self.order_by_similarity?(arel_query)
        arel_query.to_sql.include?(SIMILARITY_FUNCTION_CALL_WITH_ANNOTATION)
      end

      # (SIMILARITY(COALESCE(column, ''), 'search_string') * CAST(multiplier AS numeric))
      def self.rule_to_arel(search, rule)
        Arel::Nodes::Grouping.new(
          Arel::Nodes::Multiplication.new(
            similarity_function_call(search, column_expression(rule)),
            multiplier_expression(rule)
          )
        )
      end

      # COALESCE(column, '')
      def self.column_expression(rule)
        Arel::Nodes::NamedFunction.new('COALESCE', [rule.fetch(:column), EMPTY_STRING])
      end

      # SIMILARITY(COALESCE(column, ''), 'search_string')
      def self.similarity_function_call(search, column)
        Arel::Nodes::NamedFunction.new(SIMILARITY_FUNCTION_CALL_WITH_ANNOTATION, [column, Arel.sql(search)])
      end

      # CAST(multiplier AS numeric)
      def self.multiplier_expression(rule)
        quoted_multiplier = ActiveRecord::Base.connection.quote(rule.fetch(:multiplier, DEFAULT_MULTIPLIER).to_s)

        Arel::Nodes::NamedFunction.new('CAST', [Arel.sql(quoted_multiplier).as('numeric')])
      end

      private_class_method :rule_to_arel
      private_class_method :column_expression
      private_class_method :similarity_function_call
      private_class_method :multiplier_expression
    end
  end
end