summaryrefslogtreecommitdiff
path: root/app/models/concerns/pg_full_text_searchable.rb
blob: 813827478da9aed1f518afd36bc78d08a323696b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# frozen_string_literal: true

# This module adds PG full-text search capabilities to a model.
# A `search_data` association with a `search_vector` column is required.
#
# Declare the fields that will be part of the search vector with their
# corresponding weights. Possible values for weight are A, B, C, or D.
# For example:
#
# include PgFullTextSearchable
# pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }, { name: 'description', weight: 'B' }]
#
# This module sets up an after_commit hook that updates the search data
# when the searchable columns are changed. You will need to implement the
# `#persist_pg_full_text_search_vector` method that does the actual insert or update.
#
# This also adds a `pg_full_text_search` scope so you can do:
#
# Model.pg_full_text_search("some search term")

module PgFullTextSearchable
  extend ActiveSupport::Concern

  LONG_WORDS_REGEX = %r([A-Za-z0-9+/@]{50,}).freeze
  TSVECTOR_MAX_LENGTH = 1.megabyte.freeze
  TEXT_SEARCH_DICTIONARY = 'english'
  URL_SCHEME_REGEX = %r{(?<=\A|\W)\w+://(?=\w+)}.freeze

  def update_search_data!
    tsvector_sql_nodes = self.class.pg_full_text_searchable_columns.map do |column, weight|
      tsvector_arel_node(column, weight)&.to_sql
    end

    persist_pg_full_text_search_vector(Arel.sql(tsvector_sql_nodes.compact.join(' || ')))
  rescue ActiveRecord::StatementInvalid => e
    raise unless e.cause.is_a?(PG::ProgramLimitExceeded) && e.message.include?('string is too long for tsvector')

    Gitlab::AppJsonLogger.error(
      message: 'Error updating search data: string is too long for tsvector',
      class: self.class.name,
      model_id: self.id
    )
  end

  private

  def persist_pg_full_text_search_vector(search_vector)
    raise NotImplementedError
  end

  def tsvector_arel_node(column, weight)
    return if self[column].blank?

    column_text = self[column].gsub(LONG_WORDS_REGEX, ' ')
    column_text = column_text[0..(TSVECTOR_MAX_LENGTH - 1)]
    column_text = ActiveSupport::Inflector.transliterate(column_text)

    Arel::Nodes::NamedFunction.new(
      'setweight',
      [
        Arel::Nodes::NamedFunction.new(
          'to_tsvector',
          [Arel::Nodes.build_quoted(TEXT_SEARCH_DICTIONARY), Arel::Nodes.build_quoted(column_text)]
        ),
        Arel::Nodes.build_quoted(weight)
      ]
    )
  end

  included do
    cattr_reader :pg_full_text_searchable_columns do
      {}
    end
  end

  class_methods do
    def pg_full_text_searchable(columns:)
      raise 'Full text search columns already defined!' if pg_full_text_searchable_columns.present?

      columns.each do |column|
        pg_full_text_searchable_columns[column[:name]] = column[:weight]
      end

      # When multiple updates are done in a transaction, `saved_changes` will only report the latest save
      # and we may miss an update to the searchable columns.
      # As a workaround, we set a dirty flag here and update the search data in `after_save_commit`.
      after_save do
        next unless pg_full_text_searchable_columns.keys.any? { |f| saved_changes.has_key?(f) }

        @update_pg_full_text_search_data = true
      end

      # We update this outside the transaction because this could raise an error if the resulting tsvector
      # is too long. When that happens, we still persist the create / update but the model will not have a
      # search data record. This is fine in most cases because this is a very rare occurrence and only happens
      # with strings that are most likely unsearchable anyway.
      #
      # We also do not want to use a subtransaction here due to: https://gitlab.com/groups/gitlab-org/-/epics/6540
      after_save_commit do
        update_search_data! if @update_pg_full_text_search_data
        @update_pg_full_text_search_data = nil
      end
    end

    def pg_full_text_search(search_term)
      search_data_table = reflect_on_association(:search_data).klass.arel_table

      # This fixes an inconsistency with how to_tsvector and websearch_to_tsquery process URLs
      # See https://gitlab.com/gitlab-org/gitlab/-/issues/354784#note_905431920
      search_term = remove_url_scheme(search_term)

      joins(:search_data).where(
        Arel::Nodes::InfixOperation.new(
          '@@',
          search_data_table[:search_vector],
          Arel::Nodes::NamedFunction.new(
            'websearch_to_tsquery',
            [Arel::Nodes.build_quoted(TEXT_SEARCH_DICTIONARY), Arel::Nodes.build_quoted(search_term)]
          )
        )
      )
    end

    private

    def remove_url_scheme(search_term)
      search_term.gsub(URL_SCHEME_REGEX, '')
    end
  end
end