1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
|
# frozen_string_literal: true
# This module adds PG full-text search capabilities to a model.
# A `search_data` association with a `search_vector` column is required.
#
# Declare the fields that will be part of the search vector with their
# corresponding weights. Possible values for weight are A, B, C, or D.
# For example:
#
# include PgFullTextSearchable
# pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }, { name: 'description', weight: 'B' }]
#
# This module sets up an after_commit hook that updates the search data
# when the searchable columns are changed. You will need to implement the
# `#persist_pg_full_text_search_vector` method that does the actual insert or update.
#
# This also adds a `pg_full_text_search` scope so you can do:
#
# Model.pg_full_text_search("some search term")
module PgFullTextSearchable
extend ActiveSupport::Concern
LONG_WORDS_REGEX = %r([A-Za-z0-9+/@]{50,}).freeze
TSVECTOR_MAX_LENGTH = 1.megabyte.freeze
TEXT_SEARCH_DICTIONARY = 'english'
URL_SCHEME_REGEX = %r{(?<=\A|\W)\w+://(?=\w+)}.freeze
TSQUERY_DISALLOWED_CHARACTERS_REGEX = %r{[^a-zA-Z0-9 .@/\-_"]}.freeze
def update_search_data!
tsvector_sql_nodes = self.class.pg_full_text_searchable_columns.map do |column, weight|
tsvector_arel_node(column, weight)&.to_sql
end
persist_pg_full_text_search_vector(Arel.sql(tsvector_sql_nodes.compact.join(' || ')))
rescue ActiveRecord::StatementInvalid => e
raise unless e.cause.is_a?(PG::ProgramLimitExceeded) && e.message.include?('string is too long for tsvector')
Gitlab::AppJsonLogger.error(
message: 'Error updating search data: string is too long for tsvector',
class: self.class.name,
model_id: self.id
)
end
private
def persist_pg_full_text_search_vector(search_vector)
raise NotImplementedError
end
def tsvector_arel_node(column, weight)
return if self[column].blank?
column_text = self[column].gsub(LONG_WORDS_REGEX, ' ')
column_text = column_text[0..(TSVECTOR_MAX_LENGTH - 1)]
column_text = ActiveSupport::Inflector.transliterate(column_text)
Arel::Nodes::NamedFunction.new(
'setweight',
[
Arel::Nodes::NamedFunction.new(
'to_tsvector',
[Arel::Nodes.build_quoted(TEXT_SEARCH_DICTIONARY), Arel::Nodes.build_quoted(column_text)]
),
Arel::Nodes.build_quoted(weight)
]
)
end
included do
cattr_reader :pg_full_text_searchable_columns do
{}
end
end
class_methods do
def pg_full_text_searchable(columns:)
raise 'Full text search columns already defined!' if pg_full_text_searchable_columns.present?
columns.each do |column|
pg_full_text_searchable_columns[column[:name]] = column[:weight]
end
# When multiple updates are done in a transaction, `saved_changes` will only report the latest save
# and we may miss an update to the searchable columns.
# As a workaround, we set a dirty flag here and update the search data in `after_save_commit`.
after_save do
next unless pg_full_text_searchable_columns.keys.any? { |f| saved_changes.has_key?(f) }
@update_pg_full_text_search_data = true
end
# We update this outside the transaction because this could raise an error if the resulting tsvector
# is too long. When that happens, we still persist the create / update but the model will not have a
# search data record. This is fine in most cases because this is a very rare occurrence and only happens
# with strings that are most likely unsearchable anyway.
#
# We also do not want to use a subtransaction here due to: https://gitlab.com/groups/gitlab-org/-/epics/6540
after_save_commit do
update_search_data! if @update_pg_full_text_search_data
@update_pg_full_text_search_data = nil
end
end
def pg_full_text_search(query, matched_columns: [])
search_data_table = reflect_on_association(:search_data).klass.arel_table
joins(:search_data).where(
Arel::Nodes::InfixOperation.new(
'@@',
search_data_table[:search_vector],
Arel::Nodes::NamedFunction.new(
'to_tsquery',
[Arel::Nodes.build_quoted(TEXT_SEARCH_DICTIONARY), build_tsquery(query, matched_columns)]
)
)
)
end
private
def build_tsquery(query, matched_columns)
# URLs get broken up into separate words when : is removed below, so we just remove the whole scheme.
query = remove_url_scheme(query)
# Remove accents from search term to match indexed data
query = ActiveSupport::Inflector.transliterate(query)
# Prevent users from using tsquery operators that can cause syntax errors.
query = filter_allowed_characters(query)
weights = matched_columns.map do |column_name|
pg_full_text_searchable_columns[column_name]
end.compact.join
prefix_search_suffix = ":*#{weights}"
tsquery = Gitlab::SQL::Pattern.split_query_to_search_terms(query).map do |search_term|
case search_term
when /\A\d+\z/ # Handles https://gitlab.com/gitlab-org/gitlab/-/issues/375337
"(#{search_term + prefix_search_suffix} | -#{search_term + prefix_search_suffix})"
when /\s/
search_term.split.map { |t| "#{t}:#{weights}" }.join(' <-> ')
else
search_term + prefix_search_suffix
end
end.join(' & ')
Arel::Nodes.build_quoted(tsquery)
end
def remove_url_scheme(query)
query.gsub(URL_SCHEME_REGEX, '')
end
def filter_allowed_characters(query)
query.gsub(TSQUERY_DISALLOWED_CHARACTERS_REGEX, ' ')
end
end
end
|