fts: Do not apply stop-words when tokenizing query search terms

FTS5 notifies of the purpose in tokenization in their xTokenize vfunc, check for the FTS5_TOKENIZE_QUERY flag indicating that this is the tokenization of the search terms in a query and do not apply the stop words list in this case. One example where this is potentially harmful are "search as you type" UIs. eg. typing the word "ontology" would have you type the word "onto", which is an ignored word. Only after typing the next character you'd get matches, which seems irregular behavior.
author: Carlos Garnacho <carlosg@gnome.org> 2016-06-07 00:49:15 +0200
committer: Carlos Garnacho <carlosg@gnome.org> 2016-06-21 14:32:36 +0200
commit: 63e507865d661fee59a6e9c789cffa7d1073017e (patch)
tree: 71144b8203bac180b0795ee25509f945ef78e168
parent: 9fcbba78b87c4dbf2493997fa75f594f0c583990 (diff)
download: tracker-63e507865d661fee59a6e9c789cffa7d1073017e.tar.gz
1 files changed, 8 insertions, 2 deletions
diff --git a/src/libtracker-fts/tracker-fts-tokenizer.c b/src/libtracker-fts/tracker-fts-tokenizer.c
index 26764aaf3..e0550292d 100644
--- a/src/libtracker-fts/tracker-fts-tokenizer.c
+++ b/src/libtracker-fts/tracker-fts-tokenizer.c
@@ -95,18 +95,24 @@ tracker_tokenizer_tokenize (Fts5Tokenizer *fts5_tokenizer,
 	TrackerTokenizer *tokenizer = (TrackerTokenizer *) fts5_tokenizer;
 	TrackerTokenizerData *data = tokenizer->data;
 	const gchar *token;
-	gboolean stop_word;
+	gboolean stop_word, ignore_stop_words = data->ignore_stop_words;
 	int n_tokens = 0, pos, start, end, len;
 	int rc = SQLITE_OK;
 
 	if (length <= 0)
 		return rc;
 
+	/* When tokenizing the query, we don't want to ignore stop words,
+	 * we might ignore otherwise valid matches.
+	 */
+	if (flags & FTS5_TOKENIZE_QUERY)
+		ignore_stop_words = FALSE;
+
 	tracker_parser_reset (tokenizer->parser, text, length,
 	                      data->max_word_length,
 	                      data->enable_stemmer,
 	                      data->enable_unaccent,
-	                      data->ignore_stop_words,
+	                      ignore_stop_words,
 	                      TRUE,
 	                      data->ignore_numbers);
author	Carlos Garnacho <carlosg@gnome.org>	2016-06-07 00:49:15 +0200
committer	Carlos Garnacho <carlosg@gnome.org>	2016-06-21 14:32:36 +0200
commit	63e507865d661fee59a6e9c789cffa7d1073017e (patch)
tree	71144b8203bac180b0795ee25509f945ef78e168
parent	9fcbba78b87c4dbf2493997fa75f594f0c583990 (diff)
download	tracker-63e507865d661fee59a6e9c789cffa7d1073017e.tar.gz