summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Hughes <richard@hughsie.com>2017-05-02 09:59:05 +0100
committerRichard Hughes <richard@hughsie.com>2017-05-02 09:59:19 +0100
commit519dcb04dc0bbfbaab85ecb45dda390340a5122d (patch)
tree89df129391c156fe60aa0a10c096aaa17f93cf60
parent471baf6f896c5cea5b93df360147f5501384c200 (diff)
downloadappstream-glib-519dcb04dc0bbfbaab85ecb45dda390340a5122d.tar.gz
Casefold all stemmed entries
The idea for the stemmer is to input a string and to return a more generic version of it. We already casefold the search parameters which means we can reduce the duplication in the stemmer cache and also match more types of mixed-case search values.
-rw-r--r--libappstream-glib/as-self-test.c9
-rw-r--r--libappstream-glib/as-stemmer.c16
2 files changed, 18 insertions, 7 deletions
diff --git a/libappstream-glib/as-self-test.c b/libappstream-glib/as-self-test.c
index 93935e4..218f4b6 100644
--- a/libappstream-glib/as-self-test.c
+++ b/libappstream-glib/as-self-test.c
@@ -2970,13 +2970,14 @@ as_test_app_search_func (void)
const gchar *all[] = { "gnome", "install", "software", NULL };
const gchar *none[] = { "gnome", "xxx", "software", NULL };
const gchar *mime[] = { "application/vnd.oasis.opendocument.text", NULL };
+ g_auto(GStrv) tokens = NULL;
g_autoptr(AsApp) app = NULL;
g_autoptr(GHashTable) search_blacklist = NULL;
g_autoptr(AsStemmer) stemmer = as_stemmer_new ();
app = as_app_new ();
as_app_set_stemmer (app, stemmer);
- as_app_set_id (app, "gnome-software");
+ as_app_set_id (app, "org.gnome.Software.desktop");
as_app_add_pkgname (app, "gnome-software");
as_app_set_name (app, NULL, "GNOME Software X-Plane");
as_app_set_comment (app, NULL, "Install and remove software");
@@ -3001,6 +3002,12 @@ as_test_app_search_func (void)
g_assert_cmpint (as_app_search_matches_all (app, (gchar**) none), ==, 0);
g_assert_cmpint (as_app_search_matches_all (app, (gchar**) mime), ==, 4);
+ /* test searching for all tokenized tokens */
+ tokens = as_utils_search_tokenize ("org.gnome.Software");
+ g_assert_cmpstr (tokens[0], ==, "org.gnome.software");
+ g_assert_cmpstr (tokens[1], ==, NULL);
+ g_assert_cmpint (as_app_search_matches_all (app, tokens), ==, 256);
+
/* test tokenization of hyphenated name */
g_assert_cmpint (as_app_search_matches (app, "x-plane"), ==, 64);
g_assert_cmpint (as_app_search_matches (app, "plane"), ==, 64);
diff --git a/libappstream-glib/as-stemmer.c b/libappstream-glib/as-stemmer.c
index ef7ccd6..3a9478d 100644
--- a/libappstream-glib/as-stemmer.c
+++ b/libappstream-glib/as-stemmer.c
@@ -59,6 +59,7 @@ as_stemmer_process (AsStemmer *stemmer, const gchar *value)
AsRefString *new;
const gchar *tmp;
gsize value_len;
+ g_autofree gchar *value_casefold = NULL;
g_autoptr(GMutexLocker) locker = g_mutex_locker_new (&stemmer->ctx_mutex);
/* look for word in the cache */
@@ -67,25 +68,28 @@ as_stemmer_process (AsStemmer *stemmer, const gchar *value)
return as_ref_string_ref (new);
/* not enabled */
+ value_casefold = g_utf8_casefold (value, -1);
if (stemmer->ctx == NULL || !stemmer->enabled)
- return as_ref_string_new (value);
+ return as_ref_string_new (value_casefold);
/* stem, then add to the cache */
- value_len = strlen (value);
+ value_len = strlen (value_casefold);
tmp = (const gchar *) sb_stemmer_stem (stemmer->ctx,
- (guchar *) value,
+ (guchar *) value_casefold,
(gint) value_len);
if (value_len == (gsize) sb_stemmer_length (stemmer->ctx)) {
- new = as_ref_string_new_with_length (value, value_len);
+ new = as_ref_string_new_with_length (value_casefold, value_len);
} else {
new = as_ref_string_new_copy (tmp);
}
g_hash_table_insert (stemmer->hash,
- as_ref_string_new (value),
+ as_ref_string_new (value_casefold),
as_ref_string_ref (new));
return new;
#else
- return as_ref_string_new (value);
+ g_autofree gchar *value_casefold = NULL;
+ value_casefold = g_utf8_casefold (value, -1);
+ return as_ref_string_new (value_casefold);
#endif
}