From 519dcb04dc0bbfbaab85ecb45dda390340a5122d Mon Sep 17 00:00:00 2001 From: Richard Hughes Date: Tue, 2 May 2017 09:59:05 +0100 Subject: Casefold all stemmed entries The idea for the stemmer is to input a string and to return a more generic version of it. We already casefold the search parameters which means we can reduce the duplication in the stemmer cache and also match more types of mixed-case search values. --- libappstream-glib/as-self-test.c | 9 ++++++++- libappstream-glib/as-stemmer.c | 16 ++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/libappstream-glib/as-self-test.c b/libappstream-glib/as-self-test.c index 93935e4..218f4b6 100644 --- a/libappstream-glib/as-self-test.c +++ b/libappstream-glib/as-self-test.c @@ -2970,13 +2970,14 @@ as_test_app_search_func (void) const gchar *all[] = { "gnome", "install", "software", NULL }; const gchar *none[] = { "gnome", "xxx", "software", NULL }; const gchar *mime[] = { "application/vnd.oasis.opendocument.text", NULL }; + g_auto(GStrv) tokens = NULL; g_autoptr(AsApp) app = NULL; g_autoptr(GHashTable) search_blacklist = NULL; g_autoptr(AsStemmer) stemmer = as_stemmer_new (); app = as_app_new (); as_app_set_stemmer (app, stemmer); - as_app_set_id (app, "gnome-software"); + as_app_set_id (app, "org.gnome.Software.desktop"); as_app_add_pkgname (app, "gnome-software"); as_app_set_name (app, NULL, "GNOME Software X-Plane"); as_app_set_comment (app, NULL, "Install and remove software"); @@ -3001,6 +3002,12 @@ as_test_app_search_func (void) g_assert_cmpint (as_app_search_matches_all (app, (gchar**) none), ==, 0); g_assert_cmpint (as_app_search_matches_all (app, (gchar**) mime), ==, 4); + /* test searching for all tokenized tokens */ + tokens = as_utils_search_tokenize ("org.gnome.Software"); + g_assert_cmpstr (tokens[0], ==, "org.gnome.software"); + g_assert_cmpstr (tokens[1], ==, NULL); + g_assert_cmpint (as_app_search_matches_all (app, tokens), ==, 256); + /* test tokenization of hyphenated name */ g_assert_cmpint (as_app_search_matches (app, "x-plane"), ==, 64); g_assert_cmpint (as_app_search_matches (app, "plane"), ==, 64); diff --git a/libappstream-glib/as-stemmer.c b/libappstream-glib/as-stemmer.c index ef7ccd6..3a9478d 100644 --- a/libappstream-glib/as-stemmer.c +++ b/libappstream-glib/as-stemmer.c @@ -59,6 +59,7 @@ as_stemmer_process (AsStemmer *stemmer, const gchar *value) AsRefString *new; const gchar *tmp; gsize value_len; + g_autofree gchar *value_casefold = NULL; g_autoptr(GMutexLocker) locker = g_mutex_locker_new (&stemmer->ctx_mutex); /* look for word in the cache */ @@ -67,25 +68,28 @@ as_stemmer_process (AsStemmer *stemmer, const gchar *value) return as_ref_string_ref (new); /* not enabled */ + value_casefold = g_utf8_casefold (value, -1); if (stemmer->ctx == NULL || !stemmer->enabled) - return as_ref_string_new (value); + return as_ref_string_new (value_casefold); /* stem, then add to the cache */ - value_len = strlen (value); + value_len = strlen (value_casefold); tmp = (const gchar *) sb_stemmer_stem (stemmer->ctx, - (guchar *) value, + (guchar *) value_casefold, (gint) value_len); if (value_len == (gsize) sb_stemmer_length (stemmer->ctx)) { - new = as_ref_string_new_with_length (value, value_len); + new = as_ref_string_new_with_length (value_casefold, value_len); } else { new = as_ref_string_new_copy (tmp); } g_hash_table_insert (stemmer->hash, - as_ref_string_new (value), + as_ref_string_new (value_casefold), as_ref_string_ref (new)); return new; #else - return as_ref_string_new (value); + g_autofree gchar *value_casefold = NULL; + value_casefold = g_utf8_casefold (value, -1); + return as_ref_string_new (value_casefold); #endif } -- cgit v1.2.1