diff options
author | Richard Hughes <richard@hughsie.com> | 2017-05-02 09:59:05 +0100 |
---|---|---|
committer | Richard Hughes <richard@hughsie.com> | 2017-05-02 09:59:19 +0100 |
commit | 519dcb04dc0bbfbaab85ecb45dda390340a5122d (patch) | |
tree | 89df129391c156fe60aa0a10c096aaa17f93cf60 | |
parent | 471baf6f896c5cea5b93df360147f5501384c200 (diff) | |
download | appstream-glib-519dcb04dc0bbfbaab85ecb45dda390340a5122d.tar.gz |
Casefold all stemmed entries
The idea for the stemmer is to input a string and to return a more generic
version of it. We already casefold the search parameters which means we can
reduce the duplication in the stemmer cache and also match more types of
mixed-case search values.
-rw-r--r-- | libappstream-glib/as-self-test.c | 9 | ||||
-rw-r--r-- | libappstream-glib/as-stemmer.c | 16 |
2 files changed, 18 insertions, 7 deletions
diff --git a/libappstream-glib/as-self-test.c b/libappstream-glib/as-self-test.c index 93935e4..218f4b6 100644 --- a/libappstream-glib/as-self-test.c +++ b/libappstream-glib/as-self-test.c @@ -2970,13 +2970,14 @@ as_test_app_search_func (void) const gchar *all[] = { "gnome", "install", "software", NULL }; const gchar *none[] = { "gnome", "xxx", "software", NULL }; const gchar *mime[] = { "application/vnd.oasis.opendocument.text", NULL }; + g_auto(GStrv) tokens = NULL; g_autoptr(AsApp) app = NULL; g_autoptr(GHashTable) search_blacklist = NULL; g_autoptr(AsStemmer) stemmer = as_stemmer_new (); app = as_app_new (); as_app_set_stemmer (app, stemmer); - as_app_set_id (app, "gnome-software"); + as_app_set_id (app, "org.gnome.Software.desktop"); as_app_add_pkgname (app, "gnome-software"); as_app_set_name (app, NULL, "GNOME Software X-Plane"); as_app_set_comment (app, NULL, "Install and remove software"); @@ -3001,6 +3002,12 @@ as_test_app_search_func (void) g_assert_cmpint (as_app_search_matches_all (app, (gchar**) none), ==, 0); g_assert_cmpint (as_app_search_matches_all (app, (gchar**) mime), ==, 4); + /* test searching for all tokenized tokens */ + tokens = as_utils_search_tokenize ("org.gnome.Software"); + g_assert_cmpstr (tokens[0], ==, "org.gnome.software"); + g_assert_cmpstr (tokens[1], ==, NULL); + g_assert_cmpint (as_app_search_matches_all (app, tokens), ==, 256); + /* test tokenization of hyphenated name */ g_assert_cmpint (as_app_search_matches (app, "x-plane"), ==, 64); g_assert_cmpint (as_app_search_matches (app, "plane"), ==, 64); diff --git a/libappstream-glib/as-stemmer.c b/libappstream-glib/as-stemmer.c index ef7ccd6..3a9478d 100644 --- a/libappstream-glib/as-stemmer.c +++ b/libappstream-glib/as-stemmer.c @@ -59,6 +59,7 @@ as_stemmer_process (AsStemmer *stemmer, const gchar *value) AsRefString *new; const gchar *tmp; gsize value_len; + g_autofree gchar *value_casefold = NULL; g_autoptr(GMutexLocker) locker = g_mutex_locker_new (&stemmer->ctx_mutex); /* look for word in the cache */ @@ -67,25 +68,28 @@ as_stemmer_process (AsStemmer *stemmer, const gchar *value) return as_ref_string_ref (new); /* not enabled */ + value_casefold = g_utf8_casefold (value, -1); if (stemmer->ctx == NULL || !stemmer->enabled) - return as_ref_string_new (value); + return as_ref_string_new (value_casefold); /* stem, then add to the cache */ - value_len = strlen (value); + value_len = strlen (value_casefold); tmp = (const gchar *) sb_stemmer_stem (stemmer->ctx, - (guchar *) value, + (guchar *) value_casefold, (gint) value_len); if (value_len == (gsize) sb_stemmer_length (stemmer->ctx)) { - new = as_ref_string_new_with_length (value, value_len); + new = as_ref_string_new_with_length (value_casefold, value_len); } else { new = as_ref_string_new_copy (tmp); } g_hash_table_insert (stemmer->hash, - as_ref_string_new (value), + as_ref_string_new (value_casefold), as_ref_string_ref (new)); return new; #else - return as_ref_string_new (value); + g_autofree gchar *value_casefold = NULL; + value_casefold = g_utf8_casefold (value, -1); + return as_ref_string_new (value_casefold); #endif } |