diff options
author | Richard Hughes <richard@hughsie.com> | 2016-06-27 17:11:00 +0100 |
---|---|---|
committer | Richard Hughes <richard@hughsie.com> | 2016-06-28 08:12:01 +0100 |
commit | 63d2d89eca6c185c251c02036328fec5a6692623 (patch) | |
tree | 73e5e7c7892b04ab6bde3c2e2d9f4bfddbdbed08 | |
parent | a520edea8874bb0dd914d9e13e9f34aaff8d7e2f (diff) | |
download | appstream-glib-63d2d89eca6c185c251c02036328fec5a6692623.tar.gz |
Optionally use libstemmer for keyword stemming
This allows us to search for 'networking', 'networks', or 'networked' and to
return results for all network* keywords.
-rw-r--r-- | configure.ac | 14 | ||||
-rw-r--r-- | contrib/libappstream-glib.spec.in | 1 | ||||
-rw-r--r-- | libappstream-glib/Makefile.am | 4 | ||||
-rw-r--r-- | libappstream-glib/as-app.c | 22 | ||||
-rw-r--r-- | libappstream-glib/as-self-test.c | 2 | ||||
-rw-r--r-- | libappstream-glib/as-stemmer.c | 120 | ||||
-rw-r--r-- | libappstream-glib/as-stemmer.h | 39 |
7 files changed, 196 insertions, 6 deletions
diff --git a/configure.ac b/configure.ac index aacf8c1..0024dfc 100644 --- a/configure.ac +++ b/configure.ac @@ -245,6 +245,19 @@ if test x$enable_dep11 = xyes; then AC_DEFINE(AS_BUILD_DEP11,1,[Build DEP-11 code]) fi +dnl --------------------------------------------------------------------------- +dnl - Use libstemmer for search stemming +dnl --------------------------------------------------------------------------- +AC_ARG_ENABLE(stemmer, AS_HELP_STRING([--enable-stemmer],[enable search stemmer]), + enable_stemmer=$enableval,enable_stemmer=yes) +AM_CONDITIONAL(HAVE_LIBSTEMMER, test x$enable_stemmer = xyes) +if test x$enable_stemmer = xyes; then + AC_CHECK_HEADER(libstemmer.h, [], [AC_MSG_ERROR([No libstemmer.h])]) + STEMMER_LIBS="-lstemmer" + AC_SUBST(STEMMER_LIBS) + AC_DEFINE(HAVE_LIBSTEMMER,1,[define if libstemmer is installed]) +fi + AC_CONFIG_FILES([ Makefile client/Makefile @@ -272,6 +285,7 @@ AC_MSG_RESULT([ includedir: ${includedir} lib dir: ${libdir} DEP-11 support: ${enable_dep11} + Stemming support: ${enable_stemmer} Builder support: ${enable_builder} Firmware support: ${enable_firmware} Fonts support: ${enable_fonts} diff --git a/contrib/libappstream-glib.spec.in b/contrib/libappstream-glib.spec.in index eb2ff9f..31ec41f 100644 --- a/contrib/libappstream-glib.spec.in +++ b/contrib/libappstream-glib.spec.in @@ -22,6 +22,7 @@ BuildRequires: gettext BuildRequires: intltool BuildRequires: libgcab1-devel BuildRequires: libuuid-devel +BuildRequires: libstemmer-devel BuildRequires: json-glib-devel >= 1.1.1 # for the builder component diff --git a/libappstream-glib/Makefile.am b/libappstream-glib/Makefile.am index 546d174..0aaeee7 100644 --- a/libappstream-glib/Makefile.am +++ b/libappstream-glib/Makefile.am @@ -127,6 +127,8 @@ libappstream_glib_la_SOURCES = \ as-resources.h \ as-screenshot.c \ as-screenshot-private.h \ + as-stemmer.c \ + as-stemmer.h \ as-store.c \ as-tag.c \ as-translation.c \ @@ -156,6 +158,7 @@ libappstream_glib_la_LIBADD = \ $(GDKPIXBUF_LIBS) \ $(LIBARCHIVE_LIBS) \ $(SOUP_LIBS) \ + $(STEMMER_LIBS) \ $(UUID_LIBS) \ $(YAML_LIBS) @@ -177,6 +180,7 @@ as_self_test_LDADD = \ $(GDKPIXBUF_LIBS) \ $(LIBARCHIVE_LIBS) \ $(SOUP_LIBS) \ + $(STEMMER_LIBS) \ $(UUID_LIBS) \ $(YAML_LIBS) \ $(lib_LTLIBRARIES) diff --git a/libappstream-glib/as-app.c b/libappstream-glib/as-app.c index 6ff2414..85297a9 100644 --- a/libappstream-glib/as-app.c +++ b/libappstream-glib/as-app.c @@ -46,6 +46,7 @@ #include "as-provide-private.h" #include "as-release-private.h" #include "as-screenshot-private.h" +#include "as-stemmer.h" #include "as-tag.h" #include "as-translation-private.h" #include "as-utils-private.h" @@ -56,6 +57,7 @@ typedef struct AsAppProblems problems; AsIconKind icon_kind; AsAppKind kind; + AsStemmer *stemmer; GHashTable *comments; /* of locale:string */ GHashTable *developer_names; /* of locale:string */ GHashTable *descriptions; /* of locale:string */ @@ -374,6 +376,7 @@ as_app_finalize (GObject *object) g_hash_table_unref (priv->names); g_hash_table_unref (priv->urls); g_hash_table_unref (priv->token_cache); + g_object_unref (priv->stemmer); g_ptr_array_unref (priv->addons); g_ptr_array_unref (priv->categories); g_ptr_array_unref (priv->compulsory_for_desktops); @@ -402,6 +405,7 @@ static void as_app_init (AsApp *app) { AsAppPrivate *priv = GET_PRIVATE (app); + priv->stemmer = as_stemmer_new (); priv->categories = g_ptr_array_new_with_free_func (g_free); priv->compulsory_for_desktops = g_ptr_array_new_with_free_func (g_free); priv->content_ratings = g_ptr_array_new_with_free_func ((GDestroyNotify) g_object_unref); @@ -4437,17 +4441,21 @@ as_app_value_tokenize (const gchar *value) * as_app_add_token_internal: **/ static void -as_app_add_token_internal (AsApp *app, const gchar *value, AsAppTokenMatch match_flag) +as_app_add_token_internal (AsApp *app, + const gchar *value, + AsAppTokenMatch match_flag) { AsAppPrivate *priv = GET_PRIVATE (app); AsAppTokenType *match_pval; + g_autofree gchar *value_stem = NULL; /* invalid */ if (!as_utils_search_token_valid (value)) return; /* does the token already exist */ - match_pval = g_hash_table_lookup (priv->token_cache, value); + value_stem = as_stemmer_process (priv->stemmer, value); + match_pval = g_hash_table_lookup (priv->token_cache, value_stem); if (match_pval != NULL) { *match_pval |= match_flag; return; @@ -4456,7 +4464,9 @@ as_app_add_token_internal (AsApp *app, const gchar *value, AsAppTokenMatch match /* create and add */ match_pval = g_new0 (AsAppTokenType, 1); *match_pval = match_flag; - g_hash_table_insert (priv->token_cache, g_strdup (value), match_pval); + g_hash_table_insert (priv->token_cache, + g_steal_pointer (&value_stem), + match_pval); } /** @@ -4610,6 +4620,7 @@ as_app_search_matches (AsApp *app, const gchar *search) GList *l; AsAppTokenMatch result = 0; g_autoptr(GList) keys = NULL; + g_autofree gchar *search_stem = NULL; /* nothing to do */ if (search == NULL) @@ -4622,7 +4633,8 @@ as_app_search_matches (AsApp *app, const gchar *search) } /* find the exact match (which is more awesome than a partial match) */ - match_pval = g_hash_table_lookup (priv->token_cache, search); + search_stem = as_stemmer_process (priv->stemmer, search); + match_pval = g_hash_table_lookup (priv->token_cache, search_stem); if (match_pval != NULL) return *match_pval << 2; @@ -4630,7 +4642,7 @@ as_app_search_matches (AsApp *app, const gchar *search) keys = g_hash_table_get_keys (priv->token_cache); for (l = keys; l != NULL; l = l->next) { const gchar *key = l->data; - if (g_str_has_prefix (key, search)) { + if (g_str_has_prefix (key, search_stem)) { match_pval = g_hash_table_lookup (priv->token_cache, key); result |= *match_pval; } diff --git a/libappstream-glib/as-self-test.c b/libappstream-glib/as-self-test.c index 86e83af..5300e41 100644 --- a/libappstream-glib/as-self-test.c +++ b/libappstream-glib/as-self-test.c @@ -2634,7 +2634,7 @@ as_test_app_search_func (void) g_assert_cmpint (as_app_search_matches (app, "software"), ==, 96); g_assert_cmpint (as_app_search_matches (app, "soft"), ==, 24); - g_assert_cmpint (as_app_search_matches (app, "install"), ==, 32); + g_assert_cmpint (as_app_search_matches (app, "installing"), ==, 32); g_assert_cmpint (as_app_search_matches (app, "awesome"), ==, 128); g_assert_cmpint (as_app_search_matches (app, "c++"), ==, 128); g_assert_cmpint (as_app_search_matches (app, "d-feet"), ==, 128); diff --git a/libappstream-glib/as-stemmer.c b/libappstream-glib/as-stemmer.c new file mode 100644 index 0000000..4af4410 --- /dev/null +++ b/libappstream-glib/as-stemmer.c @@ -0,0 +1,120 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- + * + * Copyright (C) 2016 Richard Hughes <richard@hughsie.com> + * + * Licensed under the GNU General Public License Version 2 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "config.h" + +#include <glib/gi18n.h> + +#ifdef HAVE_LIBSTEMMER + #include "libstemmer.h" +#endif + +#include "as-stemmer.h" + +struct _AsStemmer +{ + GObject parent_instance; + struct sb_stemmer *ctx; + GMutex ctx_mutex; +}; + +G_DEFINE_TYPE (AsStemmer, as_stemmer, G_TYPE_OBJECT) + +static gpointer as_stemmer_object = NULL; + +/** + * as_stemmer_process: + * @stemmer: A #AsStemmer + * @value: The input string + * + * Stems a string using the Porter algorithm. + * + * Since: 0.2.2 + * + * Returns: A new string + **/ +gchar * +as_stemmer_process (AsStemmer *stemmer, const gchar *value) +{ +#ifdef HAVE_LIBSTEMMER + gchar *new; + g_autoptr(GMutexLocker) locker = g_mutex_locker_new (&stemmer->ctx_mutex); + if (stemmer->ctx == NULL) + return g_strdup (value); + new = g_strdup ((gchar *) sb_stemmer_stem (stemmer->ctx, + (guchar *) value, + strlen (value))); +// if (g_strcmp0 (value, new) != 0) +// g_debug ("stemmed %s->%s", value, new); + return new; +#else + return g_strdup (value); +#endif +} + +static void +as_stemmer_finalize (GObject *object) +{ + AsStemmer *stemmer = AS_STEMMER (object); +#ifdef HAVE_LIBSTEMMER + sb_stemmer_delete (stemmer->ctx); + g_mutex_clear (&stemmer->ctx_mutex); +#endif + G_OBJECT_CLASS (as_stemmer_parent_class)->finalize (object); +} + +static void +as_stemmer_class_init (AsStemmerClass *klass) +{ + GObjectClass *object_class = G_OBJECT_CLASS (klass); + object_class->finalize = as_stemmer_finalize; +} + +static void +as_stemmer_init (AsStemmer *stemmer) +{ + /* FIXME: use as_utils_locale_to_language()? */ +#ifdef HAVE_LIBSTEMMER + stemmer->ctx = sb_stemmer_new ("en", NULL); + g_mutex_init (&stemmer->ctx_mutex); +#endif +} + +/** + * as_stemmer_new: + * + * Creates a new #AsStemmer. + * + * Returns: (transfer full): a #AsStemmer + * + * Since: 0.2.2 + **/ +AsStemmer * +as_stemmer_new (void) +{ + if (as_stemmer_object != NULL) { + g_object_ref (as_stemmer_object); + } else { + as_stemmer_object = g_object_new (AS_TYPE_STEMMER, NULL); + g_object_add_weak_pointer (as_stemmer_object, &as_stemmer_object); + } + return AS_STEMMER (as_stemmer_object); +} diff --git a/libappstream-glib/as-stemmer.h b/libappstream-glib/as-stemmer.h new file mode 100644 index 0000000..f277d84 --- /dev/null +++ b/libappstream-glib/as-stemmer.h @@ -0,0 +1,39 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- + * + * Copyright (C) 2016 Richard Hughes <richard@hughsie.com> + * + * Licensed under the GNU General Public License Version 2 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef __AS_STEMMER_H +#define __AS_STEMMER_H + +#include <glib-object.h> + +G_BEGIN_DECLS + +#define AS_TYPE_STEMMER (as_stemmer_get_type ()) + +G_DECLARE_FINAL_TYPE (AsStemmer, as_stemmer, AS, STEMMER, GObject) + +AsStemmer *as_stemmer_new (void); +gchar *as_stemmer_process (AsStemmer *stemmer, + const gchar *value); + +G_END_DECLS + +#endif /* __AS_STEMMER_H */ |