summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Hughes <richard@hughsie.com>2016-06-27 17:11:00 +0100
committerRichard Hughes <richard@hughsie.com>2016-06-28 08:12:01 +0100
commit63d2d89eca6c185c251c02036328fec5a6692623 (patch)
tree73e5e7c7892b04ab6bde3c2e2d9f4bfddbdbed08
parenta520edea8874bb0dd914d9e13e9f34aaff8d7e2f (diff)
downloadappstream-glib-63d2d89eca6c185c251c02036328fec5a6692623.tar.gz
Optionally use libstemmer for keyword stemming
This allows us to search for 'networking', 'networks', or 'networked' and to return results for all network* keywords.
-rw-r--r--configure.ac14
-rw-r--r--contrib/libappstream-glib.spec.in1
-rw-r--r--libappstream-glib/Makefile.am4
-rw-r--r--libappstream-glib/as-app.c22
-rw-r--r--libappstream-glib/as-self-test.c2
-rw-r--r--libappstream-glib/as-stemmer.c120
-rw-r--r--libappstream-glib/as-stemmer.h39
7 files changed, 196 insertions, 6 deletions
diff --git a/configure.ac b/configure.ac
index aacf8c1..0024dfc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -245,6 +245,19 @@ if test x$enable_dep11 = xyes; then
AC_DEFINE(AS_BUILD_DEP11,1,[Build DEP-11 code])
fi
+dnl ---------------------------------------------------------------------------
+dnl - Use libstemmer for search stemming
+dnl ---------------------------------------------------------------------------
+AC_ARG_ENABLE(stemmer, AS_HELP_STRING([--enable-stemmer],[enable search stemmer]),
+ enable_stemmer=$enableval,enable_stemmer=yes)
+AM_CONDITIONAL(HAVE_LIBSTEMMER, test x$enable_stemmer = xyes)
+if test x$enable_stemmer = xyes; then
+ AC_CHECK_HEADER(libstemmer.h, [], [AC_MSG_ERROR([No libstemmer.h])])
+ STEMMER_LIBS="-lstemmer"
+ AC_SUBST(STEMMER_LIBS)
+ AC_DEFINE(HAVE_LIBSTEMMER,1,[define if libstemmer is installed])
+fi
+
AC_CONFIG_FILES([
Makefile
client/Makefile
@@ -272,6 +285,7 @@ AC_MSG_RESULT([
includedir: ${includedir}
lib dir: ${libdir}
DEP-11 support: ${enable_dep11}
+ Stemming support: ${enable_stemmer}
Builder support: ${enable_builder}
Firmware support: ${enable_firmware}
Fonts support: ${enable_fonts}
diff --git a/contrib/libappstream-glib.spec.in b/contrib/libappstream-glib.spec.in
index eb2ff9f..31ec41f 100644
--- a/contrib/libappstream-glib.spec.in
+++ b/contrib/libappstream-glib.spec.in
@@ -22,6 +22,7 @@ BuildRequires: gettext
BuildRequires: intltool
BuildRequires: libgcab1-devel
BuildRequires: libuuid-devel
+BuildRequires: libstemmer-devel
BuildRequires: json-glib-devel >= 1.1.1
# for the builder component
diff --git a/libappstream-glib/Makefile.am b/libappstream-glib/Makefile.am
index 546d174..0aaeee7 100644
--- a/libappstream-glib/Makefile.am
+++ b/libappstream-glib/Makefile.am
@@ -127,6 +127,8 @@ libappstream_glib_la_SOURCES = \
as-resources.h \
as-screenshot.c \
as-screenshot-private.h \
+ as-stemmer.c \
+ as-stemmer.h \
as-store.c \
as-tag.c \
as-translation.c \
@@ -156,6 +158,7 @@ libappstream_glib_la_LIBADD = \
$(GDKPIXBUF_LIBS) \
$(LIBARCHIVE_LIBS) \
$(SOUP_LIBS) \
+ $(STEMMER_LIBS) \
$(UUID_LIBS) \
$(YAML_LIBS)
@@ -177,6 +180,7 @@ as_self_test_LDADD = \
$(GDKPIXBUF_LIBS) \
$(LIBARCHIVE_LIBS) \
$(SOUP_LIBS) \
+ $(STEMMER_LIBS) \
$(UUID_LIBS) \
$(YAML_LIBS) \
$(lib_LTLIBRARIES)
diff --git a/libappstream-glib/as-app.c b/libappstream-glib/as-app.c
index 6ff2414..85297a9 100644
--- a/libappstream-glib/as-app.c
+++ b/libappstream-glib/as-app.c
@@ -46,6 +46,7 @@
#include "as-provide-private.h"
#include "as-release-private.h"
#include "as-screenshot-private.h"
+#include "as-stemmer.h"
#include "as-tag.h"
#include "as-translation-private.h"
#include "as-utils-private.h"
@@ -56,6 +57,7 @@ typedef struct
AsAppProblems problems;
AsIconKind icon_kind;
AsAppKind kind;
+ AsStemmer *stemmer;
GHashTable *comments; /* of locale:string */
GHashTable *developer_names; /* of locale:string */
GHashTable *descriptions; /* of locale:string */
@@ -374,6 +376,7 @@ as_app_finalize (GObject *object)
g_hash_table_unref (priv->names);
g_hash_table_unref (priv->urls);
g_hash_table_unref (priv->token_cache);
+ g_object_unref (priv->stemmer);
g_ptr_array_unref (priv->addons);
g_ptr_array_unref (priv->categories);
g_ptr_array_unref (priv->compulsory_for_desktops);
@@ -402,6 +405,7 @@ static void
as_app_init (AsApp *app)
{
AsAppPrivate *priv = GET_PRIVATE (app);
+ priv->stemmer = as_stemmer_new ();
priv->categories = g_ptr_array_new_with_free_func (g_free);
priv->compulsory_for_desktops = g_ptr_array_new_with_free_func (g_free);
priv->content_ratings = g_ptr_array_new_with_free_func ((GDestroyNotify) g_object_unref);
@@ -4437,17 +4441,21 @@ as_app_value_tokenize (const gchar *value)
* as_app_add_token_internal:
**/
static void
-as_app_add_token_internal (AsApp *app, const gchar *value, AsAppTokenMatch match_flag)
+as_app_add_token_internal (AsApp *app,
+ const gchar *value,
+ AsAppTokenMatch match_flag)
{
AsAppPrivate *priv = GET_PRIVATE (app);
AsAppTokenType *match_pval;
+ g_autofree gchar *value_stem = NULL;
/* invalid */
if (!as_utils_search_token_valid (value))
return;
/* does the token already exist */
- match_pval = g_hash_table_lookup (priv->token_cache, value);
+ value_stem = as_stemmer_process (priv->stemmer, value);
+ match_pval = g_hash_table_lookup (priv->token_cache, value_stem);
if (match_pval != NULL) {
*match_pval |= match_flag;
return;
@@ -4456,7 +4464,9 @@ as_app_add_token_internal (AsApp *app, const gchar *value, AsAppTokenMatch match
/* create and add */
match_pval = g_new0 (AsAppTokenType, 1);
*match_pval = match_flag;
- g_hash_table_insert (priv->token_cache, g_strdup (value), match_pval);
+ g_hash_table_insert (priv->token_cache,
+ g_steal_pointer (&value_stem),
+ match_pval);
}
/**
@@ -4610,6 +4620,7 @@ as_app_search_matches (AsApp *app, const gchar *search)
GList *l;
AsAppTokenMatch result = 0;
g_autoptr(GList) keys = NULL;
+ g_autofree gchar *search_stem = NULL;
/* nothing to do */
if (search == NULL)
@@ -4622,7 +4633,8 @@ as_app_search_matches (AsApp *app, const gchar *search)
}
/* find the exact match (which is more awesome than a partial match) */
- match_pval = g_hash_table_lookup (priv->token_cache, search);
+ search_stem = as_stemmer_process (priv->stemmer, search);
+ match_pval = g_hash_table_lookup (priv->token_cache, search_stem);
if (match_pval != NULL)
return *match_pval << 2;
@@ -4630,7 +4642,7 @@ as_app_search_matches (AsApp *app, const gchar *search)
keys = g_hash_table_get_keys (priv->token_cache);
for (l = keys; l != NULL; l = l->next) {
const gchar *key = l->data;
- if (g_str_has_prefix (key, search)) {
+ if (g_str_has_prefix (key, search_stem)) {
match_pval = g_hash_table_lookup (priv->token_cache, key);
result |= *match_pval;
}
diff --git a/libappstream-glib/as-self-test.c b/libappstream-glib/as-self-test.c
index 86e83af..5300e41 100644
--- a/libappstream-glib/as-self-test.c
+++ b/libappstream-glib/as-self-test.c
@@ -2634,7 +2634,7 @@ as_test_app_search_func (void)
g_assert_cmpint (as_app_search_matches (app, "software"), ==, 96);
g_assert_cmpint (as_app_search_matches (app, "soft"), ==, 24);
- g_assert_cmpint (as_app_search_matches (app, "install"), ==, 32);
+ g_assert_cmpint (as_app_search_matches (app, "installing"), ==, 32);
g_assert_cmpint (as_app_search_matches (app, "awesome"), ==, 128);
g_assert_cmpint (as_app_search_matches (app, "c++"), ==, 128);
g_assert_cmpint (as_app_search_matches (app, "d-feet"), ==, 128);
diff --git a/libappstream-glib/as-stemmer.c b/libappstream-glib/as-stemmer.c
new file mode 100644
index 0000000..4af4410
--- /dev/null
+++ b/libappstream-glib/as-stemmer.c
@@ -0,0 +1,120 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2016 Richard Hughes <richard@hughsie.com>
+ *
+ * Licensed under the GNU General Public License Version 2
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "config.h"
+
+#include <glib/gi18n.h>
+
+#ifdef HAVE_LIBSTEMMER
+ #include "libstemmer.h"
+#endif
+
+#include "as-stemmer.h"
+
+struct _AsStemmer
+{
+ GObject parent_instance;
+ struct sb_stemmer *ctx;
+ GMutex ctx_mutex;
+};
+
+G_DEFINE_TYPE (AsStemmer, as_stemmer, G_TYPE_OBJECT)
+
+static gpointer as_stemmer_object = NULL;
+
+/**
+ * as_stemmer_process:
+ * @stemmer: A #AsStemmer
+ * @value: The input string
+ *
+ * Stems a string using the Porter algorithm.
+ *
+ * Since: 0.2.2
+ *
+ * Returns: A new string
+ **/
+gchar *
+as_stemmer_process (AsStemmer *stemmer, const gchar *value)
+{
+#ifdef HAVE_LIBSTEMMER
+ gchar *new;
+ g_autoptr(GMutexLocker) locker = g_mutex_locker_new (&stemmer->ctx_mutex);
+ if (stemmer->ctx == NULL)
+ return g_strdup (value);
+ new = g_strdup ((gchar *) sb_stemmer_stem (stemmer->ctx,
+ (guchar *) value,
+ strlen (value)));
+// if (g_strcmp0 (value, new) != 0)
+// g_debug ("stemmed %s->%s", value, new);
+ return new;
+#else
+ return g_strdup (value);
+#endif
+}
+
+static void
+as_stemmer_finalize (GObject *object)
+{
+ AsStemmer *stemmer = AS_STEMMER (object);
+#ifdef HAVE_LIBSTEMMER
+ sb_stemmer_delete (stemmer->ctx);
+ g_mutex_clear (&stemmer->ctx_mutex);
+#endif
+ G_OBJECT_CLASS (as_stemmer_parent_class)->finalize (object);
+}
+
+static void
+as_stemmer_class_init (AsStemmerClass *klass)
+{
+ GObjectClass *object_class = G_OBJECT_CLASS (klass);
+ object_class->finalize = as_stemmer_finalize;
+}
+
+static void
+as_stemmer_init (AsStemmer *stemmer)
+{
+ /* FIXME: use as_utils_locale_to_language()? */
+#ifdef HAVE_LIBSTEMMER
+ stemmer->ctx = sb_stemmer_new ("en", NULL);
+ g_mutex_init (&stemmer->ctx_mutex);
+#endif
+}
+
+/**
+ * as_stemmer_new:
+ *
+ * Creates a new #AsStemmer.
+ *
+ * Returns: (transfer full): a #AsStemmer
+ *
+ * Since: 0.2.2
+ **/
+AsStemmer *
+as_stemmer_new (void)
+{
+ if (as_stemmer_object != NULL) {
+ g_object_ref (as_stemmer_object);
+ } else {
+ as_stemmer_object = g_object_new (AS_TYPE_STEMMER, NULL);
+ g_object_add_weak_pointer (as_stemmer_object, &as_stemmer_object);
+ }
+ return AS_STEMMER (as_stemmer_object);
+}
diff --git a/libappstream-glib/as-stemmer.h b/libappstream-glib/as-stemmer.h
new file mode 100644
index 0000000..f277d84
--- /dev/null
+++ b/libappstream-glib/as-stemmer.h
@@ -0,0 +1,39 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2016 Richard Hughes <richard@hughsie.com>
+ *
+ * Licensed under the GNU General Public License Version 2
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef __AS_STEMMER_H
+#define __AS_STEMMER_H
+
+#include <glib-object.h>
+
+G_BEGIN_DECLS
+
+#define AS_TYPE_STEMMER (as_stemmer_get_type ())
+
+G_DECLARE_FINAL_TYPE (AsStemmer, as_stemmer, AS, STEMMER, GObject)
+
+AsStemmer *as_stemmer_new (void);
+gchar *as_stemmer_process (AsStemmer *stemmer,
+ const gchar *value);
+
+G_END_DECLS
+
+#endif /* __AS_STEMMER_H */