summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Hughes <richard@hughsie.com>2015-01-15 12:46:11 +0000
committerRichard Hughes <richard@hughsie.com>2015-01-15 13:09:07 +0000
commit5542d872a3f95e330395bf674595e7c0593105f6 (patch)
tree98138493fd8312f1384274b5bd493cf53a113cd7
parent0f336ac127ec3c4d9b8ff2f9283ee5a75f301934 (diff)
downloadappstream-glib-5542d872a3f95e330395bf674595e7c0593105f6.tar.gz
Check if the search entries are valid before searching
We want to do this once when preparing the search action, rather than in as_app_search_matches_all() which would be much slower. This prevents returning hundreds of entries for: $ appstream-util search action and adventure ... and also provides some new API that gnome-software can use.
-rw-r--r--libappstream-glib/as-app.c36
-rw-r--r--libappstream-glib/as-self-test.c16
-rw-r--r--libappstream-glib/as-utils.c74
-rw-r--r--libappstream-glib/as-utils.h2
4 files changed, 96 insertions, 32 deletions
diff --git a/libappstream-glib/as-app.c b/libappstream-glib/as-app.c
index 378237c..c638564 100644
--- a/libappstream-glib/as-app.c
+++ b/libappstream-glib/as-app.c
@@ -3696,37 +3696,6 @@ as_app_value_tokenize (const gchar *value)
}
/**
- * as_app_token_is_valid:
- **/
-static gboolean
-as_app_token_is_valid (const gchar *token)
-{
- guint i;
- const gchar *blacklist[] = {
- "and", "the", "desktop", "application", "for", "you", "your",
- "with", "can", "are", "from", "that", "use", "allows", "also",
- "this", "other", "all", "using", "has", "some", "like", "them",
- "well", "not", "using", "not", "but", "set", "its", "into",
- "such", "was", "they", "where", "want", "only", "about",
- NULL };
- if (strlen (token) < 3)
- return FALSE;
- if (g_strstr_len (token, -1, "<") != NULL)
- return FALSE;
- if (g_strstr_len (token, -1, ">") != NULL)
- return FALSE;
- if (g_strstr_len (token, -1, "(") != NULL)
- return FALSE;
- if (g_strstr_len (token, -1, ")") != NULL)
- return FALSE;
- for (i = 0; blacklist[i] != NULL; i++) {
- if (g_strcmp0 (token, blacklist[i]) == 0)
- return FALSE;
- }
- return TRUE;
-}
-
-/**
* as_app_remove_invalid_tokens:
**/
static void
@@ -3742,7 +3711,7 @@ as_app_remove_invalid_tokens (gchar **tokens)
/* remove any tokens that are invalid and maintain the order */
len = g_strv_length (tokens);
for (i = 0; i < len; i++) {
- if (!as_app_token_is_valid (tokens[i])) {
+ if (!as_utils_search_token_valid (tokens[i])) {
g_free (tokens[i]);
tokens[i] = NULL;
continue;
@@ -3955,6 +3924,9 @@ as_app_get_search_tokens (AsApp *app)
* Returns: a match scrore, where 0 is no match and larger numbers are better
* matches.
*
+ * It's probably a good idea to use as_utils_search_tokenize() to populate
+ * search as very short or common keywords will return a lot of matches.
+ *
* Since: 0.1.3
*/
guint
diff --git a/libappstream-glib/as-self-test.c b/libappstream-glib/as-self-test.c
index 20a38ca..b62dddd 100644
--- a/libappstream-glib/as-self-test.c
+++ b/libappstream-glib/as-self-test.c
@@ -2708,6 +2708,7 @@ as_test_utils_func (void)
{
gboolean ret;
gchar *tmp;
+ gchar **tokens;
GError *error = NULL;
/* as_strndup */
@@ -2781,6 +2782,21 @@ as_test_utils_func (void)
//ret = as_utils_check_url_exists ("http://www.bbc.co.uk/", &error);
//g_assert (ret);
//g_assert_no_error (error);
+
+ /* valid tokens */
+ g_assert (as_utils_search_token_valid ("battery"));
+ g_assert (!as_utils_search_token_valid ("and"));
+ g_assert (!as_utils_search_token_valid ("is"));
+ g_assert (!as_utils_search_token_valid ("<b>"));
+
+ /* check tokenisation */
+ tokens = as_utils_search_tokenize ("a c b");
+ g_assert (tokens == NULL);
+ tokens = as_utils_search_tokenize ("batteries are (really) stupid");
+ g_assert_cmpstr (tokens[0], ==, "batteries");
+ g_assert_cmpstr (tokens[1], ==, "stupid");
+ g_assert_cmpstr (tokens[2], ==, NULL);
+ g_strfreev (tokens);
}
static void
diff --git a/libappstream-glib/as-utils.c b/libappstream-glib/as-utils.c
index bd41151..195bf73 100644
--- a/libappstream-glib/as-utils.c
+++ b/libappstream-glib/as-utils.c
@@ -1304,3 +1304,77 @@ as_utils_install_filename (AsUtilsLocation location,
}
return ret;
}
+
+/**
+ * as_utils_search_token_valid:
+ * @token: the search token
+ *
+ * Checks the search token if it is valid. Valid tokens are at least 3 chars in
+ * length, not common words like "and", and do not contain markup.
+ *
+ * Returns: %TRUE is the search token was valid
+ *
+ * Since: 0.3.4
+ **/
+gboolean
+as_utils_search_token_valid (const gchar *token)
+{
+ guint i;
+ const gchar *blacklist[] = {
+ "and", "the", "desktop", "application", "for", "you", "your",
+ "with", "can", "are", "from", "that", "use", "allows", "also",
+ "this", "other", "all", "using", "has", "some", "like", "them",
+ "well", "not", "using", "not", "but", "set", "its", "into",
+ "such", "was", "they", "where", "want", "only", "about",
+ NULL };
+ if (strlen (token) < 3)
+ return FALSE;
+ if (g_strstr_len (token, -1, "<") != NULL)
+ return FALSE;
+ if (g_strstr_len (token, -1, ">") != NULL)
+ return FALSE;
+ if (g_strstr_len (token, -1, "(") != NULL)
+ return FALSE;
+ if (g_strstr_len (token, -1, ")") != NULL)
+ return FALSE;
+ for (i = 0; blacklist[i] != NULL; i++) {
+ if (g_strcmp0 (token, blacklist[i]) == 0)
+ return FALSE;
+ }
+ return TRUE;
+}
+
+/**
+ * as_utils_search_tokenize:
+ * @search: the search string
+ *
+ * Splits up a string into tokens and returns tokens that are suitable for
+ * searching. This includes taking out common words and casefolding the
+ * returned search tokens.
+ *
+ * Returns: (transfer full): Valid tokens to search for, or %NULL for error
+ *
+ * Since: 0.3.4
+ **/
+gchar **
+as_utils_search_tokenize (const gchar *search)
+{
+ gchar **values = NULL;
+ guint i;
+ guint idx = 0;
+ _cleanup_strv_free_ gchar **tmp = NULL;
+
+ /* only add keywords that are long enough */
+ tmp = g_strsplit (search, " ", -1);
+ values = g_new0 (gchar *, g_strv_length (tmp) + 1);
+ for (i = 0; tmp[i] != NULL; i++) {
+ if (!as_utils_search_token_valid (tmp[i]))
+ continue;
+ values[idx++] = g_utf8_casefold (tmp[i], -1);
+ }
+ if (idx == 0) {
+ g_free (values);
+ return NULL;
+ }
+ return values;
+}
diff --git a/libappstream-glib/as-utils.h b/libappstream-glib/as-utils.h
index e81d65a..5e6910f 100644
--- a/libappstream-glib/as-utils.h
+++ b/libappstream-glib/as-utils.h
@@ -105,6 +105,8 @@ gboolean as_utils_install_filename (AsUtilsLocation location,
const gchar *origin,
const gchar *destdir,
GError **error);
+gboolean as_utils_search_token_valid (const gchar *token);
+gchar **as_utils_search_tokenize (const gchar *search);
G_END_DECLS