diff options
author | Richard Hughes <richard@hughsie.com> | 2015-01-15 12:46:11 +0000 |
---|---|---|
committer | Richard Hughes <richard@hughsie.com> | 2015-01-15 13:09:07 +0000 |
commit | 5542d872a3f95e330395bf674595e7c0593105f6 (patch) | |
tree | 98138493fd8312f1384274b5bd493cf53a113cd7 | |
parent | 0f336ac127ec3c4d9b8ff2f9283ee5a75f301934 (diff) | |
download | appstream-glib-5542d872a3f95e330395bf674595e7c0593105f6.tar.gz |
Check if the search entries are valid before searching
We want to do this once when preparing the search action, rather than in
as_app_search_matches_all() which would be much slower.
This prevents returning hundreds of entries for:
$ appstream-util search action and adventure
... and also provides some new API that gnome-software can use.
-rw-r--r-- | libappstream-glib/as-app.c | 36 | ||||
-rw-r--r-- | libappstream-glib/as-self-test.c | 16 | ||||
-rw-r--r-- | libappstream-glib/as-utils.c | 74 | ||||
-rw-r--r-- | libappstream-glib/as-utils.h | 2 |
4 files changed, 96 insertions, 32 deletions
diff --git a/libappstream-glib/as-app.c b/libappstream-glib/as-app.c index 378237c..c638564 100644 --- a/libappstream-glib/as-app.c +++ b/libappstream-glib/as-app.c @@ -3696,37 +3696,6 @@ as_app_value_tokenize (const gchar *value) } /** - * as_app_token_is_valid: - **/ -static gboolean -as_app_token_is_valid (const gchar *token) -{ - guint i; - const gchar *blacklist[] = { - "and", "the", "desktop", "application", "for", "you", "your", - "with", "can", "are", "from", "that", "use", "allows", "also", - "this", "other", "all", "using", "has", "some", "like", "them", - "well", "not", "using", "not", "but", "set", "its", "into", - "such", "was", "they", "where", "want", "only", "about", - NULL }; - if (strlen (token) < 3) - return FALSE; - if (g_strstr_len (token, -1, "<") != NULL) - return FALSE; - if (g_strstr_len (token, -1, ">") != NULL) - return FALSE; - if (g_strstr_len (token, -1, "(") != NULL) - return FALSE; - if (g_strstr_len (token, -1, ")") != NULL) - return FALSE; - for (i = 0; blacklist[i] != NULL; i++) { - if (g_strcmp0 (token, blacklist[i]) == 0) - return FALSE; - } - return TRUE; -} - -/** * as_app_remove_invalid_tokens: **/ static void @@ -3742,7 +3711,7 @@ as_app_remove_invalid_tokens (gchar **tokens) /* remove any tokens that are invalid and maintain the order */ len = g_strv_length (tokens); for (i = 0; i < len; i++) { - if (!as_app_token_is_valid (tokens[i])) { + if (!as_utils_search_token_valid (tokens[i])) { g_free (tokens[i]); tokens[i] = NULL; continue; @@ -3955,6 +3924,9 @@ as_app_get_search_tokens (AsApp *app) * Returns: a match scrore, where 0 is no match and larger numbers are better * matches. * + * It's probably a good idea to use as_utils_search_tokenize() to populate + * search as very short or common keywords will return a lot of matches. + * * Since: 0.1.3 */ guint diff --git a/libappstream-glib/as-self-test.c b/libappstream-glib/as-self-test.c index 20a38ca..b62dddd 100644 --- a/libappstream-glib/as-self-test.c +++ b/libappstream-glib/as-self-test.c @@ -2708,6 +2708,7 @@ as_test_utils_func (void) { gboolean ret; gchar *tmp; + gchar **tokens; GError *error = NULL; /* as_strndup */ @@ -2781,6 +2782,21 @@ as_test_utils_func (void) //ret = as_utils_check_url_exists ("http://www.bbc.co.uk/", &error); //g_assert (ret); //g_assert_no_error (error); + + /* valid tokens */ + g_assert (as_utils_search_token_valid ("battery")); + g_assert (!as_utils_search_token_valid ("and")); + g_assert (!as_utils_search_token_valid ("is")); + g_assert (!as_utils_search_token_valid ("<b>")); + + /* check tokenisation */ + tokens = as_utils_search_tokenize ("a c b"); + g_assert (tokens == NULL); + tokens = as_utils_search_tokenize ("batteries are (really) stupid"); + g_assert_cmpstr (tokens[0], ==, "batteries"); + g_assert_cmpstr (tokens[1], ==, "stupid"); + g_assert_cmpstr (tokens[2], ==, NULL); + g_strfreev (tokens); } static void diff --git a/libappstream-glib/as-utils.c b/libappstream-glib/as-utils.c index bd41151..195bf73 100644 --- a/libappstream-glib/as-utils.c +++ b/libappstream-glib/as-utils.c @@ -1304,3 +1304,77 @@ as_utils_install_filename (AsUtilsLocation location, } return ret; } + +/** + * as_utils_search_token_valid: + * @token: the search token + * + * Checks the search token if it is valid. Valid tokens are at least 3 chars in + * length, not common words like "and", and do not contain markup. + * + * Returns: %TRUE is the search token was valid + * + * Since: 0.3.4 + **/ +gboolean +as_utils_search_token_valid (const gchar *token) +{ + guint i; + const gchar *blacklist[] = { + "and", "the", "desktop", "application", "for", "you", "your", + "with", "can", "are", "from", "that", "use", "allows", "also", + "this", "other", "all", "using", "has", "some", "like", "them", + "well", "not", "using", "not", "but", "set", "its", "into", + "such", "was", "they", "where", "want", "only", "about", + NULL }; + if (strlen (token) < 3) + return FALSE; + if (g_strstr_len (token, -1, "<") != NULL) + return FALSE; + if (g_strstr_len (token, -1, ">") != NULL) + return FALSE; + if (g_strstr_len (token, -1, "(") != NULL) + return FALSE; + if (g_strstr_len (token, -1, ")") != NULL) + return FALSE; + for (i = 0; blacklist[i] != NULL; i++) { + if (g_strcmp0 (token, blacklist[i]) == 0) + return FALSE; + } + return TRUE; +} + +/** + * as_utils_search_tokenize: + * @search: the search string + * + * Splits up a string into tokens and returns tokens that are suitable for + * searching. This includes taking out common words and casefolding the + * returned search tokens. + * + * Returns: (transfer full): Valid tokens to search for, or %NULL for error + * + * Since: 0.3.4 + **/ +gchar ** +as_utils_search_tokenize (const gchar *search) +{ + gchar **values = NULL; + guint i; + guint idx = 0; + _cleanup_strv_free_ gchar **tmp = NULL; + + /* only add keywords that are long enough */ + tmp = g_strsplit (search, " ", -1); + values = g_new0 (gchar *, g_strv_length (tmp) + 1); + for (i = 0; tmp[i] != NULL; i++) { + if (!as_utils_search_token_valid (tmp[i])) + continue; + values[idx++] = g_utf8_casefold (tmp[i], -1); + } + if (idx == 0) { + g_free (values); + return NULL; + } + return values; +} diff --git a/libappstream-glib/as-utils.h b/libappstream-glib/as-utils.h index e81d65a..5e6910f 100644 --- a/libappstream-glib/as-utils.h +++ b/libappstream-glib/as-utils.h @@ -105,6 +105,8 @@ gboolean as_utils_install_filename (AsUtilsLocation location, const gchar *origin, const gchar *destdir, GError **error); +gboolean as_utils_search_token_valid (const gchar *token); +gchar **as_utils_search_tokenize (const gchar *search); G_END_DECLS |