diff options
author | Martyn Russell <martyn@lanedo.com> | 2014-12-03 10:31:18 +0000 |
---|---|---|
committer | Martyn Russell <martyn@lanedo.com> | 2014-12-03 10:31:18 +0000 |
commit | 8d14867631b59ecdfadcd77ac407f19fc15ba4d8 (patch) | |
tree | 7502236577c41d633c8cd13e27dd70fb6f9e9580 /tests/libtracker-common | |
parent | 8833933e45e77a67d06f21f47e1c70a1525350eb (diff) | |
download | tracker-8d14867631b59ecdfadcd77ac407f19fc15ba4d8.tar.gz |
libtracker-common: Move and fix tracker-parser unit tests from libtracker-fts
Diffstat (limited to 'tests/libtracker-common')
-rw-r--r-- | tests/libtracker-common/Makefile.am | 9 | ||||
-rw-r--r-- | tests/libtracker-common/tracker-parser-test.c | 450 | ||||
-rw-r--r-- | tests/libtracker-common/tracker-parser.c | 262 |
3 files changed, 720 insertions, 1 deletions
diff --git a/tests/libtracker-common/Makefile.am b/tests/libtracker-common/Makefile.am index 68d6cbef2..d82ca6c99 100644 --- a/tests/libtracker-common/Makefile.am +++ b/tests/libtracker-common/Makefile.am @@ -2,13 +2,16 @@ include $(top_srcdir)/Makefile.decl noinst_PROGRAMS += $(test_programs) +check_PROGRAMS += tracker-parser + test_programs = \ tracker-type-utils \ tracker-dbus \ tracker-file-utils \ tracker-utils \ tracker-sched-test \ - tracker-date-time-test + tracker-date-time-test \ + tracker-parser-test AM_CPPFLAGS = \ -DTOP_SRCDIR=\"$(abs_top_srcdir)\" \ @@ -37,4 +40,8 @@ tracker_sched_test_SOURCES = tracker-sched-test.c tracker_date_time_test_SOURCES = tracker-date-time-test.c +tracker_parser_test_SOURCES = tracker-parser-test.c + +tracker_parser_SOURCES = tracker-parser.c + EXTRA_DIST += non-utf8.txt diff --git a/tests/libtracker-common/tracker-parser-test.c b/tests/libtracker-common/tracker-parser-test.c new file mode 100644 index 000000000..954c212bd --- /dev/null +++ b/tests/libtracker-common/tracker-parser-test.c @@ -0,0 +1,450 @@ +/* + * Copyright (C) 2010, Nokia <ivan.frade@nokia.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#include "config.h" + +#include <string.h> + +#include <glib.h> +#include <gio/gio.h> + +#include <libtracker-common/tracker-parser.h> + +/* -------------- COMMON FOR ALL TESTS ----------------- */ + +/* Fixture object type */ +typedef struct { + /* The parser object */ + TrackerParser *parser; + + /* Default parser configuration to use */ + gint max_word_length; + gboolean enable_stemmer; + gboolean enable_unaccent; + gboolean ignore_stop_words; + gboolean ignore_reserved_words; + gboolean ignore_numbers; +} TrackerParserTestFixture; + +/* Common setup for all tests */ +static void +test_common_setup (TrackerParserTestFixture *fixture, + gconstpointer data) +{ + TrackerLanguage *language; + + /* Setup language for parser. We make sure that always English is used + * in the unit tests, because we want the English stemming method to + * be used. */ + language = tracker_language_new ("en"); + if (!language) { + g_critical ("Language setup failed!"); + return; + } + + /* Default conf parameters */ + fixture->max_word_length = 50; + fixture->enable_stemmer = TRUE; + fixture->enable_unaccent = TRUE; + fixture->ignore_stop_words = TRUE; + fixture->ignore_reserved_words = TRUE; + fixture->ignore_numbers = TRUE; + + /* Create the parser */ + fixture->parser = tracker_parser_new (language); + if (!fixture->parser) { + g_critical ("Parser creation failed!"); + return; + } + + g_object_unref (language); +} + +/* Common teardown for all tests */ +static void +test_common_teardown (TrackerParserTestFixture *fixture, + gconstpointer data) +{ + if (fixture->parser) { + tracker_parser_free (fixture->parser); + } +} + +/* -------------- EXPECTED NUMBER OF WORDS TESTS ----------------- */ + +/* Test struct for the expected-nwords tests */ +typedef struct TestDataExpectedNWords TestDataExpectedNWords; +struct TestDataExpectedNWords { + const gchar *str; + gboolean ignore_numbers; + guint expected_nwords; + gint alternate_expected_nwords; +}; + +/* Common expected_word test method */ +static void +expected_nwords_check (TrackerParserTestFixture *fixture, + gconstpointer data) +{ + const TestDataExpectedNWords *testdata = data; + gint position; + gint byte_offset_start; + gint byte_offset_end; + gboolean stop_word; + gint word_length; + guint nwords = 0; + + /* Reset the parser with the test string */ + tracker_parser_reset (fixture->parser, + testdata->str, + strlen (testdata->str), + fixture->max_word_length, + fixture->enable_stemmer, + fixture->enable_unaccent, + fixture->ignore_stop_words, + fixture->ignore_reserved_words, + testdata->ignore_numbers); + + /* Count number of output words */ + while (tracker_parser_next (fixture->parser, + &position, + &byte_offset_start, + &byte_offset_end, + &stop_word, + &word_length)) { + nwords++; + } + + /* Some tests will yield different results when using different versions of + * libicu (e.g. chinese ones). Handle this by allowing an alternate number + * of words expected in the test. Note that our whole purpose is to test + * that we can split different words, not much about the number of words + * itself (althogh we should check that as well) */ + + if (testdata->alternate_expected_nwords < 0) + /* Check if input is same as expected */ + g_assert_cmpuint (nwords, == , testdata->expected_nwords); + else + /* We'll assert if both expected number of words fail */ + g_assert ((nwords == testdata->expected_nwords) || + (nwords == testdata->alternate_expected_nwords)); +} + +/* -------------- EXPECTED WORD TESTS ----------------- */ + +/* Test struct for the expected-word tests */ +typedef struct TestDataExpectedWord TestDataExpectedWord; +struct TestDataExpectedWord { + const gchar *str; + const gchar *expected; + gboolean enable_stemmer; + gboolean enable_unaccent; +}; + +/* Common expected_word test method */ +static void +expected_word_check (TrackerParserTestFixture *fixture, + gconstpointer data) +{ + const TestDataExpectedWord *testdata = data; + const gchar *word; + gchar *expected_nfkd; + gint position; + gint byte_offset_start; + gint byte_offset_end; + gboolean stop_word; + gint word_length; + + /* Reset the parser with our string */ + tracker_parser_reset (fixture->parser, + testdata->str, + strlen (testdata->str), + fixture->max_word_length, + testdata->enable_stemmer, + testdata->enable_unaccent, + fixture->ignore_stop_words, + fixture->ignore_reserved_words, + fixture->ignore_numbers); + + /* Process next word */ + word = tracker_parser_next (fixture->parser, + &position, + &byte_offset_start, + &byte_offset_end, + &stop_word, + &word_length); + + /* Expected word MUST always be in NFKD normalization */ + expected_nfkd = g_utf8_normalize (testdata->expected, + -1, + G_NORMALIZE_NFKD); + + /* Check if input is same as expected */ + g_assert_cmpstr (word, == , expected_nfkd); + + g_free (expected_nfkd); +} + +static void +test_stemmer (TrackerParserTestFixture *fixture, + gconstpointer data) +{ +#ifdef HAVE_LIBSTEMMER + expected_word_check (fixture, data); +#else + g_test_skip ("Built without libstemmer"); +#endif +} + +static void +test_unac (TrackerParserTestFixture *fixture, + gconstpointer data) +{ +#ifdef HAVE_UNAC + expected_word_check (fixture, data); +#else + g_test_skip ("Built without UNAC"); +#endif +} + +/* -------------- STOP WORD TESTS ----------------- */ + +/* Test struct for the stop-word tests */ +typedef struct TestDataStopWord TestDataStopWord; +struct TestDataStopWord { + const gchar *str; + gboolean ignore_stop_words; + gboolean is_expected_stop_word; +}; + +/* Common stop__word test method */ +static void +stop_word_check (TrackerParserTestFixture *fixture, + gconstpointer data) +{ + const TestDataStopWord *testdata = data; + gint position; + gint byte_offset_start; + gint byte_offset_end; + gboolean stop_word; + gint word_length; + + /* Reset the parser with our string */ + tracker_parser_reset (fixture->parser, + testdata->str, + strlen (testdata->str), + fixture->max_word_length, + fixture->enable_stemmer, + fixture->enable_unaccent, + testdata->ignore_stop_words, + fixture->ignore_reserved_words, + fixture->ignore_numbers); + + /* Process next word */ + tracker_parser_next (fixture->parser, + &position, + &byte_offset_start, + &byte_offset_end, + &stop_word, + &word_length); + + /* Check if input is same as stop_word */ + g_assert_cmpuint (stop_word, == , testdata->is_expected_stop_word); +} + +/* -------------- LIST OF TESTS ----------------- */ + +/* Normalization-related tests (unaccenting) */ +static const TestDataExpectedWord test_data_normalization[] = { + { "école", "ecole", FALSE, TRUE }, + { "ÉCOLE", "ecole", FALSE, TRUE }, + { "École", "ecole", FALSE, TRUE }, + { "e" "\xCC\x81" "cole", "ecole", FALSE, TRUE }, + { "E" "\xCC\x81" "COLE", "ecole", FALSE, TRUE }, + { "E" "\xCC\x81" "cole", "ecole", FALSE, TRUE }, + { NULL, NULL, FALSE, FALSE } +}; + +/* Unaccenting-related tests */ +static const TestDataExpectedWord test_data_unaccent[] = { + { "Murciélago", "murcielago", FALSE, TRUE }, + { "camión", "camion", FALSE, TRUE }, + { "desagüe", "desague", FALSE, TRUE }, + { "Ὰ", "α", FALSE, TRUE }, /* greek capital alpha with U+0300, composed */ + { "ὰ", "α", FALSE, TRUE }, /* greek small alpha with U+0300, composed */ + { "Ὶ", "ι", FALSE, TRUE }, /* greek capital iotta with U+0300, composed */ + { "ὶ", "ι", FALSE, TRUE }, /* greek small iotta with U+0300, composed */ + { "Ὼ", "ω", FALSE, TRUE }, /* greek capital omega with U+0300, composed */ + { "ὼ", "ω", FALSE, TRUE }, /* greek small omega with U+0300, composed */ + { "Ὰ", "α", FALSE, TRUE }, /* capital alpha with U+0300, decomposed */ + { "ὰ", "α", FALSE, TRUE }, /* small alpha with U+0300, decomposed */ + { "Ὶ", "ι", FALSE, TRUE }, /* capital iotta with U+0300, decomposed */ + { "ὶ", "ι", FALSE, TRUE }, /* small iotta with U+0300, decomposed */ + { "Ὼ", "ω", FALSE, TRUE }, /* capital omega with U+0300, decomposed */ + { "ὼ", "ω", FALSE, TRUE }, /* small omega with U+0300, decomposed */ + { "aN͡Ga", "anga", FALSE, TRUE }, /* 0x0361 affects to two characters */ + { "aNG͡a", "anga", FALSE, TRUE }, /* 0x0361 affects to two characters */ + { "Murciélago", "murciélago", FALSE, FALSE }, + { "camión", "camión", FALSE, FALSE }, + { "desagüe", "desagüe", FALSE, FALSE }, + { NULL, NULL, FALSE, FALSE } +}; + +/* Stemming-related tests */ +static const TestDataExpectedWord test_data_stemming[] = { + { "ecole", "ecol", TRUE, TRUE }, + { "ecole", "ecole", FALSE, TRUE }, + { NULL, NULL, FALSE, FALSE } +}; + +/* Casefolding-related tests */ +static const TestDataExpectedWord test_data_casefolding[] = { + { "gross", "gross", FALSE, TRUE }, + { "GROSS", "gross", FALSE, TRUE }, + { "GrOsS", "gross", FALSE, TRUE }, + { "groß", "gross", FALSE, TRUE }, + { NULL, NULL, FALSE, FALSE } +}; + +/* Number of expected words tests */ +static const TestDataExpectedNWords test_data_nwords[] = { + { "The quick (\"brown\") fox can’t jump 32.3 feet, right?", TRUE, 8, -1 }, + { "The quick (\"brown\") fox can’t jump 32.3 feet, right?", FALSE, 10, -1 }, + /* Note: as of 0.9.15, the dot is always a word breaker, even between + * numbers. */ + { "filename.txt", TRUE, 2, -1 }, + { ".hidden.txt", TRUE, 2, -1 }, + { "noextension.", TRUE, 1, -1 }, + { "ホモ・サピエンス", TRUE, 2, -1 }, /* katakana */ + { "喂人类", TRUE, 2, 3 }, /* chinese */ + { "Американские суда находятся в международных водах.", TRUE, 6, -1 }, /* russian */ + { "Bần chỉ là một anh nghèo xác", TRUE, 7, -1 }, /* vietnamese */ + { "ホモ・サピエンス 喂人类 katakana, chinese, english", TRUE, 7, 8 }, /* mixed */ + { NULL, FALSE, 0, 0 } +}; + +/* Stop-word tests (for english only) */ +static const TestDataStopWord test_data_stop_words[] = { + { "hello", TRUE, TRUE }, /* hello is stop word */ + { "hello", FALSE, FALSE }, + { "world", TRUE, FALSE }, /* world is not stop word */ + { "world", FALSE, FALSE }, + { NULL, FALSE, FALSE } +}; + +int +main (int argc, char **argv) +{ + gint i; + + g_test_init (&argc, &argv, NULL); + + /* We want the tests to properly find the stopwords dictionaries, so we + * need to set the following envvar with the path where the + * dictionaries are. */ + g_setenv ("TRACKER_LANGUAGE_STOP_WORDS_DIR", + TOP_SRCDIR "/src/libtracker-common/stop-words", + TRUE); + + /* Add normalization checks */ + for (i = 0; test_data_normalization[i].str != NULL; i++) { + gchar *testpath; + + testpath = g_strdup_printf ("/libtracker-fts/parser/normalization_%d", i); + g_test_add (testpath, + TrackerParserTestFixture, + &test_data_normalization[i], + test_common_setup, + expected_word_check, + test_common_teardown); + g_free (testpath); + } + + /* Add unaccent checks */ + for (i = 0; test_data_unaccent[i].str != NULL; i++) { + gchar *testpath; + + testpath = g_strdup_printf ("/libtracker-fts/parser/unaccent_%d", i); + g_test_add (testpath, + TrackerParserTestFixture, + &test_data_unaccent[i], + test_common_setup, + test_unac, + test_common_teardown); + g_free (testpath); + } + + /* Add casefolding checks */ + for (i = 0; test_data_casefolding[i].str != NULL; i++) { + gchar *testpath; + + testpath = g_strdup_printf ("/libtracker-fts/parser/casefolding_%d", i); + g_test_add (testpath, + TrackerParserTestFixture, + &test_data_casefolding[i], + test_common_setup, + expected_word_check, + test_common_teardown); + g_free (testpath); + } + + /* Add stemming checks */ + for (i = 0; test_data_stemming[i].str != NULL; i++) { + gchar *testpath; + + testpath = g_strdup_printf ("/libtracker-fts/parser/stemming_%d", i); + g_test_add (testpath, + TrackerParserTestFixture, + &test_data_stemming[i], + test_common_setup, + test_stemmer, + test_common_teardown); + g_free (testpath); + } + + /* Add expected number of words checks */ + for (i = 0; test_data_nwords[i].str != NULL; i++) { + gchar *testpath; + + testpath = g_strdup_printf ("/libtracker-fts/parser/nwords_%d", i); + g_test_add (testpath, + TrackerParserTestFixture, + &test_data_nwords[i], + test_common_setup, + expected_nwords_check, + test_common_teardown); + g_free (testpath); + } + + /* Add stop word checks */ + for (i = 0; test_data_stop_words[i].str != NULL; i++) { + gchar *testpath; + + testpath = g_strdup_printf ("/libtracker-fts/parser/stop_words_%d", i); + g_test_add (testpath, + TrackerParserTestFixture, + &test_data_stop_words[i], + test_common_setup, + stop_word_check, + test_common_teardown); + g_free (testpath); + } + + return g_test_run (); +} diff --git a/tests/libtracker-common/tracker-parser.c b/tests/libtracker-common/tracker-parser.c new file mode 100644 index 000000000..932cc2e69 --- /dev/null +++ b/tests/libtracker-common/tracker-parser.c @@ -0,0 +1,262 @@ +/* + * Copyright (C) 2010, Nokia <ivan.frade@nokia.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#include "config.h" + +#include <string.h> +#include <locale.h> + +#include <glib.h> +#include <gio/gio.h> + +#include <libtracker-common/tracker-common.h> + +/* Normally this would be in the libtracker-fts config */ +#define DEFAULT_MAX_WORD_LENGTH 30 +#define DEFAULT_ENABLE_STEMMER FALSE +#define DEFAULT_ENABLE_UNACCENT TRUE +#define DEFAULT_IGNORE_STOP_WORDS TRUE +#define DEFAULT_IGNORE_NUMBERS TRUE + +static gchar *text; +static gchar *filename; +static gboolean verbose; + +/* Command Line options */ +static const GOptionEntry options [] = { + { + "verbose", 'v', G_OPTION_FLAG_NO_ARG, + G_OPTION_ARG_NONE, &verbose, + "Enable verbose output", + NULL + }, + { + "text", 't', 0, + G_OPTION_ARG_STRING, &text, + "Specific text to parse", + NULL + }, + { + "file", 'f', 0, + G_OPTION_ARG_STRING, &filename, + "Specific file to parse its contents", + NULL + }, + { NULL } +}; + +static gboolean +setup_context (gint argc, + gchar **argv) +{ + GOptionContext *context = NULL; + GError *error = NULL; + + /* Setup command line options */ + context = g_option_context_new ("- Test the Tracker FTS parser"); + g_option_context_add_main_entries (context, + options, + argv[0]); + + /* Parse input arguments */ + if (!g_option_context_parse (context, + &argc, + &argv, + &error)) + { + g_printerr ("%s\nRun '%s --help' to see a full list of available " + "command line options.\n", + error->message, + argv[0]); + g_error_free (error); + return FALSE; + } + + g_option_context_free (context); + return TRUE; +} + +static gboolean +load_file_contents (void) +{ + GError *error = NULL; + GFile *file; + + file = g_file_new_for_commandline_arg (filename); + if (!g_file_load_contents (file, NULL, &text, NULL, NULL, &error)) { + g_printerr ("Error loading file '%s' contents: '%s'\n", + filename, + error->message); + g_error_free (error); + g_object_unref (file); + return FALSE; + } + g_object_unref (file); + return TRUE; +} + +static gboolean +run_parsing (void) +{ + TrackerLanguage *language; + TrackerParser *parser; + GTimer *timer; + + /* Initialize timing */ + timer = g_timer_new (); + + /* Setup language for parser */ + language = tracker_language_new (NULL); + if (!language) { + g_printerr ("Language setup failed!\n"); + return FALSE; + } + + /* Create the parser */ + parser = tracker_parser_new (language); + if (!parser) { + g_printerr ("Parser creation failed!\n"); + g_object_unref (language); + return FALSE; + } + + /* Reset the parser with our string, reading the current FTS config */ + + tracker_parser_reset (parser, + text, + strlen (text), + DEFAULT_MAX_WORD_LENGTH, + DEFAULT_ENABLE_STEMMER, + DEFAULT_ENABLE_UNACCENT, + DEFAULT_IGNORE_STOP_WORDS, + TRUE, + DEFAULT_IGNORE_NUMBERS); + + /* Loop through all words! */ + while (1) { + const gchar *word; + gint position; + gint byte_offset_start; + gint byte_offset_end; + gboolean stop_word; + gint word_length; + + + /* Process next word */ + word = tracker_parser_next (parser, + &position, + &byte_offset_start, + &byte_offset_end, + &stop_word, + &word_length); + + /* Stop loop if no more words */ + if (!word) { + break; + } + + if (verbose) { + gchar *word_hex; + gchar *original_word; + gchar *original_word_hex; + gint original_word_length; + + /* Get original word */ + original_word_length = byte_offset_end - byte_offset_start; + original_word = g_malloc (original_word_length + 1); + memcpy (original_word, + &text[byte_offset_start], + original_word_length); + original_word[original_word_length] = '\0'; + + /* Get hex strings */ + word_hex = tracker_strhex (word, word_length, ':'); + original_word_hex = tracker_strhex (original_word, + original_word_length, + ':'); + + g_print ("WORD at %d [%d,%d] Original: '%s' (%s), " + "Processed: '%s' (%s) (stop? %s)\n", + position, + byte_offset_start, + byte_offset_end, + original_word, + original_word_hex, + word, + word_hex, + stop_word ? "yes" : "no"); + + g_free (word_hex); + g_free (original_word_hex); + g_free (original_word); + } + } + + g_print ("\n----> Parsing finished after '%lf' seconds\n", + g_timer_elapsed (timer, NULL)); + + g_timer_destroy (timer); + + tracker_parser_free (parser); + g_object_unref (language); + return TRUE; +} + + +int +main (int argc, char **argv) +{ + /* Setup locale */ + setlocale (LC_ALL, ""); + + /* Setup context */ + if (!setup_context (argc, argv)) { + g_printerr ("Context setup failed... exiting\n"); + return -1; + } + + /* Either text or file must be given */ + if (filename == NULL && + text == NULL) { + g_printerr ("Either 'file' or 'text' options should be used\n" + "Run '%s --help' to see a full list of available " + "command line options.\n", + argv[0]); + return -2; + } + + /* If required, load file contents */ + if (filename != NULL && + !load_file_contents ()) { + g_printerr ("Loading file '%s' contents failed... exiting\n", + filename); + return -3; + } + + /* Run the parsing! */ + if (!run_parsing ()) { + g_printerr ("Parsing operation failed... exiting\n"); + return -4; + } + + /* Clean exit */ + if (filename) + g_free (text); + return 0; +} |