diff options
author | Martyn Russell <martyn@lanedo.com> | 2014-12-03 10:31:18 +0000 |
---|---|---|
committer | Martyn Russell <martyn@lanedo.com> | 2014-12-03 10:31:18 +0000 |
commit | 8d14867631b59ecdfadcd77ac407f19fc15ba4d8 (patch) | |
tree | 7502236577c41d633c8cd13e27dd70fb6f9e9580 /tests/libtracker-fts | |
parent | 8833933e45e77a67d06f21f47e1c70a1525350eb (diff) | |
download | tracker-8d14867631b59ecdfadcd77ac407f19fc15ba4d8.tar.gz |
libtracker-common: Move and fix tracker-parser unit tests from libtracker-fts
Diffstat (limited to 'tests/libtracker-fts')
-rw-r--r-- | tests/libtracker-fts/Makefile.am | 10 | ||||
-rw-r--r-- | tests/libtracker-fts/tracker-parser-test.c | 450 | ||||
-rw-r--r-- | tests/libtracker-fts/tracker-parser.c | 260 |
3 files changed, 1 insertions, 719 deletions
diff --git a/tests/libtracker-fts/Makefile.am b/tests/libtracker-fts/Makefile.am index aa2be7f2e..7e86b2de7 100644 --- a/tests/libtracker-fts/Makefile.am +++ b/tests/libtracker-fts/Makefile.am @@ -4,14 +4,10 @@ SUBDIRS = \ limits \ prefix -check_PROGRAMS += \ - tracker-parser - noinst_PROGRAMS += $(test_programs) test_programs = \ - tracker-fts-test \ - tracker-parser-test + tracker-fts-test AM_CPPFLAGS = \ $(BUILD_CFLAGS) \ @@ -32,10 +28,6 @@ LDADD = \ tracker_fts_test_SOURCES = tracker-fts-test.c -tracker_parser_test_SOURCES = tracker-parser-test.c - -tracker_parser_SOURCES = tracker-parser.c - EXTRA_DIST += \ data.ontology \ fts3aa-data.rq \ diff --git a/tests/libtracker-fts/tracker-parser-test.c b/tests/libtracker-fts/tracker-parser-test.c deleted file mode 100644 index 954c212bd..000000000 --- a/tests/libtracker-fts/tracker-parser-test.c +++ /dev/null @@ -1,450 +0,0 @@ -/* - * Copyright (C) 2010, Nokia <ivan.frade@nokia.com> - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - * Boston, MA 02110-1301, USA. - */ - -#include "config.h" - -#include <string.h> - -#include <glib.h> -#include <gio/gio.h> - -#include <libtracker-common/tracker-parser.h> - -/* -------------- COMMON FOR ALL TESTS ----------------- */ - -/* Fixture object type */ -typedef struct { - /* The parser object */ - TrackerParser *parser; - - /* Default parser configuration to use */ - gint max_word_length; - gboolean enable_stemmer; - gboolean enable_unaccent; - gboolean ignore_stop_words; - gboolean ignore_reserved_words; - gboolean ignore_numbers; -} TrackerParserTestFixture; - -/* Common setup for all tests */ -static void -test_common_setup (TrackerParserTestFixture *fixture, - gconstpointer data) -{ - TrackerLanguage *language; - - /* Setup language for parser. We make sure that always English is used - * in the unit tests, because we want the English stemming method to - * be used. */ - language = tracker_language_new ("en"); - if (!language) { - g_critical ("Language setup failed!"); - return; - } - - /* Default conf parameters */ - fixture->max_word_length = 50; - fixture->enable_stemmer = TRUE; - fixture->enable_unaccent = TRUE; - fixture->ignore_stop_words = TRUE; - fixture->ignore_reserved_words = TRUE; - fixture->ignore_numbers = TRUE; - - /* Create the parser */ - fixture->parser = tracker_parser_new (language); - if (!fixture->parser) { - g_critical ("Parser creation failed!"); - return; - } - - g_object_unref (language); -} - -/* Common teardown for all tests */ -static void -test_common_teardown (TrackerParserTestFixture *fixture, - gconstpointer data) -{ - if (fixture->parser) { - tracker_parser_free (fixture->parser); - } -} - -/* -------------- EXPECTED NUMBER OF WORDS TESTS ----------------- */ - -/* Test struct for the expected-nwords tests */ -typedef struct TestDataExpectedNWords TestDataExpectedNWords; -struct TestDataExpectedNWords { - const gchar *str; - gboolean ignore_numbers; - guint expected_nwords; - gint alternate_expected_nwords; -}; - -/* Common expected_word test method */ -static void -expected_nwords_check (TrackerParserTestFixture *fixture, - gconstpointer data) -{ - const TestDataExpectedNWords *testdata = data; - gint position; - gint byte_offset_start; - gint byte_offset_end; - gboolean stop_word; - gint word_length; - guint nwords = 0; - - /* Reset the parser with the test string */ - tracker_parser_reset (fixture->parser, - testdata->str, - strlen (testdata->str), - fixture->max_word_length, - fixture->enable_stemmer, - fixture->enable_unaccent, - fixture->ignore_stop_words, - fixture->ignore_reserved_words, - testdata->ignore_numbers); - - /* Count number of output words */ - while (tracker_parser_next (fixture->parser, - &position, - &byte_offset_start, - &byte_offset_end, - &stop_word, - &word_length)) { - nwords++; - } - - /* Some tests will yield different results when using different versions of - * libicu (e.g. chinese ones). Handle this by allowing an alternate number - * of words expected in the test. Note that our whole purpose is to test - * that we can split different words, not much about the number of words - * itself (althogh we should check that as well) */ - - if (testdata->alternate_expected_nwords < 0) - /* Check if input is same as expected */ - g_assert_cmpuint (nwords, == , testdata->expected_nwords); - else - /* We'll assert if both expected number of words fail */ - g_assert ((nwords == testdata->expected_nwords) || - (nwords == testdata->alternate_expected_nwords)); -} - -/* -------------- EXPECTED WORD TESTS ----------------- */ - -/* Test struct for the expected-word tests */ -typedef struct TestDataExpectedWord TestDataExpectedWord; -struct TestDataExpectedWord { - const gchar *str; - const gchar *expected; - gboolean enable_stemmer; - gboolean enable_unaccent; -}; - -/* Common expected_word test method */ -static void -expected_word_check (TrackerParserTestFixture *fixture, - gconstpointer data) -{ - const TestDataExpectedWord *testdata = data; - const gchar *word; - gchar *expected_nfkd; - gint position; - gint byte_offset_start; - gint byte_offset_end; - gboolean stop_word; - gint word_length; - - /* Reset the parser with our string */ - tracker_parser_reset (fixture->parser, - testdata->str, - strlen (testdata->str), - fixture->max_word_length, - testdata->enable_stemmer, - testdata->enable_unaccent, - fixture->ignore_stop_words, - fixture->ignore_reserved_words, - fixture->ignore_numbers); - - /* Process next word */ - word = tracker_parser_next (fixture->parser, - &position, - &byte_offset_start, - &byte_offset_end, - &stop_word, - &word_length); - - /* Expected word MUST always be in NFKD normalization */ - expected_nfkd = g_utf8_normalize (testdata->expected, - -1, - G_NORMALIZE_NFKD); - - /* Check if input is same as expected */ - g_assert_cmpstr (word, == , expected_nfkd); - - g_free (expected_nfkd); -} - -static void -test_stemmer (TrackerParserTestFixture *fixture, - gconstpointer data) -{ -#ifdef HAVE_LIBSTEMMER - expected_word_check (fixture, data); -#else - g_test_skip ("Built without libstemmer"); -#endif -} - -static void -test_unac (TrackerParserTestFixture *fixture, - gconstpointer data) -{ -#ifdef HAVE_UNAC - expected_word_check (fixture, data); -#else - g_test_skip ("Built without UNAC"); -#endif -} - -/* -------------- STOP WORD TESTS ----------------- */ - -/* Test struct for the stop-word tests */ -typedef struct TestDataStopWord TestDataStopWord; -struct TestDataStopWord { - const gchar *str; - gboolean ignore_stop_words; - gboolean is_expected_stop_word; -}; - -/* Common stop__word test method */ -static void -stop_word_check (TrackerParserTestFixture *fixture, - gconstpointer data) -{ - const TestDataStopWord *testdata = data; - gint position; - gint byte_offset_start; - gint byte_offset_end; - gboolean stop_word; - gint word_length; - - /* Reset the parser with our string */ - tracker_parser_reset (fixture->parser, - testdata->str, - strlen (testdata->str), - fixture->max_word_length, - fixture->enable_stemmer, - fixture->enable_unaccent, - testdata->ignore_stop_words, - fixture->ignore_reserved_words, - fixture->ignore_numbers); - - /* Process next word */ - tracker_parser_next (fixture->parser, - &position, - &byte_offset_start, - &byte_offset_end, - &stop_word, - &word_length); - - /* Check if input is same as stop_word */ - g_assert_cmpuint (stop_word, == , testdata->is_expected_stop_word); -} - -/* -------------- LIST OF TESTS ----------------- */ - -/* Normalization-related tests (unaccenting) */ -static const TestDataExpectedWord test_data_normalization[] = { - { "école", "ecole", FALSE, TRUE }, - { "ÉCOLE", "ecole", FALSE, TRUE }, - { "École", "ecole", FALSE, TRUE }, - { "e" "\xCC\x81" "cole", "ecole", FALSE, TRUE }, - { "E" "\xCC\x81" "COLE", "ecole", FALSE, TRUE }, - { "E" "\xCC\x81" "cole", "ecole", FALSE, TRUE }, - { NULL, NULL, FALSE, FALSE } -}; - -/* Unaccenting-related tests */ -static const TestDataExpectedWord test_data_unaccent[] = { - { "Murciélago", "murcielago", FALSE, TRUE }, - { "camión", "camion", FALSE, TRUE }, - { "desagüe", "desague", FALSE, TRUE }, - { "Ὰ", "α", FALSE, TRUE }, /* greek capital alpha with U+0300, composed */ - { "ὰ", "α", FALSE, TRUE }, /* greek small alpha with U+0300, composed */ - { "Ὶ", "ι", FALSE, TRUE }, /* greek capital iotta with U+0300, composed */ - { "ὶ", "ι", FALSE, TRUE }, /* greek small iotta with U+0300, composed */ - { "Ὼ", "ω", FALSE, TRUE }, /* greek capital omega with U+0300, composed */ - { "ὼ", "ω", FALSE, TRUE }, /* greek small omega with U+0300, composed */ - { "Ὰ", "α", FALSE, TRUE }, /* capital alpha with U+0300, decomposed */ - { "ὰ", "α", FALSE, TRUE }, /* small alpha with U+0300, decomposed */ - { "Ὶ", "ι", FALSE, TRUE }, /* capital iotta with U+0300, decomposed */ - { "ὶ", "ι", FALSE, TRUE }, /* small iotta with U+0300, decomposed */ - { "Ὼ", "ω", FALSE, TRUE }, /* capital omega with U+0300, decomposed */ - { "ὼ", "ω", FALSE, TRUE }, /* small omega with U+0300, decomposed */ - { "aN͡Ga", "anga", FALSE, TRUE }, /* 0x0361 affects to two characters */ - { "aNG͡a", "anga", FALSE, TRUE }, /* 0x0361 affects to two characters */ - { "Murciélago", "murciélago", FALSE, FALSE }, - { "camión", "camión", FALSE, FALSE }, - { "desagüe", "desagüe", FALSE, FALSE }, - { NULL, NULL, FALSE, FALSE } -}; - -/* Stemming-related tests */ -static const TestDataExpectedWord test_data_stemming[] = { - { "ecole", "ecol", TRUE, TRUE }, - { "ecole", "ecole", FALSE, TRUE }, - { NULL, NULL, FALSE, FALSE } -}; - -/* Casefolding-related tests */ -static const TestDataExpectedWord test_data_casefolding[] = { - { "gross", "gross", FALSE, TRUE }, - { "GROSS", "gross", FALSE, TRUE }, - { "GrOsS", "gross", FALSE, TRUE }, - { "groß", "gross", FALSE, TRUE }, - { NULL, NULL, FALSE, FALSE } -}; - -/* Number of expected words tests */ -static const TestDataExpectedNWords test_data_nwords[] = { - { "The quick (\"brown\") fox can’t jump 32.3 feet, right?", TRUE, 8, -1 }, - { "The quick (\"brown\") fox can’t jump 32.3 feet, right?", FALSE, 10, -1 }, - /* Note: as of 0.9.15, the dot is always a word breaker, even between - * numbers. */ - { "filename.txt", TRUE, 2, -1 }, - { ".hidden.txt", TRUE, 2, -1 }, - { "noextension.", TRUE, 1, -1 }, - { "ホモ・サピエンス", TRUE, 2, -1 }, /* katakana */ - { "喂人类", TRUE, 2, 3 }, /* chinese */ - { "Американские суда находятся в международных водах.", TRUE, 6, -1 }, /* russian */ - { "Bần chỉ là một anh nghèo xác", TRUE, 7, -1 }, /* vietnamese */ - { "ホモ・サピエンス 喂人类 katakana, chinese, english", TRUE, 7, 8 }, /* mixed */ - { NULL, FALSE, 0, 0 } -}; - -/* Stop-word tests (for english only) */ -static const TestDataStopWord test_data_stop_words[] = { - { "hello", TRUE, TRUE }, /* hello is stop word */ - { "hello", FALSE, FALSE }, - { "world", TRUE, FALSE }, /* world is not stop word */ - { "world", FALSE, FALSE }, - { NULL, FALSE, FALSE } -}; - -int -main (int argc, char **argv) -{ - gint i; - - g_test_init (&argc, &argv, NULL); - - /* We want the tests to properly find the stopwords dictionaries, so we - * need to set the following envvar with the path where the - * dictionaries are. */ - g_setenv ("TRACKER_LANGUAGE_STOP_WORDS_DIR", - TOP_SRCDIR "/src/libtracker-common/stop-words", - TRUE); - - /* Add normalization checks */ - for (i = 0; test_data_normalization[i].str != NULL; i++) { - gchar *testpath; - - testpath = g_strdup_printf ("/libtracker-fts/parser/normalization_%d", i); - g_test_add (testpath, - TrackerParserTestFixture, - &test_data_normalization[i], - test_common_setup, - expected_word_check, - test_common_teardown); - g_free (testpath); - } - - /* Add unaccent checks */ - for (i = 0; test_data_unaccent[i].str != NULL; i++) { - gchar *testpath; - - testpath = g_strdup_printf ("/libtracker-fts/parser/unaccent_%d", i); - g_test_add (testpath, - TrackerParserTestFixture, - &test_data_unaccent[i], - test_common_setup, - test_unac, - test_common_teardown); - g_free (testpath); - } - - /* Add casefolding checks */ - for (i = 0; test_data_casefolding[i].str != NULL; i++) { - gchar *testpath; - - testpath = g_strdup_printf ("/libtracker-fts/parser/casefolding_%d", i); - g_test_add (testpath, - TrackerParserTestFixture, - &test_data_casefolding[i], - test_common_setup, - expected_word_check, - test_common_teardown); - g_free (testpath); - } - - /* Add stemming checks */ - for (i = 0; test_data_stemming[i].str != NULL; i++) { - gchar *testpath; - - testpath = g_strdup_printf ("/libtracker-fts/parser/stemming_%d", i); - g_test_add (testpath, - TrackerParserTestFixture, - &test_data_stemming[i], - test_common_setup, - test_stemmer, - test_common_teardown); - g_free (testpath); - } - - /* Add expected number of words checks */ - for (i = 0; test_data_nwords[i].str != NULL; i++) { - gchar *testpath; - - testpath = g_strdup_printf ("/libtracker-fts/parser/nwords_%d", i); - g_test_add (testpath, - TrackerParserTestFixture, - &test_data_nwords[i], - test_common_setup, - expected_nwords_check, - test_common_teardown); - g_free (testpath); - } - - /* Add stop word checks */ - for (i = 0; test_data_stop_words[i].str != NULL; i++) { - gchar *testpath; - - testpath = g_strdup_printf ("/libtracker-fts/parser/stop_words_%d", i); - g_test_add (testpath, - TrackerParserTestFixture, - &test_data_stop_words[i], - test_common_setup, - stop_word_check, - test_common_teardown); - g_free (testpath); - } - - return g_test_run (); -} diff --git a/tests/libtracker-fts/tracker-parser.c b/tests/libtracker-fts/tracker-parser.c deleted file mode 100644 index 2c725d81a..000000000 --- a/tests/libtracker-fts/tracker-parser.c +++ /dev/null @@ -1,260 +0,0 @@ -/* - * Copyright (C) 2010, Nokia <ivan.frade@nokia.com> - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - * Boston, MA 02110-1301, USA. - */ - -#include "config.h" - -#include <string.h> -#include <locale.h> - -#include <glib.h> -#include <gio/gio.h> - -#include <libtracker-fts/tracker-parser.h> -#include <libtracker-fts/tracker-fts-config.h> -#include <libtracker-common/tracker-common.h> - -static gchar *text; -static gchar *filename; -static gboolean verbose; - -/* Command Line options */ -static const GOptionEntry options [] = { - { - "verbose", 'v', G_OPTION_FLAG_NO_ARG, - G_OPTION_ARG_NONE, &verbose, - "Enable verbose output", - NULL - }, - { - "text", 't', 0, - G_OPTION_ARG_STRING, &text, - "Specific text to parse", - NULL - }, - { - "file", 'f', 0, - G_OPTION_ARG_STRING, &filename, - "Specific file to parse its contents", - NULL - }, - { NULL } -}; - -static gboolean -setup_context (gint argc, - gchar **argv) -{ - GOptionContext *context = NULL; - GError *error = NULL; - - /* Setup command line options */ - context = g_option_context_new ("- Test the Tracker FTS parser"); - g_option_context_add_main_entries (context, - options, - argv[0]); - - /* Parse input arguments */ - if (!g_option_context_parse (context, - &argc, - &argv, - &error)) - { - g_printerr ("%s\nRun '%s --help' to see a full list of available " - "command line options.\n", - error->message, - argv[0]); - g_error_free (error); - return FALSE; - } - - g_option_context_free (context); - return TRUE; -} - -static gboolean -load_file_contents (void) -{ - GError *error = NULL; - GFile *file; - - file = g_file_new_for_commandline_arg (filename); - if (!g_file_load_contents (file, NULL, &text, NULL, NULL, &error)) { - g_printerr ("Error loading file '%s' contents: '%s'\n", - filename, - error->message); - g_error_free (error); - g_object_unref (file); - return FALSE; - } - g_object_unref (file); - return TRUE; -} - -static gboolean -run_parsing (void) -{ - TrackerFTSConfig *config; - TrackerLanguage *language; - TrackerParser *parser; - GTimer *timer; - - /* Initialize timing */ - timer = g_timer_new (); - - /* Read config file */ - config = tracker_fts_config_new (); - - /* Setup language for parser */ - language = tracker_language_new (NULL); - if (!language) { - g_printerr ("Language setup failed!\n"); - return FALSE; - } - - /* Create the parser */ - parser = tracker_parser_new (language); - if (!parser) { - g_printerr ("Parser creation failed!\n"); - g_object_unref (language); - return FALSE; - } - - /* Reset the parser with our string, reading the current FTS config */ - tracker_parser_reset (parser, - text, - strlen (text), - tracker_fts_config_get_max_word_length (config), - tracker_fts_config_get_enable_stemmer (config), - tracker_fts_config_get_enable_unaccent (config), - tracker_fts_config_get_ignore_stop_words (config), - TRUE, - tracker_fts_config_get_ignore_numbers (config)); - - /* Loop through all words! */ - while (1) { - const gchar *word; - gint position; - gint byte_offset_start; - gint byte_offset_end; - gboolean stop_word; - gint word_length; - - - /* Process next word */ - word = tracker_parser_next (parser, - &position, - &byte_offset_start, - &byte_offset_end, - &stop_word, - &word_length); - - /* Stop loop if no more words */ - if (!word) { - break; - } - - if (verbose) { - gchar *word_hex; - gchar *original_word; - gchar *original_word_hex; - gint original_word_length; - - /* Get original word */ - original_word_length = byte_offset_end - byte_offset_start; - original_word = g_malloc (original_word_length + 1); - memcpy (original_word, - &text[byte_offset_start], - original_word_length); - original_word[original_word_length] = '\0'; - - /* Get hex strings */ - word_hex = tracker_strhex (word, word_length, ':'); - original_word_hex = tracker_strhex (original_word, - original_word_length, - ':'); - - g_print ("WORD at %d [%d,%d] Original: '%s' (%s), " - "Processed: '%s' (%s) (stop? %s)\n", - position, - byte_offset_start, - byte_offset_end, - original_word, - original_word_hex, - word, - word_hex, - stop_word ? "yes" : "no"); - - g_free (word_hex); - g_free (original_word_hex); - g_free (original_word); - } - } - - g_print ("\n----> Parsing finished after '%lf' seconds\n", - g_timer_elapsed (timer, NULL)); - - g_timer_destroy (timer); - - tracker_parser_free (parser); - g_object_unref (language); - return TRUE; -} - - -int -main (int argc, char **argv) -{ - /* Setup locale */ - setlocale (LC_ALL, ""); - - /* Setup context */ - if (!setup_context (argc, argv)) { - g_printerr ("Context setup failed... exiting\n"); - return -1; - } - - /* Either text or file must be given */ - if (filename == NULL && - text == NULL) { - g_printerr ("Either 'file' or 'text' options should be used\n" - "Run '%s --help' to see a full list of available " - "command line options.\n", - argv[0]); - return -2; - } - - /* If required, load file contents */ - if (filename != NULL && - !load_file_contents ()) { - g_printerr ("Loading file '%s' contents failed... exiting\n", - filename); - return -3; - } - - /* Run the parsing! */ - if (!run_parsing ()) { - g_printerr ("Parsing operation failed... exiting\n"); - return -4; - } - - /* Clean exit */ - if (filename) - g_free (text); - return 0; -} |