summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJamie McCracken <jamiemcc@src.gnome.org>2007-01-20 00:24:54 +0000
committerJamie McCracken <jamiemcc@src.gnome.org>2007-01-20 00:24:54 +0000
commit5c0428636e0ce179977c5073c80a1ddf990657a4 (patch)
treea5c8b2b19e633929ad1be3b72e4b3d9c152b9f8e
parent353f3aa0da750be07f9711eda2bed81e88ea4ff5 (diff)
downloadtracker-5c0428636e0ce179977c5073c80a1ddf990657a4.tar.gz
big speed of indexer, also smoothed it out and fixed bugs
svn path=/trunk/; revision=402
-rw-r--r--src/trackerd/depot.c6
-rw-r--r--src/trackerd/tracker-db-sqlite.c132
-rw-r--r--src/trackerd/tracker-indexer.c2
-rw-r--r--src/trackerd/tracker-parser.c3
-rw-r--r--src/trackerd/tracker-utils.c145
-rw-r--r--src/trackerd/tracker-utils.h17
-rw-r--r--src/trackerd/trackerd.c125
7 files changed, 361 insertions, 69 deletions
diff --git a/src/trackerd/depot.c b/src/trackerd/depot.c
index 4ab1d6203..0f350e146 100644
--- a/src/trackerd/depot.c
+++ b/src/trackerd/depot.c
@@ -263,7 +263,7 @@ DEPOT *dpopen(const char *name, int omode, int bnum){
return NULL;
}
msiz = DP_HEADSIZ + bnum * sizeof(int);
- map = mmap(0, msiz, PROT_READ | ((mode & DP_OWRITER) ? PROT_WRITE : 0), MAP_PRIVATE, fd, 0);
+ map = mmap(0, msiz, PROT_READ | ((mode & DP_OWRITER) ? PROT_WRITE : 0), MAP_SHARED, fd, 0);
if(map == MAP_FAILED){
close(fd);
dpecodeset(DP_EMAP, __FILE__, __LINE__);
@@ -900,7 +900,7 @@ int dpoptimize(DEPOT *depot, int bnum){
depot->fbpool[i+1] = -1;
}
depot->msiz = tdepot->msiz;
- depot->map = mmap(0, depot->msiz, PROT_READ | PROT_WRITE, MAP_PRIVATE, depot->fd, 0);
+ depot->map = mmap(0, depot->msiz, PROT_READ | PROT_WRITE, MAP_SHARED, depot->fd, 0);
if(depot->map == MAP_FAILED){
dpecodeset(DP_EMAP, __FILE__, __LINE__);
depot->fatal = TRUE;
@@ -1299,7 +1299,7 @@ char *dpsnaffle(const char *name, const char* kbuf, int ksiz, int *sp){
return NULL;
}
msiz = DP_HEADSIZ + bnum * sizeof(int);
- map = mmap(0, msiz, PROT_READ, MAP_PRIVATE, fd, 0);
+ map = mmap(0, msiz, PROT_READ, MAP_SHARED, fd, 0);
if(map == MAP_FAILED){
close(fd);
dpecodeset(DP_EMAP, __FILE__, __LINE__);
diff --git a/src/trackerd/tracker-db-sqlite.c b/src/trackerd/tracker-db-sqlite.c
index 6eae293ac..578061b73 100644
--- a/src/trackerd/tracker-db-sqlite.c
+++ b/src/trackerd/tracker-db-sqlite.c
@@ -724,6 +724,7 @@ tracker_db_connect_cache (void)
tracker_db_exec_no_reply (db_con, "CREATE INDEX WordWord ON Words (Word)");
tracker_db_exec_no_reply (db_con, "CREATE INDEX WordWordCount ON Words (WordCount)");
tracker_db_exec_no_reply (db_con, "CREATE INDEX ServiceWordID ON ServiceWords (ServiceID)");
+ tracker_db_exec_no_reply (db_con, "ANALYZE");
}
db_con->thread = NULL;
@@ -2806,6 +2807,45 @@ delete_index_for_service (DBConnection *db_con, DBConnection *blob_db_con, guint
}
+static gint
+delete_id_from_list (gpointer key,
+ gpointer value,
+ gpointer data)
+{
+
+ GSList *list, *l;
+ WordDetails *wd;
+ guint32 file_id = GPOINTER_TO_UINT (data);
+
+ list = value;
+
+ for (l=list;l;l=l->next) {
+ wd = l->data;
+ if (wd->id == file_id) {
+
+ list = g_slist_remove (list, l->data);
+ g_slice_free (WordDetails, wd);
+ value = list;
+ tracker->word_detail_count--;
+
+ if (g_slist_length (list) == 0) {
+ tracker->word_count--;
+ }
+
+ return 1;
+ }
+ }
+
+ return 1;
+}
+
+
+static void
+delete_cache_words (guint32 file_id)
+{
+ g_hash_table_foreach (tracker->cached_table, (GHFunc) delete_id_from_list, GUINT_TO_POINTER (file_id));
+}
+
void
tracker_db_delete_file (DBConnection *db_con, DBConnection *blob_db_con, guint32 file_id)
@@ -2827,12 +2867,14 @@ tracker_db_delete_file (DBConnection *db_con, DBConnection *blob_db_con, guint32
tracker_exec_proc (db_con, "DeleteFile8", 1, str_file_id);
tracker_exec_proc (db_con, "DeleteFile9", 1, str_file_id);
- if (db_con->user_data) {
+ delete_cache_words (file_id);
+
+/* if (db_con->user_data) {
tracker_exec_proc (db_con->user_data, "DeleteFile10", 1, str_file_id);
} else {
tracker_log ("WARNING: Cache DB not found");
}
-
+*/
tracker_db_end_transaction (db_con);
g_free (str_file_id);
@@ -2864,10 +2906,12 @@ tracker_db_delete_directory (DBConnection *db_con, DBConnection *blob_db_con, gu
id = atoi (row[0]);
delete_index_for_service (db_con, blob_db_con, id);
- if (db_con->user_data) {
+ delete_cache_words (id);
+
+/* if (db_con->user_data) {
tracker_exec_proc (db_con->user_data, "DeleteServiceWordForID", 1, row[0]);
}
-
+*/
}
@@ -3582,6 +3626,8 @@ add_word_to_hash (const char *word, int id, int count)
g_mutex_unlock (tracker->cached_word_table_mutex);
}
+
+
static char *
cache_word_exists (DBConnection *db_con, const char *word)
{
@@ -3600,6 +3646,7 @@ cache_word_exists (DBConnection *db_con, const char *word)
cache_word->count++;
return tracker_int_to_str (cache_word->id);
}
+
}
res = tracker_exec_proc (db_con, "GetWordID", 1, word);
@@ -3657,6 +3704,31 @@ append_cache_word (DBConnection *db_con, const char *word, guint32 service_id, i
// tracker_log ("appending word %s", word);
+
+ if (tracker->use_extra_memory) {
+ WordDetails *word_details;
+ GSList *list;
+
+ word_details = g_slice_new (WordDetails);
+
+ word_details->id = service_id;
+ word_details->amalgamated = tracker_indexer_calc_amalgamated (service_type, score);
+
+ list = g_hash_table_lookup (tracker->cached_table, word);
+
+ if (!list) {
+ tracker->word_count++;
+ }
+
+ list = g_slist_prepend (list, word_details);
+ g_hash_table_insert (tracker->cached_table, g_strdup (word), list);
+
+
+ tracker->word_detail_count++;
+
+ return;
+ }
+
word_id = cache_word_exists (db_con, word);
str_service_id = tracker_uint_to_str (service_id);
@@ -3671,12 +3743,49 @@ append_cache_word (DBConnection *db_con, const char *word, guint32 service_id, i
g_free (str_score);
}
+static inline guint8
+get_service_type_from_detail (WordDetails *details)
+{
+ return (details->amalgamated >> 24) & 0xFF;
+}
-static void
+static inline guint16
+get_score_from_detail (WordDetails *details)
+{
+ unsigned char a[2];
+
+ a[0] = (details->amalgamated >> 16) & 0xFF;
+ a[1] = (details->amalgamated >> 8) & 0xFF;
+
+ return (a[0] << 8) | (a[1]);
+
+}
+
+
+static gboolean
update_cache_word (DBConnection *db_con, const char *word, guint32 service_id, int score)
{
char *str_service_id, *str_score;
+ if (tracker->use_extra_memory) {
+ WordDetails *word_details;
+ GSList *list, *l;
+
+ list = g_hash_table_lookup (tracker->cached_table, word);
+
+ for (l = list; l; l=l->next) {
+ word_details = l->data;
+ if (word_details->id == service_id) {
+ score += get_score_from_detail (word_details);
+ word_details->amalgamated = tracker_indexer_calc_amalgamated (get_service_type_from_detail (word_details), score);
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+ }
+
+
str_service_id = tracker_uint_to_str (service_id);
str_score = tracker_int_to_str (score);
@@ -3684,6 +3793,8 @@ update_cache_word (DBConnection *db_con, const char *word, guint32 service_id, i
g_free (str_service_id);
g_free (str_score);
+
+ return FALSE;
}
@@ -3885,14 +3996,19 @@ update_index_data (gpointer key,
word = (char *) key;
score = GPOINTER_TO_INT (value);
info = user_data;
+
+ if (!update_cache_word (info->db_con, word, info->service_id, score)) {
+ tracker_indexer_update_word (tracker->file_indexer, word, info->service_id, info->service_type_id, score, FALSE);
+ }
+
+ return;
+
char *str_service_id = tracker_uint_to_str (info->service_id);
if (score != 0) {
res = tracker_exec_proc (info->db_con, "ServiceCached", 2, str_service_id, word);
- g_free (str_service_id);
-
if (res) {
if (res[0] && res[0][0]) {
if (res[0][0][0] == '1') {
@@ -3909,6 +4025,8 @@ update_index_data (gpointer key,
update_cache_word (info->db_con, word, info->service_id, score);
}
}
+
+ g_free (str_service_id);
}
diff --git a/src/trackerd/tracker-indexer.c b/src/trackerd/tracker-indexer.c
index eb8c2ded1..88356bdb9 100644
--- a/src/trackerd/tracker-indexer.c
+++ b/src/trackerd/tracker-indexer.c
@@ -32,7 +32,7 @@ typedef struct {
static gboolean shutdown;
-static inline guint8
+static inline guint16
get_score (WordDetails *details)
{
unsigned char a[2];
diff --git a/src/trackerd/tracker-parser.c b/src/trackerd/tracker-parser.c
index 385943a2d..20df34675 100644
--- a/src/trackerd/tracker-parser.c
+++ b/src/trackerd/tracker-parser.c
@@ -100,7 +100,8 @@ word_is_valid (const char *word)
c = g_utf8_get_char (word);
- if (g_unichar_isalnum (c)) {
+ if (tracker->index_numbers && g_unichar_isalnum (c)) {
+
return numbered_word_is_valid (word);
}
diff --git a/src/trackerd/tracker-utils.c b/src/trackerd/tracker-utils.c
index be2c391fc..73d07c673 100644
--- a/src/trackerd/tracker-utils.c
+++ b/src/trackerd/tracker-utils.c
@@ -31,9 +31,11 @@
#include "tracker-dbus.h"
#include "tracker-utils.h"
+#include "tracker-indexer.h"
#include "xdgmime.h"
+
extern Tracker *tracker;
char *implemented_services[] = {"Files", "Folders", "Documents", "Images", "Music", "Videos", "Text Files", "Development Files", "Other Files",
@@ -2075,7 +2077,7 @@ tracker_load_config_file ()
if (g_key_file_has_key (key_file, "Indexing", "Throttle", NULL)) {
tracker->throttle = g_key_file_get_integer (key_file, "Indexing", "Throttle", NULL);
} else {
- tracker->throttle = 5;
+ tracker->throttle = 0;
}
if (g_key_file_has_key (key_file, "Indexing", "EnableIndexing", NULL)) {
@@ -2943,3 +2945,144 @@ tracker_get_snippet (const char *txt, char **terms, int length)
return NULL;
}
}
+
+
+static gint
+prepend_key_pointer (gpointer key,
+ gpointer value,
+ gpointer data)
+{
+ GSList **plist = data;
+ *plist = g_slist_prepend (*plist, key);
+ return 1;
+}
+
+
+static GSList *
+g_hash_table_key_slist (GHashTable *table)
+{
+ GSList *rv = NULL;
+ g_hash_table_foreach (table, (GHFunc) prepend_key_pointer, &rv);
+ return rv;
+}
+
+
+static gint
+sort_func (char *a, char *b)
+{
+ GSList *lista, *listb;
+
+ lista = g_hash_table_lookup (tracker->cached_table, a);
+ listb = g_hash_table_lookup (tracker->cached_table, b);
+
+ return (g_slist_length (lista) - g_slist_length (listb));
+}
+
+
+static void
+flush_list (GSList *list, const char *word)
+{
+ WordDetails *word_details, *wd;
+ int i, count;
+ GSList *l;
+
+ count = g_slist_length (list);
+
+// tracker_log ("flushing word %s with count %d", word, count);
+
+ word_details = g_malloc (sizeof (WordDetails) * count);
+
+ i = 0;
+ for (l=list; (l && i<count); l=l->next) {
+
+ wd = l->data;
+ word_details[i].id = wd->id;
+ word_details[i].amalgamated = wd->amalgamated;
+ i++;
+ g_slice_free (WordDetails, wd);
+
+
+ }
+
+ g_slist_free (list);
+
+ tracker_indexer_append_word_chunk (tracker->file_indexer, word, word_details, count);
+
+ g_free (word_details);
+
+ tracker->update_count++;
+ tracker->word_detail_count -= count;
+ tracker->word_count--;
+
+}
+
+
+static inline gboolean
+is_min_flush_done ()
+{
+ return (tracker->word_detail_count <= tracker->word_detail_min) && (tracker->word_count <= tracker->word_count_min);
+
+}
+
+static void
+delete_word_detail (WordDetails *wd)
+{
+ g_slice_free (WordDetails, wd);
+
+}
+
+void
+tracker_flush_rare_words ()
+{
+
+ GSList *list, *l, *l2;
+
+ tracker_log ("flushing rare words");
+
+ list = g_hash_table_key_slist (tracker->cached_table);
+
+ list = g_slist_sort (list, (GCompareFunc) sort_func);
+
+ for (l = list; (l && !is_min_flush_done ()); l=l->next) {
+ char *word = l->data;
+
+ l2 = g_hash_table_lookup (tracker->cached_table, word);
+
+ flush_list (l2, word);
+
+ g_hash_table_remove (tracker->cached_table, word);
+
+ }
+
+ g_slist_free (list);
+
+}
+
+
+static gint
+flush_all (gpointer key,
+ gpointer value,
+ gpointer data)
+{
+
+ flush_list (value, key);
+
+ return 1;
+}
+
+
+void
+tracker_flush_all_words ()
+{
+ tracker_log ("flushing all words");
+
+ g_hash_table_foreach (tracker->cached_table, (GHFunc) flush_all, NULL);
+
+ g_hash_table_destroy (tracker->cached_table);
+
+ tracker->cached_table = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL);
+
+
+
+}
+
diff --git a/src/trackerd/tracker-utils.h b/src/trackerd/tracker-utils.h
index 8e56d1619..88931f02f 100644
--- a/src/trackerd/tracker-utils.h
+++ b/src/trackerd/tracker-utils.h
@@ -130,7 +130,10 @@ typedef struct {
GHashTable *stop_words; /* table of stop words that are to be ignored by the parser */
gboolean use_pango_word_break;
+ gboolean index_numbers;
+
gboolean first_time_index;
+ gboolean first_flush;
gboolean do_optimize;
@@ -175,7 +178,16 @@ typedef struct {
int flush_count;
int min_flush;
int flush_by_file;
-
+
+ /* cache words when we can use extra memory */
+ GHashTable *cached_table;
+ GMutex *cache_table_mutex;
+ int word_detail_limit;
+ int word_detail_count;
+ int word_detail_min;
+ int word_count;
+ int word_count_limit;
+ int word_count_min;
GSList *poll_list;
@@ -410,6 +422,9 @@ gboolean tracker_is_dir_polled (const char *dir);
void tracker_throttle (int multiplier);
+void tracker_flush_all_words ();
+void tracker_flush_rare_words ();
+
void tracker_notify_file_data_available (void);
void tracker_notify_meta_data_available (void);
void tracker_notify_request_data_available (void);
diff --git a/src/trackerd/trackerd.c b/src/trackerd/trackerd.c
index 34547ac77..228d984cb 100644
--- a/src/trackerd/trackerd.c
+++ b/src/trackerd/trackerd.c
@@ -135,7 +135,7 @@ static char **watch_dirs = NULL;
static char *language = NULL;
static gboolean disable_indexing = FALSE;
static gboolean low_memory, turbo, enable_debug, enable_evolution, enable_thunderbird, enable_kmail;
-static int throttle = 5, throttle_battery = 15;
+static int throttle = 0, throttle_battery = 10;
static GOptionEntry entries[] = {
{"exclude-dir", 'e', 0, G_OPTION_ARG_STRING_ARRAY, &no_watch_dirs, N_("Directory to exclude from indexing"), N_("/PATH/DIR")},
@@ -1221,9 +1221,9 @@ scan_directory (const char *uri, DBConnection *db_con)
}
-
+/*
static gboolean
-start_watching (gpointer data)
+start_watching ()
{
if (!tracker->is_running) {
return FALSE;
@@ -1235,52 +1235,19 @@ start_watching (gpointer data)
exit (1);
} else {
- /* start emails watching */
+
//tracker_email_watch_emails (main_thread_db_con);
- if (data) {
- char *watch_folder;
- int len;
-
- watch_folder = (char *) data;
-
- if (!watch_folder || watch_folder[0] != '/') {
- g_free (watch_folder);
- return FALSE;
- }
-
- len = strlen (watch_folder);
-
- if (watch_folder[len-1] == G_DIR_SEPARATOR) {
- watch_folder[len-1] = '\0';
- }
-
-
- watch_dir (watch_folder, main_thread_db_con);
- schedule_dir_check (watch_folder, main_thread_db_con);
- g_free (watch_folder);
-
- } else {
- g_slist_foreach (tracker->watch_directory_roots_list, (GFunc) watch_dir, main_thread_db_con);
- g_slist_foreach (tracker->watch_directory_roots_list, (GFunc) schedule_dir_check, main_thread_db_con);
- }
+ g_slist_foreach (tracker->watch_directory_roots_list, (GFunc) watch_dir, main_thread_db_con);
+ g_slist_foreach (tracker->watch_directory_roots_list, (GFunc) schedule_dir_check, main_thread_db_con);
tracker_notify_file_data_available ();
- tracker->is_dir_scan = FALSE;
-
- tracker_log ("waiting for file events...");
-
- /*if (tracker->first_time_index) {
- tracker->do_optimize = TRUE;
- g_timeout_add (5000, (GSourceFunc) optimize_when_indexing_finished, NULL);
- }*/
-
}
return FALSE;
}
-
+*/
static void
extract_metadata_thread (void)
@@ -1444,7 +1411,7 @@ extract_metadata_thread (void)
if (!tracker->is_indexing) {
tracker->is_indexing = TRUE;
- g_timeout_add (3000, (GSourceFunc) flush_when_indexing_finished, NULL);
+ //g_timeout_add (3000, (GSourceFunc) flush_when_indexing_finished, NULL);
}
if (info->service_type_id == -1) {
@@ -1686,7 +1653,13 @@ process_files_thread (void)
break;
}
}
+
+ if (tracker->word_detail_count > tracker->word_detail_limit || tracker->word_count > tracker->word_count_limit) {
+ tracker_flush_rare_words ();
+ }
+
+ /*
if (!tracker->in_flush && (tracker->number_of_cached_words > tracker->cache_word_limit)) {
int words_left;
tracker->in_flush = TRUE;
@@ -1715,12 +1688,15 @@ process_files_thread (void)
tracker->number_of_cached_words = words_left;
tracker_log ("flushing data (%d words left) to inverted word index - please wait", words_left);
}
-
+ if (tracker->first_time_index && tracker->first_flush) {
+ tracker->first_flush = FALSE;
+ tracker_db_exec_no_reply (cache_db_con, "ANALYZE");
+ }
tracker->in_flush = FALSE;
}
-
+*/
info = g_async_queue_try_pop (tracker->file_process_queue);
@@ -1763,6 +1739,19 @@ process_files_thread (void)
}
+ /* flush all words if nothing left to do before sleeping */
+ tracker_flush_all_words ();
+
+ if (tracker->is_running && (tracker->first_time_index || tracker->do_optimize || (tracker->update_count > tracker->optimization_count))) {
+
+ tracker_indexer_optimize (tracker->file_indexer);
+
+ tracker->do_optimize = FALSE;
+ tracker->first_time_index = FALSE;
+ tracker->update_count = 0;
+
+ }
+
/* we have no stuff to process so sleep until awoken by a new signal */
g_debug ("File thread sleeping");
@@ -1773,7 +1762,7 @@ process_files_thread (void)
/* determine if wake up call is new stuff or a shutdown signal */
if (!shutdown) {
- continue;
+
} else {
break;
}
@@ -1985,10 +1974,7 @@ process_files_thread (void)
if (need_index) {
- if (!tracker->is_indexing) {
- tracker->is_indexing = TRUE;
- g_timeout_add (5000, (GSourceFunc) flush_when_indexing_finished, NULL);
- }
+
index_entity (db_con, info);
}
@@ -2522,6 +2508,9 @@ set_defaults ()
tracker->language = g_strdup ("en");
tracker->stop_words = NULL;
tracker->use_pango_word_break = FALSE;
+ tracker->index_numbers = FALSE;
+
+ tracker->first_flush = TRUE;
}
@@ -2612,12 +2601,27 @@ sanity_check_option_values ()
tracker_log ("Stemmer enabled : \t\t\t%s", bools[tracker->use_stemmer]);
tracker_log ("Using Pango word breaking : \t\t%s\n\n", bools[tracker->use_pango_word_break]);
+ tracker->word_count = 0;
+ tracker->word_detail_count = 0;
+
if (tracker->use_extra_memory) {
tracker->max_process_queue_size = 5000;
tracker->max_extract_queue_size = 5000;
tracker->cached_word_table = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL);
tracker->cache_word_limit = 20000;
tracker->cache_word_min = 1000;
+
+ tracker->cached_table = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL);
+
+ tracker->word_detail_limit = 200000;
+ tracker->word_detail_min = 100000;
+ tracker->word_count_limit = 20000;
+ tracker->word_count_min = 1000;
+ } else {
+ tracker->word_detail_limit = 100000;
+ tracker->word_detail_min = 50000;
+ tracker->word_count_limit = 10000;
+ tracker->word_count_min = 500;
}
@@ -3028,16 +3032,27 @@ main (int argc, char **argv)
/* schedule the watching of directories so as not to delay start up time*/
+
if (tracker->enable_indexing) {
- g_timeout_add_full (G_PRIORITY_LOW,
- 500,
- (GSourceFunc) start_watching,
- NULL, NULL
- );
+
+ if (!tracker_start_watching ()) {
+ tracker_log ("File monitoring failed to start");
+ do_cleanup ("File watching failure");
+ exit (1);
+ } else {
+
+
+ //tracker_email_watch_emails (main_thread_db_con);
+
+ g_slist_foreach (tracker->watch_directory_roots_list, (GFunc) watch_dir, main_thread_db_con);
+ g_slist_foreach (tracker->watch_directory_roots_list, (GFunc) schedule_dir_check, main_thread_db_con);
+
+ tracker->file_process_thread = g_thread_create ((GThreadFunc) process_files_thread, NULL, FALSE, NULL);
+
+ }
}
- /* execute events and user requests to be processed and indexed in their own threads */
- tracker->file_process_thread = g_thread_create ((GThreadFunc) process_files_thread, NULL, FALSE, NULL);
+
//tracker->file_metadata_thread = g_thread_create ((GThreadFunc) extract_metadata_thread, NULL, FALSE, NULL);
tracker->user_request_thread = g_thread_create ((GThreadFunc) process_user_request_queue_thread, NULL, FALSE, NULL);