diff options
author | Jamie McCracken <jamiemcc@src.gnome.org> | 2007-01-20 00:24:54 +0000 |
---|---|---|
committer | Jamie McCracken <jamiemcc@src.gnome.org> | 2007-01-20 00:24:54 +0000 |
commit | 5c0428636e0ce179977c5073c80a1ddf990657a4 (patch) | |
tree | a5c8b2b19e633929ad1be3b72e4b3d9c152b9f8e | |
parent | 353f3aa0da750be07f9711eda2bed81e88ea4ff5 (diff) | |
download | tracker-5c0428636e0ce179977c5073c80a1ddf990657a4.tar.gz |
big speed of indexer, also smoothed it out and fixed bugs
svn path=/trunk/; revision=402
-rw-r--r-- | src/trackerd/depot.c | 6 | ||||
-rw-r--r-- | src/trackerd/tracker-db-sqlite.c | 132 | ||||
-rw-r--r-- | src/trackerd/tracker-indexer.c | 2 | ||||
-rw-r--r-- | src/trackerd/tracker-parser.c | 3 | ||||
-rw-r--r-- | src/trackerd/tracker-utils.c | 145 | ||||
-rw-r--r-- | src/trackerd/tracker-utils.h | 17 | ||||
-rw-r--r-- | src/trackerd/trackerd.c | 125 |
7 files changed, 361 insertions, 69 deletions
diff --git a/src/trackerd/depot.c b/src/trackerd/depot.c index 4ab1d6203..0f350e146 100644 --- a/src/trackerd/depot.c +++ b/src/trackerd/depot.c @@ -263,7 +263,7 @@ DEPOT *dpopen(const char *name, int omode, int bnum){ return NULL; } msiz = DP_HEADSIZ + bnum * sizeof(int); - map = mmap(0, msiz, PROT_READ | ((mode & DP_OWRITER) ? PROT_WRITE : 0), MAP_PRIVATE, fd, 0); + map = mmap(0, msiz, PROT_READ | ((mode & DP_OWRITER) ? PROT_WRITE : 0), MAP_SHARED, fd, 0); if(map == MAP_FAILED){ close(fd); dpecodeset(DP_EMAP, __FILE__, __LINE__); @@ -900,7 +900,7 @@ int dpoptimize(DEPOT *depot, int bnum){ depot->fbpool[i+1] = -1; } depot->msiz = tdepot->msiz; - depot->map = mmap(0, depot->msiz, PROT_READ | PROT_WRITE, MAP_PRIVATE, depot->fd, 0); + depot->map = mmap(0, depot->msiz, PROT_READ | PROT_WRITE, MAP_SHARED, depot->fd, 0); if(depot->map == MAP_FAILED){ dpecodeset(DP_EMAP, __FILE__, __LINE__); depot->fatal = TRUE; @@ -1299,7 +1299,7 @@ char *dpsnaffle(const char *name, const char* kbuf, int ksiz, int *sp){ return NULL; } msiz = DP_HEADSIZ + bnum * sizeof(int); - map = mmap(0, msiz, PROT_READ, MAP_PRIVATE, fd, 0); + map = mmap(0, msiz, PROT_READ, MAP_SHARED, fd, 0); if(map == MAP_FAILED){ close(fd); dpecodeset(DP_EMAP, __FILE__, __LINE__); diff --git a/src/trackerd/tracker-db-sqlite.c b/src/trackerd/tracker-db-sqlite.c index 6eae293ac..578061b73 100644 --- a/src/trackerd/tracker-db-sqlite.c +++ b/src/trackerd/tracker-db-sqlite.c @@ -724,6 +724,7 @@ tracker_db_connect_cache (void) tracker_db_exec_no_reply (db_con, "CREATE INDEX WordWord ON Words (Word)"); tracker_db_exec_no_reply (db_con, "CREATE INDEX WordWordCount ON Words (WordCount)"); tracker_db_exec_no_reply (db_con, "CREATE INDEX ServiceWordID ON ServiceWords (ServiceID)"); + tracker_db_exec_no_reply (db_con, "ANALYZE"); } db_con->thread = NULL; @@ -2806,6 +2807,45 @@ delete_index_for_service (DBConnection *db_con, DBConnection *blob_db_con, guint } +static gint +delete_id_from_list (gpointer key, + gpointer value, + gpointer data) +{ + + GSList *list, *l; + WordDetails *wd; + guint32 file_id = GPOINTER_TO_UINT (data); + + list = value; + + for (l=list;l;l=l->next) { + wd = l->data; + if (wd->id == file_id) { + + list = g_slist_remove (list, l->data); + g_slice_free (WordDetails, wd); + value = list; + tracker->word_detail_count--; + + if (g_slist_length (list) == 0) { + tracker->word_count--; + } + + return 1; + } + } + + return 1; +} + + +static void +delete_cache_words (guint32 file_id) +{ + g_hash_table_foreach (tracker->cached_table, (GHFunc) delete_id_from_list, GUINT_TO_POINTER (file_id)); +} + void tracker_db_delete_file (DBConnection *db_con, DBConnection *blob_db_con, guint32 file_id) @@ -2827,12 +2867,14 @@ tracker_db_delete_file (DBConnection *db_con, DBConnection *blob_db_con, guint32 tracker_exec_proc (db_con, "DeleteFile8", 1, str_file_id); tracker_exec_proc (db_con, "DeleteFile9", 1, str_file_id); - if (db_con->user_data) { + delete_cache_words (file_id); + +/* if (db_con->user_data) { tracker_exec_proc (db_con->user_data, "DeleteFile10", 1, str_file_id); } else { tracker_log ("WARNING: Cache DB not found"); } - +*/ tracker_db_end_transaction (db_con); g_free (str_file_id); @@ -2864,10 +2906,12 @@ tracker_db_delete_directory (DBConnection *db_con, DBConnection *blob_db_con, gu id = atoi (row[0]); delete_index_for_service (db_con, blob_db_con, id); - if (db_con->user_data) { + delete_cache_words (id); + +/* if (db_con->user_data) { tracker_exec_proc (db_con->user_data, "DeleteServiceWordForID", 1, row[0]); } - +*/ } @@ -3582,6 +3626,8 @@ add_word_to_hash (const char *word, int id, int count) g_mutex_unlock (tracker->cached_word_table_mutex); } + + static char * cache_word_exists (DBConnection *db_con, const char *word) { @@ -3600,6 +3646,7 @@ cache_word_exists (DBConnection *db_con, const char *word) cache_word->count++; return tracker_int_to_str (cache_word->id); } + } res = tracker_exec_proc (db_con, "GetWordID", 1, word); @@ -3657,6 +3704,31 @@ append_cache_word (DBConnection *db_con, const char *word, guint32 service_id, i // tracker_log ("appending word %s", word); + + if (tracker->use_extra_memory) { + WordDetails *word_details; + GSList *list; + + word_details = g_slice_new (WordDetails); + + word_details->id = service_id; + word_details->amalgamated = tracker_indexer_calc_amalgamated (service_type, score); + + list = g_hash_table_lookup (tracker->cached_table, word); + + if (!list) { + tracker->word_count++; + } + + list = g_slist_prepend (list, word_details); + g_hash_table_insert (tracker->cached_table, g_strdup (word), list); + + + tracker->word_detail_count++; + + return; + } + word_id = cache_word_exists (db_con, word); str_service_id = tracker_uint_to_str (service_id); @@ -3671,12 +3743,49 @@ append_cache_word (DBConnection *db_con, const char *word, guint32 service_id, i g_free (str_score); } +static inline guint8 +get_service_type_from_detail (WordDetails *details) +{ + return (details->amalgamated >> 24) & 0xFF; +} -static void +static inline guint16 +get_score_from_detail (WordDetails *details) +{ + unsigned char a[2]; + + a[0] = (details->amalgamated >> 16) & 0xFF; + a[1] = (details->amalgamated >> 8) & 0xFF; + + return (a[0] << 8) | (a[1]); + +} + + +static gboolean update_cache_word (DBConnection *db_con, const char *word, guint32 service_id, int score) { char *str_service_id, *str_score; + if (tracker->use_extra_memory) { + WordDetails *word_details; + GSList *list, *l; + + list = g_hash_table_lookup (tracker->cached_table, word); + + for (l = list; l; l=l->next) { + word_details = l->data; + if (word_details->id == service_id) { + score += get_score_from_detail (word_details); + word_details->amalgamated = tracker_indexer_calc_amalgamated (get_service_type_from_detail (word_details), score); + return TRUE; + } + } + + return FALSE; + } + + str_service_id = tracker_uint_to_str (service_id); str_score = tracker_int_to_str (score); @@ -3684,6 +3793,8 @@ update_cache_word (DBConnection *db_con, const char *word, guint32 service_id, i g_free (str_service_id); g_free (str_score); + + return FALSE; } @@ -3885,14 +3996,19 @@ update_index_data (gpointer key, word = (char *) key; score = GPOINTER_TO_INT (value); info = user_data; + + if (!update_cache_word (info->db_con, word, info->service_id, score)) { + tracker_indexer_update_word (tracker->file_indexer, word, info->service_id, info->service_type_id, score, FALSE); + } + + return; + char *str_service_id = tracker_uint_to_str (info->service_id); if (score != 0) { res = tracker_exec_proc (info->db_con, "ServiceCached", 2, str_service_id, word); - g_free (str_service_id); - if (res) { if (res[0] && res[0][0]) { if (res[0][0][0] == '1') { @@ -3909,6 +4025,8 @@ update_index_data (gpointer key, update_cache_word (info->db_con, word, info->service_id, score); } } + + g_free (str_service_id); } diff --git a/src/trackerd/tracker-indexer.c b/src/trackerd/tracker-indexer.c index eb8c2ded1..88356bdb9 100644 --- a/src/trackerd/tracker-indexer.c +++ b/src/trackerd/tracker-indexer.c @@ -32,7 +32,7 @@ typedef struct { static gboolean shutdown; -static inline guint8 +static inline guint16 get_score (WordDetails *details) { unsigned char a[2]; diff --git a/src/trackerd/tracker-parser.c b/src/trackerd/tracker-parser.c index 385943a2d..20df34675 100644 --- a/src/trackerd/tracker-parser.c +++ b/src/trackerd/tracker-parser.c @@ -100,7 +100,8 @@ word_is_valid (const char *word) c = g_utf8_get_char (word); - if (g_unichar_isalnum (c)) { + if (tracker->index_numbers && g_unichar_isalnum (c)) { + return numbered_word_is_valid (word); } diff --git a/src/trackerd/tracker-utils.c b/src/trackerd/tracker-utils.c index be2c391fc..73d07c673 100644 --- a/src/trackerd/tracker-utils.c +++ b/src/trackerd/tracker-utils.c @@ -31,9 +31,11 @@ #include "tracker-dbus.h" #include "tracker-utils.h" +#include "tracker-indexer.h" #include "xdgmime.h" + extern Tracker *tracker; char *implemented_services[] = {"Files", "Folders", "Documents", "Images", "Music", "Videos", "Text Files", "Development Files", "Other Files", @@ -2075,7 +2077,7 @@ tracker_load_config_file () if (g_key_file_has_key (key_file, "Indexing", "Throttle", NULL)) { tracker->throttle = g_key_file_get_integer (key_file, "Indexing", "Throttle", NULL); } else { - tracker->throttle = 5; + tracker->throttle = 0; } if (g_key_file_has_key (key_file, "Indexing", "EnableIndexing", NULL)) { @@ -2943,3 +2945,144 @@ tracker_get_snippet (const char *txt, char **terms, int length) return NULL; } } + + +static gint +prepend_key_pointer (gpointer key, + gpointer value, + gpointer data) +{ + GSList **plist = data; + *plist = g_slist_prepend (*plist, key); + return 1; +} + + +static GSList * +g_hash_table_key_slist (GHashTable *table) +{ + GSList *rv = NULL; + g_hash_table_foreach (table, (GHFunc) prepend_key_pointer, &rv); + return rv; +} + + +static gint +sort_func (char *a, char *b) +{ + GSList *lista, *listb; + + lista = g_hash_table_lookup (tracker->cached_table, a); + listb = g_hash_table_lookup (tracker->cached_table, b); + + return (g_slist_length (lista) - g_slist_length (listb)); +} + + +static void +flush_list (GSList *list, const char *word) +{ + WordDetails *word_details, *wd; + int i, count; + GSList *l; + + count = g_slist_length (list); + +// tracker_log ("flushing word %s with count %d", word, count); + + word_details = g_malloc (sizeof (WordDetails) * count); + + i = 0; + for (l=list; (l && i<count); l=l->next) { + + wd = l->data; + word_details[i].id = wd->id; + word_details[i].amalgamated = wd->amalgamated; + i++; + g_slice_free (WordDetails, wd); + + + } + + g_slist_free (list); + + tracker_indexer_append_word_chunk (tracker->file_indexer, word, word_details, count); + + g_free (word_details); + + tracker->update_count++; + tracker->word_detail_count -= count; + tracker->word_count--; + +} + + +static inline gboolean +is_min_flush_done () +{ + return (tracker->word_detail_count <= tracker->word_detail_min) && (tracker->word_count <= tracker->word_count_min); + +} + +static void +delete_word_detail (WordDetails *wd) +{ + g_slice_free (WordDetails, wd); + +} + +void +tracker_flush_rare_words () +{ + + GSList *list, *l, *l2; + + tracker_log ("flushing rare words"); + + list = g_hash_table_key_slist (tracker->cached_table); + + list = g_slist_sort (list, (GCompareFunc) sort_func); + + for (l = list; (l && !is_min_flush_done ()); l=l->next) { + char *word = l->data; + + l2 = g_hash_table_lookup (tracker->cached_table, word); + + flush_list (l2, word); + + g_hash_table_remove (tracker->cached_table, word); + + } + + g_slist_free (list); + +} + + +static gint +flush_all (gpointer key, + gpointer value, + gpointer data) +{ + + flush_list (value, key); + + return 1; +} + + +void +tracker_flush_all_words () +{ + tracker_log ("flushing all words"); + + g_hash_table_foreach (tracker->cached_table, (GHFunc) flush_all, NULL); + + g_hash_table_destroy (tracker->cached_table); + + tracker->cached_table = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL); + + + +} + diff --git a/src/trackerd/tracker-utils.h b/src/trackerd/tracker-utils.h index 8e56d1619..88931f02f 100644 --- a/src/trackerd/tracker-utils.h +++ b/src/trackerd/tracker-utils.h @@ -130,7 +130,10 @@ typedef struct { GHashTable *stop_words; /* table of stop words that are to be ignored by the parser */ gboolean use_pango_word_break; + gboolean index_numbers; + gboolean first_time_index; + gboolean first_flush; gboolean do_optimize; @@ -175,7 +178,16 @@ typedef struct { int flush_count; int min_flush; int flush_by_file; - + + /* cache words when we can use extra memory */ + GHashTable *cached_table; + GMutex *cache_table_mutex; + int word_detail_limit; + int word_detail_count; + int word_detail_min; + int word_count; + int word_count_limit; + int word_count_min; GSList *poll_list; @@ -410,6 +422,9 @@ gboolean tracker_is_dir_polled (const char *dir); void tracker_throttle (int multiplier); +void tracker_flush_all_words (); +void tracker_flush_rare_words (); + void tracker_notify_file_data_available (void); void tracker_notify_meta_data_available (void); void tracker_notify_request_data_available (void); diff --git a/src/trackerd/trackerd.c b/src/trackerd/trackerd.c index 34547ac77..228d984cb 100644 --- a/src/trackerd/trackerd.c +++ b/src/trackerd/trackerd.c @@ -135,7 +135,7 @@ static char **watch_dirs = NULL; static char *language = NULL; static gboolean disable_indexing = FALSE; static gboolean low_memory, turbo, enable_debug, enable_evolution, enable_thunderbird, enable_kmail; -static int throttle = 5, throttle_battery = 15; +static int throttle = 0, throttle_battery = 10; static GOptionEntry entries[] = { {"exclude-dir", 'e', 0, G_OPTION_ARG_STRING_ARRAY, &no_watch_dirs, N_("Directory to exclude from indexing"), N_("/PATH/DIR")}, @@ -1221,9 +1221,9 @@ scan_directory (const char *uri, DBConnection *db_con) } - +/* static gboolean -start_watching (gpointer data) +start_watching () { if (!tracker->is_running) { return FALSE; @@ -1235,52 +1235,19 @@ start_watching (gpointer data) exit (1); } else { - /* start emails watching */ + //tracker_email_watch_emails (main_thread_db_con); - if (data) { - char *watch_folder; - int len; - - watch_folder = (char *) data; - - if (!watch_folder || watch_folder[0] != '/') { - g_free (watch_folder); - return FALSE; - } - - len = strlen (watch_folder); - - if (watch_folder[len-1] == G_DIR_SEPARATOR) { - watch_folder[len-1] = '\0'; - } - - - watch_dir (watch_folder, main_thread_db_con); - schedule_dir_check (watch_folder, main_thread_db_con); - g_free (watch_folder); - - } else { - g_slist_foreach (tracker->watch_directory_roots_list, (GFunc) watch_dir, main_thread_db_con); - g_slist_foreach (tracker->watch_directory_roots_list, (GFunc) schedule_dir_check, main_thread_db_con); - } + g_slist_foreach (tracker->watch_directory_roots_list, (GFunc) watch_dir, main_thread_db_con); + g_slist_foreach (tracker->watch_directory_roots_list, (GFunc) schedule_dir_check, main_thread_db_con); tracker_notify_file_data_available (); - tracker->is_dir_scan = FALSE; - - tracker_log ("waiting for file events..."); - - /*if (tracker->first_time_index) { - tracker->do_optimize = TRUE; - g_timeout_add (5000, (GSourceFunc) optimize_when_indexing_finished, NULL); - }*/ - } return FALSE; } - +*/ static void extract_metadata_thread (void) @@ -1444,7 +1411,7 @@ extract_metadata_thread (void) if (!tracker->is_indexing) { tracker->is_indexing = TRUE; - g_timeout_add (3000, (GSourceFunc) flush_when_indexing_finished, NULL); + //g_timeout_add (3000, (GSourceFunc) flush_when_indexing_finished, NULL); } if (info->service_type_id == -1) { @@ -1686,7 +1653,13 @@ process_files_thread (void) break; } } + + if (tracker->word_detail_count > tracker->word_detail_limit || tracker->word_count > tracker->word_count_limit) { + tracker_flush_rare_words (); + } + + /* if (!tracker->in_flush && (tracker->number_of_cached_words > tracker->cache_word_limit)) { int words_left; tracker->in_flush = TRUE; @@ -1715,12 +1688,15 @@ process_files_thread (void) tracker->number_of_cached_words = words_left; tracker_log ("flushing data (%d words left) to inverted word index - please wait", words_left); } - + if (tracker->first_time_index && tracker->first_flush) { + tracker->first_flush = FALSE; + tracker_db_exec_no_reply (cache_db_con, "ANALYZE"); + } tracker->in_flush = FALSE; } - +*/ info = g_async_queue_try_pop (tracker->file_process_queue); @@ -1763,6 +1739,19 @@ process_files_thread (void) } + /* flush all words if nothing left to do before sleeping */ + tracker_flush_all_words (); + + if (tracker->is_running && (tracker->first_time_index || tracker->do_optimize || (tracker->update_count > tracker->optimization_count))) { + + tracker_indexer_optimize (tracker->file_indexer); + + tracker->do_optimize = FALSE; + tracker->first_time_index = FALSE; + tracker->update_count = 0; + + } + /* we have no stuff to process so sleep until awoken by a new signal */ g_debug ("File thread sleeping"); @@ -1773,7 +1762,7 @@ process_files_thread (void) /* determine if wake up call is new stuff or a shutdown signal */ if (!shutdown) { - continue; + } else { break; } @@ -1985,10 +1974,7 @@ process_files_thread (void) if (need_index) { - if (!tracker->is_indexing) { - tracker->is_indexing = TRUE; - g_timeout_add (5000, (GSourceFunc) flush_when_indexing_finished, NULL); - } + index_entity (db_con, info); } @@ -2522,6 +2508,9 @@ set_defaults () tracker->language = g_strdup ("en"); tracker->stop_words = NULL; tracker->use_pango_word_break = FALSE; + tracker->index_numbers = FALSE; + + tracker->first_flush = TRUE; } @@ -2612,12 +2601,27 @@ sanity_check_option_values () tracker_log ("Stemmer enabled : \t\t\t%s", bools[tracker->use_stemmer]); tracker_log ("Using Pango word breaking : \t\t%s\n\n", bools[tracker->use_pango_word_break]); + tracker->word_count = 0; + tracker->word_detail_count = 0; + if (tracker->use_extra_memory) { tracker->max_process_queue_size = 5000; tracker->max_extract_queue_size = 5000; tracker->cached_word_table = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL); tracker->cache_word_limit = 20000; tracker->cache_word_min = 1000; + + tracker->cached_table = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL); + + tracker->word_detail_limit = 200000; + tracker->word_detail_min = 100000; + tracker->word_count_limit = 20000; + tracker->word_count_min = 1000; + } else { + tracker->word_detail_limit = 100000; + tracker->word_detail_min = 50000; + tracker->word_count_limit = 10000; + tracker->word_count_min = 500; } @@ -3028,16 +3032,27 @@ main (int argc, char **argv) /* schedule the watching of directories so as not to delay start up time*/ + if (tracker->enable_indexing) { - g_timeout_add_full (G_PRIORITY_LOW, - 500, - (GSourceFunc) start_watching, - NULL, NULL - ); + + if (!tracker_start_watching ()) { + tracker_log ("File monitoring failed to start"); + do_cleanup ("File watching failure"); + exit (1); + } else { + + + //tracker_email_watch_emails (main_thread_db_con); + + g_slist_foreach (tracker->watch_directory_roots_list, (GFunc) watch_dir, main_thread_db_con); + g_slist_foreach (tracker->watch_directory_roots_list, (GFunc) schedule_dir_check, main_thread_db_con); + + tracker->file_process_thread = g_thread_create ((GThreadFunc) process_files_thread, NULL, FALSE, NULL); + + } } - /* execute events and user requests to be processed and indexed in their own threads */ - tracker->file_process_thread = g_thread_create ((GThreadFunc) process_files_thread, NULL, FALSE, NULL); + //tracker->file_metadata_thread = g_thread_create ((GThreadFunc) extract_metadata_thread, NULL, FALSE, NULL); tracker->user_request_thread = g_thread_create ((GThreadFunc) process_user_request_queue_thread, NULL, FALSE, NULL); |