diff options
author | Nirbhay Choubey <nirbhay@skysql.com> | 2014-08-11 23:55:41 -0400 |
---|---|---|
committer | Nirbhay Choubey <nirbhay@skysql.com> | 2014-08-11 23:55:41 -0400 |
commit | 8358dd53b7406deaa9f50ad09b16a86b7e367632 (patch) | |
tree | ef8995ad0e400cb6a1842649c3c886c7b3474835 /storage/innobase | |
parent | e06e12f5b8dfe0ab2e5976eec1b27b25d318441b (diff) | |
parent | 4105cbf4a230c82ea7dee31d4d2262b798fad9f4 (diff) | |
download | mariadb-git-8358dd53b7406deaa9f50ad09b16a86b7e367632.tar.gz |
bzr merge -r4346 maria/10.0 (maria-10.0.13)
Diffstat (limited to 'storage/innobase')
36 files changed, 1325 insertions, 503 deletions
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index 1d2f313a07c..34a72f360be 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -202,15 +202,6 @@ btr_rec_free_externally_stored_fields( mtr_t* mtr); /*!< in: mini-transaction handle which contains an X-latch to record page and to the index tree */ -/***********************************************************//** -Gets the externally stored size of a record, in units of a database page. -@return externally stored part, in units of a database page */ -static -ulint -btr_rec_get_externally_stored_len( -/*==============================*/ - const rec_t* rec, /*!< in: record */ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ #endif /* !UNIV_HOTBACKUP */ /******************************************************//** @@ -271,6 +262,7 @@ btr_cur_latch_leaves( case BTR_MODIFY_TREE: /* x-latch also brothers from left to right */ left_page_no = btr_page_get_prev(page, mtr); + mode = latch_mode; if (left_page_no != FIL_NULL) { get_block = btr_block_get( @@ -4043,15 +4035,15 @@ btr_rec_get_field_ref_offs( #define btr_rec_get_field_ref(rec, offsets, n) \ ((rec) + btr_rec_get_field_ref_offs(offsets, n)) -/***********************************************************//** -Gets the externally stored size of a record, in units of a database page. +/** Gets the externally stored size of a record, in units of a database page. +@param[in] rec record +@param[in] offsets array returned by rec_get_offsets() @return externally stored part, in units of a database page */ -static + ulint btr_rec_get_externally_stored_len( -/*==============================*/ - const rec_t* rec, /*!< in: record */ - const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + const rec_t* rec, + const ulint* offsets) { ulint n_fields; ulint total_extern_len = 0; diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 3cce75abe74..fa2edb90b8e 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -2183,6 +2183,10 @@ af_get_pct_for_dirty() { ulint dirty_pct = buf_get_modified_ratio_pct(); + if (dirty_pct > 0 && srv_max_buf_pool_modified_pct == 0) { + return(100); + } + ut_a(srv_max_dirty_pages_pct_lwm <= srv_max_buf_pool_modified_pct); diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index ec30c063a72..64409e1993d 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -2263,6 +2263,24 @@ buf_LRU_block_remove_hashed( " in the hash table\n", (ulong) bpage->space, (ulong) bpage->offset); +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: in_page_hash %lu in_zip_hash %lu\n" + " in_free_list %lu in_flush_list %lu in_LRU_list %lu\n" + " zip.data %p zip_size %lu page_state %d\n", + bpage->in_page_hash, bpage->in_zip_hash, + bpage->in_free_list, bpage->in_flush_list, + bpage->in_LRU_list, bpage->zip.data, + buf_page_get_zip_size(bpage), + buf_page_get_state(bpage)); +#else + fprintf(stderr, + "InnoDB: zip.data %p zip_size %lu page_state %d\n", + bpage->zip.data, + buf_page_get_zip_size(bpage), + buf_page_get_state(bpage)); +#endif + if (hashed_bpage) { fprintf(stderr, "InnoDB: In hash table we find block" diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index 86a903d925e..c53f7e82f58 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under @@ -50,6 +50,7 @@ UNIV_INTERN dict_index_t* dict_ind_compact; #include "btr0btr.h" #include "btr0cur.h" #include "btr0sea.h" +#include "os0once.h" #include "page0zip.h" #include "page0page.h" #include "pars0pars.h" @@ -102,7 +103,7 @@ UNIV_INTERN ulong zip_pad_max = 50; UNIV_INTERN mysql_pfs_key_t dict_operation_lock_key; UNIV_INTERN mysql_pfs_key_t index_tree_rw_lock_key; UNIV_INTERN mysql_pfs_key_t index_online_log_key; -UNIV_INTERN mysql_pfs_key_t dict_table_stats_latch_key; +UNIV_INTERN mysql_pfs_key_t dict_table_stats_key; #endif /* UNIV_PFS_RWLOCK */ #ifdef UNIV_PFS_MUTEX @@ -121,6 +122,11 @@ UNIV_INTERN mysql_pfs_key_t dict_foreign_err_mutex_key; /** Identifies generated InnoDB foreign key names */ static char dict_ibfk[] = "_ibfk_"; +bool innodb_table_stats_not_found = false; +bool innodb_index_stats_not_found = false; +static bool innodb_table_stats_not_found_reported = false; +static bool innodb_index_stats_not_found_reported = false; + /*******************************************************************//** Tries to find column names for the index and sets the col field of the index. @@ -319,6 +325,82 @@ dict_mutex_exit_for_mysql(void) mutex_exit(&(dict_sys->mutex)); } +/** Allocate and init a dict_table_t's stats latch. +This function must not be called concurrently on the same table object. +@param[in,out] table_void table whose stats latch to create */ +static +void +dict_table_stats_latch_alloc( + void* table_void) +{ + dict_table_t* table = static_cast<dict_table_t*>(table_void); + + table->stats_latch = new(std::nothrow) rw_lock_t; + + ut_a(table->stats_latch != NULL); + + rw_lock_create(dict_table_stats_key, table->stats_latch, + SYNC_INDEX_TREE); +} + +/** Deinit and free a dict_table_t's stats latch. +This function must not be called concurrently on the same table object. +@param[in,out] table table whose stats latch to free */ +static +void +dict_table_stats_latch_free( + dict_table_t* table) +{ + rw_lock_free(table->stats_latch); + delete table->stats_latch; +} + +/** Create a dict_table_t's stats latch or delay for lazy creation. +This function is only called from either single threaded environment +or from a thread that has not shared the table object with other threads. +@param[in,out] table table whose stats latch to create +@param[in] enabled if false then the latch is disabled +and dict_table_stats_lock()/unlock() become noop on this table. */ + +void +dict_table_stats_latch_create( + dict_table_t* table, + bool enabled) +{ + if (!enabled) { + table->stats_latch = NULL; + table->stats_latch_created = os_once::DONE; + return; + } + +#ifdef HAVE_ATOMIC_BUILTINS + /* We create this lazily the first time it is used. */ + table->stats_latch = NULL; + table->stats_latch_created = os_once::NEVER_DONE; +#else /* HAVE_ATOMIC_BUILTINS */ + + dict_table_stats_latch_alloc(table); + + table->stats_latch_created = os_once::DONE; +#endif /* HAVE_ATOMIC_BUILTINS */ +} + +/** Destroy a dict_table_t's stats latch. +This function is only called from either single threaded environment +or from a thread that has not shared the table object with other threads. +@param[in,out] table table whose stats latch to destroy */ + +void +dict_table_stats_latch_destroy( + dict_table_t* table) +{ + if (table->stats_latch_created == os_once::DONE + && table->stats_latch != NULL) { + + dict_table_stats_latch_free(table); + } +} + /**********************************************************************//** Lock the appropriate latch to protect a given table's statistics. */ UNIV_INTERN @@ -331,6 +413,14 @@ dict_table_stats_lock( ut_ad(table != NULL); ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); +#ifdef HAVE_ATOMIC_BUILTINS + os_once::do_or_wait_for_done( + &table->stats_latch_created, + dict_table_stats_latch_alloc, table); +#else /* HAVE_ATOMIC_BUILTINS */ + ut_ad(table->stats_latch_created == os_once::DONE); +#endif /* HAVE_ATOMIC_BUILTINS */ + if (table->stats_latch == NULL) { /* This is a dummy table object that is private in the current thread and is not shared between multiple threads, thus we @@ -5212,8 +5302,6 @@ dict_table_print( index = UT_LIST_GET_NEXT(indexes, index); } - table->stat_initialized = FALSE; - dict_table_stats_unlock(table, RW_X_LATCH); foreign = UT_LIST_GET_FIRST(table->foreign_list); @@ -6016,14 +6104,34 @@ dict_table_schema_check( table = dict_table_get_low(req_schema->table_name); if (table == NULL) { + bool should_print=true; /* no such table */ - ut_snprintf(errstr, errstr_sz, - "Table %s not found.", - ut_format_name(req_schema->table_name, - TRUE, buf, sizeof(buf))); + if (innobase_strcasecmp(req_schema->table_name, "mysql/innodb_table_stats") == 0) { + if (innodb_table_stats_not_found_reported == false) { + innodb_table_stats_not_found = true; + innodb_table_stats_not_found_reported = true; + } else { + should_print = false; + } + } else if (innobase_strcasecmp(req_schema->table_name, "mysql/innodb_index_stats") == 0 ) { + if (innodb_index_stats_not_found_reported == false) { + innodb_index_stats_not_found = true; + innodb_index_stats_not_found_reported = true; + } else { + should_print = false; + } + } - return(DB_TABLE_NOT_FOUND); + if (should_print) { + ut_snprintf(errstr, errstr_sz, + "Table %s not found.", + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf))); + return(DB_TABLE_NOT_FOUND); + } else { + return(DB_STATS_DO_NOT_EXIST); + } } if (table->ibd_file_missing) { diff --git a/storage/innobase/dict/dict0mem.cc b/storage/innobase/dict/dict0mem.cc index 60daeea3a96..6310b2fd225 100644 --- a/storage/innobase/dict/dict0mem.cc +++ b/storage/innobase/dict/dict0mem.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under @@ -95,9 +95,9 @@ dict_mem_table_create( ut_d(table->magic_n = DICT_TABLE_MAGIC_N); - table->stats_latch = new rw_lock_t; - rw_lock_create(dict_table_stats_latch_key, table->stats_latch, - SYNC_INDEX_TREE); + /* true means that the stats latch will be enabled - + dict_table_stats_lock() will not be noop. */ + dict_table_stats_latch_create(table, true); #ifndef UNIV_HOTBACKUP table->autoinc_lock = static_cast<ib_lock_t*>( @@ -154,8 +154,7 @@ dict_mem_table_free( mutex_free(&(table->autoinc_mutex)); #endif /* UNIV_HOTBACKUP */ - rw_lock_free(table->stats_latch); - delete table->stats_latch; + dict_table_stats_latch_destroy(table); ut_free(table->name); mem_heap_free(table->heap); diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc index 928bdb3f2ef..1eac9e0df51 100644 --- a/storage/innobase/dict/dict0stats.cc +++ b/storage/innobase/dict/dict0stats.cc @@ -46,6 +46,7 @@ Created Jan 06, 2010 Vasil Dimov #include "ut0rnd.h" /* ut_rnd_interval() */ #include "ut0ut.h" /* ut_format_name(), ut_time() */ +#include <algorithm> #include <map> #include <vector> @@ -127,10 +128,11 @@ where n=1..n_uniq. #endif /* UNIV_STATS_DEBUG */ /* Gets the number of leaf pages to sample in persistent stats estimation */ -#define N_SAMPLE_PAGES(index) \ - ((index)->table->stats_sample_pages != 0 ? \ - (index)->table->stats_sample_pages : \ - srv_stats_persistent_sample_pages) +#define N_SAMPLE_PAGES(index) \ + static_cast<ib_uint64_t>( \ + (index)->table->stats_sample_pages != 0 \ + ? (index)->table->stats_sample_pages \ + : srv_stats_persistent_sample_pages) /* number of distinct records on a given level that are required to stop descending to lower levels and fetch N_SAMPLE_PAGES(index) records @@ -268,10 +270,12 @@ dict_stats_persistent_storage_check( mutex_exit(&(dict_sys->mutex)); } - if (ret != DB_SUCCESS) { + if (ret != DB_SUCCESS && ret != DB_STATS_DO_NOT_EXIST) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: %s\n", errstr); return(false); + } else if (ret == DB_STATS_DO_NOT_EXIST) { + return false; } /* else */ @@ -430,9 +434,9 @@ dict_stats_table_clone_create( t->corrupted = table->corrupted; /* This private object "t" is not shared with other threads, so - we do not need the stats_latch. The lock/unlock routines will do - nothing if stats_latch is NULL. */ - t->stats_latch = NULL; + we do not need the stats_latch (thus we pass false below). The + dict_table_stats_lock()/unlock() routines will do nothing. */ + dict_table_stats_latch_create(t, false); UT_LIST_INIT(t->indexes); @@ -508,6 +512,7 @@ dict_stats_table_clone_free( /*========================*/ dict_table_t* t) /*!< in: dummy table object to free */ { + dict_table_stats_latch_destroy(t); mem_heap_free(t->heap); } @@ -1283,35 +1288,40 @@ enum page_scan_method_t { }; /* @} */ -/*********************************************************************//** -Scan a page, reading records from left to right and counting the number -of distinct records on that page (looking only at the first n_prefix -columns). If scan_method is QUIT_ON_FIRST_NON_BORING then the function +/** Scan a page, reading records from left to right and counting the number +of distinct records (looking only at the first n_prefix +columns) and the number of external pages pointed by records from this page. +If scan_method is QUIT_ON_FIRST_NON_BORING then the function will return as soon as it finds a record that does not match its neighbor to the right, which means that in the case of QUIT_ON_FIRST_NON_BORING the returned n_diff can either be 0 (empty page), 1 (the whole page has all keys equal) or 2 (the function found a non-boring record and returned). +@param[out] out_rec record, or NULL +@param[out] offsets1 rec_get_offsets() working space (must +be big enough) +@param[out] offsets2 rec_get_offsets() working space (must +be big enough) +@param[in] index index of the page +@param[in] page the page to scan +@param[in] n_prefix look at the first n_prefix columns +@param[in] scan_method scan to the end of the page or not +@param[out] n_diff number of distinct records encountered +@param[out] n_external_pages if this is non-NULL then it will be set +to the number of externally stored pages which were encountered @return offsets1 or offsets2 (the offsets of *out_rec), or NULL if the page is empty and does not contain user records. */ -UNIV_INLINE __attribute__((nonnull)) +UNIV_INLINE ulint* dict_stats_scan_page( -/*=================*/ - const rec_t** out_rec, /*!< out: record, or NULL */ - ulint* offsets1, /*!< out: rec_get_offsets() - working space (must be big - enough) */ - ulint* offsets2, /*!< out: rec_get_offsets() - working space (must be big - enough) */ - dict_index_t* index, /*!< in: index of the page */ - const page_t* page, /*!< in: the page to scan */ - ulint n_prefix, /*!< in: look at the first - n_prefix columns */ - page_scan_method_t scan_method, /*!< in: scan to the end of - the page or not */ - ib_uint64_t* n_diff) /*!< out: number of distinct - records encountered */ + const rec_t** out_rec, + ulint* offsets1, + ulint* offsets2, + dict_index_t* index, + const page_t* page, + ulint n_prefix, + page_scan_method_t scan_method, + ib_uint64_t* n_diff, + ib_uint64_t* n_external_pages) { ulint* offsets_rec = offsets1; ulint* offsets_next_rec = offsets2; @@ -1329,6 +1339,12 @@ dict_stats_scan_page( get_next = page_rec_get_next_const; } + const bool should_count_external_pages = n_external_pages != NULL; + + if (should_count_external_pages) { + *n_external_pages = 0; + } + rec = get_next(page_get_infimum_rec(page)); if (page_rec_is_supremum(rec)) { @@ -1341,6 +1357,11 @@ dict_stats_scan_page( offsets_rec = rec_get_offsets(rec, index, offsets_rec, ULINT_UNDEFINED, &heap); + if (should_count_external_pages) { + *n_external_pages += btr_rec_get_externally_stored_len( + rec, offsets_rec); + } + next_rec = get_next(rec); *n_diff = 1; @@ -1391,6 +1412,11 @@ dict_stats_scan_page( offsets_next_rec = offsets_tmp; } + if (should_count_external_pages) { + *n_external_pages += btr_rec_get_externally_stored_len( + rec, offsets_rec); + } + next_rec = get_next(next_rec); } @@ -1401,19 +1427,25 @@ func_exit: return(offsets_rec); } -/*********************************************************************//** -Dive below the current position of a cursor and calculate the number of +/** Dive below the current position of a cursor and calculate the number of distinct records on the leaf page, when looking at the fist n_prefix -columns. +columns. Also calculate the number of external pages pointed by records +on the leaf page. +@param[in] cur cursor +@param[in] n_prefix look at the first n_prefix columns +when comparing records +@param[out] n_diff number of distinct records +@param[out] n_external_pages number of external pages +@param[in,out] mtr mini-transaction @return number of distinct records on the leaf page */ static -ib_uint64_t +void dict_stats_analyze_index_below_cur( -/*===============================*/ - const btr_cur_t*cur, /*!< in: cursor */ - ulint n_prefix, /*!< in: look at the first n_prefix - columns when comparing records */ - mtr_t* mtr) /*!< in/out: mini-transaction */ + const btr_cur_t* cur, + ulint n_prefix, + ib_uint64_t* n_diff, + ib_uint64_t* n_external_pages, + mtr_t* mtr) { dict_index_t* index; ulint space; @@ -1426,7 +1458,6 @@ dict_stats_analyze_index_below_cur( ulint* offsets1; ulint* offsets2; ulint* offsets_rec; - ib_uint64_t n_diff; /* the result */ ulint size; index = btr_cur_get_index(cur); @@ -1462,6 +1493,10 @@ dict_stats_analyze_index_below_cur( page_no = btr_node_ptr_get_child_page_no(rec, offsets_rec); + /* assume no external pages by default - in case we quit from this + function without analyzing any leaf pages */ + *n_external_pages = 0; + /* descend to the leaf level on the B-tree */ for (;;) { @@ -1480,20 +1515,24 @@ dict_stats_analyze_index_below_cur( /* search for the first non-boring record on the page */ offsets_rec = dict_stats_scan_page( &rec, offsets1, offsets2, index, page, n_prefix, - QUIT_ON_FIRST_NON_BORING, &n_diff); + QUIT_ON_FIRST_NON_BORING, n_diff, NULL); /* pages on level > 0 are not allowed to be empty */ ut_a(offsets_rec != NULL); /* if page is not empty (offsets_rec != NULL) then n_diff must be > 0, otherwise there is a bug in dict_stats_scan_page() */ - ut_a(n_diff > 0); + ut_a(*n_diff > 0); - if (n_diff == 1) { + if (*n_diff == 1) { /* page has all keys equal and the end of the page was reached by dict_stats_scan_page(), no need to descend to the leaf level */ mem_heap_free(heap); - return(1); + /* can't get an estimate for n_external_pages here + because we do not dive to the leaf level, assume no + external pages (*n_external_pages was assigned to 0 + above). */ + return; } /* else */ @@ -1501,7 +1540,7 @@ dict_stats_analyze_index_below_cur( first non-boring record it finds, then the returned n_diff can either be 0 (empty page), 1 (page has all keys equal) or 2 (non-boring record was found) */ - ut_a(n_diff == 2); + ut_a(*n_diff == 2); /* we have a non-boring record in rec, descend below it */ @@ -1512,11 +1551,14 @@ dict_stats_analyze_index_below_cur( ut_ad(btr_page_get_level(page, mtr) == 0); /* scan the leaf page and find the number of distinct keys, - when looking only at the first n_prefix columns */ + when looking only at the first n_prefix columns; also estimate + the number of externally stored pages pointed by records on this + page */ offsets_rec = dict_stats_scan_page( &rec, offsets1, offsets2, index, page, n_prefix, - COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED, &n_diff); + COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED, n_diff, + n_external_pages); #if 0 DEBUG_PRINTF(" %s(): n_diff below page_no=%lu: " UINT64PF "\n", @@ -1524,133 +1566,146 @@ dict_stats_analyze_index_below_cur( #endif mem_heap_free(heap); - - return(n_diff); } -/*********************************************************************//** -For a given level in an index select N_SAMPLE_PAGES(index) -(or less) records from that level and dive below them to the corresponding -leaf pages, then scan those leaf pages and save the sampling results in -index->stat_n_diff_key_vals[n_prefix - 1] and the number of pages scanned in -index->stat_n_sample_sizes[n_prefix - 1]. */ +/** Input data that is used to calculate dict_index_t::stat_n_diff_key_vals[] +for each n-columns prefix (n from 1 to n_uniq). */ +struct n_diff_data_t { + /** Index of the level on which the descent through the btree + stopped. level 0 is the leaf level. This is >= 1 because we + avoid scanning the leaf level because it may contain too many + pages and doing so is useless when combined with the random dives - + if we are to scan the leaf level, this means a full scan and we can + simply do that instead of fiddling with picking random records higher + in the tree and to dive below them. At the start of the analyzing + we may decide to do full scan of the leaf level, but then this + structure is not used in that code path. */ + ulint level; + + /** Number of records on the level where the descend through the btree + stopped. When we scan the btree from the root, we stop at some mid + level, choose some records from it and dive below them towards a leaf + page to analyze. */ + ib_uint64_t n_recs_on_level; + + /** Number of different key values that were found on the mid level. */ + ib_uint64_t n_diff_on_level; + + /** Number of leaf pages that are analyzed. This is also the same as + the number of records that we pick from the mid level and dive below + them. */ + ib_uint64_t n_leaf_pages_to_analyze; + + /** Cumulative sum of the number of different key values that were + found on all analyzed pages. */ + ib_uint64_t n_diff_all_analyzed_pages; + + /** Cumulative sum of the number of external pages (stored outside of + the btree but in the same file segment). */ + ib_uint64_t n_external_pages_sum; +}; + +/** Estimate the number of different key values in an index when looking at +the first n_prefix columns. For a given level in an index select +n_diff_data->n_leaf_pages_to_analyze records from that level and dive below +them to the corresponding leaf pages, then scan those leaf pages and save the +sampling results in n_diff_data->n_diff_all_analyzed_pages. +@param[in] index index +@param[in] n_prefix look at first 'n_prefix' columns when +comparing records +@param[in] boundaries a vector that contains +n_diff_data->n_diff_on_level integers each of which represents the index (on +level 'level', counting from left/smallest to right/biggest from 0) of the +last record from each group of distinct keys +@param[in,out] n_diff_data n_diff_all_analyzed_pages and +n_external_pages_sum in this structure will be set by this function. The +members level, n_diff_on_level and n_leaf_pages_to_analyze must be set by the +caller in advance - they are used by some calculations inside this function +@param[in,out] mtr mini-transaction */ static void dict_stats_analyze_index_for_n_prefix( -/*==================================*/ - dict_index_t* index, /*!< in/out: index */ - ulint level, /*!< in: level, must be >= 1 */ - ib_uint64_t total_recs_on_level, - /*!< in: total number of - records on the given level */ - ulint n_prefix, /*!< in: look at first - n_prefix columns when - comparing records */ - ib_uint64_t n_diff_for_this_prefix, - /*!< in: number of distinct - records on the given level, - when looking at the first - n_prefix columns */ - boundaries_t* boundaries, /*!< in: array that contains - n_diff_for_this_prefix - integers each of which - represents the index (on the - level, counting from - left/smallest to right/biggest - from 0) of the last record - from each group of distinct - keys */ - mtr_t* mtr) /*!< in/out: mini-transaction */ + dict_index_t* index, + ulint n_prefix, + const boundaries_t* boundaries, + n_diff_data_t* n_diff_data, + mtr_t* mtr) { btr_pcur_t pcur; const page_t* page; ib_uint64_t rec_idx; - ib_uint64_t last_idx_on_level; - ib_uint64_t n_recs_to_dive_below; - ib_uint64_t n_diff_sum_of_all_analyzed_pages; ib_uint64_t i; #if 0 DEBUG_PRINTF(" %s(table=%s, index=%s, level=%lu, n_prefix=%lu, " - "n_diff_for_this_prefix=" UINT64PF ")\n", + "n_diff_on_level=" UINT64PF ")\n", __func__, index->table->name, index->name, level, - n_prefix, n_diff_for_this_prefix); + n_prefix, n_diff_data->n_diff_on_level); #endif ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), MTR_MEMO_S_LOCK)); - /* if some of those is 0 then this means that there is exactly one - page in the B-tree and it is empty and we should have done full scan - and should not be here */ - ut_ad(total_recs_on_level > 0); - ut_ad(n_diff_for_this_prefix > 0); - - /* this must be at least 1 */ - ut_ad(N_SAMPLE_PAGES(index) > 0); - /* Position pcur on the leftmost record on the leftmost page on the desired level. */ btr_pcur_open_at_index_side( true, index, BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED, - &pcur, true, level, mtr); + &pcur, true, n_diff_data->level, mtr); btr_pcur_move_to_next_on_page(&pcur); page = btr_pcur_get_page(&pcur); + const rec_t* first_rec = btr_pcur_get_rec(&pcur); + + /* We shouldn't be scanning the leaf level. The caller of this function + should have stopped the descend on level 1 or higher. */ + ut_ad(n_diff_data->level > 0); + ut_ad(!page_is_leaf(page)); + /* The page must not be empty, except when it is the root page (and the whole index is empty). */ - ut_ad(btr_pcur_is_on_user_rec(&pcur) || page_is_leaf(page)); - ut_ad(btr_pcur_get_rec(&pcur) - == page_rec_get_next_const(page_get_infimum_rec(page))); + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + ut_ad(first_rec == page_rec_get_next_const(page_get_infimum_rec(page))); /* check that we are indeed on the desired level */ - ut_a(btr_page_get_level(page, mtr) == level); + ut_a(btr_page_get_level(page, mtr) == n_diff_data->level); /* there should not be any pages on the left */ ut_a(btr_page_get_prev(page, mtr) == FIL_NULL); /* check whether the first record on the leftmost page is marked - as such, if we are on a non-leaf level */ - ut_a((level == 0) - == !(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( - btr_pcur_get_rec(&pcur), page_is_comp(page)))); + as such; we are on a non-leaf level */ + ut_a(rec_get_info_bits(first_rec, page_is_comp(page)) + & REC_INFO_MIN_REC_FLAG); - last_idx_on_level = boundaries->at( - static_cast<unsigned int>(n_diff_for_this_prefix - 1)); + const ib_uint64_t last_idx_on_level = boundaries->at( + static_cast<unsigned>(n_diff_data->n_diff_on_level - 1)); rec_idx = 0; - n_diff_sum_of_all_analyzed_pages = 0; - - n_recs_to_dive_below = ut_min(N_SAMPLE_PAGES(index), - n_diff_for_this_prefix); - - for (i = 0; i < n_recs_to_dive_below; i++) { - ib_uint64_t left; - ib_uint64_t right; - ib_uint64_t rnd; - ib_uint64_t dive_below_idx; + n_diff_data->n_diff_all_analyzed_pages = 0; + n_diff_data->n_external_pages_sum = 0; - /* there are n_diff_for_this_prefix elements + for (i = 0; i < n_diff_data->n_leaf_pages_to_analyze; i++) { + /* there are n_diff_on_level elements in 'boundaries' and we divide those elements - into n_recs_to_dive_below segments, for example: + into n_leaf_pages_to_analyze segments, for example: - let n_diff_for_this_prefix=100, n_recs_to_dive_below=4, then: + let n_diff_on_level=100, n_leaf_pages_to_analyze=4, then: segment i=0: [0, 24] segment i=1: [25, 49] segment i=2: [50, 74] segment i=3: [75, 99] or - let n_diff_for_this_prefix=1, n_recs_to_dive_below=1, then: + let n_diff_on_level=1, n_leaf_pages_to_analyze=1, then: segment i=0: [0, 0] or - let n_diff_for_this_prefix=2, n_recs_to_dive_below=2, then: + let n_diff_on_level=2, n_leaf_pages_to_analyze=2, then: segment i=0: [0, 0] segment i=1: [1, 1] or - let n_diff_for_this_prefix=13, n_recs_to_dive_below=7, then: + let n_diff_on_level=13, n_leaf_pages_to_analyze=7, then: segment i=0: [0, 0] segment i=1: [1, 2] segment i=2: [3, 4] @@ -1661,9 +1716,12 @@ dict_stats_analyze_index_for_n_prefix( then we select a random record from each segment and dive below it */ - left = n_diff_for_this_prefix * i / n_recs_to_dive_below; - right = n_diff_for_this_prefix * (i + 1) - / n_recs_to_dive_below - 1; + const ib_uint64_t n_diff = n_diff_data->n_diff_on_level; + const ib_uint64_t n_pick + = n_diff_data->n_leaf_pages_to_analyze; + + const ib_uint64_t left = n_diff * i / n_pick; + const ib_uint64_t right = n_diff * (i + 1) / n_pick - 1; ut_a(left <= right); ut_a(right <= last_idx_on_level); @@ -1671,11 +1729,11 @@ dict_stats_analyze_index_for_n_prefix( /* we do not pass (left, right) because we do not want to ask ut_rnd_interval() to work with too big numbers since ib_uint64_t could be bigger than ulint */ - rnd = static_cast<ib_uint64_t>( - ut_rnd_interval(0, static_cast<ulint>(right - left))); + const ulint rnd = ut_rnd_interval( + 0, static_cast<ulint>(right - left)); - dive_below_idx = boundaries->at( - static_cast<unsigned int>(left + rnd)); + const ib_uint64_t dive_below_idx + = boundaries->at(static_cast<unsigned>(left + rnd)); #if 0 DEBUG_PRINTF(" %s(): dive below record with index=" @@ -1711,9 +1769,13 @@ dict_stats_analyze_index_for_n_prefix( ut_a(rec_idx == dive_below_idx); ib_uint64_t n_diff_on_leaf_page; + ib_uint64_t n_external_pages; - n_diff_on_leaf_page = dict_stats_analyze_index_below_cur( - btr_pcur_get_btr_cur(&pcur), n_prefix, mtr); + dict_stats_analyze_index_below_cur(btr_pcur_get_btr_cur(&pcur), + n_prefix, + &n_diff_on_leaf_page, + &n_external_pages, + mtr); /* We adjust n_diff_on_leaf_page here to avoid counting one record twice - once as the last on some page and once @@ -1733,37 +1795,86 @@ dict_stats_analyze_index_for_n_prefix( n_diff_on_leaf_page--; } - n_diff_sum_of_all_analyzed_pages += n_diff_on_leaf_page; - } - - /* n_diff_sum_of_all_analyzed_pages can be 0 here if all the leaf - pages sampled contained only delete-marked records. In this case - we should assign 0 to index->stat_n_diff_key_vals[n_prefix - 1], which - the formula below does. */ + n_diff_data->n_diff_all_analyzed_pages += n_diff_on_leaf_page; - /* See REF01 for an explanation of the algorithm */ - index->stat_n_diff_key_vals[n_prefix - 1] - = index->stat_n_leaf_pages - - * n_diff_for_this_prefix - / total_recs_on_level - - * n_diff_sum_of_all_analyzed_pages - / n_recs_to_dive_below; + n_diff_data->n_external_pages_sum += n_external_pages; + } - index->stat_n_sample_sizes[n_prefix - 1] = n_recs_to_dive_below; + btr_pcur_close(&pcur); +} - DEBUG_PRINTF(" %s(): n_diff=" UINT64PF " for n_prefix=%lu " - "(%lu" - " * " UINT64PF " / " UINT64PF - " * " UINT64PF " / " UINT64PF ")\n", - __func__, index->stat_n_diff_key_vals[n_prefix - 1], - n_prefix, - index->stat_n_leaf_pages, - n_diff_for_this_prefix, total_recs_on_level, - n_diff_sum_of_all_analyzed_pages, n_recs_to_dive_below); +/** Set dict_index_t::stat_n_diff_key_vals[] and stat_n_sample_sizes[]. +@param[in] n_diff_data input data to use to derive the results +@param[in,out] index index whose stat_n_diff_key_vals[] to set */ +UNIV_INLINE +void +dict_stats_index_set_n_diff( + const n_diff_data_t* n_diff_data, + dict_index_t* index) +{ + for (ulint n_prefix = dict_index_get_n_unique(index); + n_prefix >= 1; + n_prefix--) { + /* n_diff_all_analyzed_pages can be 0 here if + all the leaf pages sampled contained only + delete-marked records. In this case we should assign + 0 to index->stat_n_diff_key_vals[n_prefix - 1], which + the formula below does. */ + + const n_diff_data_t* data = &n_diff_data[n_prefix - 1]; + + ut_ad(data->n_leaf_pages_to_analyze > 0); + ut_ad(data->n_recs_on_level > 0); + + ulint n_ordinary_leaf_pages; + + if (data->level == 1) { + /* If we know the number of records on level 1, then + this number is the same as the number of pages on + level 0 (leaf). */ + n_ordinary_leaf_pages = data->n_recs_on_level; + } else { + /* If we analyzed D ordinary leaf pages and found E + external pages in total linked from those D ordinary + leaf pages, then this means that the ratio + ordinary/external is D/E. Then the ratio ordinary/total + is D / (D + E). Knowing that the total number of pages + is T (including ordinary and external) then we estimate + that the total number of ordinary leaf pages is + T * D / (D + E). */ + n_ordinary_leaf_pages + = index->stat_n_leaf_pages + * data->n_leaf_pages_to_analyze + / (data->n_leaf_pages_to_analyze + + data->n_external_pages_sum); + } - btr_pcur_close(&pcur); + /* See REF01 for an explanation of the algorithm */ + index->stat_n_diff_key_vals[n_prefix - 1] + = n_ordinary_leaf_pages + + * data->n_diff_on_level + / data->n_recs_on_level + + * data->n_diff_all_analyzed_pages + / data->n_leaf_pages_to_analyze; + + index->stat_n_sample_sizes[n_prefix - 1] + = data->n_leaf_pages_to_analyze; + + DEBUG_PRINTF(" %s(): n_diff=" UINT64PF " for n_prefix=%lu" + " (%lu" + " * " UINT64PF " / " UINT64PF + " * " UINT64PF " / " UINT64PF ")\n", + __func__, + index->stat_n_diff_key_vals[n_prefix - 1], + n_prefix, + index->stat_n_leaf_pages, + data->n_diff_on_level, + data->n_recs_on_level, + data->n_diff_all_analyzed_pages, + data->n_leaf_pages_to_analyze); + } } /*********************************************************************//** @@ -1781,10 +1892,8 @@ dict_stats_analyze_index( bool level_is_analyzed; ulint n_uniq; ulint n_prefix; - ib_uint64_t* n_diff_on_level; ib_uint64_t total_recs; ib_uint64_t total_pages; - boundaries_t* n_diff_boundaries; mtr_t mtr; ulint size; DBUG_ENTER("dict_stats_analyze_index"); @@ -1870,11 +1979,18 @@ dict_stats_analyze_index( DBUG_VOID_RETURN; } - /* set to zero */ - n_diff_on_level = reinterpret_cast<ib_uint64_t*> - (mem_zalloc(n_uniq * sizeof(ib_uint64_t))); + /* For each level that is being scanned in the btree, this contains the + number of different key values for all possible n-column prefixes. */ + ib_uint64_t* n_diff_on_level = new ib_uint64_t[n_uniq]; - n_diff_boundaries = new boundaries_t[n_uniq]; + /* For each level that is being scanned in the btree, this contains the + index of the last record from each group of equal records (when + comparing only the first n columns, n=1..n_uniq). */ + boundaries_t* n_diff_boundaries = new boundaries_t[n_uniq]; + + /* For each n-column prefix this array contains the input data that is + used to calculate dict_index_t::stat_n_diff_key_vals[]. */ + n_diff_data_t* n_diff_data = new n_diff_data_t[n_uniq]; /* total_recs is also used to estimate the number of pages on one level below, so at the start we have 1 page (the root) */ @@ -1986,12 +2102,12 @@ dict_stats_analyze_index( level_is_analyzed = true; - if (n_diff_on_level[n_prefix - 1] - >= N_DIFF_REQUIRED(index) - || level == 1) { - /* we found a good level with many distinct - records or we have reached the last level we - could scan */ + if (level == 1 + || n_diff_on_level[n_prefix - 1] + >= N_DIFF_REQUIRED(index)) { + /* we have reached the last level we could scan + or we found a good level with many distinct + records */ break; } @@ -2004,7 +2120,6 @@ found_level: " distinct records for n_prefix=%lu\n", __func__, level, n_diff_on_level[n_prefix - 1], n_prefix); - /* here we are either on level 1 or the level that we are on contains >= N_DIFF_REQUIRED distinct keys or we did not scan deeper levels because they would contain too many pages */ @@ -2013,20 +2128,47 @@ found_level: ut_ad(level_is_analyzed); + /* if any of these is 0 then there is exactly one page in the + B-tree and it is empty and we should have done full scan and + should not be here */ + ut_ad(total_recs > 0); + ut_ad(n_diff_on_level[n_prefix - 1] > 0); + + ut_ad(N_SAMPLE_PAGES(index) > 0); + + n_diff_data_t* data = &n_diff_data[n_prefix - 1]; + + data->level = level; + + data->n_recs_on_level = total_recs; + + data->n_diff_on_level = n_diff_on_level[n_prefix - 1]; + + data->n_leaf_pages_to_analyze = std::min( + N_SAMPLE_PAGES(index), + n_diff_on_level[n_prefix - 1]); + /* pick some records from this level and dive below them for the given n_prefix */ dict_stats_analyze_index_for_n_prefix( - index, level, total_recs, n_prefix, - n_diff_on_level[n_prefix - 1], - &n_diff_boundaries[n_prefix - 1], &mtr); + index, n_prefix, &n_diff_boundaries[n_prefix - 1], + data, &mtr); } mtr_commit(&mtr); delete[] n_diff_boundaries; - mem_free(n_diff_on_level); + delete[] n_diff_on_level; + + /* n_prefix == 0 means that the above loop did not end up prematurely + due to tree being changed and so n_diff_data[] is set up. */ + if (n_prefix == 0) { + dict_stats_index_set_n_diff(n_diff_data, index); + } + + delete[] n_diff_data; dict_stats_assert_initialized_index(index); DBUG_VOID_RETURN; @@ -2201,17 +2343,21 @@ dict_stats_save_index_stat( "END;", trx); if (ret != DB_SUCCESS) { - char buf_table[MAX_FULL_NAME_LEN]; - char buf_index[MAX_FULL_NAME_LEN]; - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Cannot save index statistics for table " - "%s, index %s, stat name \"%s\": %s\n", - ut_format_name(index->table->name, TRUE, - buf_table, sizeof(buf_table)), - ut_format_name(index->name, FALSE, - buf_index, sizeof(buf_index)), - stat_name, ut_strerr(ret)); + if (innodb_index_stats_not_found == false && + index->stats_error_printed == false) { + char buf_table[MAX_FULL_NAME_LEN]; + char buf_index[MAX_FULL_NAME_LEN]; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot save index statistics for table " + "%s, index %s, stat name \"%s\": %s\n", + ut_format_name(index->table->name, TRUE, + buf_table, sizeof(buf_table)), + ut_format_name(index->name, FALSE, + buf_index, sizeof(buf_index)), + stat_name, ut_strerr(ret)); + index->stats_error_printed = true; + } } return(ret); @@ -2900,20 +3046,24 @@ dict_stats_update_for_index( } /* else */ - /* Fall back to transient stats since the persistent - storage is not present or is corrupted */ - char buf_table[MAX_FULL_NAME_LEN]; - char buf_index[MAX_FULL_NAME_LEN]; - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Recalculation of persistent statistics " - "requested for table %s index %s but the required " - "persistent statistics storage is not present or is " - "corrupted. Using transient stats instead.\n", - ut_format_name(index->table->name, TRUE, - buf_table, sizeof(buf_table)), - ut_format_name(index->name, FALSE, - buf_index, sizeof(buf_index))); + if (innodb_index_stats_not_found == false && + index->stats_error_printed == false) { + /* Fall back to transient stats since the persistent + storage is not present or is corrupted */ + char buf_table[MAX_FULL_NAME_LEN]; + char buf_index[MAX_FULL_NAME_LEN]; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Recalculation of persistent statistics " + "requested for table %s index %s but the required " + "persistent statistics storage is not present or is " + "corrupted. Using transient stats instead.\n", + ut_format_name(index->table->name, TRUE, + buf_table, sizeof(buf_table)), + ut_format_name(index->name, FALSE, + buf_index, sizeof(buf_index))); + index->stats_error_printed = false; + } } dict_table_stats_lock(index->table, RW_X_LATCH); @@ -2998,13 +3148,17 @@ dict_stats_update( /* Fall back to transient stats since the persistent storage is not present or is corrupted */ - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Recalculation of persistent statistics " - "requested for table %s but the required persistent " - "statistics storage is not present or is corrupted. " - "Using transient stats instead.\n", - ut_format_name(table->name, TRUE, buf, sizeof(buf))); + if (innodb_table_stats_not_found == false && + table->stats_error_printed == false) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Recalculation of persistent statistics " + "requested for table %s but the required persistent " + "statistics storage is not present or is corrupted. " + "Using transient stats instead.\n", + ut_format_name(table->name, TRUE, buf, sizeof(buf))); + table->stats_error_printed = true; + } goto transient; @@ -3048,17 +3202,21 @@ dict_stats_update( /* persistent statistics storage does not exist or is corrupted, calculate the transient stats */ - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Error: Fetch of persistent " - "statistics requested for table %s but the " - "required system tables %s and %s are not " - "present or have unexpected structure. " - "Using transient stats instead.\n", - ut_format_name(table->name, TRUE, - buf, sizeof(buf)), - TABLE_STATS_NAME_PRINT, - INDEX_STATS_NAME_PRINT); + if (innodb_table_stats_not_found == false && + table->stats_error_printed == false) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Fetch of persistent " + "statistics requested for table %s but the " + "required system tables %s and %s are not " + "present or have unexpected structure. " + "Using transient stats instead.\n", + ut_format_name(table->name, TRUE, + buf, sizeof(buf)), + TABLE_STATS_NAME_PRINT, + INDEX_STATS_NAME_PRINT); + table->stats_error_printed = true; + } goto transient; } @@ -3128,16 +3286,19 @@ dict_stats_update( dict_stats_table_clone_free(t); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Error fetching persistent statistics " - "for table %s from %s and %s: %s. " - "Using transient stats method instead.\n", - ut_format_name(table->name, TRUE, buf, - sizeof(buf)), - TABLE_STATS_NAME, - INDEX_STATS_NAME, - ut_strerr(err)); + if (innodb_table_stats_not_found == false && + table->stats_error_printed == false) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error fetching persistent statistics " + "for table %s from %s and %s: %s. " + "Using transient stats method instead.\n", + ut_format_name(table->name, TRUE, buf, + sizeof(buf)), + TABLE_STATS_NAME, + INDEX_STATS_NAME, + ut_strerr(err)); + } goto transient; } diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 81fcba47812..f4e5721caa7 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -758,7 +758,7 @@ fil_node_open_file( fprintf(stderr, "InnoDB: Error: the size of single-table" " tablespace file %s\n" - "InnoDB: is only "UINT64PF"," + "InnoDB: is only " UINT64PF "," " should be at least %lu!\n", node->name, size_bytes, @@ -5725,7 +5725,7 @@ fil_io( ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, offset, len, node, message); #endif /* UNIV_HOTBACKUP */ - ut_a(ret); + if (mode == OS_AIO_SYNC) { /* The i/o operation is already completed when we return from @@ -5740,7 +5740,10 @@ fil_io( ut_ad(fil_validate_skip()); } - return(DB_SUCCESS); + if (!ret) { + return(DB_OUT_OF_FILE_SPACE); + } else { + } return(DB_SUCCESS); } #ifndef UNIV_HOTBACKUP diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc index 4a667686795..f503cc487b7 100644 --- a/storage/innobase/fts/fts0fts.cc +++ b/storage/innobase/fts/fts0fts.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -44,6 +44,13 @@ Full Text Search interface /** Column name from the FTS config table */ #define FTS_MAX_CACHE_SIZE_IN_MB "cache_size_in_mb" +/** Verify if a aux table name is a obsolete table +by looking up the key word in the obsolete table names */ +#define FTS_IS_OBSOLETE_AUX_TABLE(table_name) \ + (strstr((table_name), "DOC_ID") != NULL \ + || strstr((table_name), "ADDED") != NULL \ + || strstr((table_name), "STOPWORDS") != NULL) + /** This is maximum FTS cache for each table and would be a configurable variable */ UNIV_INTERN ulong fts_max_cache_size; @@ -5837,6 +5844,12 @@ fts_is_aux_table_name( } } + /* Could be obsolete common tables. */ + if (strncmp(ptr, "ADDED", len) == 0 + || strncmp(ptr, "STOPWORDS", len) == 0) { + return(true); + } + /* Try and read the index id. */ if (!fts_read_object_id(&table->index_id, ptr)) { return(FALSE); @@ -6433,6 +6446,56 @@ fts_check_and_drop_orphaned_tables( mem_free(path); } + } else { + if (FTS_IS_OBSOLETE_AUX_TABLE(aux_table->name)) { + + /* Current table could be one of the three + obsolete tables, in this case, we should + always try to drop it but not rename it. + This could happen when we try to upgrade + from older server to later one, which doesn't + contain these obsolete tables. */ + drop = true; + + dberr_t err; + trx_t* trx_drop = + trx_allocate_for_background(); + + trx_drop->op_info = "Drop obsolete aux tables"; + trx_drop->dict_operation_lock_mode = RW_X_LATCH; + + trx_start_for_ddl(trx_drop, TRX_DICT_OP_TABLE); + + err = row_drop_table_for_mysql( + aux_table->name, trx_drop, false, true); + + trx_drop->dict_operation_lock_mode = 0; + + if (err != DB_SUCCESS) { + /* We don't need to worry about the + failure, since server would try to + drop it on next restart, even if + the table was broken. */ + + ib_logf(IB_LOG_LEVEL_WARN, + "Fail to drop obsolete aux" + " table '%s', which is" + " harmless. will try to drop" + " it on next restart.", + aux_table->name); + + fts_sql_rollback(trx_drop); + } else { + ib_logf(IB_LOG_LEVEL_INFO, + "Dropped obsolete aux" + " table '%s'.", + aux_table->name); + + fts_sql_commit(trx_drop); + } + + trx_free_for_background(trx_drop); + } } #ifdef _WIN32 if (!drop && rename) { diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc index a9f3a25530d..910a00cd521 100644 --- a/storage/innobase/fts/fts0opt.cc +++ b/storage/innobase/fts/fts0opt.cc @@ -95,7 +95,7 @@ enum fts_msg_type_t { /** Compressed list of words that have been read from FTS INDEX that needs to be optimized. */ struct fts_zip_t { - ulint status; /*!< Status of (un)/zip operation */ + lint status; /*!< Status of (un)/zip operation */ ulint n_words; /*!< Number of words compressed */ diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 79c994a78a0..a33d9a1d5bb 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -4,6 +4,7 @@ Copyright (c) 2000, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2014 SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -432,7 +433,7 @@ static PSI_rwlock_info all_innodb_rwlocks[] = { {&trx_purge_latch_key, "trx_purge_latch", 0}, {&index_tree_rw_lock_key, "index_tree_rw_lock", 0}, {&index_online_log_key, "index_online_log", 0}, - {&dict_table_stats_latch_key, "dict_table_stats", 0}, + {&dict_table_stats_key, "dict_table_stats", 0}, {&hash_table_rw_lock_key, "hash_table_locks", 0} }; # endif /* UNIV_PFS_RWLOCK */ @@ -3504,6 +3505,14 @@ innobase_end( if (innodb_inited) { + THD *thd= current_thd; + if (thd) { // may be UNINSTALL PLUGIN statement + trx_t* trx = thd_to_trx(thd); + if (trx) { + trx_free_for_mysql(trx); + } + } + srv_fast_shutdown = (ulint) innobase_fast_shutdown; innodb_inited = 0; @@ -4254,7 +4263,7 @@ innobase_close_connection( sql_print_warning( "MySQL is closing a connection that has an active " - "InnoDB transaction. "TRX_ID_FMT" row modifications " + "InnoDB transaction. " TRX_ID_FMT " row modifications " "will roll back.", trx->undo_no); } @@ -4317,16 +4326,23 @@ innobase_kill_query( #endif /* WITH_WSREP */ trx = thd_to_trx(thd); - if (trx) - { - /* Cancel a pending lock request. */ - lock_mutex_enter(); - trx_mutex_enter(trx); - if (trx->lock.wait_lock) - lock_cancel_waiting_and_release(trx->lock.wait_lock); - trx_mutex_exit(trx); - lock_mutex_exit(); - } + if (trx) { + THD *cur = current_thd; + THD *owner = trx->current_lock_mutex_owner; + + /* Cancel a pending lock request. */ + if (owner != cur) { + lock_mutex_enter(); + } + trx_mutex_enter(trx); + if (trx->lock.wait_lock) { + lock_cancel_waiting_and_release(trx->lock.wait_lock); + } + trx_mutex_exit(trx); + if (owner != cur) { + lock_mutex_exit(); + } + } DBUG_VOID_RETURN; } @@ -4373,14 +4389,11 @@ handler::Table_flags ha_innobase::table_flags() const /*============================*/ { - THD *thd = ha_thd(); /* Need to use tx_isolation here since table flags is (also) called before prebuilt is inited. */ - ulong const tx_isolation = thd_tx_isolation(thd); + ulong const tx_isolation = thd_tx_isolation(ha_thd()); - if (tx_isolation <= ISO_READ_COMMITTED && - !(tx_isolation == ISO_READ_COMMITTED && - thd_rpl_is_parallel(thd))) { + if (tx_isolation <= ISO_READ_COMMITTED) { return(int_table_flags); } @@ -7871,7 +7884,7 @@ calc_row_difference( if (doc_id < prebuilt->table->fts->cache->next_doc_id) { fprintf(stderr, "InnoDB: FTS Doc ID must be larger than" - " "IB_ID_FMT" for table", + " " IB_ID_FMT " for table", innodb_table->fts->cache->next_doc_id - 1); ut_print_name(stderr, trx, @@ -7883,9 +7896,9 @@ calc_row_difference( - prebuilt->table->fts->cache->next_doc_id) >= FTS_DOC_ID_MAX_STEP) { fprintf(stderr, - "InnoDB: Doc ID "UINT64PF" is too" + "InnoDB: Doc ID " UINT64PF " is too" " big. Its difference with largest" - " Doc ID used "UINT64PF" cannot" + " Doc ID used " UINT64PF " cannot" " exceed or equal to %d\n", doc_id, prebuilt->table->fts->cache->next_doc_id - 1, @@ -8625,6 +8638,29 @@ ha_innobase::innobase_get_index( index = innobase_index_lookup(share, keynr); if (index) { + if (!key || ut_strcmp(index->name, key->name) != 0) { + fprintf(stderr, "InnoDB: [Error] Index for key no %u" + " mysql name %s , InnoDB name %s for table %s\n", + keynr, key ? key->name : "NULL", + index->name, + prebuilt->table->name); + + for(ulint i=0; i < table->s->keys; i++) { + index = innobase_index_lookup(share, i); + key = table->key_info + keynr; + + if (index) { + + fprintf(stderr, "InnoDB: [Note] Index for key no %u" + " mysql name %s , InnoDB name %s for table %s\n", + keynr, key ? key->name : "NULL", + index->name, + prebuilt->table->name); + } + } + + } + ut_a(ut_strcmp(index->name, key->name) == 0); } else { /* Can't find index with keynr in the translation @@ -12501,6 +12537,34 @@ ha_innobase::info_low( break; } + DBUG_EXECUTE_IF("ib_ha_innodb_stat_not_initialized", + index->table->stat_initialized = FALSE;); + + if (!ib_table->stat_initialized || + (index->table != ib_table || + !index->table->stat_initialized)) { + fprintf(stderr, + "InnoDB: Warning: Index %s points to table %s" " and ib_table %s statistics is initialized %d " + " but index table %s initialized %d " + " mysql table is %s. Have you mixed " + "up .frm files from different " + "installations? " + "See " REFMAN + "innodb-troubleshooting.html\n", + index->name, + index->table->name, + ib_table->name, + ib_table->stat_initialized, + index->table->name, + index->table->stat_initialized, + table->s->table_name.str + ); + + /* This is better than + assert on below function */ + dict_stats_init(index->table); + } + rec_per_key = innodb_rec_per_key( index, j, stats.records); @@ -18191,6 +18255,11 @@ static MYSQL_SYSVAR_ULONG(saved_page_number_debug, NULL, innodb_save_page_no, 0, 0, UINT_MAX32, 0); #endif /* UNIV_DEBUG */ +static MYSQL_SYSVAR_UINT(simulate_comp_failures, srv_simulate_comp_failures, + PLUGIN_VAR_NOCMDARG, + "Simulate compression failures.", + NULL, NULL, 0, 0, 99, 0); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(api_trx_level), @@ -18351,6 +18420,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(fil_make_page_dirty_debug), MYSQL_SYSVAR(saved_page_number_debug), #endif /* UNIV_DEBUG */ + MYSQL_SYSVAR(simulate_comp_failures), NULL }; @@ -18680,7 +18750,7 @@ ib_senderrf( va_start(args, code); - myf l; + myf l=0; switch(level) { case IB_LOG_LEVEL_INFO: diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index 833166e783c..f1e4406fcf7 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -576,6 +576,17 @@ void btr_estimate_number_of_different_key_vals( /*======================================*/ dict_index_t* index); /*!< in: index */ + +/** Gets the externally stored size of a record, in units of a database page. +@param[in] rec record +@param[in] offsets array returned by rec_get_offsets() +@return externally stored part, in units of a database page */ + +ulint +btr_rec_get_externally_stored_len( + const rec_t* rec, + const ulint* offsets); + /*******************************************************************//** Marks non-updated off-page fields as disowned by this record. The ownership must be transferred to the updated record which is inserted elsewhere in the diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index ce709a2e912..026187b2000 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under @@ -43,6 +43,9 @@ Created 1/8/1996 Heikki Tuuri #include "trx0types.h" #include "row0types.h" +extern bool innodb_table_stats_not_found; +extern bool innodb_index_stats_not_found; + #ifndef UNIV_HOTBACKUP # include "sync0sync.h" # include "sync0rw.h" @@ -1435,6 +1438,28 @@ UNIV_INTERN void dict_mutex_exit_for_mysql(void); /*===========================*/ + +/** Create a dict_table_t's stats latch or delay for lazy creation. +This function is only called from either single threaded environment +or from a thread that has not shared the table object with other threads. +@param[in,out] table table whose stats latch to create +@param[in] enabled if false then the latch is disabled +and dict_table_stats_lock()/unlock() become noop on this table. */ + +void +dict_table_stats_latch_create( + dict_table_t* table, + bool enabled); + +/** Destroy a dict_table_t's stats latch. +This function is only called from either single threaded environment +or from a thread that has not shared the table object with other threads. +@param[in,out] table table whose stats latch to destroy */ + +void +dict_table_stats_latch_destroy( + dict_table_t* table); + /**********************************************************************//** Lock the appropriate latch to protect a given table's statistics. table->id is used to pick the corresponding latch from a global array of diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index c5ed8d92cb0..0e3981a2946 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under @@ -46,6 +46,7 @@ Created 1/8/1996 Heikki Tuuri #include "hash0hash.h" #include "trx0types.h" #include "fts0fts.h" +#include "os0once.h" /* Forward declaration. */ struct ib_rbt_t; @@ -627,6 +628,9 @@ struct dict_index_t{ ulint stat_n_leaf_pages; /*!< approximate number of leaf pages in the index tree */ + bool stats_error_printed; + /*!< has persistent statistics error printed + for this index ? */ /* @} */ rw_lock_t lock; /*!< read-write lock protecting the upper levels of the index tree */ @@ -842,6 +846,10 @@ struct dict_table_t{ initialized in dict_table_add_to_cache() */ /** Statistics for query optimization */ /* @{ */ + + volatile os_once::state_t stats_latch_created; + /*!< Creation state of 'stats_latch'. */ + rw_lock_t* stats_latch; /*!< this latch protects: dict_table_t::stat_initialized dict_table_t::stat_n_rows (*) @@ -950,6 +958,9 @@ struct dict_table_t{ /*!< see BG_STAT_* above. Writes are covered by dict_sys->mutex. Dirty reads are possible. */ + bool stats_error_printed; + /*!< Has persistent stats error beein + already printed for this table ? */ /* @} */ /*----------------------*/ /**!< The following fields are used by the diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index 385853bdb68..88246afebdc 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -289,7 +289,7 @@ lock_rec_insert_check_and_lock( inserted record maybe should inherit LOCK_GAP type locks from the successor record */ - __attribute__((nonnull, warn_unused_result)); + __attribute__((nonnull(2,3,4,6,7), warn_unused_result)); /*********************************************************************//** Checks if locks of other transactions prevent an immediate modify (update, delete mark, or delete unmark) of a clustered index record. If they do, diff --git a/storage/innobase/include/os0once.h b/storage/innobase/include/os0once.h new file mode 100644 index 00000000000..a8bbaf1d2d4 --- /dev/null +++ b/storage/innobase/include/os0once.h @@ -0,0 +1,125 @@ +/***************************************************************************** + +Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0once.h +A class that aids executing a given function exactly once in a multi-threaded +environment. + +Created Feb 20, 2014 Vasil Dimov +*******************************************************/ + +#ifndef os0once_h +#define os0once_h + +#include "univ.i" + +#include "os0sync.h" +#include "ut0ut.h" + +/** Execute a given function exactly once in a multi-threaded environment +or wait for the function to be executed by another thread. + +Example usage: +First the user must create a control variable of type os_once::state_t and +assign it os_once::NEVER_DONE. +Then the user must pass this variable, together with a function to be +executed to os_once::do_or_wait_for_done(). + +Multiple threads can call os_once::do_or_wait_for_done() simultaneously with +the same (os_once::state_t) control variable. The provided function will be +called exactly once and when os_once::do_or_wait_for_done() returns then this +function has completed execution, by this or another thread. In other words +os_once::do_or_wait_for_done() will either execute the provided function or +will wait for its execution to complete if it is already called by another +thread or will do nothing if the function has already completed its execution +earlier. + +This mimics pthread_once(3), but unfortunatelly pthread_once(3) does not +support passing arguments to the init_routine() function. We should use +std::call_once() when we start compiling with C++11 enabled. */ +class os_once { +public: + /** Control variables' state type */ + typedef ib_uint32_t state_t; + + /** Not yet executed. */ + static const state_t NEVER_DONE = 0; + + /** Currently being executed by this or another thread. */ + static const state_t IN_PROGRESS = 1; + + /** Finished execution. */ + static const state_t DONE = 2; + +#ifdef HAVE_ATOMIC_BUILTINS + /** Call a given function or wait its execution to complete if it is + already called by another thread. + @param[in,out] state control variable + @param[in] do_func function to call + @param[in,out] do_func_arg an argument to pass to do_func(). */ + static + void + do_or_wait_for_done( + volatile state_t* state, + void (*do_func)(void*), + void* do_func_arg) + { + /* Avoid calling os_compare_and_swap_uint32() in the most + common case. */ + if (*state == DONE) { + return; + } + + if (os_compare_and_swap_uint32(state, + NEVER_DONE, IN_PROGRESS)) { + /* We are the first. Call the function. */ + + do_func(do_func_arg); + + const bool swapped = os_compare_and_swap_uint32( + state, IN_PROGRESS, DONE); + + ut_a(swapped); + } else { + /* The state is not NEVER_DONE, so either it is + IN_PROGRESS (somebody is calling the function right + now or DONE (it has already been called and completed). + Wait for it to become DONE. */ + for (;;) { + const state_t s = *state; + + switch (s) { + case DONE: + return; + case IN_PROGRESS: + break; + case NEVER_DONE: + /* fall through */ + default: + ut_error; + } + + UT_RELAX_CPU(); + } + } + } +#endif /* HAVE_ATOMIC_BUILTINS */ +}; + +#endif /* os0once_h */ diff --git a/storage/innobase/include/os0sync.h b/storage/innobase/include/os0sync.h index 9b4ce2343c5..6d3dd850e08 100644 --- a/storage/innobase/include/os0sync.h +++ b/storage/innobase/include/os0sync.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -434,6 +434,9 @@ Returns the old value of *ptr, atomically sets *ptr to new_val */ # define os_atomic_test_and_set_ulint(ptr, new_val) \ __sync_lock_test_and_set(ptr, new_val) +# define os_atomic_lock_release_byte(ptr) \ + __sync_lock_release(ptr) + #elif defined(HAVE_IB_SOLARIS_ATOMICS) # define HAVE_ATOMIC_BUILTINS @@ -515,6 +518,9 @@ Returns the old value of *ptr, atomically sets *ptr to new_val */ # define os_atomic_test_and_set_ulint(ptr, new_val) \ atomic_swap_ulong(ptr, new_val) +# define os_atomic_lock_release_byte(ptr) \ + (void) atomic_swap_uchar(ptr, 0) + #elif defined(HAVE_WINDOWS_ATOMICS) # define HAVE_ATOMIC_BUILTINS @@ -574,7 +580,8 @@ Returns true if swapped, ptr is pointer to target, old_val is value to compare to, new_val is the value to swap in. */ # define os_compare_and_swap_uint32(ptr, old_val, new_val) \ - (win_cmp_and_xchg_dword(ptr, new_val, old_val) == old_val) + (InterlockedCompareExchange(reinterpret_cast<volatile long*>(ptr), \ + new_val, old_val) == old_val) # define os_compare_and_swap_ulint(ptr, old_val, new_val) \ (win_cmp_and_xchg_ulint(ptr, new_val, old_val) == old_val) @@ -637,6 +644,9 @@ clobbered */ # define os_atomic_test_and_set_ulong(ptr, new_val) \ InterlockedExchange(ptr, new_val) +# define os_atomic_lock_release_byte(ptr) \ + (void) InterlockedExchange(ptr, 0) + #else # define IB_ATOMICS_STARTUP_MSG \ "Mutexes and rw_locks use InnoDB's own implementation" diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 7922b14cc86..2b58e0717fb 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -3,6 +3,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, 2009, Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -453,6 +454,9 @@ extern struct export_var_t export_vars; /** Global counters */ extern srv_stats_t srv_stats; +/** Simulate compression failures. */ +extern uint srv_simulate_comp_failures; + # ifdef UNIV_PFS_THREAD /* Keys to register InnoDB threads with performance schema */ extern mysql_pfs_key_t buf_page_cleaner_thread_key; diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h index 34cd8ef4bd6..b36e04f2810 100644 --- a/storage/innobase/include/sync0rw.h +++ b/storage/innobase/include/sync0rw.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -108,14 +108,8 @@ extern ib_mutex_t rw_lock_list_mutex; #ifdef UNIV_SYNC_DEBUG /* The global mutex which protects debug info lists of all rw-locks. To modify the debug info list of an rw-lock, this mutex has to be - acquired in addition to the mutex protecting the lock. */ -extern ib_mutex_t rw_lock_debug_mutex; -extern os_event_t rw_lock_debug_event; /*!< If deadlock detection does - not get immediately the mutex it - may wait for this event */ -extern ibool rw_lock_debug_waiters; /*!< This is set to TRUE, if - there may be waiters for the event */ +extern os_fast_mutex_t rw_lock_debug_mutex; #endif /* UNIV_SYNC_DEBUG */ /** Counters for RW locks. */ @@ -141,7 +135,7 @@ extern mysql_pfs_key_t trx_i_s_cache_lock_key; extern mysql_pfs_key_t trx_purge_latch_key; extern mysql_pfs_key_t index_tree_rw_lock_key; extern mysql_pfs_key_t index_online_log_key; -extern mysql_pfs_key_t dict_table_stats_latch_key; +extern mysql_pfs_key_t dict_table_stats_key; extern mysql_pfs_key_t trx_sys_rw_lock_key; extern mysql_pfs_key_t hash_table_rw_lock_key; #endif /* UNIV_PFS_RWLOCK */ diff --git a/storage/innobase/include/sync0sync.ic b/storage/innobase/include/sync0sync.ic index f34f3f90b63..cb6f6efbed8 100644 --- a/storage/innobase/include/sync0sync.ic +++ b/storage/innobase/include/sync0sync.ic @@ -108,10 +108,7 @@ mutex_reset_lock_word( ib_mutex_t* mutex) /*!< in: mutex */ { #if defined(HAVE_ATOMIC_BUILTINS) - /* In theory __sync_lock_release should be used to release the lock. - Unfortunately, it does not work properly alone. The workaround is - that more conservative __sync_lock_test_and_set is used instead. */ - os_atomic_test_and_set_byte(&mutex->lock_word, 0); + os_atomic_lock_release_byte(&mutex->lock_word); #else mutex->lock_word = 0; diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index a30bbdbebb2..7c92445b796 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -992,6 +992,11 @@ struct trx_t{ count of tables being flushed. */ /*------------------------------*/ + THD* current_lock_mutex_owner; + /*!< If this is equal to current_thd, + then in innobase_kill_query() we know we + already hold the lock_sys->mutex. */ + /*------------------------------*/ #ifdef UNIV_DEBUG ulint start_line; /*!< Track where it was started from */ const char* start_file; /*!< Filename where it was started */ diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index 98c5512bd0b..bc359746a0b 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -44,7 +44,7 @@ Created 1/20/1994 Heikki Tuuri #define INNODB_VERSION_MAJOR 5 #define INNODB_VERSION_MINOR 6 -#define INNODB_VERSION_BUGFIX 17 +#define INNODB_VERSION_BUGFIX 19 /* The following is the InnoDB version as shown in SELECT plugin_version FROM information_schema.plugins; @@ -439,10 +439,10 @@ typedef unsigned __int64 ib_uint64_t; typedef unsigned __int32 ib_uint32_t; #else /* Use the integer types and formatting strings defined in the C99 standard. */ -# define UINT32PF "%"PRIu32 -# define INT64PF "%"PRId64 -# define UINT64PF "%"PRIu64 -# define UINT64PFx "%016"PRIx64 +# define UINT32PF "%" PRIu32 +# define INT64PF "%" PRId64 +# define UINT64PF "%" PRIu64 +# define UINT64PFx "%016" PRIx64 # define DBUG_LSN_PF UINT64PF typedef int64_t ib_int64_t; typedef uint64_t ib_uint64_t; diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index f99c34294cd..659b2e5b62a 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -49,6 +49,7 @@ Created 5/7/1996 Heikki Tuuri #include "btr0btr.h" #include "dict0boot.h" #include <set> +#include "mysql/plugin.h" #ifdef WITH_WSREP extern my_bool wsrep_debug; @@ -378,6 +379,11 @@ struct lock_stack_t { ulint heap_no; /*!< heap number if rec lock */ }; +extern "C" void thd_report_wait_for(const MYSQL_THD thd, MYSQL_THD other_thd); +extern "C" int thd_need_wait_for(const MYSQL_THD thd); +extern "C" +int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd); + /** Stack to use during DFS search. Currently only a single stack is required because there is no parallel deadlock check. This stack is protected by the lock_sys_t::mutex. */ @@ -393,6 +399,14 @@ UNIV_INTERN mysql_pfs_key_t lock_sys_mutex_key; UNIV_INTERN mysql_pfs_key_t lock_sys_wait_mutex_key; #endif /* UNIV_PFS_MUTEX */ +/* Buffer to collect THDs to report waits for. */ +struct thd_wait_reports { + struct thd_wait_reports *next; /*!< List link */ + ulint used; /*!< How many elements in waitees[] */ + trx_t *waitees[64]; /*!< Trxs for thd_report_wait_for() */ +}; + + #ifdef UNIV_DEBUG UNIV_INTERN ibool lock_print_waits = FALSE; @@ -1023,6 +1037,32 @@ lock_rec_has_to_wait( return(FALSE); } + if ((type_mode & LOCK_GAP || lock_rec_get_gap(lock2)) && + !thd_need_ordering_with(trx->mysql_thd, + lock2->trx->mysql_thd)) { + /* If the upper server layer has already decided on the + commit order between the transaction requesting the + lock and the transaction owning the lock, we do not + need to wait for gap locks. Such ordeering by the upper + server layer happens in parallel replication, where the + commit order is fixed to match the original order on the + master. + + Such gap locks are mainly needed to get serialisability + between transactions so that they will be binlogged in + the correct order so that statement-based replication + will give the correct results. Since the right order + was already determined on the master, we do not need + to enforce it again here. + + Skipping the locks is not essential for correctness, + since in case of deadlock we will just kill the later + transaction and retry it. But it can save some + unnecessary rollbacks and retries. */ + + return (FALSE); + } + #ifdef WITH_WSREP /* if BF thread is locking and has conflict with another BF thread, we need to look at trx ordering and lock types */ @@ -4069,7 +4109,8 @@ static trx_id_t lock_deadlock_search( /*=================*/ - lock_deadlock_ctx_t* ctx) /*!< in/out: deadlock context */ + lock_deadlock_ctx_t* ctx, /*!< in/out: deadlock context */ + struct thd_wait_reports*waitee_ptr) /*!< in/out: list of waitees */ { const lock_t* lock; ulint heap_no; @@ -4149,38 +4190,59 @@ lock_deadlock_search( /* Select the joining transaction as the victim. */ return(ctx->start->id); - } else if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) { + } else { + /* We do not need to report autoinc locks to the upper + layer. These locks are released before commit, so they + can not cause deadlocks with binlog-fixed commit + order. */ + if (waitee_ptr && + (lock_get_type_low(lock) != LOCK_TABLE || + lock_get_mode(lock) != LOCK_AUTO_INC)) { + if (waitee_ptr->used == + sizeof(waitee_ptr->waitees) / + sizeof(waitee_ptr->waitees[0])) { + waitee_ptr->next = + (struct thd_wait_reports *) + mem_alloc(sizeof(*waitee_ptr)); + waitee_ptr = waitee_ptr->next; + if (!waitee_ptr) { + ctx->too_deep = TRUE; + return(ctx->start->id); + } + waitee_ptr->next = NULL; + waitee_ptr->used = 0; + } + waitee_ptr->waitees[waitee_ptr->used++] = lock->trx; + } - /* Another trx ahead has requested a lock in an - incompatible mode, and is itself waiting for a lock. */ + if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) { - ++ctx->cost; + /* Another trx ahead has requested a lock in an + incompatible mode, and is itself waiting for a lock. */ - /* Save current search state. */ - if (!lock_deadlock_push(ctx, lock, heap_no)) { + ++ctx->cost; - /* Unable to save current search state, stack - size not big enough. */ + /* Save current search state. */ + if (!lock_deadlock_push(ctx, lock, heap_no)) { - ctx->too_deep = TRUE; + /* Unable to save current search state, stack + size not big enough. */ + + ctx->too_deep = TRUE; -#ifdef WITH_WSREP - if (wsrep_thd_is_BF(ctx->start->mysql_thd, TRUE)) - return(lock->trx->id); - else -#endif /* WITH_WSREP */ return(ctx->start->id); - } + } - ctx->wait_lock = lock->trx->lock.wait_lock; - lock = lock_get_first_lock(ctx, &heap_no); + ctx->wait_lock = lock->trx->lock.wait_lock; + lock = lock_get_first_lock(ctx, &heap_no); - if (lock->trx->lock.deadlock_mark > ctx->mark_start) { + if (lock->trx->lock.deadlock_mark > ctx->mark_start) { + lock = lock_get_next_lock(ctx, lock, heap_no); + } + + } else { lock = lock_get_next_lock(ctx, lock, heap_no); } - - } else { - lock = lock_get_next_lock(ctx, lock, heap_no); } } @@ -4245,6 +4307,48 @@ lock_deadlock_trx_rollback( trx_mutex_exit(trx); } +static +void +lock_report_waiters_to_mysql( +/*=======================*/ + struct thd_wait_reports* waitee_buf_ptr, /*!< in: set of trxs */ + THD* mysql_thd, /*!< in: THD */ + trx_id_t victim_trx_id) /*!< in: Trx selected + as deadlock victim, if + any */ +{ + struct thd_wait_reports* p; + struct thd_wait_reports* q; + ulint i; + + p = waitee_buf_ptr; + while (p) { + i = 0; + while (i < p->used) { + trx_t *w_trx = p->waitees[i]; + /* There is no need to report waits to a trx already + selected as a victim. */ + if (w_trx->id != victim_trx_id) { + /* If thd_report_wait_for() decides to kill the + transaction, then we will get a call back into + innobase_kill_query. We mark this by setting + current_lock_mutex_owner, so we can avoid trying + to recursively take lock_sys->mutex. */ + w_trx->current_lock_mutex_owner = mysql_thd; + thd_report_wait_for(mysql_thd, w_trx->mysql_thd); + w_trx->current_lock_mutex_owner = NULL; + } + ++i; + } + q = p->next; + if (p != waitee_buf_ptr) { + mem_free(p); + } + p = q; + } +} + + /********************************************************************//** Checks if a joining lock request results in a deadlock. If a deadlock is found this function will resolve the dadlock by choosing a victim transaction @@ -4260,13 +4364,23 @@ lock_deadlock_check_and_resolve( const lock_t* lock, /*!< in: lock the transaction is requesting */ const trx_t* trx) /*!< in: transaction */ { - trx_id_t victim_trx_id; + trx_id_t victim_trx_id; + struct thd_wait_reports waitee_buf; + struct thd_wait_reports*waitee_buf_ptr; + THD* start_mysql_thd; ut_ad(trx != NULL); ut_ad(lock != NULL); ut_ad(lock_mutex_own()); assert_trx_in_list(trx); + start_mysql_thd = trx->mysql_thd; + if (start_mysql_thd && thd_need_wait_for(start_mysql_thd)) { + waitee_buf_ptr = &waitee_buf; + } else { + waitee_buf_ptr = NULL; + } + /* Try and resolve as many deadlocks as possible. */ do { lock_deadlock_ctx_t ctx; @@ -4279,7 +4393,19 @@ lock_deadlock_check_and_resolve( ctx.wait_lock = lock; ctx.mark_start = lock_mark_counter; - victim_trx_id = lock_deadlock_search(&ctx); + if (waitee_buf_ptr) { + waitee_buf_ptr->next = NULL; + waitee_buf_ptr->used = 0; + } + + victim_trx_id = lock_deadlock_search(&ctx, waitee_buf_ptr); + + /* Report waits to upper layer, as needed. */ + if (waitee_buf_ptr) { + lock_report_waiters_to_mysql(waitee_buf_ptr, + start_mysql_thd, + victim_trx_id); + } /* Search too deep, we rollback the joining transaction. */ if (ctx.too_deep) { diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 992b1e79b58..1ec08da8a83 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -2679,7 +2679,7 @@ try_again: } ib_logf(IB_LOG_LEVEL_ERROR, - "Tried to read "ULINTPF" bytes at offset " UINT64PF". " + "Tried to read " ULINTPF " bytes at offset " UINT64PF ". " "Was only able to read %ld.", n, offset, (lint) ret); #endif /* __WIN__ */ #ifdef __WIN__ @@ -2866,6 +2866,7 @@ os_file_write_func( DWORD high; ulint n_retries = 0; ulint err; + DWORD saved_error = 0; #ifndef UNIV_HOTBACKUP ulint i; #endif /* !UNIV_HOTBACKUP */ @@ -2955,8 +2956,10 @@ retry: } if (!os_has_said_disk_full) { + char *winmsg = NULL; - err = (ulint) GetLastError(); + saved_error = GetLastError(); + err = (ulint) saved_error; ut_print_timestamp(stderr); @@ -2973,6 +2976,23 @@ retry: name, offset, (ulong) n, (ulong) len, (ulong) err); + /* Ask Windows to prepare a standard message for a + GetLastError() */ + + FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, saved_error, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPSTR)&winmsg, 0, NULL); + + if (winmsg) { + fprintf(stderr, + "InnoDB: FormatMessage: Error number %lu means '%s'.\n", + (ulong) saved_error, winmsg); + LocalFree(winmsg); + } + if (strerror((int) err) != NULL) { fprintf(stderr, "InnoDB: Error number %lu means '%s'.\n", @@ -3001,12 +3021,11 @@ retry: } if (!os_has_said_disk_full) { - ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: Write to file %s failed" - " at offset "UINT64PF".\n" + " at offset " UINT64PF ".\n" "InnoDB: %lu bytes should have been written," " only %ld were written.\n" "InnoDB: Operating system error number %lu.\n" @@ -4592,11 +4611,16 @@ os_aio_func( wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER); + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", + mode = OS_AIO_SYNC;); + if (mode == OS_AIO_SYNC #ifdef WIN_ASYNC_IO && !srv_use_native_aio #endif /* WIN_ASYNC_IO */ ) { + ibool ret; + /* This is actually an ordinary synchronous read or write: no need to use an i/o-handler thread. NOTE that if we use Windows async i/o, Windows does not allow us to use @@ -4611,13 +4635,23 @@ os_aio_func( and os_file_write_func() */ if (type == OS_FILE_READ) { - return(os_file_read_func(file, buf, offset, n)); + ret = os_file_read_func(file, buf, offset, n); + } else { + + ut_ad(!srv_read_only_mode); + ut_a(type == OS_FILE_WRITE); + + ret = os_file_write_func(name, file, buf, offset, n); } - ut_ad(!srv_read_only_mode); - ut_a(type == OS_FILE_WRITE); + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", + os_has_said_disk_full = FALSE;); + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", + ret = 0;); + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", + errno = 28;); - return(os_file_write_func(name, file, buf, offset, n)); + return ret; } try_again: @@ -5442,7 +5476,13 @@ consecutive_loop: aio_slot->offset, total_len); } - ut_a(ret); + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28_2", + os_has_said_disk_full = FALSE;); + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28_2", + ret = 0;); + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28_2", + errno = 28;); + srv_set_io_thread_op_info(global_segment, "file i/o done"); if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) { diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc index ab7a19795a3..4b19a35925e 100644 --- a/storage/innobase/page/page0zip.cc +++ b/storage/innobase/page/page0zip.cc @@ -1309,6 +1309,30 @@ page_zip_compress( MONITOR_INC(MONITOR_PAGE_COMPRESS); + /* Simulate a compression failure with a probability determined by + innodb_simulate_comp_failures, only if the page has 2 or more + records. */ + + if (srv_simulate_comp_failures + && !dict_index_is_ibuf(index) + && page_get_n_recs(page) >= 2 + && ((ulint)(rand() % 100) < srv_simulate_comp_failures) + && strcasecmp(index->table_name, "IBUF_DUMMY") != 0) { + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Simulating a compression failure" + " for table %s, index %s, page %lu (%s)\n", + index->table_name, + index->name, + page_get_page_no(page), + page_is_leaf(page) ? "leaf" : "non-leaf"); + +#endif + + goto err_exit; + } + heap = mem_heap_create(page_zip_get_size(page_zip) + n_fields * (2 + sizeof(ulint)) + REC_OFFS_HEADER_SIZE diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index e6487730a77..c144ca890f8 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -151,35 +151,37 @@ row_ins_alloc_sys_fields( ut_ad(row && table && heap); ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table)); - /* 1. Allocate buffer for row id */ + /* allocate buffer to hold the needed system created hidden columns. */ + uint len = DATA_ROW_ID_LEN + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + ptr = static_cast<byte*>(mem_heap_zalloc(heap, len)); + /* 1. Populate row-id */ col = dict_table_get_sys_col(table, DATA_ROW_ID); dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); - ptr = static_cast<byte*>(mem_heap_zalloc(heap, DATA_ROW_ID_LEN)); - dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN); node->row_id_buf = ptr; - /* 3. Allocate buffer for trx id */ + ptr += DATA_ROW_ID_LEN; + /* 2. Populate trx id */ col = dict_table_get_sys_col(table, DATA_TRX_ID); dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); - ptr = static_cast<byte*>(mem_heap_zalloc(heap, DATA_TRX_ID_LEN)); dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN); node->trx_id_buf = ptr; - /* 4. Allocate buffer for roll ptr */ + ptr += DATA_TRX_ID_LEN; + + /* 3. Populate roll ptr */ col = dict_table_get_sys_col(table, DATA_ROLL_PTR); dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); - ptr = static_cast<byte*>(mem_heap_zalloc(heap, DATA_ROLL_PTR_LEN)); dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN); } diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index 56cf9f1943c..86b47c9f3bd 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -786,7 +786,7 @@ row_merge_read( if (UNIV_UNLIKELY(!success)) { ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: failed to read merge block at "UINT64PF"\n", + " InnoDB: failed to read merge block at " UINT64PF "\n", ofs); } diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index 93d13ea49ee..dd7af8a3526 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -1359,7 +1359,7 @@ error_exit: if (doc_id < next_doc_id) { fprintf(stderr, "InnoDB: FTS Doc ID must be large than" - " "UINT64PF" for table", + " " UINT64PF " for table", next_doc_id - 1); ut_print_name(stderr, trx, TRUE, table->name); putc('\n', stderr); @@ -1374,9 +1374,9 @@ error_exit: if (doc_id - next_doc_id >= FTS_DOC_ID_MAX_STEP) { fprintf(stderr, - "InnoDB: Doc ID "UINT64PF" is too" + "InnoDB: Doc ID " UINT64PF " is too" " big. Its difference with largest" - " used Doc ID "UINT64PF" cannot" + " used Doc ID " UINT64PF " cannot" " exceed or equal to %d\n", doc_id, next_doc_id - 1, FTS_DOC_ID_MAX_STEP); diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index 359ae3f2c21..e5a7694cb93 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -877,16 +877,15 @@ row_sel_get_clust_rec( if (!node->read_view) { /* Try to place a lock on the index record */ - - /* If innodb_locks_unsafe_for_binlog option is used - or this session is using READ COMMITTED isolation level - we lock only the record, i.e., next-key locking is - not used. */ ulint lock_type; trx_t* trx; trx = thr_get_trx(thr); + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using READ COMMITTED or lower isolation level + we lock only the record, i.e., next-key locking is + not used. */ if (srv_locks_unsafe_for_binlog || trx->isolation_level <= TRX_ISO_READ_COMMITTED) { lock_type = LOCK_REC_NOT_GAP; @@ -1502,12 +1501,6 @@ rec_loop: search result set, resulting in the phantom problem. */ if (!consistent_read) { - - /* If innodb_locks_unsafe_for_binlog option is used - or this session is using READ COMMITTED isolation - level, we lock only the record, i.e., next-key - locking is not used. */ - rec_t* next_rec = page_rec_get_next(rec); ulint lock_type; trx_t* trx; @@ -1517,6 +1510,10 @@ rec_loop: offsets = rec_get_offsets(next_rec, index, offsets, ULINT_UNDEFINED, &heap); + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using READ COMMITTED or lower isolation + level, we lock only the record, i.e., next-key + locking is not used. */ if (srv_locks_unsafe_for_binlog || trx->isolation_level <= TRX_ISO_READ_COMMITTED) { @@ -1565,12 +1562,6 @@ skip_lock: if (!consistent_read) { /* Try to place a lock on the index record */ - - /* If innodb_locks_unsafe_for_binlog option is used - or this session is using READ COMMITTED isolation level, - we lock only the record, i.e., next-key locking is - not used. */ - ulint lock_type; trx_t* trx; @@ -1579,6 +1570,10 @@ skip_lock: trx = thr_get_trx(thr); + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using READ COMMITTED or lower isolation level, + we lock only the record, i.e., next-key locking is + not used. */ if (srv_locks_unsafe_for_binlog || trx->isolation_level <= TRX_ISO_READ_COMMITTED) { @@ -4227,7 +4222,7 @@ rec_loop: /* Try to place a lock on the index record */ /* If innodb_locks_unsafe_for_binlog option is used - or this session is using a READ COMMITTED isolation + or this session is using a READ COMMITTED or lower isolation level we do not lock gaps. Supremum record is really a gap and therefore we do not set locks there. */ @@ -4369,7 +4364,7 @@ wrong_offs: /* Try to place a gap lock on the index record only if innodb_locks_unsafe_for_binlog option is not set or this session is not - using a READ COMMITTED isolation level. */ + using a READ COMMITTED or lower isolation level. */ err = sel_set_rec_lock( btr_pcur_get_block(pcur), @@ -4418,7 +4413,7 @@ wrong_offs: /* Try to place a gap lock on the index record only if innodb_locks_unsafe_for_binlog option is not set or this session is not - using a READ COMMITTED isolation level. */ + using a READ COMMITTED or lower isolation level. */ err = sel_set_rec_lock( btr_pcur_get_block(pcur), diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index ea346566e57..64417b1e5fb 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -41,8 +41,8 @@ Created 12/9/2009 Jimmy Yang /* Macro to standardize the counter names for counters in the "monitor_buf_page" module as they have very structured defines */ #define MONITOR_BUF_PAGE(name, description, code, op, op_code) \ - {"buffer_page_"op"_"name, "buffer_page_io", \ - "Number of "description" Pages "op, \ + {"buffer_page_" op "_" name, "buffer_page_io", \ + "Number of " description " Pages " op, \ MONITOR_GROUP_MODULE, MONITOR_DEFAULT_START, \ MONITOR_##code##_##op_code} diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 6a410285f2b..6e03f715f28 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -3,6 +3,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -473,6 +474,9 @@ current_time % 5 != 0. */ #endif /* MEM_PERIODIC_CHECK */ # define SRV_MASTER_DICT_LRU_INTERVAL (47) +/** Simulate compression failures. */ +UNIV_INTERN uint srv_simulate_comp_failures = 0; + /** Acquire the system_mutex. */ #define srv_sys_mutex_enter() do { \ mutex_enter(&srv_sys->mutex); \ diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 0c04fba421a..1c2bfcbd920 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -2197,9 +2197,9 @@ innobase_start_or_create_for_mysql(void) } else if (size != srv_log_file_size) { ib_logf(IB_LOG_LEVEL_ERROR, "Log file %s is" - " of different size "UINT64PF" bytes" + " of different size " UINT64PF " bytes" " than other log" - " files "UINT64PF" bytes!", + " files " UINT64PF " bytes!", logfilename, size << UNIV_PAGE_SIZE_SHIFT, (os_offset_t) srv_log_file_size diff --git a/storage/innobase/sync/sync0arr.cc b/storage/innobase/sync/sync0arr.cc index 2cfb693f8ba..986010039f9 100644 --- a/storage/innobase/sync/sync0arr.cc +++ b/storage/innobase/sync/sync0arr.cc @@ -182,6 +182,33 @@ sync_array_get_nth_cell( } /******************************************************************//** +Looks for a cell with the given thread id. +@return pointer to cell or NULL if not found */ +static +sync_cell_t* +sync_array_find_thread( +/*===================*/ + sync_array_t* arr, /*!< in: wait array */ + os_thread_id_t thread) /*!< in: thread id */ +{ + ulint i; + sync_cell_t* cell; + + for (i = 0; i < arr->n_cells; i++) { + + cell = sync_array_get_nth_cell(arr, i); + + if (cell->wait_object != NULL + && os_thread_eq(cell->thread, thread)) { + + return(cell); /* Found */ + } + } + + return(NULL); /* Not found */ +} + +/******************************************************************//** Reserves the mutex semaphore protecting a sync array. */ static void @@ -432,8 +459,10 @@ static void sync_array_cell_print( /*==================*/ - FILE* file, /*!< in: file where to print */ - sync_cell_t* cell) /*!< in: sync cell */ + FILE* file, /*!< in: file where to print */ + sync_cell_t* cell, /*!< in: sync cell */ + os_thread_id_t* reserver) /*!< out: write reserver or + 0 */ { ib_mutex_t* mutex; rw_lock_t* rwlock; @@ -454,19 +483,21 @@ sync_array_cell_print( been freed meanwhile */ mutex = cell->old_wait_mutex; - fprintf(file, - "Mutex at %p created file %s line %lu, lock var %lu\n" + if (mutex) { + fprintf(file, + "Mutex at %p created file %s line %lu, lock var %lu\n" #ifdef UNIV_SYNC_DEBUG - "Last time reserved in file %s line %lu, " + "Last time reserved in file %s line %lu, " #endif /* UNIV_SYNC_DEBUG */ - "waiters flag %lu\n", - (void*) mutex, innobase_basename(mutex->cfile_name), - (ulong) mutex->cline, - (ulong) mutex->lock_word, + "waiters flag %lu\n", + (void*) mutex, innobase_basename(mutex->cfile_name), + (ulong) mutex->cline, + (ulong) mutex->lock_word, #ifdef UNIV_SYNC_DEBUG - mutex->file_name, (ulong) mutex->line, + mutex->file_name, (ulong) mutex->line, #endif /* UNIV_SYNC_DEBUG */ - (ulong) mutex->waiters); + (ulong) mutex->waiters); + } } else if (type == RW_LOCK_EX || type == RW_LOCK_WAIT_EX @@ -478,33 +509,36 @@ sync_array_cell_print( rwlock = cell->old_wait_rw_lock; - fprintf(file, - " RW-latch at %p created in file %s line %lu\n", - (void*) rwlock, innobase_basename(rwlock->cfile_name), - (ulong) rwlock->cline); - writer = rw_lock_get_writer(rwlock); - if (writer != RW_LOCK_NOT_LOCKED) { + if (rwlock) { fprintf(file, - "a writer (thread id %lu) has" - " reserved it in mode %s", - (ulong) os_thread_pf(rwlock->writer_thread), - writer == RW_LOCK_EX - ? " exclusive\n" - : " wait exclusive\n"); - } + " RW-latch at %p created in file %s line %lu\n", + (void*) rwlock, innobase_basename(rwlock->cfile_name), + (ulong) rwlock->cline); + writer = rw_lock_get_writer(rwlock); + if (writer != RW_LOCK_NOT_LOCKED) { + fprintf(file, + "a writer (thread id %lu) has" + " reserved it in mode %s", + (ulong) os_thread_pf(rwlock->writer_thread), + writer == RW_LOCK_EX + ? " exclusive\n" + : " wait exclusive\n"); + *reserver = rwlock->writer_thread; + } - fprintf(file, - "number of readers %lu, waiters flag %lu, " - "lock_word: %lx\n" - "Last time read locked in file %s line %lu\n" - "Last time write locked in file %s line %lu\n", - (ulong) rw_lock_get_reader_count(rwlock), - (ulong) rwlock->waiters, - rwlock->lock_word, - innobase_basename(rwlock->last_s_file_name), - (ulong) rwlock->last_s_line, - rwlock->last_x_file_name, - (ulong) rwlock->last_x_line); + fprintf(file, + "number of readers %lu, waiters flag %lu, " + "lock_word: %lx\n" + "Last time read locked in file %s line %lu\n" + "Last time write locked in file %s line %lu\n", + (ulong) rw_lock_get_reader_count(rwlock), + (ulong) rwlock->waiters, + rwlock->lock_word, + innobase_basename(rwlock->last_s_file_name), + (ulong) rwlock->last_s_line, + rwlock->last_x_file_name, + (ulong) rwlock->last_x_line); + } } else { ut_error; } @@ -515,32 +549,6 @@ sync_array_cell_print( } #ifdef UNIV_SYNC_DEBUG -/******************************************************************//** -Looks for a cell with the given thread id. -@return pointer to cell or NULL if not found */ -static -sync_cell_t* -sync_array_find_thread( -/*===================*/ - sync_array_t* arr, /*!< in: wait array */ - os_thread_id_t thread) /*!< in: thread id */ -{ - ulint i; - sync_cell_t* cell; - - for (i = 0; i < arr->n_cells; i++) { - - cell = sync_array_get_nth_cell(arr, i); - - if (cell->wait_object != NULL - && os_thread_eq(cell->thread, thread)) { - - return(cell); /* Found */ - } - } - - return(NULL); /* Not found */ -} /******************************************************************//** Recursion step for deadlock detection. @@ -602,6 +610,7 @@ sync_array_detect_deadlock( os_thread_id_t thread; ibool ret; rw_lock_debug_t*debug; + os_thread_id_t reserver=0; ut_a(arr); ut_a(start); @@ -637,10 +646,10 @@ sync_array_detect_deadlock( depth); if (ret) { fprintf(stderr, - "Mutex %p owned by thread %lu file %s line %lu\n", + "Mutex %p owned by thread %lu file %s line %lu\n", mutex, (ulong) os_thread_pf(mutex->thread_id), mutex->file_name, (ulong) mutex->line); - sync_array_cell_print(stderr, cell); + sync_array_cell_print(stderr, cell, &reserver); return(TRUE); } @@ -678,7 +687,7 @@ sync_array_detect_deadlock( print: fprintf(stderr, "rw-lock %p ", (void*) lock); - sync_array_cell_print(stderr, cell); + sync_array_cell_print(stderr, cell, &reserver); rw_lock_debug_print(stderr, debug); return(TRUE); } @@ -921,6 +930,7 @@ sync_array_print_long_waits_low( double diff; sync_cell_t* cell; void* wait_object; + os_thread_id_t reserver=0; cell = sync_array_get_nth_cell(arr, i); @@ -936,7 +946,7 @@ sync_array_print_long_waits_low( if (diff > SYNC_ARRAY_TIMEOUT) { fputs("InnoDB: Warning: a long semaphore wait:\n", stderr); - sync_array_cell_print(stderr, cell); + sync_array_cell_print(stderr, cell, &reserver); *noticed = TRUE; } @@ -951,6 +961,60 @@ sync_array_print_long_waits_low( } } + /* We found a long semaphore wait, wait all threads that are + waiting for a semaphore. */ + if (*noticed) { + for (i = 0; i < arr->n_cells; i++) { + void* wait_object; + os_thread_id_t reserver=(os_thread_id_t)ULINT_UNDEFINED; + sync_cell_t* cell; + ulint loop = 0; + + cell = sync_array_get_nth_cell(arr, i); + + wait_object = cell->wait_object; + + if (wait_object == NULL || !cell->waiting) { + + continue; + } + + fputs("InnoDB: Warning: semaphore wait:\n", + stderr); + sync_array_cell_print(stderr, cell, &reserver); + + /* Try to output cell information for writer recursive way */ + while (reserver != (os_thread_id_t)ULINT_UNDEFINED) { + sync_cell_t* reserver_wait; + + reserver_wait = sync_array_find_thread(arr, reserver); + + if (reserver_wait && + reserver_wait->wait_object != NULL && + reserver_wait->waiting) { + fputs("InnoDB: Warning: Writer thread is waiting this semaphore:\n", + stderr); + reserver = (os_thread_id_t)ULINT_UNDEFINED; + sync_array_cell_print(stderr, reserver_wait, &reserver); + loop++; + + if (reserver_wait->thread == reserver) { + reserver = (os_thread_id_t)ULINT_UNDEFINED; + } + } else { + reserver = (os_thread_id_t)ULINT_UNDEFINED; + } + + /* This is protection against loop */ + if (loop > 100) { + fputs("InnoDB: Warning: Too many waiting threads.\n", stderr); + break; + } + + } + } + } + #undef SYNC_ARRAY_TIMEOUT return(fatal); @@ -1030,6 +1094,7 @@ sync_array_print_info_low( { ulint i; ulint count = 0; + os_thread_id_t r = 0; fprintf(file, "OS WAIT ARRAY INFO: reservation count %ld\n", @@ -1042,7 +1107,7 @@ sync_array_print_info_low( if (cell->wait_object != NULL) { count++; - sync_array_cell_print(file, cell); + sync_array_cell_print(file, cell, &r); } } } diff --git a/storage/innobase/sync/sync0rw.cc b/storage/innobase/sync/sync0rw.cc index ebf73917702..e129d39fc9d 100644 --- a/storage/innobase/sync/sync0rw.cc +++ b/storage/innobase/sync/sync0rw.cc @@ -151,18 +151,12 @@ UNIV_INTERN mysql_pfs_key_t rw_lock_mutex_key; To modify the debug info list of an rw-lock, this mutex has to be acquired in addition to the mutex protecting the lock. */ -UNIV_INTERN ib_mutex_t rw_lock_debug_mutex; +UNIV_INTERN os_fast_mutex_t rw_lock_debug_mutex; # ifdef UNIV_PFS_MUTEX UNIV_INTERN mysql_pfs_key_t rw_lock_debug_mutex_key; # endif -/* If deadlock detection does not get immediately the mutex, -it may wait for this event */ -UNIV_INTERN os_event_t rw_lock_debug_event; -/* This is set to TRUE, if there may be waiters for the event */ -UNIV_INTERN ibool rw_lock_debug_waiters; - /******************************************************************//** Creates a debug info struct. */ static @@ -690,22 +684,7 @@ void rw_lock_debug_mutex_enter(void) /*===========================*/ { -loop: - if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) { - return; - } - - os_event_reset(rw_lock_debug_event); - - rw_lock_debug_waiters = TRUE; - - if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) { - return; - } - - os_event_wait(rw_lock_debug_event); - - goto loop; + os_fast_mutex_lock(&rw_lock_debug_mutex); } /******************************************************************//** @@ -715,12 +694,7 @@ void rw_lock_debug_mutex_exit(void) /*==========================*/ { - mutex_exit(&rw_lock_debug_mutex); - - if (rw_lock_debug_waiters) { - rw_lock_debug_waiters = FALSE; - os_event_set(rw_lock_debug_event); - } + os_fast_mutex_unlock(&rw_lock_debug_mutex); } /******************************************************************//** diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc index 5ef8a02fb3f..54018471abc 100644 --- a/storage/innobase/sync/sync0sync.cc +++ b/storage/innobase/sync/sync0sync.cc @@ -1472,11 +1472,7 @@ sync_init(void) SYNC_NO_ORDER_CHECK); #ifdef UNIV_SYNC_DEBUG - mutex_create(rw_lock_debug_mutex_key, &rw_lock_debug_mutex, - SYNC_NO_ORDER_CHECK); - - rw_lock_debug_event = os_event_create(); - rw_lock_debug_waiters = FALSE; + os_fast_mutex_init(rw_lock_debug_mutex_key, &rw_lock_debug_mutex); #endif /* UNIV_SYNC_DEBUG */ } @@ -1544,6 +1540,7 @@ sync_close(void) sync_order_checks_on = FALSE; sync_thread_level_arrays_free(); + os_fast_mutex_free(&rw_lock_debug_mutex); #endif /* UNIV_SYNC_DEBUG */ sync_initialized = FALSE; @@ -1558,12 +1555,12 @@ sync_print_wait_info( FILE* file) /*!< in: file where to print */ { fprintf(file, - "Mutex spin waits "UINT64PF", rounds "UINT64PF", " - "OS waits "UINT64PF"\n" - "RW-shared spins "UINT64PF", rounds "UINT64PF", " - "OS waits "UINT64PF"\n" - "RW-excl spins "UINT64PF", rounds "UINT64PF", " - "OS waits "UINT64PF"\n", + "Mutex spin waits " UINT64PF ", rounds " UINT64PF ", " + "OS waits " UINT64PF "\n" + "RW-shared spins " UINT64PF ", rounds " UINT64PF ", " + "OS waits " UINT64PF "\n" + "RW-excl spins " UINT64PF ", rounds " UINT64PF ", " + "OS waits " UINT64PF "\n", (ib_uint64_t) mutex_spin_wait_count, (ib_uint64_t) mutex_spin_round_count, (ib_uint64_t) mutex_os_wait_count, diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc index f6360562ae7..01ccfb8a6d0 100644 --- a/storage/innobase/trx/trx0i_s.cc +++ b/storage/innobase/trx/trx0i_s.cc @@ -1639,7 +1639,7 @@ trx_i_s_create_lock_id( } else { /* table lock */ res_len = ut_snprintf(lock_id, lock_id_size, - TRX_ID_FMT":"UINT64PF, + TRX_ID_FMT":" UINT64PF, row->lock_trx_id, row->lock_table_id); } diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 38b21d8d428..272f8377f68 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -50,6 +50,9 @@ Created 3/26/1996 Heikki Tuuri #include<set> +extern "C" +int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2); + /** Set of table_id */ typedef std::set<table_id_t> table_id_set; @@ -1833,7 +1836,7 @@ state_ok: if (trx->undo_no != 0) { newline = TRUE; - fprintf(f, ", undo log entries "TRX_ID_FMT, trx->undo_no); + fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no); } if (newline) { @@ -1936,9 +1939,8 @@ trx_assert_started( #endif /* UNIV_DEBUG */ /*******************************************************************//** -Compares the "weight" (or size) of two transactions. Transactions that -have edited non-transactional tables are considered heavier than ones -that have not. +Compares the "weight" (or size) of two transactions. The heavier the weight, +the more reluctant we will be to choose the transaction as a deadlock victim. @return TRUE if weight(a) >= weight(b) */ UNIV_INTERN ibool @@ -1947,26 +1949,19 @@ trx_weight_ge( const trx_t* a, /*!< in: the first transaction to be compared */ const trx_t* b) /*!< in: the second transaction to be compared */ { - ibool a_notrans_edit; - ibool b_notrans_edit; - - /* If mysql_thd is NULL for a transaction we assume that it has - not edited non-transactional tables. */ - - a_notrans_edit = a->mysql_thd != NULL - && thd_has_edited_nontrans_tables(a->mysql_thd); - - b_notrans_edit = b->mysql_thd != NULL - && thd_has_edited_nontrans_tables(b->mysql_thd); - - if (a_notrans_edit != b_notrans_edit) { + int pref; - return(a_notrans_edit); + /* First ask the upper server layer if it has any preference for which + to prefer as a deadlock victim. */ + pref= thd_deadlock_victim_preference(a->mysql_thd, b->mysql_thd); + if (pref < 0) { + return FALSE; + } else if (pref > 0) { + return TRUE; } - /* Either both had edited non-transactional tables or both had - not, we fall back to comparing the number of altered/locked - rows. */ + /* Upper server layer had no preference, we fall back to comparing the + number of altered/locked rows. */ #if 0 fprintf(stderr, @@ -2133,7 +2128,7 @@ trx_recover_for_mysql( ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Transaction contains changes" - " to "TRX_ID_FMT" rows\n", + " to " TRX_ID_FMT " rows\n", trx->undo_no); count++; |