summaryrefslogtreecommitdiff
path: root/storage/innobase
diff options
context:
space:
mode:
authorNirbhay Choubey <nirbhay@skysql.com>2014-08-11 23:55:41 -0400
committerNirbhay Choubey <nirbhay@skysql.com>2014-08-11 23:55:41 -0400
commit8358dd53b7406deaa9f50ad09b16a86b7e367632 (patch)
treeef8995ad0e400cb6a1842649c3c886c7b3474835 /storage/innobase
parente06e12f5b8dfe0ab2e5976eec1b27b25d318441b (diff)
parent4105cbf4a230c82ea7dee31d4d2262b798fad9f4 (diff)
downloadmariadb-git-8358dd53b7406deaa9f50ad09b16a86b7e367632.tar.gz
bzr merge -r4346 maria/10.0 (maria-10.0.13)
Diffstat (limited to 'storage/innobase')
-rw-r--r--storage/innobase/btr/btr0cur.cc22
-rw-r--r--storage/innobase/buf/buf0flu.cc4
-rw-r--r--storage/innobase/buf/buf0lru.cc18
-rw-r--r--storage/innobase/dict/dict0dict.cc126
-rw-r--r--storage/innobase/dict/dict0mem.cc11
-rw-r--r--storage/innobase/dict/dict0stats.cc611
-rw-r--r--storage/innobase/fil/fil0fil.cc9
-rw-r--r--storage/innobase/fts/fts0fts.cc65
-rw-r--r--storage/innobase/fts/fts0opt.cc2
-rw-r--r--storage/innobase/handler/ha_innodb.cc112
-rw-r--r--storage/innobase/include/btr0cur.h13
-rw-r--r--storage/innobase/include/dict0dict.h27
-rw-r--r--storage/innobase/include/dict0mem.h13
-rw-r--r--storage/innobase/include/lock0lock.h2
-rw-r--r--storage/innobase/include/os0once.h125
-rw-r--r--storage/innobase/include/os0sync.h14
-rw-r--r--storage/innobase/include/srv0srv.h4
-rw-r--r--storage/innobase/include/sync0rw.h12
-rw-r--r--storage/innobase/include/sync0sync.ic5
-rw-r--r--storage/innobase/include/trx0trx.h5
-rw-r--r--storage/innobase/include/univ.i10
-rw-r--r--storage/innobase/lock/lock0lock.cc174
-rw-r--r--storage/innobase/os/os0file.cc58
-rw-r--r--storage/innobase/page/page0zip.cc24
-rw-r--r--storage/innobase/row/row0ins.cc16
-rw-r--r--storage/innobase/row/row0merge.cc2
-rw-r--r--storage/innobase/row/row0mysql.cc6
-rw-r--r--storage/innobase/row/row0sel.cc35
-rw-r--r--storage/innobase/srv/srv0mon.cc4
-rw-r--r--storage/innobase/srv/srv0srv.cc4
-rw-r--r--storage/innobase/srv/srv0start.cc4
-rw-r--r--storage/innobase/sync/sync0arr.cc199
-rw-r--r--storage/innobase/sync/sync0rw.cc32
-rw-r--r--storage/innobase/sync/sync0sync.cc19
-rw-r--r--storage/innobase/trx/trx0i_s.cc2
-rw-r--r--storage/innobase/trx/trx0trx.cc39
36 files changed, 1325 insertions, 503 deletions
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index 1d2f313a07c..34a72f360be 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -202,15 +202,6 @@ btr_rec_free_externally_stored_fields(
mtr_t* mtr); /*!< in: mini-transaction handle which contains
an X-latch to record page and to the index
tree */
-/***********************************************************//**
-Gets the externally stored size of a record, in units of a database page.
-@return externally stored part, in units of a database page */
-static
-ulint
-btr_rec_get_externally_stored_len(
-/*==============================*/
- const rec_t* rec, /*!< in: record */
- const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
#endif /* !UNIV_HOTBACKUP */
/******************************************************//**
@@ -271,6 +262,7 @@ btr_cur_latch_leaves(
case BTR_MODIFY_TREE:
/* x-latch also brothers from left to right */
left_page_no = btr_page_get_prev(page, mtr);
+ mode = latch_mode;
if (left_page_no != FIL_NULL) {
get_block = btr_block_get(
@@ -4043,15 +4035,15 @@ btr_rec_get_field_ref_offs(
#define btr_rec_get_field_ref(rec, offsets, n) \
((rec) + btr_rec_get_field_ref_offs(offsets, n))
-/***********************************************************//**
-Gets the externally stored size of a record, in units of a database page.
+/** Gets the externally stored size of a record, in units of a database page.
+@param[in] rec record
+@param[in] offsets array returned by rec_get_offsets()
@return externally stored part, in units of a database page */
-static
+
ulint
btr_rec_get_externally_stored_len(
-/*==============================*/
- const rec_t* rec, /*!< in: record */
- const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+ const rec_t* rec,
+ const ulint* offsets)
{
ulint n_fields;
ulint total_extern_len = 0;
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index 3cce75abe74..fa2edb90b8e 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -2183,6 +2183,10 @@ af_get_pct_for_dirty()
{
ulint dirty_pct = buf_get_modified_ratio_pct();
+ if (dirty_pct > 0 && srv_max_buf_pool_modified_pct == 0) {
+ return(100);
+ }
+
ut_a(srv_max_dirty_pages_pct_lwm
<= srv_max_buf_pool_modified_pct);
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
index ec30c063a72..64409e1993d 100644
--- a/storage/innobase/buf/buf0lru.cc
+++ b/storage/innobase/buf/buf0lru.cc
@@ -2263,6 +2263,24 @@ buf_LRU_block_remove_hashed(
" in the hash table\n",
(ulong) bpage->space,
(ulong) bpage->offset);
+#ifdef UNIV_DEBUG
+ fprintf(stderr,
+ "InnoDB: in_page_hash %lu in_zip_hash %lu\n"
+ " in_free_list %lu in_flush_list %lu in_LRU_list %lu\n"
+ " zip.data %p zip_size %lu page_state %d\n",
+ bpage->in_page_hash, bpage->in_zip_hash,
+ bpage->in_free_list, bpage->in_flush_list,
+ bpage->in_LRU_list, bpage->zip.data,
+ buf_page_get_zip_size(bpage),
+ buf_page_get_state(bpage));
+#else
+ fprintf(stderr,
+ "InnoDB: zip.data %p zip_size %lu page_state %d\n",
+ bpage->zip.data,
+ buf_page_get_zip_size(bpage),
+ buf_page_get_state(bpage));
+#endif
+
if (hashed_bpage) {
fprintf(stderr,
"InnoDB: In hash table we find block"
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
index 86a903d925e..c53f7e82f58 100644
--- a/storage/innobase/dict/dict0dict.cc
+++ b/storage/innobase/dict/dict0dict.cc
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
This program is free software; you can redistribute it and/or modify it under
@@ -50,6 +50,7 @@ UNIV_INTERN dict_index_t* dict_ind_compact;
#include "btr0btr.h"
#include "btr0cur.h"
#include "btr0sea.h"
+#include "os0once.h"
#include "page0zip.h"
#include "page0page.h"
#include "pars0pars.h"
@@ -102,7 +103,7 @@ UNIV_INTERN ulong zip_pad_max = 50;
UNIV_INTERN mysql_pfs_key_t dict_operation_lock_key;
UNIV_INTERN mysql_pfs_key_t index_tree_rw_lock_key;
UNIV_INTERN mysql_pfs_key_t index_online_log_key;
-UNIV_INTERN mysql_pfs_key_t dict_table_stats_latch_key;
+UNIV_INTERN mysql_pfs_key_t dict_table_stats_key;
#endif /* UNIV_PFS_RWLOCK */
#ifdef UNIV_PFS_MUTEX
@@ -121,6 +122,11 @@ UNIV_INTERN mysql_pfs_key_t dict_foreign_err_mutex_key;
/** Identifies generated InnoDB foreign key names */
static char dict_ibfk[] = "_ibfk_";
+bool innodb_table_stats_not_found = false;
+bool innodb_index_stats_not_found = false;
+static bool innodb_table_stats_not_found_reported = false;
+static bool innodb_index_stats_not_found_reported = false;
+
/*******************************************************************//**
Tries to find column names for the index and sets the col field of the
index.
@@ -319,6 +325,82 @@ dict_mutex_exit_for_mysql(void)
mutex_exit(&(dict_sys->mutex));
}
+/** Allocate and init a dict_table_t's stats latch.
+This function must not be called concurrently on the same table object.
+@param[in,out] table_void table whose stats latch to create */
+static
+void
+dict_table_stats_latch_alloc(
+ void* table_void)
+{
+ dict_table_t* table = static_cast<dict_table_t*>(table_void);
+
+ table->stats_latch = new(std::nothrow) rw_lock_t;
+
+ ut_a(table->stats_latch != NULL);
+
+ rw_lock_create(dict_table_stats_key, table->stats_latch,
+ SYNC_INDEX_TREE);
+}
+
+/** Deinit and free a dict_table_t's stats latch.
+This function must not be called concurrently on the same table object.
+@param[in,out] table table whose stats latch to free */
+static
+void
+dict_table_stats_latch_free(
+ dict_table_t* table)
+{
+ rw_lock_free(table->stats_latch);
+ delete table->stats_latch;
+}
+
+/** Create a dict_table_t's stats latch or delay for lazy creation.
+This function is only called from either single threaded environment
+or from a thread that has not shared the table object with other threads.
+@param[in,out] table table whose stats latch to create
+@param[in] enabled if false then the latch is disabled
+and dict_table_stats_lock()/unlock() become noop on this table. */
+
+void
+dict_table_stats_latch_create(
+ dict_table_t* table,
+ bool enabled)
+{
+ if (!enabled) {
+ table->stats_latch = NULL;
+ table->stats_latch_created = os_once::DONE;
+ return;
+ }
+
+#ifdef HAVE_ATOMIC_BUILTINS
+ /* We create this lazily the first time it is used. */
+ table->stats_latch = NULL;
+ table->stats_latch_created = os_once::NEVER_DONE;
+#else /* HAVE_ATOMIC_BUILTINS */
+
+ dict_table_stats_latch_alloc(table);
+
+ table->stats_latch_created = os_once::DONE;
+#endif /* HAVE_ATOMIC_BUILTINS */
+}
+
+/** Destroy a dict_table_t's stats latch.
+This function is only called from either single threaded environment
+or from a thread that has not shared the table object with other threads.
+@param[in,out] table table whose stats latch to destroy */
+
+void
+dict_table_stats_latch_destroy(
+ dict_table_t* table)
+{
+ if (table->stats_latch_created == os_once::DONE
+ && table->stats_latch != NULL) {
+
+ dict_table_stats_latch_free(table);
+ }
+}
+
/**********************************************************************//**
Lock the appropriate latch to protect a given table's statistics. */
UNIV_INTERN
@@ -331,6 +413,14 @@ dict_table_stats_lock(
ut_ad(table != NULL);
ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+#ifdef HAVE_ATOMIC_BUILTINS
+ os_once::do_or_wait_for_done(
+ &table->stats_latch_created,
+ dict_table_stats_latch_alloc, table);
+#else /* HAVE_ATOMIC_BUILTINS */
+ ut_ad(table->stats_latch_created == os_once::DONE);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
if (table->stats_latch == NULL) {
/* This is a dummy table object that is private in the current
thread and is not shared between multiple threads, thus we
@@ -5212,8 +5302,6 @@ dict_table_print(
index = UT_LIST_GET_NEXT(indexes, index);
}
- table->stat_initialized = FALSE;
-
dict_table_stats_unlock(table, RW_X_LATCH);
foreign = UT_LIST_GET_FIRST(table->foreign_list);
@@ -6016,14 +6104,34 @@ dict_table_schema_check(
table = dict_table_get_low(req_schema->table_name);
if (table == NULL) {
+ bool should_print=true;
/* no such table */
- ut_snprintf(errstr, errstr_sz,
- "Table %s not found.",
- ut_format_name(req_schema->table_name,
- TRUE, buf, sizeof(buf)));
+ if (innobase_strcasecmp(req_schema->table_name, "mysql/innodb_table_stats") == 0) {
+ if (innodb_table_stats_not_found_reported == false) {
+ innodb_table_stats_not_found = true;
+ innodb_table_stats_not_found_reported = true;
+ } else {
+ should_print = false;
+ }
+ } else if (innobase_strcasecmp(req_schema->table_name, "mysql/innodb_index_stats") == 0 ) {
+ if (innodb_index_stats_not_found_reported == false) {
+ innodb_index_stats_not_found = true;
+ innodb_index_stats_not_found_reported = true;
+ } else {
+ should_print = false;
+ }
+ }
- return(DB_TABLE_NOT_FOUND);
+ if (should_print) {
+ ut_snprintf(errstr, errstr_sz,
+ "Table %s not found.",
+ ut_format_name(req_schema->table_name,
+ TRUE, buf, sizeof(buf)));
+ return(DB_TABLE_NOT_FOUND);
+ } else {
+ return(DB_STATS_DO_NOT_EXIST);
+ }
}
if (table->ibd_file_missing) {
diff --git a/storage/innobase/dict/dict0mem.cc b/storage/innobase/dict/dict0mem.cc
index 60daeea3a96..6310b2fd225 100644
--- a/storage/innobase/dict/dict0mem.cc
+++ b/storage/innobase/dict/dict0mem.cc
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
This program is free software; you can redistribute it and/or modify it under
@@ -95,9 +95,9 @@ dict_mem_table_create(
ut_d(table->magic_n = DICT_TABLE_MAGIC_N);
- table->stats_latch = new rw_lock_t;
- rw_lock_create(dict_table_stats_latch_key, table->stats_latch,
- SYNC_INDEX_TREE);
+ /* true means that the stats latch will be enabled -
+ dict_table_stats_lock() will not be noop. */
+ dict_table_stats_latch_create(table, true);
#ifndef UNIV_HOTBACKUP
table->autoinc_lock = static_cast<ib_lock_t*>(
@@ -154,8 +154,7 @@ dict_mem_table_free(
mutex_free(&(table->autoinc_mutex));
#endif /* UNIV_HOTBACKUP */
- rw_lock_free(table->stats_latch);
- delete table->stats_latch;
+ dict_table_stats_latch_destroy(table);
ut_free(table->name);
mem_heap_free(table->heap);
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
index 928bdb3f2ef..1eac9e0df51 100644
--- a/storage/innobase/dict/dict0stats.cc
+++ b/storage/innobase/dict/dict0stats.cc
@@ -46,6 +46,7 @@ Created Jan 06, 2010 Vasil Dimov
#include "ut0rnd.h" /* ut_rnd_interval() */
#include "ut0ut.h" /* ut_format_name(), ut_time() */
+#include <algorithm>
#include <map>
#include <vector>
@@ -127,10 +128,11 @@ where n=1..n_uniq.
#endif /* UNIV_STATS_DEBUG */
/* Gets the number of leaf pages to sample in persistent stats estimation */
-#define N_SAMPLE_PAGES(index) \
- ((index)->table->stats_sample_pages != 0 ? \
- (index)->table->stats_sample_pages : \
- srv_stats_persistent_sample_pages)
+#define N_SAMPLE_PAGES(index) \
+ static_cast<ib_uint64_t>( \
+ (index)->table->stats_sample_pages != 0 \
+ ? (index)->table->stats_sample_pages \
+ : srv_stats_persistent_sample_pages)
/* number of distinct records on a given level that are required to stop
descending to lower levels and fetch N_SAMPLE_PAGES(index) records
@@ -268,10 +270,12 @@ dict_stats_persistent_storage_check(
mutex_exit(&(dict_sys->mutex));
}
- if (ret != DB_SUCCESS) {
+ if (ret != DB_SUCCESS && ret != DB_STATS_DO_NOT_EXIST) {
ut_print_timestamp(stderr);
fprintf(stderr, " InnoDB: Error: %s\n", errstr);
return(false);
+ } else if (ret == DB_STATS_DO_NOT_EXIST) {
+ return false;
}
/* else */
@@ -430,9 +434,9 @@ dict_stats_table_clone_create(
t->corrupted = table->corrupted;
/* This private object "t" is not shared with other threads, so
- we do not need the stats_latch. The lock/unlock routines will do
- nothing if stats_latch is NULL. */
- t->stats_latch = NULL;
+ we do not need the stats_latch (thus we pass false below). The
+ dict_table_stats_lock()/unlock() routines will do nothing. */
+ dict_table_stats_latch_create(t, false);
UT_LIST_INIT(t->indexes);
@@ -508,6 +512,7 @@ dict_stats_table_clone_free(
/*========================*/
dict_table_t* t) /*!< in: dummy table object to free */
{
+ dict_table_stats_latch_destroy(t);
mem_heap_free(t->heap);
}
@@ -1283,35 +1288,40 @@ enum page_scan_method_t {
};
/* @} */
-/*********************************************************************//**
-Scan a page, reading records from left to right and counting the number
-of distinct records on that page (looking only at the first n_prefix
-columns). If scan_method is QUIT_ON_FIRST_NON_BORING then the function
+/** Scan a page, reading records from left to right and counting the number
+of distinct records (looking only at the first n_prefix
+columns) and the number of external pages pointed by records from this page.
+If scan_method is QUIT_ON_FIRST_NON_BORING then the function
will return as soon as it finds a record that does not match its neighbor
to the right, which means that in the case of QUIT_ON_FIRST_NON_BORING the
returned n_diff can either be 0 (empty page), 1 (the whole page has all keys
equal) or 2 (the function found a non-boring record and returned).
+@param[out] out_rec record, or NULL
+@param[out] offsets1 rec_get_offsets() working space (must
+be big enough)
+@param[out] offsets2 rec_get_offsets() working space (must
+be big enough)
+@param[in] index index of the page
+@param[in] page the page to scan
+@param[in] n_prefix look at the first n_prefix columns
+@param[in] scan_method scan to the end of the page or not
+@param[out] n_diff number of distinct records encountered
+@param[out] n_external_pages if this is non-NULL then it will be set
+to the number of externally stored pages which were encountered
@return offsets1 or offsets2 (the offsets of *out_rec),
or NULL if the page is empty and does not contain user records. */
-UNIV_INLINE __attribute__((nonnull))
+UNIV_INLINE
ulint*
dict_stats_scan_page(
-/*=================*/
- const rec_t** out_rec, /*!< out: record, or NULL */
- ulint* offsets1, /*!< out: rec_get_offsets()
- working space (must be big
- enough) */
- ulint* offsets2, /*!< out: rec_get_offsets()
- working space (must be big
- enough) */
- dict_index_t* index, /*!< in: index of the page */
- const page_t* page, /*!< in: the page to scan */
- ulint n_prefix, /*!< in: look at the first
- n_prefix columns */
- page_scan_method_t scan_method, /*!< in: scan to the end of
- the page or not */
- ib_uint64_t* n_diff) /*!< out: number of distinct
- records encountered */
+ const rec_t** out_rec,
+ ulint* offsets1,
+ ulint* offsets2,
+ dict_index_t* index,
+ const page_t* page,
+ ulint n_prefix,
+ page_scan_method_t scan_method,
+ ib_uint64_t* n_diff,
+ ib_uint64_t* n_external_pages)
{
ulint* offsets_rec = offsets1;
ulint* offsets_next_rec = offsets2;
@@ -1329,6 +1339,12 @@ dict_stats_scan_page(
get_next = page_rec_get_next_const;
}
+ const bool should_count_external_pages = n_external_pages != NULL;
+
+ if (should_count_external_pages) {
+ *n_external_pages = 0;
+ }
+
rec = get_next(page_get_infimum_rec(page));
if (page_rec_is_supremum(rec)) {
@@ -1341,6 +1357,11 @@ dict_stats_scan_page(
offsets_rec = rec_get_offsets(rec, index, offsets_rec,
ULINT_UNDEFINED, &heap);
+ if (should_count_external_pages) {
+ *n_external_pages += btr_rec_get_externally_stored_len(
+ rec, offsets_rec);
+ }
+
next_rec = get_next(rec);
*n_diff = 1;
@@ -1391,6 +1412,11 @@ dict_stats_scan_page(
offsets_next_rec = offsets_tmp;
}
+ if (should_count_external_pages) {
+ *n_external_pages += btr_rec_get_externally_stored_len(
+ rec, offsets_rec);
+ }
+
next_rec = get_next(next_rec);
}
@@ -1401,19 +1427,25 @@ func_exit:
return(offsets_rec);
}
-/*********************************************************************//**
-Dive below the current position of a cursor and calculate the number of
+/** Dive below the current position of a cursor and calculate the number of
distinct records on the leaf page, when looking at the fist n_prefix
-columns.
+columns. Also calculate the number of external pages pointed by records
+on the leaf page.
+@param[in] cur cursor
+@param[in] n_prefix look at the first n_prefix columns
+when comparing records
+@param[out] n_diff number of distinct records
+@param[out] n_external_pages number of external pages
+@param[in,out] mtr mini-transaction
@return number of distinct records on the leaf page */
static
-ib_uint64_t
+void
dict_stats_analyze_index_below_cur(
-/*===============================*/
- const btr_cur_t*cur, /*!< in: cursor */
- ulint n_prefix, /*!< in: look at the first n_prefix
- columns when comparing records */
- mtr_t* mtr) /*!< in/out: mini-transaction */
+ const btr_cur_t* cur,
+ ulint n_prefix,
+ ib_uint64_t* n_diff,
+ ib_uint64_t* n_external_pages,
+ mtr_t* mtr)
{
dict_index_t* index;
ulint space;
@@ -1426,7 +1458,6 @@ dict_stats_analyze_index_below_cur(
ulint* offsets1;
ulint* offsets2;
ulint* offsets_rec;
- ib_uint64_t n_diff; /* the result */
ulint size;
index = btr_cur_get_index(cur);
@@ -1462,6 +1493,10 @@ dict_stats_analyze_index_below_cur(
page_no = btr_node_ptr_get_child_page_no(rec, offsets_rec);
+ /* assume no external pages by default - in case we quit from this
+ function without analyzing any leaf pages */
+ *n_external_pages = 0;
+
/* descend to the leaf level on the B-tree */
for (;;) {
@@ -1480,20 +1515,24 @@ dict_stats_analyze_index_below_cur(
/* search for the first non-boring record on the page */
offsets_rec = dict_stats_scan_page(
&rec, offsets1, offsets2, index, page, n_prefix,
- QUIT_ON_FIRST_NON_BORING, &n_diff);
+ QUIT_ON_FIRST_NON_BORING, n_diff, NULL);
/* pages on level > 0 are not allowed to be empty */
ut_a(offsets_rec != NULL);
/* if page is not empty (offsets_rec != NULL) then n_diff must
be > 0, otherwise there is a bug in dict_stats_scan_page() */
- ut_a(n_diff > 0);
+ ut_a(*n_diff > 0);
- if (n_diff == 1) {
+ if (*n_diff == 1) {
/* page has all keys equal and the end of the page
was reached by dict_stats_scan_page(), no need to
descend to the leaf level */
mem_heap_free(heap);
- return(1);
+ /* can't get an estimate for n_external_pages here
+ because we do not dive to the leaf level, assume no
+ external pages (*n_external_pages was assigned to 0
+ above). */
+ return;
}
/* else */
@@ -1501,7 +1540,7 @@ dict_stats_analyze_index_below_cur(
first non-boring record it finds, then the returned n_diff
can either be 0 (empty page), 1 (page has all keys equal) or
2 (non-boring record was found) */
- ut_a(n_diff == 2);
+ ut_a(*n_diff == 2);
/* we have a non-boring record in rec, descend below it */
@@ -1512,11 +1551,14 @@ dict_stats_analyze_index_below_cur(
ut_ad(btr_page_get_level(page, mtr) == 0);
/* scan the leaf page and find the number of distinct keys,
- when looking only at the first n_prefix columns */
+ when looking only at the first n_prefix columns; also estimate
+ the number of externally stored pages pointed by records on this
+ page */
offsets_rec = dict_stats_scan_page(
&rec, offsets1, offsets2, index, page, n_prefix,
- COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED, &n_diff);
+ COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED, n_diff,
+ n_external_pages);
#if 0
DEBUG_PRINTF(" %s(): n_diff below page_no=%lu: " UINT64PF "\n",
@@ -1524,133 +1566,146 @@ dict_stats_analyze_index_below_cur(
#endif
mem_heap_free(heap);
-
- return(n_diff);
}
-/*********************************************************************//**
-For a given level in an index select N_SAMPLE_PAGES(index)
-(or less) records from that level and dive below them to the corresponding
-leaf pages, then scan those leaf pages and save the sampling results in
-index->stat_n_diff_key_vals[n_prefix - 1] and the number of pages scanned in
-index->stat_n_sample_sizes[n_prefix - 1]. */
+/** Input data that is used to calculate dict_index_t::stat_n_diff_key_vals[]
+for each n-columns prefix (n from 1 to n_uniq). */
+struct n_diff_data_t {
+ /** Index of the level on which the descent through the btree
+ stopped. level 0 is the leaf level. This is >= 1 because we
+ avoid scanning the leaf level because it may contain too many
+ pages and doing so is useless when combined with the random dives -
+ if we are to scan the leaf level, this means a full scan and we can
+ simply do that instead of fiddling with picking random records higher
+ in the tree and to dive below them. At the start of the analyzing
+ we may decide to do full scan of the leaf level, but then this
+ structure is not used in that code path. */
+ ulint level;
+
+ /** Number of records on the level where the descend through the btree
+ stopped. When we scan the btree from the root, we stop at some mid
+ level, choose some records from it and dive below them towards a leaf
+ page to analyze. */
+ ib_uint64_t n_recs_on_level;
+
+ /** Number of different key values that were found on the mid level. */
+ ib_uint64_t n_diff_on_level;
+
+ /** Number of leaf pages that are analyzed. This is also the same as
+ the number of records that we pick from the mid level and dive below
+ them. */
+ ib_uint64_t n_leaf_pages_to_analyze;
+
+ /** Cumulative sum of the number of different key values that were
+ found on all analyzed pages. */
+ ib_uint64_t n_diff_all_analyzed_pages;
+
+ /** Cumulative sum of the number of external pages (stored outside of
+ the btree but in the same file segment). */
+ ib_uint64_t n_external_pages_sum;
+};
+
+/** Estimate the number of different key values in an index when looking at
+the first n_prefix columns. For a given level in an index select
+n_diff_data->n_leaf_pages_to_analyze records from that level and dive below
+them to the corresponding leaf pages, then scan those leaf pages and save the
+sampling results in n_diff_data->n_diff_all_analyzed_pages.
+@param[in] index index
+@param[in] n_prefix look at first 'n_prefix' columns when
+comparing records
+@param[in] boundaries a vector that contains
+n_diff_data->n_diff_on_level integers each of which represents the index (on
+level 'level', counting from left/smallest to right/biggest from 0) of the
+last record from each group of distinct keys
+@param[in,out] n_diff_data n_diff_all_analyzed_pages and
+n_external_pages_sum in this structure will be set by this function. The
+members level, n_diff_on_level and n_leaf_pages_to_analyze must be set by the
+caller in advance - they are used by some calculations inside this function
+@param[in,out] mtr mini-transaction */
static
void
dict_stats_analyze_index_for_n_prefix(
-/*==================================*/
- dict_index_t* index, /*!< in/out: index */
- ulint level, /*!< in: level, must be >= 1 */
- ib_uint64_t total_recs_on_level,
- /*!< in: total number of
- records on the given level */
- ulint n_prefix, /*!< in: look at first
- n_prefix columns when
- comparing records */
- ib_uint64_t n_diff_for_this_prefix,
- /*!< in: number of distinct
- records on the given level,
- when looking at the first
- n_prefix columns */
- boundaries_t* boundaries, /*!< in: array that contains
- n_diff_for_this_prefix
- integers each of which
- represents the index (on the
- level, counting from
- left/smallest to right/biggest
- from 0) of the last record
- from each group of distinct
- keys */
- mtr_t* mtr) /*!< in/out: mini-transaction */
+ dict_index_t* index,
+ ulint n_prefix,
+ const boundaries_t* boundaries,
+ n_diff_data_t* n_diff_data,
+ mtr_t* mtr)
{
btr_pcur_t pcur;
const page_t* page;
ib_uint64_t rec_idx;
- ib_uint64_t last_idx_on_level;
- ib_uint64_t n_recs_to_dive_below;
- ib_uint64_t n_diff_sum_of_all_analyzed_pages;
ib_uint64_t i;
#if 0
DEBUG_PRINTF(" %s(table=%s, index=%s, level=%lu, n_prefix=%lu, "
- "n_diff_for_this_prefix=" UINT64PF ")\n",
+ "n_diff_on_level=" UINT64PF ")\n",
__func__, index->table->name, index->name, level,
- n_prefix, n_diff_for_this_prefix);
+ n_prefix, n_diff_data->n_diff_on_level);
#endif
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
MTR_MEMO_S_LOCK));
- /* if some of those is 0 then this means that there is exactly one
- page in the B-tree and it is empty and we should have done full scan
- and should not be here */
- ut_ad(total_recs_on_level > 0);
- ut_ad(n_diff_for_this_prefix > 0);
-
- /* this must be at least 1 */
- ut_ad(N_SAMPLE_PAGES(index) > 0);
-
/* Position pcur on the leftmost record on the leftmost page
on the desired level. */
btr_pcur_open_at_index_side(
true, index, BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED,
- &pcur, true, level, mtr);
+ &pcur, true, n_diff_data->level, mtr);
btr_pcur_move_to_next_on_page(&pcur);
page = btr_pcur_get_page(&pcur);
+ const rec_t* first_rec = btr_pcur_get_rec(&pcur);
+
+ /* We shouldn't be scanning the leaf level. The caller of this function
+ should have stopped the descend on level 1 or higher. */
+ ut_ad(n_diff_data->level > 0);
+ ut_ad(!page_is_leaf(page));
+
/* The page must not be empty, except when
it is the root page (and the whole index is empty). */
- ut_ad(btr_pcur_is_on_user_rec(&pcur) || page_is_leaf(page));
- ut_ad(btr_pcur_get_rec(&pcur)
- == page_rec_get_next_const(page_get_infimum_rec(page)));
+ ut_ad(btr_pcur_is_on_user_rec(&pcur));
+ ut_ad(first_rec == page_rec_get_next_const(page_get_infimum_rec(page)));
/* check that we are indeed on the desired level */
- ut_a(btr_page_get_level(page, mtr) == level);
+ ut_a(btr_page_get_level(page, mtr) == n_diff_data->level);
/* there should not be any pages on the left */
ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
/* check whether the first record on the leftmost page is marked
- as such, if we are on a non-leaf level */
- ut_a((level == 0)
- == !(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
- btr_pcur_get_rec(&pcur), page_is_comp(page))));
+ as such; we are on a non-leaf level */
+ ut_a(rec_get_info_bits(first_rec, page_is_comp(page))
+ & REC_INFO_MIN_REC_FLAG);
- last_idx_on_level = boundaries->at(
- static_cast<unsigned int>(n_diff_for_this_prefix - 1));
+ const ib_uint64_t last_idx_on_level = boundaries->at(
+ static_cast<unsigned>(n_diff_data->n_diff_on_level - 1));
rec_idx = 0;
- n_diff_sum_of_all_analyzed_pages = 0;
-
- n_recs_to_dive_below = ut_min(N_SAMPLE_PAGES(index),
- n_diff_for_this_prefix);
-
- for (i = 0; i < n_recs_to_dive_below; i++) {
- ib_uint64_t left;
- ib_uint64_t right;
- ib_uint64_t rnd;
- ib_uint64_t dive_below_idx;
+ n_diff_data->n_diff_all_analyzed_pages = 0;
+ n_diff_data->n_external_pages_sum = 0;
- /* there are n_diff_for_this_prefix elements
+ for (i = 0; i < n_diff_data->n_leaf_pages_to_analyze; i++) {
+ /* there are n_diff_on_level elements
in 'boundaries' and we divide those elements
- into n_recs_to_dive_below segments, for example:
+ into n_leaf_pages_to_analyze segments, for example:
- let n_diff_for_this_prefix=100, n_recs_to_dive_below=4, then:
+ let n_diff_on_level=100, n_leaf_pages_to_analyze=4, then:
segment i=0: [0, 24]
segment i=1: [25, 49]
segment i=2: [50, 74]
segment i=3: [75, 99] or
- let n_diff_for_this_prefix=1, n_recs_to_dive_below=1, then:
+ let n_diff_on_level=1, n_leaf_pages_to_analyze=1, then:
segment i=0: [0, 0] or
- let n_diff_for_this_prefix=2, n_recs_to_dive_below=2, then:
+ let n_diff_on_level=2, n_leaf_pages_to_analyze=2, then:
segment i=0: [0, 0]
segment i=1: [1, 1] or
- let n_diff_for_this_prefix=13, n_recs_to_dive_below=7, then:
+ let n_diff_on_level=13, n_leaf_pages_to_analyze=7, then:
segment i=0: [0, 0]
segment i=1: [1, 2]
segment i=2: [3, 4]
@@ -1661,9 +1716,12 @@ dict_stats_analyze_index_for_n_prefix(
then we select a random record from each segment and dive
below it */
- left = n_diff_for_this_prefix * i / n_recs_to_dive_below;
- right = n_diff_for_this_prefix * (i + 1)
- / n_recs_to_dive_below - 1;
+ const ib_uint64_t n_diff = n_diff_data->n_diff_on_level;
+ const ib_uint64_t n_pick
+ = n_diff_data->n_leaf_pages_to_analyze;
+
+ const ib_uint64_t left = n_diff * i / n_pick;
+ const ib_uint64_t right = n_diff * (i + 1) / n_pick - 1;
ut_a(left <= right);
ut_a(right <= last_idx_on_level);
@@ -1671,11 +1729,11 @@ dict_stats_analyze_index_for_n_prefix(
/* we do not pass (left, right) because we do not want to ask
ut_rnd_interval() to work with too big numbers since
ib_uint64_t could be bigger than ulint */
- rnd = static_cast<ib_uint64_t>(
- ut_rnd_interval(0, static_cast<ulint>(right - left)));
+ const ulint rnd = ut_rnd_interval(
+ 0, static_cast<ulint>(right - left));
- dive_below_idx = boundaries->at(
- static_cast<unsigned int>(left + rnd));
+ const ib_uint64_t dive_below_idx
+ = boundaries->at(static_cast<unsigned>(left + rnd));
#if 0
DEBUG_PRINTF(" %s(): dive below record with index="
@@ -1711,9 +1769,13 @@ dict_stats_analyze_index_for_n_prefix(
ut_a(rec_idx == dive_below_idx);
ib_uint64_t n_diff_on_leaf_page;
+ ib_uint64_t n_external_pages;
- n_diff_on_leaf_page = dict_stats_analyze_index_below_cur(
- btr_pcur_get_btr_cur(&pcur), n_prefix, mtr);
+ dict_stats_analyze_index_below_cur(btr_pcur_get_btr_cur(&pcur),
+ n_prefix,
+ &n_diff_on_leaf_page,
+ &n_external_pages,
+ mtr);
/* We adjust n_diff_on_leaf_page here to avoid counting
one record twice - once as the last on some page and once
@@ -1733,37 +1795,86 @@ dict_stats_analyze_index_for_n_prefix(
n_diff_on_leaf_page--;
}
- n_diff_sum_of_all_analyzed_pages += n_diff_on_leaf_page;
- }
-
- /* n_diff_sum_of_all_analyzed_pages can be 0 here if all the leaf
- pages sampled contained only delete-marked records. In this case
- we should assign 0 to index->stat_n_diff_key_vals[n_prefix - 1], which
- the formula below does. */
+ n_diff_data->n_diff_all_analyzed_pages += n_diff_on_leaf_page;
- /* See REF01 for an explanation of the algorithm */
- index->stat_n_diff_key_vals[n_prefix - 1]
- = index->stat_n_leaf_pages
-
- * n_diff_for_this_prefix
- / total_recs_on_level
-
- * n_diff_sum_of_all_analyzed_pages
- / n_recs_to_dive_below;
+ n_diff_data->n_external_pages_sum += n_external_pages;
+ }
- index->stat_n_sample_sizes[n_prefix - 1] = n_recs_to_dive_below;
+ btr_pcur_close(&pcur);
+}
- DEBUG_PRINTF(" %s(): n_diff=" UINT64PF " for n_prefix=%lu "
- "(%lu"
- " * " UINT64PF " / " UINT64PF
- " * " UINT64PF " / " UINT64PF ")\n",
- __func__, index->stat_n_diff_key_vals[n_prefix - 1],
- n_prefix,
- index->stat_n_leaf_pages,
- n_diff_for_this_prefix, total_recs_on_level,
- n_diff_sum_of_all_analyzed_pages, n_recs_to_dive_below);
+/** Set dict_index_t::stat_n_diff_key_vals[] and stat_n_sample_sizes[].
+@param[in] n_diff_data input data to use to derive the results
+@param[in,out] index index whose stat_n_diff_key_vals[] to set */
+UNIV_INLINE
+void
+dict_stats_index_set_n_diff(
+ const n_diff_data_t* n_diff_data,
+ dict_index_t* index)
+{
+ for (ulint n_prefix = dict_index_get_n_unique(index);
+ n_prefix >= 1;
+ n_prefix--) {
+ /* n_diff_all_analyzed_pages can be 0 here if
+ all the leaf pages sampled contained only
+ delete-marked records. In this case we should assign
+ 0 to index->stat_n_diff_key_vals[n_prefix - 1], which
+ the formula below does. */
+
+ const n_diff_data_t* data = &n_diff_data[n_prefix - 1];
+
+ ut_ad(data->n_leaf_pages_to_analyze > 0);
+ ut_ad(data->n_recs_on_level > 0);
+
+ ulint n_ordinary_leaf_pages;
+
+ if (data->level == 1) {
+ /* If we know the number of records on level 1, then
+ this number is the same as the number of pages on
+ level 0 (leaf). */
+ n_ordinary_leaf_pages = data->n_recs_on_level;
+ } else {
+ /* If we analyzed D ordinary leaf pages and found E
+ external pages in total linked from those D ordinary
+ leaf pages, then this means that the ratio
+ ordinary/external is D/E. Then the ratio ordinary/total
+ is D / (D + E). Knowing that the total number of pages
+ is T (including ordinary and external) then we estimate
+ that the total number of ordinary leaf pages is
+ T * D / (D + E). */
+ n_ordinary_leaf_pages
+ = index->stat_n_leaf_pages
+ * data->n_leaf_pages_to_analyze
+ / (data->n_leaf_pages_to_analyze
+ + data->n_external_pages_sum);
+ }
- btr_pcur_close(&pcur);
+ /* See REF01 for an explanation of the algorithm */
+ index->stat_n_diff_key_vals[n_prefix - 1]
+ = n_ordinary_leaf_pages
+
+ * data->n_diff_on_level
+ / data->n_recs_on_level
+
+ * data->n_diff_all_analyzed_pages
+ / data->n_leaf_pages_to_analyze;
+
+ index->stat_n_sample_sizes[n_prefix - 1]
+ = data->n_leaf_pages_to_analyze;
+
+ DEBUG_PRINTF(" %s(): n_diff=" UINT64PF " for n_prefix=%lu"
+ " (%lu"
+ " * " UINT64PF " / " UINT64PF
+ " * " UINT64PF " / " UINT64PF ")\n",
+ __func__,
+ index->stat_n_diff_key_vals[n_prefix - 1],
+ n_prefix,
+ index->stat_n_leaf_pages,
+ data->n_diff_on_level,
+ data->n_recs_on_level,
+ data->n_diff_all_analyzed_pages,
+ data->n_leaf_pages_to_analyze);
+ }
}
/*********************************************************************//**
@@ -1781,10 +1892,8 @@ dict_stats_analyze_index(
bool level_is_analyzed;
ulint n_uniq;
ulint n_prefix;
- ib_uint64_t* n_diff_on_level;
ib_uint64_t total_recs;
ib_uint64_t total_pages;
- boundaries_t* n_diff_boundaries;
mtr_t mtr;
ulint size;
DBUG_ENTER("dict_stats_analyze_index");
@@ -1870,11 +1979,18 @@ dict_stats_analyze_index(
DBUG_VOID_RETURN;
}
- /* set to zero */
- n_diff_on_level = reinterpret_cast<ib_uint64_t*>
- (mem_zalloc(n_uniq * sizeof(ib_uint64_t)));
+ /* For each level that is being scanned in the btree, this contains the
+ number of different key values for all possible n-column prefixes. */
+ ib_uint64_t* n_diff_on_level = new ib_uint64_t[n_uniq];
- n_diff_boundaries = new boundaries_t[n_uniq];
+ /* For each level that is being scanned in the btree, this contains the
+ index of the last record from each group of equal records (when
+ comparing only the first n columns, n=1..n_uniq). */
+ boundaries_t* n_diff_boundaries = new boundaries_t[n_uniq];
+
+ /* For each n-column prefix this array contains the input data that is
+ used to calculate dict_index_t::stat_n_diff_key_vals[]. */
+ n_diff_data_t* n_diff_data = new n_diff_data_t[n_uniq];
/* total_recs is also used to estimate the number of pages on one
level below, so at the start we have 1 page (the root) */
@@ -1986,12 +2102,12 @@ dict_stats_analyze_index(
level_is_analyzed = true;
- if (n_diff_on_level[n_prefix - 1]
- >= N_DIFF_REQUIRED(index)
- || level == 1) {
- /* we found a good level with many distinct
- records or we have reached the last level we
- could scan */
+ if (level == 1
+ || n_diff_on_level[n_prefix - 1]
+ >= N_DIFF_REQUIRED(index)) {
+ /* we have reached the last level we could scan
+ or we found a good level with many distinct
+ records */
break;
}
@@ -2004,7 +2120,6 @@ found_level:
" distinct records for n_prefix=%lu\n",
__func__, level, n_diff_on_level[n_prefix - 1],
n_prefix);
-
/* here we are either on level 1 or the level that we are on
contains >= N_DIFF_REQUIRED distinct keys or we did not scan
deeper levels because they would contain too many pages */
@@ -2013,20 +2128,47 @@ found_level:
ut_ad(level_is_analyzed);
+ /* if any of these is 0 then there is exactly one page in the
+ B-tree and it is empty and we should have done full scan and
+ should not be here */
+ ut_ad(total_recs > 0);
+ ut_ad(n_diff_on_level[n_prefix - 1] > 0);
+
+ ut_ad(N_SAMPLE_PAGES(index) > 0);
+
+ n_diff_data_t* data = &n_diff_data[n_prefix - 1];
+
+ data->level = level;
+
+ data->n_recs_on_level = total_recs;
+
+ data->n_diff_on_level = n_diff_on_level[n_prefix - 1];
+
+ data->n_leaf_pages_to_analyze = std::min(
+ N_SAMPLE_PAGES(index),
+ n_diff_on_level[n_prefix - 1]);
+
/* pick some records from this level and dive below them for
the given n_prefix */
dict_stats_analyze_index_for_n_prefix(
- index, level, total_recs, n_prefix,
- n_diff_on_level[n_prefix - 1],
- &n_diff_boundaries[n_prefix - 1], &mtr);
+ index, n_prefix, &n_diff_boundaries[n_prefix - 1],
+ data, &mtr);
}
mtr_commit(&mtr);
delete[] n_diff_boundaries;
- mem_free(n_diff_on_level);
+ delete[] n_diff_on_level;
+
+ /* n_prefix == 0 means that the above loop did not end up prematurely
+ due to tree being changed and so n_diff_data[] is set up. */
+ if (n_prefix == 0) {
+ dict_stats_index_set_n_diff(n_diff_data, index);
+ }
+
+ delete[] n_diff_data;
dict_stats_assert_initialized_index(index);
DBUG_VOID_RETURN;
@@ -2201,17 +2343,21 @@ dict_stats_save_index_stat(
"END;", trx);
if (ret != DB_SUCCESS) {
- char buf_table[MAX_FULL_NAME_LEN];
- char buf_index[MAX_FULL_NAME_LEN];
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Cannot save index statistics for table "
- "%s, index %s, stat name \"%s\": %s\n",
- ut_format_name(index->table->name, TRUE,
- buf_table, sizeof(buf_table)),
- ut_format_name(index->name, FALSE,
- buf_index, sizeof(buf_index)),
- stat_name, ut_strerr(ret));
+ if (innodb_index_stats_not_found == false &&
+ index->stats_error_printed == false) {
+ char buf_table[MAX_FULL_NAME_LEN];
+ char buf_index[MAX_FULL_NAME_LEN];
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Cannot save index statistics for table "
+ "%s, index %s, stat name \"%s\": %s\n",
+ ut_format_name(index->table->name, TRUE,
+ buf_table, sizeof(buf_table)),
+ ut_format_name(index->name, FALSE,
+ buf_index, sizeof(buf_index)),
+ stat_name, ut_strerr(ret));
+ index->stats_error_printed = true;
+ }
}
return(ret);
@@ -2900,20 +3046,24 @@ dict_stats_update_for_index(
}
/* else */
- /* Fall back to transient stats since the persistent
- storage is not present or is corrupted */
- char buf_table[MAX_FULL_NAME_LEN];
- char buf_index[MAX_FULL_NAME_LEN];
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Recalculation of persistent statistics "
- "requested for table %s index %s but the required "
- "persistent statistics storage is not present or is "
- "corrupted. Using transient stats instead.\n",
- ut_format_name(index->table->name, TRUE,
- buf_table, sizeof(buf_table)),
- ut_format_name(index->name, FALSE,
- buf_index, sizeof(buf_index)));
+ if (innodb_index_stats_not_found == false &&
+ index->stats_error_printed == false) {
+ /* Fall back to transient stats since the persistent
+ storage is not present or is corrupted */
+ char buf_table[MAX_FULL_NAME_LEN];
+ char buf_index[MAX_FULL_NAME_LEN];
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Recalculation of persistent statistics "
+ "requested for table %s index %s but the required "
+ "persistent statistics storage is not present or is "
+ "corrupted. Using transient stats instead.\n",
+ ut_format_name(index->table->name, TRUE,
+ buf_table, sizeof(buf_table)),
+ ut_format_name(index->name, FALSE,
+ buf_index, sizeof(buf_index)));
+ index->stats_error_printed = false;
+ }
}
dict_table_stats_lock(index->table, RW_X_LATCH);
@@ -2998,13 +3148,17 @@ dict_stats_update(
/* Fall back to transient stats since the persistent
storage is not present or is corrupted */
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Recalculation of persistent statistics "
- "requested for table %s but the required persistent "
- "statistics storage is not present or is corrupted. "
- "Using transient stats instead.\n",
- ut_format_name(table->name, TRUE, buf, sizeof(buf)));
+ if (innodb_table_stats_not_found == false &&
+ table->stats_error_printed == false) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Recalculation of persistent statistics "
+ "requested for table %s but the required persistent "
+ "statistics storage is not present or is corrupted. "
+ "Using transient stats instead.\n",
+ ut_format_name(table->name, TRUE, buf, sizeof(buf)));
+ table->stats_error_printed = true;
+ }
goto transient;
@@ -3048,17 +3202,21 @@ dict_stats_update(
/* persistent statistics storage does not exist
or is corrupted, calculate the transient stats */
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Error: Fetch of persistent "
- "statistics requested for table %s but the "
- "required system tables %s and %s are not "
- "present or have unexpected structure. "
- "Using transient stats instead.\n",
- ut_format_name(table->name, TRUE,
- buf, sizeof(buf)),
- TABLE_STATS_NAME_PRINT,
- INDEX_STATS_NAME_PRINT);
+ if (innodb_table_stats_not_found == false &&
+ table->stats_error_printed == false) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: Fetch of persistent "
+ "statistics requested for table %s but the "
+ "required system tables %s and %s are not "
+ "present or have unexpected structure. "
+ "Using transient stats instead.\n",
+ ut_format_name(table->name, TRUE,
+ buf, sizeof(buf)),
+ TABLE_STATS_NAME_PRINT,
+ INDEX_STATS_NAME_PRINT);
+ table->stats_error_printed = true;
+ }
goto transient;
}
@@ -3128,16 +3286,19 @@ dict_stats_update(
dict_stats_table_clone_free(t);
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Error fetching persistent statistics "
- "for table %s from %s and %s: %s. "
- "Using transient stats method instead.\n",
- ut_format_name(table->name, TRUE, buf,
- sizeof(buf)),
- TABLE_STATS_NAME,
- INDEX_STATS_NAME,
- ut_strerr(err));
+ if (innodb_table_stats_not_found == false &&
+ table->stats_error_printed == false) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error fetching persistent statistics "
+ "for table %s from %s and %s: %s. "
+ "Using transient stats method instead.\n",
+ ut_format_name(table->name, TRUE, buf,
+ sizeof(buf)),
+ TABLE_STATS_NAME,
+ INDEX_STATS_NAME,
+ ut_strerr(err));
+ }
goto transient;
}
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index 81fcba47812..f4e5721caa7 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -758,7 +758,7 @@ fil_node_open_file(
fprintf(stderr,
"InnoDB: Error: the size of single-table"
" tablespace file %s\n"
- "InnoDB: is only "UINT64PF","
+ "InnoDB: is only " UINT64PF ","
" should be at least %lu!\n",
node->name,
size_bytes,
@@ -5725,7 +5725,7 @@ fil_io(
ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
offset, len, node, message);
#endif /* UNIV_HOTBACKUP */
- ut_a(ret);
+
if (mode == OS_AIO_SYNC) {
/* The i/o operation is already completed when we return from
@@ -5740,7 +5740,10 @@ fil_io(
ut_ad(fil_validate_skip());
}
- return(DB_SUCCESS);
+ if (!ret) {
+ return(DB_OUT_OF_FILE_SPACE);
+ } else {
+ } return(DB_SUCCESS);
}
#ifndef UNIV_HOTBACKUP
diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc
index 4a667686795..f503cc487b7 100644
--- a/storage/innobase/fts/fts0fts.cc
+++ b/storage/innobase/fts/fts0fts.cc
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -44,6 +44,13 @@ Full Text Search interface
/** Column name from the FTS config table */
#define FTS_MAX_CACHE_SIZE_IN_MB "cache_size_in_mb"
+/** Verify if a aux table name is a obsolete table
+by looking up the key word in the obsolete table names */
+#define FTS_IS_OBSOLETE_AUX_TABLE(table_name) \
+ (strstr((table_name), "DOC_ID") != NULL \
+ || strstr((table_name), "ADDED") != NULL \
+ || strstr((table_name), "STOPWORDS") != NULL)
+
/** This is maximum FTS cache for each table and would be
a configurable variable */
UNIV_INTERN ulong fts_max_cache_size;
@@ -5837,6 +5844,12 @@ fts_is_aux_table_name(
}
}
+ /* Could be obsolete common tables. */
+ if (strncmp(ptr, "ADDED", len) == 0
+ || strncmp(ptr, "STOPWORDS", len) == 0) {
+ return(true);
+ }
+
/* Try and read the index id. */
if (!fts_read_object_id(&table->index_id, ptr)) {
return(FALSE);
@@ -6433,6 +6446,56 @@ fts_check_and_drop_orphaned_tables(
mem_free(path);
}
+ } else {
+ if (FTS_IS_OBSOLETE_AUX_TABLE(aux_table->name)) {
+
+ /* Current table could be one of the three
+ obsolete tables, in this case, we should
+ always try to drop it but not rename it.
+ This could happen when we try to upgrade
+ from older server to later one, which doesn't
+ contain these obsolete tables. */
+ drop = true;
+
+ dberr_t err;
+ trx_t* trx_drop =
+ trx_allocate_for_background();
+
+ trx_drop->op_info = "Drop obsolete aux tables";
+ trx_drop->dict_operation_lock_mode = RW_X_LATCH;
+
+ trx_start_for_ddl(trx_drop, TRX_DICT_OP_TABLE);
+
+ err = row_drop_table_for_mysql(
+ aux_table->name, trx_drop, false, true);
+
+ trx_drop->dict_operation_lock_mode = 0;
+
+ if (err != DB_SUCCESS) {
+ /* We don't need to worry about the
+ failure, since server would try to
+ drop it on next restart, even if
+ the table was broken. */
+
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Fail to drop obsolete aux"
+ " table '%s', which is"
+ " harmless. will try to drop"
+ " it on next restart.",
+ aux_table->name);
+
+ fts_sql_rollback(trx_drop);
+ } else {
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Dropped obsolete aux"
+ " table '%s'.",
+ aux_table->name);
+
+ fts_sql_commit(trx_drop);
+ }
+
+ trx_free_for_background(trx_drop);
+ }
}
#ifdef _WIN32
if (!drop && rename) {
diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc
index a9f3a25530d..910a00cd521 100644
--- a/storage/innobase/fts/fts0opt.cc
+++ b/storage/innobase/fts/fts0opt.cc
@@ -95,7 +95,7 @@ enum fts_msg_type_t {
/** Compressed list of words that have been read from FTS INDEX
that needs to be optimized. */
struct fts_zip_t {
- ulint status; /*!< Status of (un)/zip operation */
+ lint status; /*!< Status of (un)/zip operation */
ulint n_words; /*!< Number of words compressed */
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 79c994a78a0..a33d9a1d5bb 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -4,6 +4,7 @@ Copyright (c) 2000, 2014, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2014 SkySQL Ab. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -432,7 +433,7 @@ static PSI_rwlock_info all_innodb_rwlocks[] = {
{&trx_purge_latch_key, "trx_purge_latch", 0},
{&index_tree_rw_lock_key, "index_tree_rw_lock", 0},
{&index_online_log_key, "index_online_log", 0},
- {&dict_table_stats_latch_key, "dict_table_stats", 0},
+ {&dict_table_stats_key, "dict_table_stats", 0},
{&hash_table_rw_lock_key, "hash_table_locks", 0}
};
# endif /* UNIV_PFS_RWLOCK */
@@ -3504,6 +3505,14 @@ innobase_end(
if (innodb_inited) {
+ THD *thd= current_thd;
+ if (thd) { // may be UNINSTALL PLUGIN statement
+ trx_t* trx = thd_to_trx(thd);
+ if (trx) {
+ trx_free_for_mysql(trx);
+ }
+ }
+
srv_fast_shutdown = (ulint) innobase_fast_shutdown;
innodb_inited = 0;
@@ -4254,7 +4263,7 @@ innobase_close_connection(
sql_print_warning(
"MySQL is closing a connection that has an active "
- "InnoDB transaction. "TRX_ID_FMT" row modifications "
+ "InnoDB transaction. " TRX_ID_FMT " row modifications "
"will roll back.",
trx->undo_no);
}
@@ -4317,16 +4326,23 @@ innobase_kill_query(
#endif /* WITH_WSREP */
trx = thd_to_trx(thd);
- if (trx)
- {
- /* Cancel a pending lock request. */
- lock_mutex_enter();
- trx_mutex_enter(trx);
- if (trx->lock.wait_lock)
- lock_cancel_waiting_and_release(trx->lock.wait_lock);
- trx_mutex_exit(trx);
- lock_mutex_exit();
- }
+ if (trx) {
+ THD *cur = current_thd;
+ THD *owner = trx->current_lock_mutex_owner;
+
+ /* Cancel a pending lock request. */
+ if (owner != cur) {
+ lock_mutex_enter();
+ }
+ trx_mutex_enter(trx);
+ if (trx->lock.wait_lock) {
+ lock_cancel_waiting_and_release(trx->lock.wait_lock);
+ }
+ trx_mutex_exit(trx);
+ if (owner != cur) {
+ lock_mutex_exit();
+ }
+ }
DBUG_VOID_RETURN;
}
@@ -4373,14 +4389,11 @@ handler::Table_flags
ha_innobase::table_flags() const
/*============================*/
{
- THD *thd = ha_thd();
/* Need to use tx_isolation here since table flags is (also)
called before prebuilt is inited. */
- ulong const tx_isolation = thd_tx_isolation(thd);
+ ulong const tx_isolation = thd_tx_isolation(ha_thd());
- if (tx_isolation <= ISO_READ_COMMITTED &&
- !(tx_isolation == ISO_READ_COMMITTED &&
- thd_rpl_is_parallel(thd))) {
+ if (tx_isolation <= ISO_READ_COMMITTED) {
return(int_table_flags);
}
@@ -7871,7 +7884,7 @@ calc_row_difference(
if (doc_id < prebuilt->table->fts->cache->next_doc_id) {
fprintf(stderr,
"InnoDB: FTS Doc ID must be larger than"
- " "IB_ID_FMT" for table",
+ " " IB_ID_FMT " for table",
innodb_table->fts->cache->next_doc_id
- 1);
ut_print_name(stderr, trx,
@@ -7883,9 +7896,9 @@ calc_row_difference(
- prebuilt->table->fts->cache->next_doc_id)
>= FTS_DOC_ID_MAX_STEP) {
fprintf(stderr,
- "InnoDB: Doc ID "UINT64PF" is too"
+ "InnoDB: Doc ID " UINT64PF " is too"
" big. Its difference with largest"
- " Doc ID used "UINT64PF" cannot"
+ " Doc ID used " UINT64PF " cannot"
" exceed or equal to %d\n",
doc_id,
prebuilt->table->fts->cache->next_doc_id - 1,
@@ -8625,6 +8638,29 @@ ha_innobase::innobase_get_index(
index = innobase_index_lookup(share, keynr);
if (index) {
+ if (!key || ut_strcmp(index->name, key->name) != 0) {
+ fprintf(stderr, "InnoDB: [Error] Index for key no %u"
+ " mysql name %s , InnoDB name %s for table %s\n",
+ keynr, key ? key->name : "NULL",
+ index->name,
+ prebuilt->table->name);
+
+ for(ulint i=0; i < table->s->keys; i++) {
+ index = innobase_index_lookup(share, i);
+ key = table->key_info + keynr;
+
+ if (index) {
+
+ fprintf(stderr, "InnoDB: [Note] Index for key no %u"
+ " mysql name %s , InnoDB name %s for table %s\n",
+ keynr, key ? key->name : "NULL",
+ index->name,
+ prebuilt->table->name);
+ }
+ }
+
+ }
+
ut_a(ut_strcmp(index->name, key->name) == 0);
} else {
/* Can't find index with keynr in the translation
@@ -12501,6 +12537,34 @@ ha_innobase::info_low(
break;
}
+ DBUG_EXECUTE_IF("ib_ha_innodb_stat_not_initialized",
+ index->table->stat_initialized = FALSE;);
+
+ if (!ib_table->stat_initialized ||
+ (index->table != ib_table ||
+ !index->table->stat_initialized)) {
+ fprintf(stderr,
+ "InnoDB: Warning: Index %s points to table %s" " and ib_table %s statistics is initialized %d "
+ " but index table %s initialized %d "
+ " mysql table is %s. Have you mixed "
+ "up .frm files from different "
+ "installations? "
+ "See " REFMAN
+ "innodb-troubleshooting.html\n",
+ index->name,
+ index->table->name,
+ ib_table->name,
+ ib_table->stat_initialized,
+ index->table->name,
+ index->table->stat_initialized,
+ table->s->table_name.str
+ );
+
+ /* This is better than
+ assert on below function */
+ dict_stats_init(index->table);
+ }
+
rec_per_key = innodb_rec_per_key(
index, j, stats.records);
@@ -18191,6 +18255,11 @@ static MYSQL_SYSVAR_ULONG(saved_page_number_debug,
NULL, innodb_save_page_no, 0, 0, UINT_MAX32, 0);
#endif /* UNIV_DEBUG */
+static MYSQL_SYSVAR_UINT(simulate_comp_failures, srv_simulate_comp_failures,
+ PLUGIN_VAR_NOCMDARG,
+ "Simulate compression failures.",
+ NULL, NULL, 0, 0, 99, 0);
+
static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(additional_mem_pool_size),
MYSQL_SYSVAR(api_trx_level),
@@ -18351,6 +18420,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(fil_make_page_dirty_debug),
MYSQL_SYSVAR(saved_page_number_debug),
#endif /* UNIV_DEBUG */
+ MYSQL_SYSVAR(simulate_comp_failures),
NULL
};
@@ -18680,7 +18750,7 @@ ib_senderrf(
va_start(args, code);
- myf l;
+ myf l=0;
switch(level) {
case IB_LOG_LEVEL_INFO:
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
index 833166e783c..f1e4406fcf7 100644
--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -576,6 +576,17 @@ void
btr_estimate_number_of_different_key_vals(
/*======================================*/
dict_index_t* index); /*!< in: index */
+
+/** Gets the externally stored size of a record, in units of a database page.
+@param[in] rec record
+@param[in] offsets array returned by rec_get_offsets()
+@return externally stored part, in units of a database page */
+
+ulint
+btr_rec_get_externally_stored_len(
+ const rec_t* rec,
+ const ulint* offsets);
+
/*******************************************************************//**
Marks non-updated off-page fields as disowned by this record. The ownership
must be transferred to the updated record which is inserted elsewhere in the
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
index ce709a2e912..026187b2000 100644
--- a/storage/innobase/include/dict0dict.h
+++ b/storage/innobase/include/dict0dict.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
This program is free software; you can redistribute it and/or modify it under
@@ -43,6 +43,9 @@ Created 1/8/1996 Heikki Tuuri
#include "trx0types.h"
#include "row0types.h"
+extern bool innodb_table_stats_not_found;
+extern bool innodb_index_stats_not_found;
+
#ifndef UNIV_HOTBACKUP
# include "sync0sync.h"
# include "sync0rw.h"
@@ -1435,6 +1438,28 @@ UNIV_INTERN
void
dict_mutex_exit_for_mysql(void);
/*===========================*/
+
+/** Create a dict_table_t's stats latch or delay for lazy creation.
+This function is only called from either single threaded environment
+or from a thread that has not shared the table object with other threads.
+@param[in,out] table table whose stats latch to create
+@param[in] enabled if false then the latch is disabled
+and dict_table_stats_lock()/unlock() become noop on this table. */
+
+void
+dict_table_stats_latch_create(
+ dict_table_t* table,
+ bool enabled);
+
+/** Destroy a dict_table_t's stats latch.
+This function is only called from either single threaded environment
+or from a thread that has not shared the table object with other threads.
+@param[in,out] table table whose stats latch to destroy */
+
+void
+dict_table_stats_latch_destroy(
+ dict_table_t* table);
+
/**********************************************************************//**
Lock the appropriate latch to protect a given table's statistics.
table->id is used to pick the corresponding latch from a global array of
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index c5ed8d92cb0..0e3981a2946 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
This program is free software; you can redistribute it and/or modify it under
@@ -46,6 +46,7 @@ Created 1/8/1996 Heikki Tuuri
#include "hash0hash.h"
#include "trx0types.h"
#include "fts0fts.h"
+#include "os0once.h"
/* Forward declaration. */
struct ib_rbt_t;
@@ -627,6 +628,9 @@ struct dict_index_t{
ulint stat_n_leaf_pages;
/*!< approximate number of leaf pages in the
index tree */
+ bool stats_error_printed;
+ /*!< has persistent statistics error printed
+ for this index ? */
/* @} */
rw_lock_t lock; /*!< read-write lock protecting the
upper levels of the index tree */
@@ -842,6 +846,10 @@ struct dict_table_t{
initialized in dict_table_add_to_cache() */
/** Statistics for query optimization */
/* @{ */
+
+ volatile os_once::state_t stats_latch_created;
+ /*!< Creation state of 'stats_latch'. */
+
rw_lock_t* stats_latch; /*!< this latch protects:
dict_table_t::stat_initialized
dict_table_t::stat_n_rows (*)
@@ -950,6 +958,9 @@ struct dict_table_t{
/*!< see BG_STAT_* above.
Writes are covered by dict_sys->mutex.
Dirty reads are possible. */
+ bool stats_error_printed;
+ /*!< Has persistent stats error beein
+ already printed for this table ? */
/* @} */
/*----------------------*/
/**!< The following fields are used by the
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index 385853bdb68..88246afebdc 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -289,7 +289,7 @@ lock_rec_insert_check_and_lock(
inserted record maybe should inherit
LOCK_GAP type locks from the successor
record */
- __attribute__((nonnull, warn_unused_result));
+ __attribute__((nonnull(2,3,4,6,7), warn_unused_result));
/*********************************************************************//**
Checks if locks of other transactions prevent an immediate modify (update,
delete mark, or delete unmark) of a clustered index record. If they do,
diff --git a/storage/innobase/include/os0once.h b/storage/innobase/include/os0once.h
new file mode 100644
index 00000000000..a8bbaf1d2d4
--- /dev/null
+++ b/storage/innobase/include/os0once.h
@@ -0,0 +1,125 @@
+/*****************************************************************************
+
+Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0once.h
+A class that aids executing a given function exactly once in a multi-threaded
+environment.
+
+Created Feb 20, 2014 Vasil Dimov
+*******************************************************/
+
+#ifndef os0once_h
+#define os0once_h
+
+#include "univ.i"
+
+#include "os0sync.h"
+#include "ut0ut.h"
+
+/** Execute a given function exactly once in a multi-threaded environment
+or wait for the function to be executed by another thread.
+
+Example usage:
+First the user must create a control variable of type os_once::state_t and
+assign it os_once::NEVER_DONE.
+Then the user must pass this variable, together with a function to be
+executed to os_once::do_or_wait_for_done().
+
+Multiple threads can call os_once::do_or_wait_for_done() simultaneously with
+the same (os_once::state_t) control variable. The provided function will be
+called exactly once and when os_once::do_or_wait_for_done() returns then this
+function has completed execution, by this or another thread. In other words
+os_once::do_or_wait_for_done() will either execute the provided function or
+will wait for its execution to complete if it is already called by another
+thread or will do nothing if the function has already completed its execution
+earlier.
+
+This mimics pthread_once(3), but unfortunatelly pthread_once(3) does not
+support passing arguments to the init_routine() function. We should use
+std::call_once() when we start compiling with C++11 enabled. */
+class os_once {
+public:
+ /** Control variables' state type */
+ typedef ib_uint32_t state_t;
+
+ /** Not yet executed. */
+ static const state_t NEVER_DONE = 0;
+
+ /** Currently being executed by this or another thread. */
+ static const state_t IN_PROGRESS = 1;
+
+ /** Finished execution. */
+ static const state_t DONE = 2;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+ /** Call a given function or wait its execution to complete if it is
+ already called by another thread.
+ @param[in,out] state control variable
+ @param[in] do_func function to call
+ @param[in,out] do_func_arg an argument to pass to do_func(). */
+ static
+ void
+ do_or_wait_for_done(
+ volatile state_t* state,
+ void (*do_func)(void*),
+ void* do_func_arg)
+ {
+ /* Avoid calling os_compare_and_swap_uint32() in the most
+ common case. */
+ if (*state == DONE) {
+ return;
+ }
+
+ if (os_compare_and_swap_uint32(state,
+ NEVER_DONE, IN_PROGRESS)) {
+ /* We are the first. Call the function. */
+
+ do_func(do_func_arg);
+
+ const bool swapped = os_compare_and_swap_uint32(
+ state, IN_PROGRESS, DONE);
+
+ ut_a(swapped);
+ } else {
+ /* The state is not NEVER_DONE, so either it is
+ IN_PROGRESS (somebody is calling the function right
+ now or DONE (it has already been called and completed).
+ Wait for it to become DONE. */
+ for (;;) {
+ const state_t s = *state;
+
+ switch (s) {
+ case DONE:
+ return;
+ case IN_PROGRESS:
+ break;
+ case NEVER_DONE:
+ /* fall through */
+ default:
+ ut_error;
+ }
+
+ UT_RELAX_CPU();
+ }
+ }
+ }
+#endif /* HAVE_ATOMIC_BUILTINS */
+};
+
+#endif /* os0once_h */
diff --git a/storage/innobase/include/os0sync.h b/storage/innobase/include/os0sync.h
index 9b4ce2343c5..6d3dd850e08 100644
--- a/storage/innobase/include/os0sync.h
+++ b/storage/innobase/include/os0sync.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -434,6 +434,9 @@ Returns the old value of *ptr, atomically sets *ptr to new_val */
# define os_atomic_test_and_set_ulint(ptr, new_val) \
__sync_lock_test_and_set(ptr, new_val)
+# define os_atomic_lock_release_byte(ptr) \
+ __sync_lock_release(ptr)
+
#elif defined(HAVE_IB_SOLARIS_ATOMICS)
# define HAVE_ATOMIC_BUILTINS
@@ -515,6 +518,9 @@ Returns the old value of *ptr, atomically sets *ptr to new_val */
# define os_atomic_test_and_set_ulint(ptr, new_val) \
atomic_swap_ulong(ptr, new_val)
+# define os_atomic_lock_release_byte(ptr) \
+ (void) atomic_swap_uchar(ptr, 0)
+
#elif defined(HAVE_WINDOWS_ATOMICS)
# define HAVE_ATOMIC_BUILTINS
@@ -574,7 +580,8 @@ Returns true if swapped, ptr is pointer to target, old_val is value to
compare to, new_val is the value to swap in. */
# define os_compare_and_swap_uint32(ptr, old_val, new_val) \
- (win_cmp_and_xchg_dword(ptr, new_val, old_val) == old_val)
+ (InterlockedCompareExchange(reinterpret_cast<volatile long*>(ptr), \
+ new_val, old_val) == old_val)
# define os_compare_and_swap_ulint(ptr, old_val, new_val) \
(win_cmp_and_xchg_ulint(ptr, new_val, old_val) == old_val)
@@ -637,6 +644,9 @@ clobbered */
# define os_atomic_test_and_set_ulong(ptr, new_val) \
InterlockedExchange(ptr, new_val)
+# define os_atomic_lock_release_byte(ptr) \
+ (void) InterlockedExchange(ptr, 0)
+
#else
# define IB_ATOMICS_STARTUP_MSG \
"Mutexes and rw_locks use InnoDB's own implementation"
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index 7922b14cc86..2b58e0717fb 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -3,6 +3,7 @@
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2008, 2009, Google Inc.
Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -453,6 +454,9 @@ extern struct export_var_t export_vars;
/** Global counters */
extern srv_stats_t srv_stats;
+/** Simulate compression failures. */
+extern uint srv_simulate_comp_failures;
+
# ifdef UNIV_PFS_THREAD
/* Keys to register InnoDB threads with performance schema */
extern mysql_pfs_key_t buf_page_cleaner_thread_key;
diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h
index 34cd8ef4bd6..b36e04f2810 100644
--- a/storage/innobase/include/sync0rw.h
+++ b/storage/innobase/include/sync0rw.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -108,14 +108,8 @@ extern ib_mutex_t rw_lock_list_mutex;
#ifdef UNIV_SYNC_DEBUG
/* The global mutex which protects debug info lists of all rw-locks.
To modify the debug info list of an rw-lock, this mutex has to be
-
acquired in addition to the mutex protecting the lock. */
-extern ib_mutex_t rw_lock_debug_mutex;
-extern os_event_t rw_lock_debug_event; /*!< If deadlock detection does
- not get immediately the mutex it
- may wait for this event */
-extern ibool rw_lock_debug_waiters; /*!< This is set to TRUE, if
- there may be waiters for the event */
+extern os_fast_mutex_t rw_lock_debug_mutex;
#endif /* UNIV_SYNC_DEBUG */
/** Counters for RW locks. */
@@ -141,7 +135,7 @@ extern mysql_pfs_key_t trx_i_s_cache_lock_key;
extern mysql_pfs_key_t trx_purge_latch_key;
extern mysql_pfs_key_t index_tree_rw_lock_key;
extern mysql_pfs_key_t index_online_log_key;
-extern mysql_pfs_key_t dict_table_stats_latch_key;
+extern mysql_pfs_key_t dict_table_stats_key;
extern mysql_pfs_key_t trx_sys_rw_lock_key;
extern mysql_pfs_key_t hash_table_rw_lock_key;
#endif /* UNIV_PFS_RWLOCK */
diff --git a/storage/innobase/include/sync0sync.ic b/storage/innobase/include/sync0sync.ic
index f34f3f90b63..cb6f6efbed8 100644
--- a/storage/innobase/include/sync0sync.ic
+++ b/storage/innobase/include/sync0sync.ic
@@ -108,10 +108,7 @@ mutex_reset_lock_word(
ib_mutex_t* mutex) /*!< in: mutex */
{
#if defined(HAVE_ATOMIC_BUILTINS)
- /* In theory __sync_lock_release should be used to release the lock.
- Unfortunately, it does not work properly alone. The workaround is
- that more conservative __sync_lock_test_and_set is used instead. */
- os_atomic_test_and_set_byte(&mutex->lock_word, 0);
+ os_atomic_lock_release_byte(&mutex->lock_word);
#else
mutex->lock_word = 0;
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
index a30bbdbebb2..7c92445b796 100644
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -992,6 +992,11 @@ struct trx_t{
count of tables being flushed. */
/*------------------------------*/
+ THD* current_lock_mutex_owner;
+ /*!< If this is equal to current_thd,
+ then in innobase_kill_query() we know we
+ already hold the lock_sys->mutex. */
+ /*------------------------------*/
#ifdef UNIV_DEBUG
ulint start_line; /*!< Track where it was started from */
const char* start_file; /*!< Filename where it was started */
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
index 98c5512bd0b..bc359746a0b 100644
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@@ -44,7 +44,7 @@ Created 1/20/1994 Heikki Tuuri
#define INNODB_VERSION_MAJOR 5
#define INNODB_VERSION_MINOR 6
-#define INNODB_VERSION_BUGFIX 17
+#define INNODB_VERSION_BUGFIX 19
/* The following is the InnoDB version as shown in
SELECT plugin_version FROM information_schema.plugins;
@@ -439,10 +439,10 @@ typedef unsigned __int64 ib_uint64_t;
typedef unsigned __int32 ib_uint32_t;
#else
/* Use the integer types and formatting strings defined in the C99 standard. */
-# define UINT32PF "%"PRIu32
-# define INT64PF "%"PRId64
-# define UINT64PF "%"PRIu64
-# define UINT64PFx "%016"PRIx64
+# define UINT32PF "%" PRIu32
+# define INT64PF "%" PRId64
+# define UINT64PF "%" PRIu64
+# define UINT64PFx "%016" PRIx64
# define DBUG_LSN_PF UINT64PF
typedef int64_t ib_int64_t;
typedef uint64_t ib_uint64_t;
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index f99c34294cd..659b2e5b62a 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -49,6 +49,7 @@ Created 5/7/1996 Heikki Tuuri
#include "btr0btr.h"
#include "dict0boot.h"
#include <set>
+#include "mysql/plugin.h"
#ifdef WITH_WSREP
extern my_bool wsrep_debug;
@@ -378,6 +379,11 @@ struct lock_stack_t {
ulint heap_no; /*!< heap number if rec lock */
};
+extern "C" void thd_report_wait_for(const MYSQL_THD thd, MYSQL_THD other_thd);
+extern "C" int thd_need_wait_for(const MYSQL_THD thd);
+extern "C"
+int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
+
/** Stack to use during DFS search. Currently only a single stack is required
because there is no parallel deadlock check. This stack is protected by
the lock_sys_t::mutex. */
@@ -393,6 +399,14 @@ UNIV_INTERN mysql_pfs_key_t lock_sys_mutex_key;
UNIV_INTERN mysql_pfs_key_t lock_sys_wait_mutex_key;
#endif /* UNIV_PFS_MUTEX */
+/* Buffer to collect THDs to report waits for. */
+struct thd_wait_reports {
+ struct thd_wait_reports *next; /*!< List link */
+ ulint used; /*!< How many elements in waitees[] */
+ trx_t *waitees[64]; /*!< Trxs for thd_report_wait_for() */
+};
+
+
#ifdef UNIV_DEBUG
UNIV_INTERN ibool lock_print_waits = FALSE;
@@ -1023,6 +1037,32 @@ lock_rec_has_to_wait(
return(FALSE);
}
+ if ((type_mode & LOCK_GAP || lock_rec_get_gap(lock2)) &&
+ !thd_need_ordering_with(trx->mysql_thd,
+ lock2->trx->mysql_thd)) {
+ /* If the upper server layer has already decided on the
+ commit order between the transaction requesting the
+ lock and the transaction owning the lock, we do not
+ need to wait for gap locks. Such ordeering by the upper
+ server layer happens in parallel replication, where the
+ commit order is fixed to match the original order on the
+ master.
+
+ Such gap locks are mainly needed to get serialisability
+ between transactions so that they will be binlogged in
+ the correct order so that statement-based replication
+ will give the correct results. Since the right order
+ was already determined on the master, we do not need
+ to enforce it again here.
+
+ Skipping the locks is not essential for correctness,
+ since in case of deadlock we will just kill the later
+ transaction and retry it. But it can save some
+ unnecessary rollbacks and retries. */
+
+ return (FALSE);
+ }
+
#ifdef WITH_WSREP
/* if BF thread is locking and has conflict with another BF
thread, we need to look at trx ordering and lock types */
@@ -4069,7 +4109,8 @@ static
trx_id_t
lock_deadlock_search(
/*=================*/
- lock_deadlock_ctx_t* ctx) /*!< in/out: deadlock context */
+ lock_deadlock_ctx_t* ctx, /*!< in/out: deadlock context */
+ struct thd_wait_reports*waitee_ptr) /*!< in/out: list of waitees */
{
const lock_t* lock;
ulint heap_no;
@@ -4149,38 +4190,59 @@ lock_deadlock_search(
/* Select the joining transaction as the victim. */
return(ctx->start->id);
- } else if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+ } else {
+ /* We do not need to report autoinc locks to the upper
+ layer. These locks are released before commit, so they
+ can not cause deadlocks with binlog-fixed commit
+ order. */
+ if (waitee_ptr &&
+ (lock_get_type_low(lock) != LOCK_TABLE ||
+ lock_get_mode(lock) != LOCK_AUTO_INC)) {
+ if (waitee_ptr->used ==
+ sizeof(waitee_ptr->waitees) /
+ sizeof(waitee_ptr->waitees[0])) {
+ waitee_ptr->next =
+ (struct thd_wait_reports *)
+ mem_alloc(sizeof(*waitee_ptr));
+ waitee_ptr = waitee_ptr->next;
+ if (!waitee_ptr) {
+ ctx->too_deep = TRUE;
+ return(ctx->start->id);
+ }
+ waitee_ptr->next = NULL;
+ waitee_ptr->used = 0;
+ }
+ waitee_ptr->waitees[waitee_ptr->used++] = lock->trx;
+ }
- /* Another trx ahead has requested a lock in an
- incompatible mode, and is itself waiting for a lock. */
+ if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
- ++ctx->cost;
+ /* Another trx ahead has requested a lock in an
+ incompatible mode, and is itself waiting for a lock. */
- /* Save current search state. */
- if (!lock_deadlock_push(ctx, lock, heap_no)) {
+ ++ctx->cost;
- /* Unable to save current search state, stack
- size not big enough. */
+ /* Save current search state. */
+ if (!lock_deadlock_push(ctx, lock, heap_no)) {
- ctx->too_deep = TRUE;
+ /* Unable to save current search state, stack
+ size not big enough. */
+
+ ctx->too_deep = TRUE;
-#ifdef WITH_WSREP
- if (wsrep_thd_is_BF(ctx->start->mysql_thd, TRUE))
- return(lock->trx->id);
- else
-#endif /* WITH_WSREP */
return(ctx->start->id);
- }
+ }
- ctx->wait_lock = lock->trx->lock.wait_lock;
- lock = lock_get_first_lock(ctx, &heap_no);
+ ctx->wait_lock = lock->trx->lock.wait_lock;
+ lock = lock_get_first_lock(ctx, &heap_no);
- if (lock->trx->lock.deadlock_mark > ctx->mark_start) {
+ if (lock->trx->lock.deadlock_mark > ctx->mark_start) {
+ lock = lock_get_next_lock(ctx, lock, heap_no);
+ }
+
+ } else {
lock = lock_get_next_lock(ctx, lock, heap_no);
}
-
- } else {
- lock = lock_get_next_lock(ctx, lock, heap_no);
}
}
@@ -4245,6 +4307,48 @@ lock_deadlock_trx_rollback(
trx_mutex_exit(trx);
}
+static
+void
+lock_report_waiters_to_mysql(
+/*=======================*/
+ struct thd_wait_reports* waitee_buf_ptr, /*!< in: set of trxs */
+ THD* mysql_thd, /*!< in: THD */
+ trx_id_t victim_trx_id) /*!< in: Trx selected
+ as deadlock victim, if
+ any */
+{
+ struct thd_wait_reports* p;
+ struct thd_wait_reports* q;
+ ulint i;
+
+ p = waitee_buf_ptr;
+ while (p) {
+ i = 0;
+ while (i < p->used) {
+ trx_t *w_trx = p->waitees[i];
+ /* There is no need to report waits to a trx already
+ selected as a victim. */
+ if (w_trx->id != victim_trx_id) {
+ /* If thd_report_wait_for() decides to kill the
+ transaction, then we will get a call back into
+ innobase_kill_query. We mark this by setting
+ current_lock_mutex_owner, so we can avoid trying
+ to recursively take lock_sys->mutex. */
+ w_trx->current_lock_mutex_owner = mysql_thd;
+ thd_report_wait_for(mysql_thd, w_trx->mysql_thd);
+ w_trx->current_lock_mutex_owner = NULL;
+ }
+ ++i;
+ }
+ q = p->next;
+ if (p != waitee_buf_ptr) {
+ mem_free(p);
+ }
+ p = q;
+ }
+}
+
+
/********************************************************************//**
Checks if a joining lock request results in a deadlock. If a deadlock is
found this function will resolve the dadlock by choosing a victim transaction
@@ -4260,13 +4364,23 @@ lock_deadlock_check_and_resolve(
const lock_t* lock, /*!< in: lock the transaction is requesting */
const trx_t* trx) /*!< in: transaction */
{
- trx_id_t victim_trx_id;
+ trx_id_t victim_trx_id;
+ struct thd_wait_reports waitee_buf;
+ struct thd_wait_reports*waitee_buf_ptr;
+ THD* start_mysql_thd;
ut_ad(trx != NULL);
ut_ad(lock != NULL);
ut_ad(lock_mutex_own());
assert_trx_in_list(trx);
+ start_mysql_thd = trx->mysql_thd;
+ if (start_mysql_thd && thd_need_wait_for(start_mysql_thd)) {
+ waitee_buf_ptr = &waitee_buf;
+ } else {
+ waitee_buf_ptr = NULL;
+ }
+
/* Try and resolve as many deadlocks as possible. */
do {
lock_deadlock_ctx_t ctx;
@@ -4279,7 +4393,19 @@ lock_deadlock_check_and_resolve(
ctx.wait_lock = lock;
ctx.mark_start = lock_mark_counter;
- victim_trx_id = lock_deadlock_search(&ctx);
+ if (waitee_buf_ptr) {
+ waitee_buf_ptr->next = NULL;
+ waitee_buf_ptr->used = 0;
+ }
+
+ victim_trx_id = lock_deadlock_search(&ctx, waitee_buf_ptr);
+
+ /* Report waits to upper layer, as needed. */
+ if (waitee_buf_ptr) {
+ lock_report_waiters_to_mysql(waitee_buf_ptr,
+ start_mysql_thd,
+ victim_trx_id);
+ }
/* Search too deep, we rollback the joining transaction. */
if (ctx.too_deep) {
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
index 992b1e79b58..1ec08da8a83 100644
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@ -2679,7 +2679,7 @@ try_again:
}
ib_logf(IB_LOG_LEVEL_ERROR,
- "Tried to read "ULINTPF" bytes at offset " UINT64PF". "
+ "Tried to read " ULINTPF " bytes at offset " UINT64PF ". "
"Was only able to read %ld.", n, offset, (lint) ret);
#endif /* __WIN__ */
#ifdef __WIN__
@@ -2866,6 +2866,7 @@ os_file_write_func(
DWORD high;
ulint n_retries = 0;
ulint err;
+ DWORD saved_error = 0;
#ifndef UNIV_HOTBACKUP
ulint i;
#endif /* !UNIV_HOTBACKUP */
@@ -2955,8 +2956,10 @@ retry:
}
if (!os_has_said_disk_full) {
+ char *winmsg = NULL;
- err = (ulint) GetLastError();
+ saved_error = GetLastError();
+ err = (ulint) saved_error;
ut_print_timestamp(stderr);
@@ -2973,6 +2976,23 @@ retry:
name, offset,
(ulong) n, (ulong) len, (ulong) err);
+ /* Ask Windows to prepare a standard message for a
+ GetLastError() */
+
+ FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+ FORMAT_MESSAGE_FROM_SYSTEM |
+ FORMAT_MESSAGE_IGNORE_INSERTS,
+ NULL, saved_error,
+ MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+ (LPSTR)&winmsg, 0, NULL);
+
+ if (winmsg) {
+ fprintf(stderr,
+ "InnoDB: FormatMessage: Error number %lu means '%s'.\n",
+ (ulong) saved_error, winmsg);
+ LocalFree(winmsg);
+ }
+
if (strerror((int) err) != NULL) {
fprintf(stderr,
"InnoDB: Error number %lu means '%s'.\n",
@@ -3001,12 +3021,11 @@ retry:
}
if (!os_has_said_disk_full) {
-
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Error: Write to file %s failed"
- " at offset "UINT64PF".\n"
+ " at offset " UINT64PF ".\n"
"InnoDB: %lu bytes should have been written,"
" only %ld were written.\n"
"InnoDB: Operating system error number %lu.\n"
@@ -4592,11 +4611,16 @@ os_aio_func(
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
+ DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
+ mode = OS_AIO_SYNC;);
+
if (mode == OS_AIO_SYNC
#ifdef WIN_ASYNC_IO
&& !srv_use_native_aio
#endif /* WIN_ASYNC_IO */
) {
+ ibool ret;
+
/* This is actually an ordinary synchronous read or write:
no need to use an i/o-handler thread. NOTE that if we use
Windows async i/o, Windows does not allow us to use
@@ -4611,13 +4635,23 @@ os_aio_func(
and os_file_write_func() */
if (type == OS_FILE_READ) {
- return(os_file_read_func(file, buf, offset, n));
+ ret = os_file_read_func(file, buf, offset, n);
+ } else {
+
+ ut_ad(!srv_read_only_mode);
+ ut_a(type == OS_FILE_WRITE);
+
+ ret = os_file_write_func(name, file, buf, offset, n);
}
- ut_ad(!srv_read_only_mode);
- ut_a(type == OS_FILE_WRITE);
+ DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
+ os_has_said_disk_full = FALSE;);
+ DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
+ ret = 0;);
+ DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
+ errno = 28;);
- return(os_file_write_func(name, file, buf, offset, n));
+ return ret;
}
try_again:
@@ -5442,7 +5476,13 @@ consecutive_loop:
aio_slot->offset, total_len);
}
- ut_a(ret);
+ DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28_2",
+ os_has_said_disk_full = FALSE;);
+ DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28_2",
+ ret = 0;);
+ DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28_2",
+ errno = 28;);
+
srv_set_io_thread_op_info(global_segment, "file i/o done");
if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) {
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
index ab7a19795a3..4b19a35925e 100644
--- a/storage/innobase/page/page0zip.cc
+++ b/storage/innobase/page/page0zip.cc
@@ -1309,6 +1309,30 @@ page_zip_compress(
MONITOR_INC(MONITOR_PAGE_COMPRESS);
+ /* Simulate a compression failure with a probability determined by
+ innodb_simulate_comp_failures, only if the page has 2 or more
+ records. */
+
+ if (srv_simulate_comp_failures
+ && !dict_index_is_ibuf(index)
+ && page_get_n_recs(page) >= 2
+ && ((ulint)(rand() % 100) < srv_simulate_comp_failures)
+ && strcasecmp(index->table_name, "IBUF_DUMMY") != 0) {
+
+#ifdef UNIV_DEBUG
+ fprintf(stderr,
+ "InnoDB: Simulating a compression failure"
+ " for table %s, index %s, page %lu (%s)\n",
+ index->table_name,
+ index->name,
+ page_get_page_no(page),
+ page_is_leaf(page) ? "leaf" : "non-leaf");
+
+#endif
+
+ goto err_exit;
+ }
+
heap = mem_heap_create(page_zip_get_size(page_zip)
+ n_fields * (2 + sizeof(ulint))
+ REC_OFFS_HEADER_SIZE
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
index e6487730a77..c144ca890f8 100644
--- a/storage/innobase/row/row0ins.cc
+++ b/storage/innobase/row/row0ins.cc
@@ -151,35 +151,37 @@ row_ins_alloc_sys_fields(
ut_ad(row && table && heap);
ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
- /* 1. Allocate buffer for row id */
+ /* allocate buffer to hold the needed system created hidden columns. */
+ uint len = DATA_ROW_ID_LEN + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ ptr = static_cast<byte*>(mem_heap_zalloc(heap, len));
+ /* 1. Populate row-id */
col = dict_table_get_sys_col(table, DATA_ROW_ID);
dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
- ptr = static_cast<byte*>(mem_heap_zalloc(heap, DATA_ROW_ID_LEN));
-
dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN);
node->row_id_buf = ptr;
- /* 3. Allocate buffer for trx id */
+ ptr += DATA_ROW_ID_LEN;
+ /* 2. Populate trx id */
col = dict_table_get_sys_col(table, DATA_TRX_ID);
dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
- ptr = static_cast<byte*>(mem_heap_zalloc(heap, DATA_TRX_ID_LEN));
dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN);
node->trx_id_buf = ptr;
- /* 4. Allocate buffer for roll ptr */
+ ptr += DATA_TRX_ID_LEN;
+
+ /* 3. Populate roll ptr */
col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
- ptr = static_cast<byte*>(mem_heap_zalloc(heap, DATA_ROLL_PTR_LEN));
dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN);
}
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
index 56cf9f1943c..86b47c9f3bd 100644
--- a/storage/innobase/row/row0merge.cc
+++ b/storage/innobase/row/row0merge.cc
@@ -786,7 +786,7 @@ row_merge_read(
if (UNIV_UNLIKELY(!success)) {
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB: failed to read merge block at "UINT64PF"\n",
+ " InnoDB: failed to read merge block at " UINT64PF "\n",
ofs);
}
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
index 93d13ea49ee..dd7af8a3526 100644
--- a/storage/innobase/row/row0mysql.cc
+++ b/storage/innobase/row/row0mysql.cc
@@ -1359,7 +1359,7 @@ error_exit:
if (doc_id < next_doc_id) {
fprintf(stderr,
"InnoDB: FTS Doc ID must be large than"
- " "UINT64PF" for table",
+ " " UINT64PF " for table",
next_doc_id - 1);
ut_print_name(stderr, trx, TRUE, table->name);
putc('\n', stderr);
@@ -1374,9 +1374,9 @@ error_exit:
if (doc_id - next_doc_id >= FTS_DOC_ID_MAX_STEP) {
fprintf(stderr,
- "InnoDB: Doc ID "UINT64PF" is too"
+ "InnoDB: Doc ID " UINT64PF " is too"
" big. Its difference with largest"
- " used Doc ID "UINT64PF" cannot"
+ " used Doc ID " UINT64PF " cannot"
" exceed or equal to %d\n",
doc_id, next_doc_id - 1,
FTS_DOC_ID_MAX_STEP);
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
index 359ae3f2c21..e5a7694cb93 100644
--- a/storage/innobase/row/row0sel.cc
+++ b/storage/innobase/row/row0sel.cc
@@ -877,16 +877,15 @@ row_sel_get_clust_rec(
if (!node->read_view) {
/* Try to place a lock on the index record */
-
- /* If innodb_locks_unsafe_for_binlog option is used
- or this session is using READ COMMITTED isolation level
- we lock only the record, i.e., next-key locking is
- not used. */
ulint lock_type;
trx_t* trx;
trx = thr_get_trx(thr);
+ /* If innodb_locks_unsafe_for_binlog option is used
+ or this session is using READ COMMITTED or lower isolation level
+ we lock only the record, i.e., next-key locking is
+ not used. */
if (srv_locks_unsafe_for_binlog
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
lock_type = LOCK_REC_NOT_GAP;
@@ -1502,12 +1501,6 @@ rec_loop:
search result set, resulting in the phantom problem. */
if (!consistent_read) {
-
- /* If innodb_locks_unsafe_for_binlog option is used
- or this session is using READ COMMITTED isolation
- level, we lock only the record, i.e., next-key
- locking is not used. */
-
rec_t* next_rec = page_rec_get_next(rec);
ulint lock_type;
trx_t* trx;
@@ -1517,6 +1510,10 @@ rec_loop:
offsets = rec_get_offsets(next_rec, index, offsets,
ULINT_UNDEFINED, &heap);
+ /* If innodb_locks_unsafe_for_binlog option is used
+ or this session is using READ COMMITTED or lower isolation
+ level, we lock only the record, i.e., next-key
+ locking is not used. */
if (srv_locks_unsafe_for_binlog
|| trx->isolation_level
<= TRX_ISO_READ_COMMITTED) {
@@ -1565,12 +1562,6 @@ skip_lock:
if (!consistent_read) {
/* Try to place a lock on the index record */
-
- /* If innodb_locks_unsafe_for_binlog option is used
- or this session is using READ COMMITTED isolation level,
- we lock only the record, i.e., next-key locking is
- not used. */
-
ulint lock_type;
trx_t* trx;
@@ -1579,6 +1570,10 @@ skip_lock:
trx = thr_get_trx(thr);
+ /* If innodb_locks_unsafe_for_binlog option is used
+ or this session is using READ COMMITTED or lower isolation level,
+ we lock only the record, i.e., next-key locking is
+ not used. */
if (srv_locks_unsafe_for_binlog
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
@@ -4227,7 +4222,7 @@ rec_loop:
/* Try to place a lock on the index record */
/* If innodb_locks_unsafe_for_binlog option is used
- or this session is using a READ COMMITTED isolation
+ or this session is using a READ COMMITTED or lower isolation
level we do not lock gaps. Supremum record is really
a gap and therefore we do not set locks there. */
@@ -4369,7 +4364,7 @@ wrong_offs:
/* Try to place a gap lock on the index
record only if innodb_locks_unsafe_for_binlog
option is not set or this session is not
- using a READ COMMITTED isolation level. */
+ using a READ COMMITTED or lower isolation level. */
err = sel_set_rec_lock(
btr_pcur_get_block(pcur),
@@ -4418,7 +4413,7 @@ wrong_offs:
/* Try to place a gap lock on the index
record only if innodb_locks_unsafe_for_binlog
option is not set or this session is not
- using a READ COMMITTED isolation level. */
+ using a READ COMMITTED or lower isolation level. */
err = sel_set_rec_lock(
btr_pcur_get_block(pcur),
diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc
index ea346566e57..64417b1e5fb 100644
--- a/storage/innobase/srv/srv0mon.cc
+++ b/storage/innobase/srv/srv0mon.cc
@@ -41,8 +41,8 @@ Created 12/9/2009 Jimmy Yang
/* Macro to standardize the counter names for counters in the
"monitor_buf_page" module as they have very structured defines */
#define MONITOR_BUF_PAGE(name, description, code, op, op_code) \
- {"buffer_page_"op"_"name, "buffer_page_io", \
- "Number of "description" Pages "op, \
+ {"buffer_page_" op "_" name, "buffer_page_io", \
+ "Number of " description " Pages " op, \
MONITOR_GROUP_MODULE, MONITOR_DEFAULT_START, \
MONITOR_##code##_##op_code}
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index 6a410285f2b..6e03f715f28 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -3,6 +3,7 @@
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -473,6 +474,9 @@ current_time % 5 != 0. */
#endif /* MEM_PERIODIC_CHECK */
# define SRV_MASTER_DICT_LRU_INTERVAL (47)
+/** Simulate compression failures. */
+UNIV_INTERN uint srv_simulate_comp_failures = 0;
+
/** Acquire the system_mutex. */
#define srv_sys_mutex_enter() do { \
mutex_enter(&srv_sys->mutex); \
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
index 0c04fba421a..1c2bfcbd920 100644
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -2197,9 +2197,9 @@ innobase_start_or_create_for_mysql(void)
} else if (size != srv_log_file_size) {
ib_logf(IB_LOG_LEVEL_ERROR,
"Log file %s is"
- " of different size "UINT64PF" bytes"
+ " of different size " UINT64PF " bytes"
" than other log"
- " files "UINT64PF" bytes!",
+ " files " UINT64PF " bytes!",
logfilename,
size << UNIV_PAGE_SIZE_SHIFT,
(os_offset_t) srv_log_file_size
diff --git a/storage/innobase/sync/sync0arr.cc b/storage/innobase/sync/sync0arr.cc
index 2cfb693f8ba..986010039f9 100644
--- a/storage/innobase/sync/sync0arr.cc
+++ b/storage/innobase/sync/sync0arr.cc
@@ -182,6 +182,33 @@ sync_array_get_nth_cell(
}
/******************************************************************//**
+Looks for a cell with the given thread id.
+@return pointer to cell or NULL if not found */
+static
+sync_cell_t*
+sync_array_find_thread(
+/*===================*/
+ sync_array_t* arr, /*!< in: wait array */
+ os_thread_id_t thread) /*!< in: thread id */
+{
+ ulint i;
+ sync_cell_t* cell;
+
+ for (i = 0; i < arr->n_cells; i++) {
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ if (cell->wait_object != NULL
+ && os_thread_eq(cell->thread, thread)) {
+
+ return(cell); /* Found */
+ }
+ }
+
+ return(NULL); /* Not found */
+}
+
+/******************************************************************//**
Reserves the mutex semaphore protecting a sync array. */
static
void
@@ -432,8 +459,10 @@ static
void
sync_array_cell_print(
/*==================*/
- FILE* file, /*!< in: file where to print */
- sync_cell_t* cell) /*!< in: sync cell */
+ FILE* file, /*!< in: file where to print */
+ sync_cell_t* cell, /*!< in: sync cell */
+ os_thread_id_t* reserver) /*!< out: write reserver or
+ 0 */
{
ib_mutex_t* mutex;
rw_lock_t* rwlock;
@@ -454,19 +483,21 @@ sync_array_cell_print(
been freed meanwhile */
mutex = cell->old_wait_mutex;
- fprintf(file,
- "Mutex at %p created file %s line %lu, lock var %lu\n"
+ if (mutex) {
+ fprintf(file,
+ "Mutex at %p created file %s line %lu, lock var %lu\n"
#ifdef UNIV_SYNC_DEBUG
- "Last time reserved in file %s line %lu, "
+ "Last time reserved in file %s line %lu, "
#endif /* UNIV_SYNC_DEBUG */
- "waiters flag %lu\n",
- (void*) mutex, innobase_basename(mutex->cfile_name),
- (ulong) mutex->cline,
- (ulong) mutex->lock_word,
+ "waiters flag %lu\n",
+ (void*) mutex, innobase_basename(mutex->cfile_name),
+ (ulong) mutex->cline,
+ (ulong) mutex->lock_word,
#ifdef UNIV_SYNC_DEBUG
- mutex->file_name, (ulong) mutex->line,
+ mutex->file_name, (ulong) mutex->line,
#endif /* UNIV_SYNC_DEBUG */
- (ulong) mutex->waiters);
+ (ulong) mutex->waiters);
+ }
} else if (type == RW_LOCK_EX
|| type == RW_LOCK_WAIT_EX
@@ -478,33 +509,36 @@ sync_array_cell_print(
rwlock = cell->old_wait_rw_lock;
- fprintf(file,
- " RW-latch at %p created in file %s line %lu\n",
- (void*) rwlock, innobase_basename(rwlock->cfile_name),
- (ulong) rwlock->cline);
- writer = rw_lock_get_writer(rwlock);
- if (writer != RW_LOCK_NOT_LOCKED) {
+ if (rwlock) {
fprintf(file,
- "a writer (thread id %lu) has"
- " reserved it in mode %s",
- (ulong) os_thread_pf(rwlock->writer_thread),
- writer == RW_LOCK_EX
- ? " exclusive\n"
- : " wait exclusive\n");
- }
+ " RW-latch at %p created in file %s line %lu\n",
+ (void*) rwlock, innobase_basename(rwlock->cfile_name),
+ (ulong) rwlock->cline);
+ writer = rw_lock_get_writer(rwlock);
+ if (writer != RW_LOCK_NOT_LOCKED) {
+ fprintf(file,
+ "a writer (thread id %lu) has"
+ " reserved it in mode %s",
+ (ulong) os_thread_pf(rwlock->writer_thread),
+ writer == RW_LOCK_EX
+ ? " exclusive\n"
+ : " wait exclusive\n");
+ *reserver = rwlock->writer_thread;
+ }
- fprintf(file,
- "number of readers %lu, waiters flag %lu, "
- "lock_word: %lx\n"
- "Last time read locked in file %s line %lu\n"
- "Last time write locked in file %s line %lu\n",
- (ulong) rw_lock_get_reader_count(rwlock),
- (ulong) rwlock->waiters,
- rwlock->lock_word,
- innobase_basename(rwlock->last_s_file_name),
- (ulong) rwlock->last_s_line,
- rwlock->last_x_file_name,
- (ulong) rwlock->last_x_line);
+ fprintf(file,
+ "number of readers %lu, waiters flag %lu, "
+ "lock_word: %lx\n"
+ "Last time read locked in file %s line %lu\n"
+ "Last time write locked in file %s line %lu\n",
+ (ulong) rw_lock_get_reader_count(rwlock),
+ (ulong) rwlock->waiters,
+ rwlock->lock_word,
+ innobase_basename(rwlock->last_s_file_name),
+ (ulong) rwlock->last_s_line,
+ rwlock->last_x_file_name,
+ (ulong) rwlock->last_x_line);
+ }
} else {
ut_error;
}
@@ -515,32 +549,6 @@ sync_array_cell_print(
}
#ifdef UNIV_SYNC_DEBUG
-/******************************************************************//**
-Looks for a cell with the given thread id.
-@return pointer to cell or NULL if not found */
-static
-sync_cell_t*
-sync_array_find_thread(
-/*===================*/
- sync_array_t* arr, /*!< in: wait array */
- os_thread_id_t thread) /*!< in: thread id */
-{
- ulint i;
- sync_cell_t* cell;
-
- for (i = 0; i < arr->n_cells; i++) {
-
- cell = sync_array_get_nth_cell(arr, i);
-
- if (cell->wait_object != NULL
- && os_thread_eq(cell->thread, thread)) {
-
- return(cell); /* Found */
- }
- }
-
- return(NULL); /* Not found */
-}
/******************************************************************//**
Recursion step for deadlock detection.
@@ -602,6 +610,7 @@ sync_array_detect_deadlock(
os_thread_id_t thread;
ibool ret;
rw_lock_debug_t*debug;
+ os_thread_id_t reserver=0;
ut_a(arr);
ut_a(start);
@@ -637,10 +646,10 @@ sync_array_detect_deadlock(
depth);
if (ret) {
fprintf(stderr,
- "Mutex %p owned by thread %lu file %s line %lu\n",
+ "Mutex %p owned by thread %lu file %s line %lu\n",
mutex, (ulong) os_thread_pf(mutex->thread_id),
mutex->file_name, (ulong) mutex->line);
- sync_array_cell_print(stderr, cell);
+ sync_array_cell_print(stderr, cell, &reserver);
return(TRUE);
}
@@ -678,7 +687,7 @@ sync_array_detect_deadlock(
print:
fprintf(stderr, "rw-lock %p ",
(void*) lock);
- sync_array_cell_print(stderr, cell);
+ sync_array_cell_print(stderr, cell, &reserver);
rw_lock_debug_print(stderr, debug);
return(TRUE);
}
@@ -921,6 +930,7 @@ sync_array_print_long_waits_low(
double diff;
sync_cell_t* cell;
void* wait_object;
+ os_thread_id_t reserver=0;
cell = sync_array_get_nth_cell(arr, i);
@@ -936,7 +946,7 @@ sync_array_print_long_waits_low(
if (diff > SYNC_ARRAY_TIMEOUT) {
fputs("InnoDB: Warning: a long semaphore wait:\n",
stderr);
- sync_array_cell_print(stderr, cell);
+ sync_array_cell_print(stderr, cell, &reserver);
*noticed = TRUE;
}
@@ -951,6 +961,60 @@ sync_array_print_long_waits_low(
}
}
+ /* We found a long semaphore wait, wait all threads that are
+ waiting for a semaphore. */
+ if (*noticed) {
+ for (i = 0; i < arr->n_cells; i++) {
+ void* wait_object;
+ os_thread_id_t reserver=(os_thread_id_t)ULINT_UNDEFINED;
+ sync_cell_t* cell;
+ ulint loop = 0;
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ wait_object = cell->wait_object;
+
+ if (wait_object == NULL || !cell->waiting) {
+
+ continue;
+ }
+
+ fputs("InnoDB: Warning: semaphore wait:\n",
+ stderr);
+ sync_array_cell_print(stderr, cell, &reserver);
+
+ /* Try to output cell information for writer recursive way */
+ while (reserver != (os_thread_id_t)ULINT_UNDEFINED) {
+ sync_cell_t* reserver_wait;
+
+ reserver_wait = sync_array_find_thread(arr, reserver);
+
+ if (reserver_wait &&
+ reserver_wait->wait_object != NULL &&
+ reserver_wait->waiting) {
+ fputs("InnoDB: Warning: Writer thread is waiting this semaphore:\n",
+ stderr);
+ reserver = (os_thread_id_t)ULINT_UNDEFINED;
+ sync_array_cell_print(stderr, reserver_wait, &reserver);
+ loop++;
+
+ if (reserver_wait->thread == reserver) {
+ reserver = (os_thread_id_t)ULINT_UNDEFINED;
+ }
+ } else {
+ reserver = (os_thread_id_t)ULINT_UNDEFINED;
+ }
+
+ /* This is protection against loop */
+ if (loop > 100) {
+ fputs("InnoDB: Warning: Too many waiting threads.\n", stderr);
+ break;
+ }
+
+ }
+ }
+ }
+
#undef SYNC_ARRAY_TIMEOUT
return(fatal);
@@ -1030,6 +1094,7 @@ sync_array_print_info_low(
{
ulint i;
ulint count = 0;
+ os_thread_id_t r = 0;
fprintf(file,
"OS WAIT ARRAY INFO: reservation count %ld\n",
@@ -1042,7 +1107,7 @@ sync_array_print_info_low(
if (cell->wait_object != NULL) {
count++;
- sync_array_cell_print(file, cell);
+ sync_array_cell_print(file, cell, &r);
}
}
}
diff --git a/storage/innobase/sync/sync0rw.cc b/storage/innobase/sync/sync0rw.cc
index ebf73917702..e129d39fc9d 100644
--- a/storage/innobase/sync/sync0rw.cc
+++ b/storage/innobase/sync/sync0rw.cc
@@ -151,18 +151,12 @@ UNIV_INTERN mysql_pfs_key_t rw_lock_mutex_key;
To modify the debug info list of an rw-lock, this mutex has to be
acquired in addition to the mutex protecting the lock. */
-UNIV_INTERN ib_mutex_t rw_lock_debug_mutex;
+UNIV_INTERN os_fast_mutex_t rw_lock_debug_mutex;
# ifdef UNIV_PFS_MUTEX
UNIV_INTERN mysql_pfs_key_t rw_lock_debug_mutex_key;
# endif
-/* If deadlock detection does not get immediately the mutex,
-it may wait for this event */
-UNIV_INTERN os_event_t rw_lock_debug_event;
-/* This is set to TRUE, if there may be waiters for the event */
-UNIV_INTERN ibool rw_lock_debug_waiters;
-
/******************************************************************//**
Creates a debug info struct. */
static
@@ -690,22 +684,7 @@ void
rw_lock_debug_mutex_enter(void)
/*===========================*/
{
-loop:
- if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) {
- return;
- }
-
- os_event_reset(rw_lock_debug_event);
-
- rw_lock_debug_waiters = TRUE;
-
- if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) {
- return;
- }
-
- os_event_wait(rw_lock_debug_event);
-
- goto loop;
+ os_fast_mutex_lock(&rw_lock_debug_mutex);
}
/******************************************************************//**
@@ -715,12 +694,7 @@ void
rw_lock_debug_mutex_exit(void)
/*==========================*/
{
- mutex_exit(&rw_lock_debug_mutex);
-
- if (rw_lock_debug_waiters) {
- rw_lock_debug_waiters = FALSE;
- os_event_set(rw_lock_debug_event);
- }
+ os_fast_mutex_unlock(&rw_lock_debug_mutex);
}
/******************************************************************//**
diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc
index 5ef8a02fb3f..54018471abc 100644
--- a/storage/innobase/sync/sync0sync.cc
+++ b/storage/innobase/sync/sync0sync.cc
@@ -1472,11 +1472,7 @@ sync_init(void)
SYNC_NO_ORDER_CHECK);
#ifdef UNIV_SYNC_DEBUG
- mutex_create(rw_lock_debug_mutex_key, &rw_lock_debug_mutex,
- SYNC_NO_ORDER_CHECK);
-
- rw_lock_debug_event = os_event_create();
- rw_lock_debug_waiters = FALSE;
+ os_fast_mutex_init(rw_lock_debug_mutex_key, &rw_lock_debug_mutex);
#endif /* UNIV_SYNC_DEBUG */
}
@@ -1544,6 +1540,7 @@ sync_close(void)
sync_order_checks_on = FALSE;
sync_thread_level_arrays_free();
+ os_fast_mutex_free(&rw_lock_debug_mutex);
#endif /* UNIV_SYNC_DEBUG */
sync_initialized = FALSE;
@@ -1558,12 +1555,12 @@ sync_print_wait_info(
FILE* file) /*!< in: file where to print */
{
fprintf(file,
- "Mutex spin waits "UINT64PF", rounds "UINT64PF", "
- "OS waits "UINT64PF"\n"
- "RW-shared spins "UINT64PF", rounds "UINT64PF", "
- "OS waits "UINT64PF"\n"
- "RW-excl spins "UINT64PF", rounds "UINT64PF", "
- "OS waits "UINT64PF"\n",
+ "Mutex spin waits " UINT64PF ", rounds " UINT64PF ", "
+ "OS waits " UINT64PF "\n"
+ "RW-shared spins " UINT64PF ", rounds " UINT64PF ", "
+ "OS waits " UINT64PF "\n"
+ "RW-excl spins " UINT64PF ", rounds " UINT64PF ", "
+ "OS waits " UINT64PF "\n",
(ib_uint64_t) mutex_spin_wait_count,
(ib_uint64_t) mutex_spin_round_count,
(ib_uint64_t) mutex_os_wait_count,
diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc
index f6360562ae7..01ccfb8a6d0 100644
--- a/storage/innobase/trx/trx0i_s.cc
+++ b/storage/innobase/trx/trx0i_s.cc
@@ -1639,7 +1639,7 @@ trx_i_s_create_lock_id(
} else {
/* table lock */
res_len = ut_snprintf(lock_id, lock_id_size,
- TRX_ID_FMT":"UINT64PF,
+ TRX_ID_FMT":" UINT64PF,
row->lock_trx_id,
row->lock_table_id);
}
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index 38b21d8d428..272f8377f68 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -50,6 +50,9 @@ Created 3/26/1996 Heikki Tuuri
#include<set>
+extern "C"
+int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2);
+
/** Set of table_id */
typedef std::set<table_id_t> table_id_set;
@@ -1833,7 +1836,7 @@ state_ok:
if (trx->undo_no != 0) {
newline = TRUE;
- fprintf(f, ", undo log entries "TRX_ID_FMT, trx->undo_no);
+ fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no);
}
if (newline) {
@@ -1936,9 +1939,8 @@ trx_assert_started(
#endif /* UNIV_DEBUG */
/*******************************************************************//**
-Compares the "weight" (or size) of two transactions. Transactions that
-have edited non-transactional tables are considered heavier than ones
-that have not.
+Compares the "weight" (or size) of two transactions. The heavier the weight,
+the more reluctant we will be to choose the transaction as a deadlock victim.
@return TRUE if weight(a) >= weight(b) */
UNIV_INTERN
ibool
@@ -1947,26 +1949,19 @@ trx_weight_ge(
const trx_t* a, /*!< in: the first transaction to be compared */
const trx_t* b) /*!< in: the second transaction to be compared */
{
- ibool a_notrans_edit;
- ibool b_notrans_edit;
-
- /* If mysql_thd is NULL for a transaction we assume that it has
- not edited non-transactional tables. */
-
- a_notrans_edit = a->mysql_thd != NULL
- && thd_has_edited_nontrans_tables(a->mysql_thd);
-
- b_notrans_edit = b->mysql_thd != NULL
- && thd_has_edited_nontrans_tables(b->mysql_thd);
-
- if (a_notrans_edit != b_notrans_edit) {
+ int pref;
- return(a_notrans_edit);
+ /* First ask the upper server layer if it has any preference for which
+ to prefer as a deadlock victim. */
+ pref= thd_deadlock_victim_preference(a->mysql_thd, b->mysql_thd);
+ if (pref < 0) {
+ return FALSE;
+ } else if (pref > 0) {
+ return TRUE;
}
- /* Either both had edited non-transactional tables or both had
- not, we fall back to comparing the number of altered/locked
- rows. */
+ /* Upper server layer had no preference, we fall back to comparing the
+ number of altered/locked rows. */
#if 0
fprintf(stderr,
@@ -2133,7 +2128,7 @@ trx_recover_for_mysql(
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Transaction contains changes"
- " to "TRX_ID_FMT" rows\n",
+ " to " TRX_ID_FMT " rows\n",
trx->undo_no);
count++;