summaryrefslogtreecommitdiff
path: root/storage
diff options
context:
space:
mode:
authorMichael Widenius <monty@askmonty.org>2011-05-02 20:58:45 +0300
committerMichael Widenius <monty@askmonty.org>2011-05-02 20:58:45 +0300
commite415ba0fb2c57eaf370e84a3a9c8d831f820a560 (patch)
treef113a8024de4ee4f1bc19aae98c19a2835f5b4e7 /storage
parent046418ad956c98c3788d79650fcb50479844df3b (diff)
parenta1f7ceb281f9d87c9baea125ebab26f99a0370f8 (diff)
downloadmariadb-git-e415ba0fb2c57eaf370e84a3a9c8d831f820a560.tar.gz
Merge with MySQL 5.1.57/58
Moved some BSD string functions from Unireg
Diffstat (limited to 'storage')
-rw-r--r--storage/heap/ha_heap.cc4
-rw-r--r--storage/heap/ha_heap.h2
-rw-r--r--storage/innobase/btr/btr0cur.c156
-rw-r--r--storage/innobase/dict/dict0dict.c10
-rw-r--r--storage/innobase/handler/ha_innodb.cc95
-rw-r--r--storage/innobase/include/btr0cur.h5
-rw-r--r--storage/innobase/include/dict0mem.h6
-rw-r--r--storage/innobase/include/dict0types.h5
-rw-r--r--storage/innobase/include/rem0cmp.h4
-rw-r--r--storage/innobase/include/rem0cmp.ic2
-rw-r--r--storage/innobase/include/srv0srv.h18
-rw-r--r--storage/innobase/include/sync0arr.h11
-rw-r--r--storage/innobase/include/sync0rw.h3
-rw-r--r--storage/innobase/include/trx0rseg.h4
-rw-r--r--storage/innobase/include/trx0trx.h5
-rw-r--r--storage/innobase/rem/rem0cmp.c14
-rw-r--r--storage/innobase/row/row0vers.c10
-rw-r--r--storage/innobase/srv/srv0srv.c24
-rw-r--r--storage/innobase/sync/sync0arr.c40
-rw-r--r--storage/innobase/sync/sync0rw.c19
-rw-r--r--storage/innobase/trx/trx0trx.c26
-rw-r--r--storage/innodb_plugin/ChangeLog86
-rw-r--r--storage/innodb_plugin/btr/btr0btr.c573
-rw-r--r--storage/innodb_plugin/btr/btr0cur.c441
-rw-r--r--storage/innodb_plugin/btr/btr0sea.c6
-rw-r--r--storage/innodb_plugin/buf/buf0buddy.c2
-rw-r--r--storage/innodb_plugin/buf/buf0buf.c85
-rw-r--r--storage/innodb_plugin/buf/buf0lru.c131
-rw-r--r--storage/innodb_plugin/dict/dict0dict.c10
-rw-r--r--storage/innodb_plugin/dict/dict0mem.c9
-rw-r--r--storage/innodb_plugin/fsp/fsp0fsp.c8
-rw-r--r--storage/innodb_plugin/handler/ha_innodb.cc95
-rw-r--r--storage/innodb_plugin/handler/handler0alter.cc12
-rw-r--r--storage/innodb_plugin/ibuf/ibuf0ibuf.c8
-rw-r--r--storage/innodb_plugin/include/btr0btr.h85
-rw-r--r--storage/innodb_plugin/include/btr0cur.h47
-rw-r--r--storage/innodb_plugin/include/btr0types.h125
-rw-r--r--storage/innodb_plugin/include/buf0buf.h18
-rw-r--r--storage/innodb_plugin/include/buf0buf.ic8
-rw-r--r--storage/innodb_plugin/include/buf0lru.h14
-rw-r--r--storage/innodb_plugin/include/dict0mem.h13
-rw-r--r--storage/innodb_plugin/include/dict0types.h5
-rw-r--r--storage/innodb_plugin/include/page0zip.h2
-rw-r--r--storage/innodb_plugin/include/rem0cmp.h4
-rw-r--r--storage/innodb_plugin/include/rem0cmp.ic2
-rw-r--r--storage/innodb_plugin/include/row0upd.h26
-rw-r--r--storage/innodb_plugin/include/srv0srv.h18
-rw-r--r--storage/innodb_plugin/include/sync0arr.h7
-rw-r--r--storage/innodb_plugin/include/sync0rw.h3
-rw-r--r--storage/innodb_plugin/include/trx0rseg.h4
-rw-r--r--storage/innodb_plugin/include/trx0trx.h4
-rw-r--r--storage/innodb_plugin/include/univ.i13
-rw-r--r--storage/innodb_plugin/mem/mem0mem.c2
-rw-r--r--storage/innodb_plugin/mtr/mtr0log.c2
-rw-r--r--storage/innodb_plugin/page/page0cur.c10
-rw-r--r--storage/innodb_plugin/page/page0page.c11
-rw-r--r--storage/innodb_plugin/page/page0zip.c24
-rw-r--r--storage/innodb_plugin/rem/rem0cmp.c14
-rw-r--r--storage/innodb_plugin/row/row0ins.c2
-rw-r--r--storage/innodb_plugin/row/row0purge.c68
-rw-r--r--storage/innodb_plugin/row/row0umod.c35
-rw-r--r--storage/innodb_plugin/row/row0upd.c82
-rw-r--r--storage/innodb_plugin/row/row0vers.c10
-rw-r--r--storage/innodb_plugin/srv/srv0srv.c16
-rw-r--r--storage/innodb_plugin/srv/srv0start.c6
-rw-r--r--storage/innodb_plugin/sync/sync0arr.c36
-rw-r--r--storage/innodb_plugin/sync/sync0rw.c22
-rw-r--r--storage/innodb_plugin/trx/trx0i_s.c2
-rw-r--r--storage/innodb_plugin/trx/trx0roll.c4
-rw-r--r--storage/innodb_plugin/trx/trx0trx.c25
-rw-r--r--storage/maria/ha_maria.cc5
-rw-r--r--storage/maria/ha_maria.h2
-rw-r--r--storage/maria/ma_init.c1
-rw-r--r--storage/myisam/ft_stopwords.c2
-rw-r--r--storage/myisam/ha_myisam.cc9
-rw-r--r--storage/myisam/ha_myisam.h3
-rw-r--r--storage/myisam/mi_create.c2
-rw-r--r--storage/myisam/mi_test1.c1
-rw-r--r--storage/myisam/mi_write.c1
-rw-r--r--storage/myisam/myisamdef.h4
-rw-r--r--storage/myisam/sp_test.c1
-rw-r--r--storage/myisammrg/ha_myisammrg.cc9
-rw-r--r--storage/myisammrg/ha_myisammrg.h2
83 files changed, 2168 insertions, 572 deletions
diff --git a/storage/heap/ha_heap.cc b/storage/heap/ha_heap.cc
index fb7c13e4e41..9f29dee2030 100644
--- a/storage/heap/ha_heap.cc
+++ b/storage/heap/ha_heap.cc
@@ -142,11 +142,11 @@ int ha_heap::close(void)
DESCRIPTION
Do same as default implementation but use file->s->name instead of
table->s->path. This is needed by Windows where the clone() call sees
- '/'-delimited path in table->s->path, while ha_peap::open() was called
+ '/'-delimited path in table->s->path, while ha_heap::open() was called
with '\'-delimited path.
*/
-handler *ha_heap::clone(MEM_ROOT *mem_root)
+handler *ha_heap::clone(const char *name, MEM_ROOT *mem_root)
{
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
if (new_handler && !new_handler->ha_open(table, file->s->name, table->db_stat,
diff --git a/storage/heap/ha_heap.h b/storage/heap/ha_heap.h
index 22722129f4c..69751101645 100644
--- a/storage/heap/ha_heap.h
+++ b/storage/heap/ha_heap.h
@@ -34,7 +34,7 @@ class ha_heap: public handler
public:
ha_heap(handlerton *hton, TABLE_SHARE *table);
~ha_heap() {}
- handler *clone(MEM_ROOT *mem_root);
+ handler *clone(const char *name, MEM_ROOT *mem_root);
const char *table_type() const
{
return (table->in_use->variables.sql_mode & MODE_MYSQL323) ?
diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c
index a7160d74a32..6c0497cbd41 100644
--- a/storage/innobase/btr/btr0cur.c
+++ b/storage/innobase/btr/btr0cur.c
@@ -66,6 +66,13 @@ this many index pages */
/*--------------------------------------*/
#define BTR_BLOB_HDR_SIZE 8
+/* Estimated table level stats from sampled value. */
+#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, ext_size, not_empty) \
+ ((value * (ib_longlong) index->stat_n_leaf_pages \
+ + BTR_KEY_VAL_ESTIMATE_N_PAGES - 1 + ext_size \
+ + not_empty) \
+ / (BTR_KEY_VAL_ESTIMATE_N_PAGES + ext_size))
+
/***********************************************************************
Marks all extern fields in a record as owned by the record. This function
should be called if the delete mark of a record is removed: a not delete
@@ -2835,9 +2842,54 @@ btr_estimate_n_rows_in_range(
}
/***********************************************************************
+Record the number of non_null key values in a given index for
+each n-column prefix of the index where n < dict_index_get_n_unique(index).
+The estimates are eventually stored in the array:
+index->stat_n_non_null_key_vals. */
+static
+void
+btr_record_not_null_field_in_rec(
+/*=============================*/
+ rec_t* rec, /* in: physical record */
+ ulint n_unique, /* in: dict_index_get_n_unique(index),
+ number of columns uniquely determine
+ an index entry */
+ const ulint* offsets, /* in: rec_get_offsets(rec, index),
+ its size could be for all fields or
+ that of "n_unique" */
+ ib_longlong* n_not_null) /* in/out: array to record number of
+ not null rows for n-column prefix */
+{
+ ulint i;
+
+ ut_ad(rec_offs_n_fields(offsets) >= n_unique);
+
+ if (n_not_null == NULL) {
+ return;
+ }
+
+ for (i = 0; i < n_unique; i++) {
+ ulint rec_len;
+ byte* field;
+
+ field = rec_get_nth_field(rec, offsets, i, &rec_len);
+
+ if (rec_len != UNIV_SQL_NULL) {
+ n_not_null[i]++;
+ } else {
+ /* Break if we hit the first NULL value */
+ break;
+ }
+ }
+}
+
+/***********************************************************************
Estimates the number of different key values in a given index, for
each n-column prefix of the index where n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals. */
+The estimates are stored in the array index->stat_n_diff_key_vals.
+If innodb_stats_method is "nulls_ignored", we also record the number of
+non-null values for each prefix and store the estimates in
+array index->stat_n_non_null_key_vals. */
void
btr_estimate_number_of_different_key_vals(
@@ -2851,6 +2903,8 @@ btr_estimate_number_of_different_key_vals(
ulint matched_fields;
ulint matched_bytes;
ib_longlong* n_diff;
+ ib_longlong* n_not_null;
+ ibool stats_null_not_equal;
ulint not_empty_flag = 0;
ulint total_external_size = 0;
ulint i;
@@ -2858,24 +2912,47 @@ btr_estimate_number_of_different_key_vals(
ulint add_on;
mtr_t mtr;
mem_heap_t* heap = NULL;
- ulint offsets_rec_[REC_OFFS_NORMAL_SIZE];
- ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE];
- ulint* offsets_rec = offsets_rec_;
- ulint* offsets_next_rec= offsets_next_rec_;
- *offsets_rec_ = (sizeof offsets_rec_) / sizeof *offsets_rec_;
- *offsets_next_rec_
- = (sizeof offsets_next_rec_) / sizeof *offsets_next_rec_;
+ ulint* offsets_rec = NULL;
+ ulint* offsets_next_rec = NULL;
n_cols = dict_index_get_n_unique(index);
- n_diff = mem_alloc((n_cols + 1) * sizeof(ib_longlong));
+ heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
+ * (n_cols + 1)
+ + dict_index_get_n_fields(index)
+ * (sizeof *offsets_rec
+ + sizeof *offsets_next_rec));
+
+ n_diff = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_longlong));
+
+ n_not_null = NULL;
+
+ /* Check srv_innodb_stats_method setting, and decide whether we
+ need to record non-null value and also decide if NULL is
+ considered equal (by setting stats_null_not_equal value) */
+ switch (srv_innodb_stats_method) {
+ case SRV_STATS_NULLS_IGNORED:
+ n_not_null = mem_heap_zalloc(heap, (n_cols + 1)
+ * sizeof *n_not_null);
+ /* fall through */
+
+ case SRV_STATS_NULLS_UNEQUAL:
+ /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
+ case, we will treat NULLs as unequal value */
+ stats_null_not_equal = TRUE;
+ break;
- memset(n_diff, 0, (n_cols + 1) * sizeof(ib_longlong));
+ case SRV_STATS_NULLS_EQUAL:
+ stats_null_not_equal = FALSE;
+ break;
+
+ default:
+ ut_error;
+ }
/* We sample some pages in the index to get an estimate */
for (i = 0; i < BTR_KEY_VAL_ESTIMATE_N_PAGES; i++) {
- rec_t* supremum;
mtr_start(&mtr);
btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
@@ -2888,18 +2965,25 @@ btr_estimate_number_of_different_key_vals(
page = btr_cur_get_page(&cursor);
- supremum = page_get_supremum_rec(page);
rec = page_rec_get_next(page_get_infimum_rec(page));
- if (rec != supremum) {
+ if (!page_rec_is_supremum(rec)) {
not_empty_flag = 1;
offsets_rec = rec_get_offsets(rec, index, offsets_rec,
ULINT_UNDEFINED, &heap);
+
+ if (n_not_null) {
+ btr_record_not_null_field_in_rec(
+ rec, n_cols, offsets_rec, n_not_null);
+ }
}
- while (rec != supremum) {
+ while (!page_rec_is_supremum(rec)) {
rec_t* next_rec = page_rec_get_next(rec);
- if (next_rec == supremum) {
+ if (page_rec_is_supremum(next_rec)) {
+ total_external_size +=
+ btr_rec_get_externally_stored_len(
+ rec, offsets_rec);
break;
}
@@ -2907,11 +2991,13 @@ btr_estimate_number_of_different_key_vals(
matched_bytes = 0;
offsets_next_rec = rec_get_offsets(next_rec, index,
offsets_next_rec,
- n_cols, &heap);
+ ULINT_UNDEFINED,
+ &heap);
cmp_rec_rec_with_match(rec, next_rec,
offsets_rec, offsets_next_rec,
- index, &matched_fields,
+ index, stats_null_not_equal,
+ &matched_fields,
&matched_bytes);
for (j = matched_fields + 1; j <= n_cols; j++) {
@@ -2921,6 +3007,12 @@ btr_estimate_number_of_different_key_vals(
n_diff[j]++;
}
+ if (n_not_null) {
+ btr_record_not_null_field_in_rec(
+ next_rec, n_cols, offsets_next_rec,
+ n_not_null);
+ }
+
total_external_size
+= btr_rec_get_externally_stored_len(
rec, offsets_rec);
@@ -2955,10 +3047,6 @@ btr_estimate_number_of_different_key_vals(
}
}
- offsets_rec = rec_get_offsets(rec, index, offsets_rec,
- ULINT_UNDEFINED, &heap);
- total_external_size += btr_rec_get_externally_stored_len(
- rec, offsets_rec);
mtr_commit(&mtr);
}
@@ -2971,14 +3059,8 @@ btr_estimate_number_of_different_key_vals(
included in index->stat_n_leaf_pages) */
for (j = 0; j <= n_cols; j++) {
- index->stat_n_diff_key_vals[j]
- = ((n_diff[j]
- * (ib_longlong)index->stat_n_leaf_pages
- + BTR_KEY_VAL_ESTIMATE_N_PAGES - 1
- + total_external_size
- + not_empty_flag)
- / (BTR_KEY_VAL_ESTIMATE_N_PAGES
- + total_external_size));
+ index->stat_n_diff_key_vals[j] = BTR_TABLE_STATS_FROM_SAMPLE(
+ n_diff[j], index, total_external_size, not_empty_flag);
/* If the tree is small, smaller than
10 * BTR_KEY_VAL_ESTIMATE_N_PAGES + total_external_size, then
@@ -2997,12 +3079,20 @@ btr_estimate_number_of_different_key_vals(
}
index->stat_n_diff_key_vals[j] += add_on;
- }
- mem_free(n_diff);
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
+ /* Update the stat_n_non_null_key_vals[] with our
+ sampled result. stat_n_non_null_key_vals[] is created
+ and initialized to zero in dict_index_add_to_cache(),
+ along with stat_n_diff_key_vals[] array */
+ if (n_not_null != NULL && (j < n_cols)) {
+ index->stat_n_non_null_key_vals[j] =
+ BTR_TABLE_STATS_FROM_SAMPLE(
+ n_not_null[j], index,
+ total_external_size, not_empty_flag);
+ }
}
+
+ mem_heap_free(heap);
}
/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
diff --git a/storage/innobase/dict/dict0dict.c b/storage/innobase/dict/dict0dict.c
index fda6555e082..beea0a2f411 100644
--- a/storage/innobase/dict/dict0dict.c
+++ b/storage/innobase/dict/dict0dict.c
@@ -1358,6 +1358,12 @@ dict_index_add_to_cache(
new_index->heap,
(1 + dict_index_get_n_unique(new_index))
* sizeof(ib_longlong));
+
+ new_index->stat_n_non_null_key_vals = mem_heap_zalloc(
+ new_index->heap,
+ (1 + dict_index_get_n_unique(new_index))
+ * sizeof(*new_index->stat_n_non_null_key_vals));
+
/* Give some sensible values to stat_n_... in case we do
not calculate statistics quickly enough */
@@ -3817,6 +3823,10 @@ dict_update_statistics_low(
for (i = dict_index_get_n_unique(index); i; ) {
index->stat_n_diff_key_vals[i--] = 1;
}
+
+ memset(index->stat_n_non_null_key_vals, 0,
+ (1 + dict_index_get_n_unique(index))
+ * sizeof(*index->stat_n_non_null_key_vals));
}
index = dict_table_get_next_index(index);
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 4c52326a58a..6f58fd70fbd 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -130,6 +130,25 @@ static my_bool innobase_adaptive_hash_index = TRUE;
static char* internal_innobase_data_file_path = NULL;
+/* Possible values for system variable "innodb_stats_method". The values
+are defined the same as its corresponding MyISAM system variable
+"myisam_stats_method"(see "myisam_stats_method_names"), for better usability */
+static const char* innodb_stats_method_names[] = {
+ "nulls_equal",
+ "nulls_unequal",
+ "nulls_ignored",
+ NullS
+};
+
+/* Used to define an enumerate type of the system variable innodb_stats_method.
+This is the same as "myisam_stats_method_typelib" */
+static TYPELIB innodb_stats_method_typelib = {
+ array_elements(innodb_stats_method_names) - 1,
+ "innodb_stats_method_typelib",
+ innodb_stats_method_names,
+ NULL
+};
+
/* The following counter is used to convey information to InnoDB
about server activity: in selects it is not sensible to call
srv_active_wake_master_thread after each fetch or search, we only do
@@ -6363,6 +6382,65 @@ ha_innobase::read_time(
}
/*************************************************************************
+Calculate Record Per Key value. Need to exclude the NULL value if
+innodb_stats_method is set to "nulls_ignored" */
+static
+ha_rows
+innodb_rec_per_key(
+/*===============*/
+ /* out: estimated record per key
+ value */
+ dict_index_t* index, /* in: dict_index_t structure */
+ ulint i, /* in: the column we are
+ calculating rec per key */
+ ha_rows records) /* in: estimated total records */
+{
+ ha_rows rec_per_key;
+
+ ut_ad(i < dict_index_get_n_unique(index));
+
+ /* Note the stat_n_diff_key_vals[] stores the diff value with
+ n-prefix indexing, so it is always stat_n_diff_key_vals[i + 1] */
+ if (index->stat_n_diff_key_vals[i + 1] == 0) {
+
+ rec_per_key = records;
+ } else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) {
+ ib_longlong num_null;
+
+ /* Number of rows with NULL value in this
+ field */
+ num_null = records - index->stat_n_non_null_key_vals[i];
+
+ /* In theory, index->stat_n_non_null_key_vals[i]
+ should always be less than the number of records.
+ Since this is statistics value, the value could
+ have slight discrepancy. But we will make sure
+ the number of null values is not a negative number. */
+ num_null = (num_null < 0) ? 0 : num_null;
+
+ /* If the number of NULL values is the same as or
+ large than that of the distinct values, we could
+ consider that the table consists mostly of NULL value.
+ Set rec_per_key to 1. */
+ if (index->stat_n_diff_key_vals[i + 1] <= num_null) {
+ rec_per_key = 1;
+ } else {
+ /* Need to exclude rows with NULL values from
+ rec_per_key calculation */
+ rec_per_key = (ha_rows)(
+ (records - num_null)
+ / (index->stat_n_diff_key_vals[i + 1]
+ - num_null));
+ }
+ } else {
+ rec_per_key = (ha_rows)
+ (records / index->stat_n_diff_key_vals[i + 1]);
+ }
+
+ return(rec_per_key);
+}
+
+/*************************************************************************
Returns statistics information of the table to the MySQL interpreter,
in various fields of the handle object. */
@@ -6568,13 +6646,8 @@ ha_innobase::info_low(
break;
}
- if (index->stat_n_diff_key_vals[j + 1] == 0) {
-
- rec_per_key = stats.records;
- } else {
- rec_per_key = (ha_rows)(stats.records /
- index->stat_n_diff_key_vals[j + 1]);
- }
+ rec_per_key = innodb_rec_per_key(
+ index, j, stats.records);
/* Since MySQL seems to favor table scans
too much over index searches, we pretend
@@ -8990,6 +9063,13 @@ static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode,
AUTOINC_OLD_STYLE_LOCKING, /* Minimum value */
AUTOINC_NO_LOCKING, 0); /* Maximum value */
+static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies how InnoDB index statistics collection code should "
+ "treat NULLs. Possible values are NULLS_EQUAL (default), "
+ "NULLS_UNEQUAL and NULLS_IGNORED",
+ NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib);
+
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug,
PLUGIN_VAR_RQCMDARG,
@@ -9031,6 +9111,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(stats_on_metadata),
MYSQL_SYSVAR(use_legacy_cardinality_algorithm),
MYSQL_SYSVAR(adaptive_hash_index),
+ MYSQL_SYSVAR(stats_method),
MYSQL_SYSVAR(status_file),
MYSQL_SYSVAR(support_xa),
MYSQL_SYSVAR(sync_spin_loops),
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
index 213dcb7f568..20235c55f22 100644
--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -404,7 +404,10 @@ btr_estimate_n_rows_in_range(
/***********************************************************************
Estimates the number of different key values in a given index, for
each n-column prefix of the index where n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals. */
+The estimates are stored in the array index->stat_n_diff_key_vals.
+If innodb_stats_method is nulls_ignored, we also record the number of
+non-null values for each prefix and stored the estimates in
+array index->stat_n_non_null_key_vals. */
void
btr_estimate_number_of_different_key_vals(
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index 2f2a7441478..83dbf65ea41 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -222,6 +222,12 @@ struct dict_index_struct{
for this index, for each n-column prefix
where n <= dict_get_n_unique(index); we
periodically calculate new estimates */
+ ib_longlong* stat_n_non_null_key_vals;
+ /* approximate number of non-null key values
+ for this index, for each column where
+ n < dict_get_n_unique(index); This
+ is used when innodb_stats_method is
+ "nulls_ignored". */
ulint stat_index_size;
/* approximate index size in database pages */
ulint stat_n_leaf_pages;
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
index b90545f2105..6674b5ff397 100644
--- a/storage/innobase/include/dict0types.h
+++ b/storage/innobase/include/dict0types.h
@@ -16,11 +16,6 @@ typedef struct dict_index_struct dict_index_t;
typedef struct dict_table_struct dict_table_t;
typedef struct dict_foreign_struct dict_foreign_t;
-/* A cluster object is a table object with the type field set to
-DICT_CLUSTERED */
-
-typedef dict_table_t dict_cluster_t;
-
typedef struct ind_node_struct ind_node_t;
typedef struct tab_node_struct tab_node_t;
diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h
index c6a6e5de4db..22a22d13e17 100644
--- a/storage/innobase/include/rem0cmp.h
+++ b/storage/innobase/include/rem0cmp.h
@@ -141,6 +141,10 @@ cmp_rec_rec_with_match(
const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */
const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */
dict_index_t* index, /* in: data dictionary index */
+ ibool nulls_unequal,
+ /* in: TRUE if this is for index statistics
+ cardinality estimation, and innodb_stats_method
+ is "nulls_unequal" or "nulls_ignored" */
ulint* matched_fields, /* in/out: number of already completely
matched fields; when the function returns,
contains the value the for current
diff --git a/storage/innobase/include/rem0cmp.ic b/storage/innobase/include/rem0cmp.ic
index 52dc7ff5dc9..45e12301a3c 100644
--- a/storage/innobase/include/rem0cmp.ic
+++ b/storage/innobase/include/rem0cmp.ic
@@ -72,5 +72,5 @@ cmp_rec_rec(
ulint match_b = 0;
return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index,
- &match_f, &match_b));
+ FALSE, &match_f, &match_b));
}
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index 3dd4bb961f9..811074b2be8 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -91,6 +91,11 @@ extern ulint srv_lock_table_size;
extern ulint srv_n_file_io_threads;
+/* The "innodb_stats_method" setting, decides how InnoDB is going
+to treat NULL value when collecting statistics. It is not defined
+as enum type because the configure option takes unsigned integer type. */
+extern ulong srv_innodb_stats_method;
+
#ifdef UNIV_LOG_ARCHIVE
extern ibool srv_log_archive_on;
extern ibool srv_archive_recovery;
@@ -286,6 +291,19 @@ of lower numbers are included. */
#define SRV_FORCE_NO_LOG_REDO 6 /* do not do the log roll-forward
in connection with recovery */
+/* Alternatives for srv_innodb_stats_method, which could be changed by
+setting innodb_stats_method */
+enum srv_stats_method_name_enum {
+ SRV_STATS_NULLS_EQUAL, /* All NULL values are treated as
+ equal. This is the default setting
+ for innodb_stats_method */
+ SRV_STATS_NULLS_UNEQUAL, /* All NULL values are treated as
+ NOT equal. */
+ SRV_STATS_NULLS_IGNORED /* NULL values are ignored */
+};
+
+typedef enum srv_stats_method_name_enum srv_stats_method_name_t;
+
/*************************************************************************
Boots Innobase server. */
diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h
index fae26b7a63e..ec48059dbcb 100644
--- a/storage/innobase/include/sync0arr.h
+++ b/storage/innobase/include/sync0arr.h
@@ -93,10 +93,13 @@ sync_arr_wake_threads_if_sema_free(void);
Prints warnings of long semaphore waits to stderr. */
ibool
-sync_array_print_long_waits(void);
-/*=============================*/
- /* out: TRUE if fatal semaphore wait threshold
- was exceeded */
+sync_array_print_long_waits(
+/*========================*/
+ /* out: TRUE if fatal semaphore wait threshold
+ was exceeded */
+ os_thread_id_t* waiter, /* out: longest waiting thread */
+ const void** sema) /* out: longest-waited-for semaphore */
+ __attribute__((nonnull));
/************************************************************************
Validates the integrity of the wait array. Checks
that the number of reserved cells equals the count variable. */
diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h
index 008df80a2c7..dd898557d6e 100644
--- a/storage/innobase/include/sync0rw.h
+++ b/storage/innobase/include/sync0rw.h
@@ -1,7 +1,7 @@
/******************************************************
The read-write lock (for threads, not for database transactions)
-(c) 1995 Innobase Oy
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Created 9/11/1995 Heikki Tuuri
*******************************************************/
@@ -409,6 +409,7 @@ Prints info of a debug struct. */
void
rw_lock_debug_print(
/*================*/
+ FILE* f, /* in: output stream */
rw_lock_debug_t* info); /* in: debug struct */
#endif /* UNIV_SYNC_DEBUG */
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
index 46ba010bd1d..22f8aa89181 100644
--- a/storage/innobase/include/trx0rseg.h
+++ b/storage/innobase/include/trx0rseg.h
@@ -121,9 +121,7 @@ struct trx_rseg_struct{
ulint id; /* rollback segment id == the index of
its slot in the trx system file copy */
mutex_t mutex; /* mutex protecting the fields in this
- struct except id; NOTE that the latching
- order must always be kernel mutex ->
- rseg mutex */
+ struct except id, which is constant */
ulint space; /* space where the rollback segment is
header is placed */
ulint page_no;/* page number of the rollback segment
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
index 97a47d9f46e..4652f45892e 100644
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -198,8 +198,9 @@ which is in the prepared state */
trx_t *
trx_get_trx_by_xid(
/*===============*/
- /* out: trx or NULL */
- XID* xid); /* in: X/Open XA transaction identification */
+ /* out: trx or NULL;
+ on match, the trx->xid will be invalidated */
+ const XID* xid); /* in: X/Open XA transaction identifier */
/**************************************************************************
If required, flushes the log to disk if we called trx_commit_for_mysql()
with trx->flush_log_later == TRUE. */
diff --git a/storage/innobase/rem/rem0cmp.c b/storage/innobase/rem/rem0cmp.c
index ca0ec663548..2939c119e2e 100644
--- a/storage/innobase/rem/rem0cmp.c
+++ b/storage/innobase/rem/rem0cmp.c
@@ -720,6 +720,10 @@ cmp_rec_rec_with_match(
const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */
const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */
dict_index_t* index, /* in: data dictionary index */
+ ibool nulls_unequal,
+ /* in: TRUE if this is for index statistics
+ cardinality estimation, and innodb_stats_method
+ is "nulls_unequal" or "nulls_ignored" */
ulint* matched_fields, /* in/out: number of already completely
matched fields; when the function returns,
contains the value the for current
@@ -821,9 +825,13 @@ cmp_rec_rec_with_match(
|| rec2_f_len == UNIV_SQL_NULL) {
if (rec1_f_len == rec2_f_len) {
-
- goto next_field;
-
+ /* This is limited to stats collection,
+ cannot use it for regular search */
+ if (nulls_unequal) {
+ ret = -1;
+ } else {
+ goto next_field;
+ }
} else if (rec2_f_len == UNIV_SQL_NULL) {
/* We define the SQL null to be the
diff --git a/storage/innobase/row/row0vers.c b/storage/innobase/row/row0vers.c
index f4adfa855df..23aca8c3f2e 100644
--- a/storage/innobase/row/row0vers.c
+++ b/storage/innobase/row/row0vers.c
@@ -593,11 +593,15 @@ row_vers_build_for_semi_consistent_read(
mutex_enter(&kernel_mutex);
version_trx = trx_get_on_id(version_trx_id);
+ if (version_trx
+ && (version_trx->conc_state == TRX_COMMITTED_IN_MEMORY
+ || version_trx->conc_state == TRX_NOT_STARTED)) {
+
+ version_trx = NULL;
+ }
mutex_exit(&kernel_mutex);
- if (!version_trx
- || version_trx->conc_state == TRX_NOT_STARTED
- || version_trx->conc_state == TRX_COMMITTED_IN_MEMORY) {
+ if (!version_trx) {
/* We found a version that belongs to a
committed transaction: return it. */
diff --git a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c
index 5b1184fb416..3f6f1982992 100644
--- a/storage/innobase/srv/srv0srv.c
+++ b/storage/innobase/srv/srv0srv.c
@@ -218,6 +218,11 @@ ulong srv_max_buf_pool_modified_pct = 90;
/* variable counts amount of data read in total (in bytes) */
ulint srv_data_read = 0;
+/* Internal setting for "innodb_stats_method". Decides how InnoDB treats
+NULL value when collecting statistics. By default, it is set to
+SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */
+ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL;
+
/* here we count the amount of data written in total (in bytes) */
ulint srv_data_written = 0;
@@ -2175,9 +2180,15 @@ srv_error_monitor_thread(
os_thread_create */
{
/* number of successive fatal timeouts observed */
- ulint fatal_cnt = 0;
- dulint old_lsn;
- dulint new_lsn;
+ ulint fatal_cnt = 0;
+ dulint old_lsn;
+ dulint new_lsn;
+ /* longest waiting thread for a semaphore */
+ os_thread_id_t waiter = os_thread_get_curr_id();
+ os_thread_id_t old_waiter = waiter;
+ /* the semaphore that is being waited for */
+ const void* sema = NULL;
+ const void* old_sema = NULL;
old_lsn = srv_start_lsn;
@@ -2219,10 +2230,11 @@ loop:
/* In case mutex_exit is not a memory barrier, it is
theoretically possible some threads are left waiting though
the semaphore is already released. Wake up those threads: */
-
+
sync_arr_wake_threads_if_sema_free();
- if (sync_array_print_long_waits()) {
+ if (sync_array_print_long_waits(&waiter, &sema)
+ && sema == old_sema && os_thread_eq(waiter, old_waiter)) {
fatal_cnt++;
if (fatal_cnt > 10) {
@@ -2237,6 +2249,8 @@ loop:
}
} else {
fatal_cnt = 0;
+ old_waiter = waiter;
+ old_sema = sema;
}
/* Flush stderr so that a database user gets the output
diff --git a/storage/innobase/sync/sync0arr.c b/storage/innobase/sync/sync0arr.c
index 154593a9035..93a7398f252 100644
--- a/storage/innobase/sync/sync0arr.c
+++ b/storage/innobase/sync/sync0arr.c
@@ -1,7 +1,7 @@
/******************************************************
The wait array used in synchronization primitives
-(c) 1995 Innobase Oy
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Created 9/5/1995 Heikki Tuuri
*******************************************************/
@@ -709,7 +709,7 @@ print:
fprintf(stderr, "rw-lock %p ",
(void*) lock);
sync_array_cell_print(stderr, cell);
- rw_lock_debug_print(debug);
+ rw_lock_debug_print(stderr, debug);
return(TRUE);
}
}
@@ -916,10 +916,12 @@ sync_arr_wake_threads_if_sema_free(void)
Prints warnings of long semaphore waits to stderr. */
ibool
-sync_array_print_long_waits(void)
-/*=============================*/
- /* out: TRUE if fatal semaphore wait threshold
- was exceeded */
+sync_array_print_long_waits(
+/*========================*/
+ /* out: TRUE if fatal semaphore wait threshold
+ was exceeded */
+ os_thread_id_t* waiter, /* out: longest waiting thread */
+ const void** sema) /* out: longest-waited-for semaphore */
{
sync_cell_t* cell;
ibool old_val;
@@ -927,24 +929,40 @@ sync_array_print_long_waits(void)
ulint i;
ulint fatal_timeout = srv_fatal_semaphore_wait_threshold;
ibool fatal = FALSE;
+ double longest_diff = 0;
for (i = 0; i < sync_primary_wait_array->n_cells; i++) {
+ double diff;
+ void* wait_object;
+
cell = sync_array_get_nth_cell(sync_primary_wait_array, i);
- if (cell->wait_object != NULL && cell->waiting
- && difftime(time(NULL), cell->reservation_time) > 240) {
+ wait_object = cell->wait_object;
+
+ if (wait_object == NULL || !cell->waiting) {
+
+ continue;
+ }
+
+ diff = difftime(time(NULL), cell->reservation_time);
+
+ if (diff > 240) {
fputs("InnoDB: Warning: a long semaphore wait:\n",
stderr);
sync_array_cell_print(stderr, cell);
noticed = TRUE;
}
- if (cell->wait_object != NULL && cell->waiting
- && difftime(time(NULL), cell->reservation_time)
- > fatal_timeout) {
+ if (diff > fatal_timeout) {
fatal = TRUE;
}
+
+ if (diff > longest_diff) {
+ longest_diff = diff;
+ *sema = wait_object;
+ *waiter = cell->thread;
+ }
}
if (noticed) {
diff --git a/storage/innobase/sync/sync0rw.c b/storage/innobase/sync/sync0rw.c
index 0b05fb826ac..ef4c07e8c26 100644
--- a/storage/innobase/sync/sync0rw.c
+++ b/storage/innobase/sync/sync0rw.c
@@ -1,7 +1,7 @@
/******************************************************
The read-write lock (for thread synchronization)
-(c) 1995 Innobase Oy
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Created 9/11/1995 Heikki Tuuri
*******************************************************/
@@ -830,7 +830,7 @@ rw_lock_list_print_info(
info = UT_LIST_GET_FIRST(lock->debug_list);
while (info != NULL) {
- rw_lock_debug_print(info);
+ rw_lock_debug_print(file, info);
info = UT_LIST_GET_NEXT(list, info);
}
}
@@ -870,7 +870,7 @@ rw_lock_print(
info = UT_LIST_GET_FIRST(lock->debug_list);
while (info != NULL) {
- rw_lock_debug_print(info);
+ rw_lock_debug_print(stderr, info);
info = UT_LIST_GET_NEXT(list, info);
}
}
@@ -882,28 +882,29 @@ Prints info of a debug struct. */
void
rw_lock_debug_print(
/*================*/
+ FILE* f, /* in: output stream */
rw_lock_debug_t* info) /* in: debug struct */
{
ulint rwt;
rwt = info->lock_type;
- fprintf(stderr, "Locked: thread %lu file %s line %lu ",
+ fprintf(f, "Locked: thread %lu file %s line %lu ",
(ulong) os_thread_pf(info->thread_id), info->file_name,
(ulong) info->line);
if (rwt == RW_LOCK_SHARED) {
- fputs("S-LOCK", stderr);
+ fputs("S-LOCK", f);
} else if (rwt == RW_LOCK_EX) {
- fputs("X-LOCK", stderr);
+ fputs("X-LOCK", f);
} else if (rwt == RW_LOCK_WAIT_EX) {
- fputs("WAIT X-LOCK", stderr);
+ fputs("WAIT X-LOCK", f);
} else {
ut_error;
}
if (info->pass != 0) {
- fprintf(stderr, " pass value %lu", (ulong) info->pass);
+ fprintf(f, " pass value %lu", (ulong) info->pass);
}
- putc('\n', stderr);
+ putc('\n', f);
}
/*******************************************************************
diff --git a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c
index 21f75e0818f..a82d7f452fc 100644
--- a/storage/innobase/trx/trx0trx.c
+++ b/storage/innobase/trx/trx0trx.c
@@ -2041,14 +2041,15 @@ which is in the prepared state */
trx_t*
trx_get_trx_by_xid(
/*===============*/
- /* out: trx or NULL */
- XID* xid) /* in: X/Open XA transaction identification */
+ /* out: trx or NULL;
+ on match, the trx->xid will be invalidated */
+ const XID* xid) /* in: X/Open XA transaction identifier */
{
trx_t* trx;
if (xid == NULL) {
- return (NULL);
+ return(NULL);
}
mutex_enter(&kernel_mutex);
@@ -2061,10 +2062,16 @@ trx_get_trx_by_xid(
of gtrid_lenght+bqual_length bytes should be
the same */
- if (xid->gtrid_length == trx->xid.gtrid_length
+ if (trx->conc_state == TRX_PREPARED
+ && xid->gtrid_length == trx->xid.gtrid_length
&& xid->bqual_length == trx->xid.bqual_length
&& memcmp(xid->data, trx->xid.data,
xid->gtrid_length + xid->bqual_length) == 0) {
+
+ /* Invalidate the XID, so that subsequent calls
+ will not find it. */
+ memset(&trx->xid, 0, sizeof(trx->xid));
+ trx->xid.formatID = -1;
break;
}
@@ -2073,14 +2080,5 @@ trx_get_trx_by_xid(
mutex_exit(&kernel_mutex);
- if (trx) {
- if (trx->conc_state != TRX_PREPARED) {
-
- return(NULL);
- }
-
- return(trx);
- } else {
- return(NULL);
- }
+ return(trx);
}
diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog
index bf003b810d2..100cf3690ce 100644
--- a/storage/innodb_plugin/ChangeLog
+++ b/storage/innodb_plugin/ChangeLog
@@ -1,3 +1,89 @@
+2011-03-30 The InnoDB Team
+
+ * srv/srv0srv.c, sync/sync0arr.h, sync/sync0arr.c:
+ Fix Bug#11877216 InnoDB too eager to commit suicide on a busy server
+
+2011-03-15 The InnoDB Team
+
+ * btr/btr0cur.c, page/page0zip.c:
+ Fix Bug#11849231 inflateInit() invoked without initializing all memory
+
+2011-02-28 The InnoDB Team
+
+ * btr/btr0sea.c, buf/buf0buf.c, buf/buf0lru.c:
+ Fix Bug#58549 Race condition in buf_LRU_drop_page_hash_for_tablespace()
+ and compressed tables
+
+2011-02-15 The InnoDB Team
+
+ * sync/sync0rw.c, innodb_bug59307.test:
+ Bug#59307 Valgrind: uninitialized value in
+ rw_lock_set_writer_id_and_recursion_flag()
+
+2011-02-14 The InnoDB Team
+
+ * handler/handler0alter.cc:
+ Bug#59749 Enabling concurrent reads while creating non-primary
+ unique index gives failures
+
+2011-01-31 The InnoDB Team
+
+ * btr/btr0cur.c, include/row0upd.h,
+ row/row0purge.c, row/row0umod.c, row/row0upd.c:
+ Bug#59230 assert 0 row_upd_changes_ord_field_binary()
+ in post-crash rollback or purge
+
+2011-01-27 The InnoDB Team
+
+ * btr/btr0cur.c:
+ Bug#59465 btr_estimate_number_of_different_key_vals use
+ incorrect offset for external_size
+
+2011-01-27 The InnoDB Team
+
+ * include/trx0trx.h, trx/trx0trx.c:
+ Bug#59440 Race condition in XA ROLLBACK and XA COMMIT
+ after server restart
+
+2011-01-25 The InnoDB Team
+
+ * row/row0upd.c:
+ Bug#59585 Fix 58912 introduces compiler warning
+ due to potentially uninitialized variable
+
+2011-01-25 The InnoDB Team
+
+ * mtr/mtr0log.c:
+ Bug#59486 Incorrect usage of UNIV_UNLIKELY() in mlog_parse_string()
+
+2011-01-25 The InnoDB Team
+
+ * row/row0vers.c:
+ Fix Bug#59464 Race condition in row_vers_build_for_semi_consistent_read
+
+2011-01-25 The InnoDB Team
+
+ * btr/btr0btr.c, btr/btr0cur.c, btr/btr0sea.c,
+ buf/buf0buddy.c, buf/buf0buf.c, buf/buf0lru.c,
+ include/buf0buf.h, include/buf0buf.ic, include/buf0lru.h,
+ mem/mem0mem.c, page/page0zip.c:
+ Fix Bug#59707 Unused compression-related parameters
+ in buffer pool functions
+
+2011-01-18 The InnoDB Team
+
+ * include/sync0rw.h, sync/sync0arr.c, sync/sync0rw.c:
+ Fix Bug#59579 rw_lock_debug_print outputs to stderr, not to
+ SHOW ENGINE INNODB STATUS
+
+2011-01-14 The InnoDB Team
+ * btr/btr0cur.c, dict/dict0dict.c, handler/ha_innodb.cc,
+ include/btr0cur.h, include/dict0mem.h, include/rem0cmp.h,
+ include/rem0cmp.ic, include/srv0srv.h, rem/rem0cmp.c,
+ srv/srv0srv.c, innodb_bug30423.test:
+ Fix Bug#30423 InnoDBs treatment of NULL in index stats causes
+ bad "rows examined" estimates
+
2011-01-06 The InnoDB Team
* row/row0merge.c:
Fix Bug#59312 Examine MAX_FULL_NAME_LEN in InnoDB to address
diff --git a/storage/innodb_plugin/btr/btr0btr.c b/storage/innodb_plugin/btr/btr0btr.c
index 32e2caecdb8..46810c011c4 100644
--- a/storage/innodb_plugin/btr/btr0btr.c
+++ b/storage/innodb_plugin/btr/btr0btr.c
@@ -42,6 +42,560 @@ Created 6/2/1994 Heikki Tuuri
#include "ibuf0ibuf.h"
#include "trx0trx.h"
+#ifdef UNIV_BLOB_DEBUG
+# include "srv0srv.h"
+# include "ut0rbt.h"
+
+/** TRUE when messages about index->blobs modification are enabled. */
+static ibool btr_blob_dbg_msg;
+
+/** Issue a message about an operation on index->blobs.
+@param op operation
+@param b the entry being subjected to the operation
+@param ctx the context of the operation */
+#define btr_blob_dbg_msg_issue(op, b, ctx) \
+ fprintf(stderr, op " %u:%u:%u->%u %s(%u,%u,%u)\n", \
+ (b)->ref_page_no, (b)->ref_heap_no, \
+ (b)->ref_field_no, (b)->blob_page_no, ctx, \
+ (b)->owner, (b)->always_owner, (b)->del)
+
+/** Insert to index->blobs a reference to an off-page column.
+@param index the index tree
+@param b the reference
+@param ctx context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_insert(
+/*====================*/
+ dict_index_t* index, /*!< in/out: index tree */
+ const btr_blob_dbg_t* b, /*!< in: the reference */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ if (btr_blob_dbg_msg) {
+ btr_blob_dbg_msg_issue("insert", b, ctx);
+ }
+ mutex_enter(&index->blobs_mutex);
+ rbt_insert(index->blobs, b, b);
+ mutex_exit(&index->blobs_mutex);
+}
+
+/** Remove from index->blobs a reference to an off-page column.
+@param index the index tree
+@param b the reference
+@param ctx context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_delete(
+/*====================*/
+ dict_index_t* index, /*!< in/out: index tree */
+ const btr_blob_dbg_t* b, /*!< in: the reference */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ if (btr_blob_dbg_msg) {
+ btr_blob_dbg_msg_issue("delete", b, ctx);
+ }
+ mutex_enter(&index->blobs_mutex);
+ ut_a(rbt_delete(index->blobs, b));
+ mutex_exit(&index->blobs_mutex);
+}
+
+/**************************************************************//**
+Comparator for items (btr_blob_dbg_t) in index->blobs.
+The key in index->blobs is (ref_page_no, ref_heap_no, ref_field_no).
+@return negative, 0 or positive if *a<*b, *a=*b, *a>*b */
+static
+int
+btr_blob_dbg_cmp(
+/*=============*/
+ const void* a, /*!< in: first btr_blob_dbg_t to compare */
+ const void* b) /*!< in: second btr_blob_dbg_t to compare */
+{
+ const btr_blob_dbg_t* aa = a;
+ const btr_blob_dbg_t* bb = b;
+
+ ut_ad(aa != NULL);
+ ut_ad(bb != NULL);
+
+ if (aa->ref_page_no != bb->ref_page_no) {
+ return(aa->ref_page_no < bb->ref_page_no ? -1 : 1);
+ }
+ if (aa->ref_heap_no != bb->ref_heap_no) {
+ return(aa->ref_heap_no < bb->ref_heap_no ? -1 : 1);
+ }
+ if (aa->ref_field_no != bb->ref_field_no) {
+ return(aa->ref_field_no < bb->ref_field_no ? -1 : 1);
+ }
+ return(0);
+}
+
+/**************************************************************//**
+Add a reference to an off-page column to the index->blobs map. */
+UNIV_INTERN
+void
+btr_blob_dbg_add_blob(
+/*==================*/
+ const rec_t* rec, /*!< in: clustered index record */
+ ulint field_no, /*!< in: off-page column number */
+ ulint page_no, /*!< in: start page of the column */
+ dict_index_t* index, /*!< in/out: index tree */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ btr_blob_dbg_t b;
+ const page_t* page = page_align(rec);
+
+ ut_a(index->blobs);
+
+ b.blob_page_no = page_no;
+ b.ref_page_no = page_get_page_no(page);
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ b.ref_field_no = field_no;
+ ut_a(b.ref_field_no >= index->n_uniq);
+ b.always_owner = b.owner = TRUE;
+ b.del = FALSE;
+ ut_a(!rec_get_deleted_flag(rec, page_is_comp(page)));
+ btr_blob_dbg_rbt_insert(index, &b, ctx);
+}
+
+/**************************************************************//**
+Add to index->blobs any references to off-page columns from a record.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add_rec(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: offsets */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ ulint count = 0;
+ ulint i;
+ btr_blob_dbg_t b;
+ ibool del;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (!rec_offs_any_extern(offsets)) {
+ return(0);
+ }
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ del = (rec_get_deleted_flag(rec, rec_offs_comp(offsets)) != 0);
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ const byte* field_ref = rec_get_nth_field(
+ rec, offsets, i, &len);
+
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (!memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE)) {
+ /* the column has not been stored yet */
+ continue;
+ }
+
+ b.ref_field_no = i;
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+ ut_a(b.ref_field_no >= index->n_uniq);
+ b.always_owner = b.owner
+ = !(field_ref[BTR_EXTERN_LEN]
+ & BTR_EXTERN_OWNER_FLAG);
+ b.del = del;
+
+ btr_blob_dbg_rbt_insert(index, &b, ctx);
+ count++;
+ }
+ }
+
+ return(count);
+}
+
+/**************************************************************//**
+Display the references to off-page columns.
+This function is to be called from a debugger,
+for example when a breakpoint on ut_dbg_assertion_failed is hit. */
+UNIV_INTERN
+void
+btr_blob_dbg_print(
+/*===============*/
+ const dict_index_t* index) /*!< in: index tree */
+{
+ const ib_rbt_node_t* node;
+
+ if (!index->blobs) {
+ return;
+ }
+
+ /* We intentionally do not acquire index->blobs_mutex here.
+ This function is to be called from a debugger, and the caller
+ should make sure that the index->blobs_mutex is held. */
+
+ for (node = rbt_first(index->blobs);
+ node != NULL; node = rbt_next(index->blobs, node)) {
+ const btr_blob_dbg_t* b
+ = rbt_value(btr_blob_dbg_t, node);
+ fprintf(stderr, "%u:%u:%u->%u%s%s%s\n",
+ b->ref_page_no, b->ref_heap_no, b->ref_field_no,
+ b->blob_page_no,
+ b->owner ? "" : "(disowned)",
+ b->always_owner ? "" : "(has disowned)",
+ b->del ? "(deleted)" : "");
+ }
+}
+
+/**************************************************************//**
+Remove from index->blobs any references to off-page columns from a record.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove_rec(
+/*====================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: offsets */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ ulint i;
+ ulint count = 0;
+ btr_blob_dbg_t b;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (!rec_offs_any_extern(offsets)) {
+ return(0);
+ }
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ const byte* field_ref = rec_get_nth_field(
+ rec, offsets, i, &len);
+
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ b.ref_field_no = i;
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+
+ switch (b.blob_page_no) {
+ case 0:
+ /* The column has not been stored yet.
+ The BLOB pointer must be all zero.
+ There cannot be a BLOB starting at
+ page 0, because page 0 is reserved for
+ the tablespace header. */
+ ut_a(!memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+ /* fall through */
+ case FIL_NULL:
+ /* the column has been freed already */
+ continue;
+ }
+
+ btr_blob_dbg_rbt_delete(index, &b, ctx);
+ count++;
+ }
+ }
+
+ return(count);
+}
+
+/**************************************************************//**
+Check that there are no references to off-page columns from or to
+the given page. Invoked when freeing or clearing a page.
+@return TRUE when no orphan references exist */
+UNIV_INTERN
+ibool
+btr_blob_dbg_is_empty(
+/*==================*/
+ dict_index_t* index, /*!< in: index */
+ ulint page_no) /*!< in: page number */
+{
+ const ib_rbt_node_t* node;
+ ibool success = TRUE;
+
+ if (!index->blobs) {
+ return(success);
+ }
+
+ mutex_enter(&index->blobs_mutex);
+
+ for (node = rbt_first(index->blobs);
+ node != NULL; node = rbt_next(index->blobs, node)) {
+ const btr_blob_dbg_t* b
+ = rbt_value(btr_blob_dbg_t, node);
+
+ if (b->ref_page_no != page_no && b->blob_page_no != page_no) {
+ continue;
+ }
+
+ fprintf(stderr,
+ "InnoDB: orphan BLOB ref%s%s%s %u:%u:%u->%u\n",
+ b->owner ? "" : "(disowned)",
+ b->always_owner ? "" : "(has disowned)",
+ b->del ? "(deleted)" : "",
+ b->ref_page_no, b->ref_heap_no, b->ref_field_no,
+ b->blob_page_no);
+
+ if (b->blob_page_no != page_no || b->owner || !b->del) {
+ success = FALSE;
+ }
+ }
+
+ mutex_exit(&index->blobs_mutex);
+ return(success);
+}
+
+/**************************************************************//**
+Count and process all references to off-page columns on a page.
+@return number of references processed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_op(
+/*============*/
+ const page_t* page, /*!< in: B-tree leaf page */
+ const rec_t* rec, /*!< in: record to start from
+ (NULL to process the whole page) */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx, /*!< in: context (for logging) */
+ const btr_blob_dbg_op_f op) /*!< in: operation on records */
+{
+ ulint count = 0;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX);
+ ut_a(!rec || page_align(rec) == page);
+
+ if (!index->blobs || !page_is_leaf(page)
+ || !dict_index_is_clust(index)) {
+ return(0);
+ }
+
+ if (rec == NULL) {
+ rec = page_get_infimum_rec(page);
+ }
+
+ do {
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ count += op(rec, index, offsets, ctx);
+ rec = page_rec_get_next_const(rec);
+ } while (!page_rec_is_supremum(rec));
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(count);
+}
+
+/**************************************************************//**
+Count and add to index->blobs any references to off-page columns
+from records on a page.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add(
+/*=============*/
+ const page_t* page, /*!< in: rewritten page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ btr_blob_dbg_assert_empty(index, page_get_page_no(page));
+
+ return(btr_blob_dbg_op(page, NULL, index, ctx, btr_blob_dbg_add_rec));
+}
+
+/**************************************************************//**
+Count and remove from index->blobs any references to off-page columns
+from records on a page.
+Used when reorganizing a page, before copying the records.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove(
+/*================*/
+ const page_t* page, /*!< in: b-tree page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ ulint count;
+
+ count = btr_blob_dbg_op(page, NULL, index, ctx,
+ btr_blob_dbg_remove_rec);
+
+ /* Check that no references exist. */
+ btr_blob_dbg_assert_empty(index, page_get_page_no(page));
+
+ return(count);
+}
+
+/**************************************************************//**
+Restore in index->blobs any references to off-page columns
+Used when page reorganize fails due to compressed page overflow. */
+UNIV_INTERN
+void
+btr_blob_dbg_restore(
+/*=================*/
+ const page_t* npage, /*!< in: page that failed to compress */
+ const page_t* page, /*!< in: copy of original page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ ulint removed;
+ ulint added;
+
+ ut_a(page_get_page_no(npage) == page_get_page_no(page));
+ ut_a(page_get_space_id(npage) == page_get_space_id(page));
+
+ removed = btr_blob_dbg_remove(npage, index, ctx);
+ added = btr_blob_dbg_add(page, index, ctx);
+ ut_a(added == removed);
+}
+
+/**************************************************************//**
+Modify the 'deleted' flag of a record. */
+UNIV_INTERN
+void
+btr_blob_dbg_set_deleted_flag(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
+ ibool del) /*!< in: TRUE=deleted, FALSE=exists */
+{
+ const ib_rbt_node_t* node;
+ btr_blob_dbg_t b;
+ btr_blob_dbg_t* c;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_a(dict_index_is_clust(index));
+ ut_a(del == !!del);/* must be FALSE==0 or TRUE==1 */
+
+ if (!rec_offs_any_extern(offsets) || !index->blobs) {
+
+ return;
+ }
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ const byte* field_ref = rec_get_nth_field(
+ rec, offsets, i, &len);
+
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ b.ref_field_no = i;
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+
+ switch (b.blob_page_no) {
+ case 0:
+ ut_a(memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+ /* page number 0 is for the
+ page allocation bitmap */
+ case FIL_NULL:
+ /* the column has been freed already */
+ ut_error;
+ }
+
+ mutex_enter(&index->blobs_mutex);
+ node = rbt_lookup(index->blobs, &b);
+ ut_a(node);
+
+ c = rbt_value(btr_blob_dbg_t, node);
+ /* The flag should be modified. */
+ c->del = del;
+ if (btr_blob_dbg_msg) {
+ b = *c;
+ mutex_exit(&index->blobs_mutex);
+ btr_blob_dbg_msg_issue("del_mk", &b, "");
+ } else {
+ mutex_exit(&index->blobs_mutex);
+ }
+ }
+ }
+}
+
+/**************************************************************//**
+Change the ownership of an off-page column. */
+UNIV_INTERN
+void
+btr_blob_dbg_owner(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
+ ulint i, /*!< in: ith field in rec */
+ ibool own) /*!< in: TRUE=owned, FALSE=disowned */
+{
+ const ib_rbt_node_t* node;
+ btr_blob_dbg_t b;
+ const byte* field_ref;
+ ulint len;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_a(rec_offs_nth_extern(offsets, i));
+
+ field_ref = rec_get_nth_field(rec, offsets, i, &len);
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ b.ref_field_no = i;
+ b.owner = !(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG);
+ b.blob_page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
+
+ ut_a(b.owner == own);
+
+ mutex_enter(&index->blobs_mutex);
+ node = rbt_lookup(index->blobs, &b);
+ /* row_ins_clust_index_entry_by_modify() invokes
+ btr_cur_unmark_extern_fields() also for the newly inserted
+ references, which are all zero bytes until the columns are stored.
+ The node lookup must fail if and only if that is the case. */
+ ut_a(!memcmp(field_ref, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)
+ == !node);
+
+ if (node) {
+ btr_blob_dbg_t* c = rbt_value(btr_blob_dbg_t, node);
+ /* Some code sets ownership from TRUE to TRUE.
+ We do not allow changing ownership from FALSE to FALSE. */
+ ut_a(own || c->owner);
+
+ c->owner = own;
+ if (!own) {
+ c->always_owner = FALSE;
+ }
+ }
+
+ mutex_exit(&index->blobs_mutex);
+}
+#endif /* UNIV_BLOB_DEBUG */
+
/*
Latching strategy of the InnoDB B-tree
--------------------------------------
@@ -296,6 +850,7 @@ btr_page_create(
page_t* page = buf_block_get_frame(block);
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
if (UNIV_LIKELY_NULL(page_zip)) {
page_create_zip(block, index, level, mtr);
@@ -489,6 +1044,7 @@ btr_page_free_low(
modify clock */
buf_block_modify_clock_inc(block);
+ btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
if (dict_index_is_ibuf(index)) {
@@ -773,6 +1329,13 @@ btr_create(
block = buf_page_get(space, zip_size, page_no,
RW_X_LATCH, mtr);
} else {
+#ifdef UNIV_BLOB_DEBUG
+ if ((type & DICT_CLUSTERED) && !index->blobs) {
+ mutex_create(&index->blobs_mutex, SYNC_ANY_LATCH);
+ index->blobs = rbt_create(sizeof(btr_blob_dbg_t),
+ btr_blob_dbg_cmp);
+ }
+#endif /* UNIV_BLOB_DEBUG */
block = fseg_create(space, 0,
PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
}
@@ -979,7 +1542,7 @@ btr_page_reorganize_low(
log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
#ifndef UNIV_HOTBACKUP
- temp_block = buf_block_alloc(0);
+ temp_block = buf_block_alloc();
#else /* !UNIV_HOTBACKUP */
ut_ad(block == back_block1);
temp_block = back_block2;
@@ -996,6 +1559,7 @@ btr_page_reorganize_low(
block->check_index_page_at_flush = TRUE;
#endif /* !UNIV_HOTBACKUP */
+ btr_blob_dbg_remove(page, index, "btr_page_reorganize");
/* Recreate the page: note that global data on page (possible
segment headers, next page-field, etc.) is preserved intact */
@@ -1024,6 +1588,8 @@ btr_page_reorganize_low(
(!page_zip_compress(page_zip, page, index, NULL))) {
/* Restore the old page and exit. */
+ btr_blob_dbg_restore(page, temp_page, index,
+ "btr_page_reorganize_compress_fail");
#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
/* Check that the bytes that we skip are identical. */
@@ -1157,6 +1723,7 @@ btr_page_empty(
#endif /* UNIV_ZIP_DEBUG */
btr_search_drop_page_hash_index(block);
+ btr_blob_dbg_remove(page, index, "btr_page_empty");
/* Recreate the page: note that global data on page (possible
segment headers, next page-field, etc.) is preserved intact */
@@ -2497,6 +3064,7 @@ btr_lift_page_up(
index);
}
+ btr_blob_dbg_remove(page, index, "btr_lift_page_up");
lock_update_copy_and_discard(father_block, block);
/* Go upward to root page, decrementing levels by one. */
@@ -2758,6 +3326,7 @@ err_exit:
lock_update_merge_right(merge_block, orig_succ, block);
}
+ btr_blob_dbg_remove(page, index, "btr_compress");
mem_heap_free(heap);
if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) {
@@ -2988,6 +3557,8 @@ btr_discard_page(
block);
}
+ btr_blob_dbg_remove(page, index, "btr_discard_page");
+
/* Free the file page */
btr_page_free(index, block, mtr);
diff --git a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c
index c57255a25ae..d7b5ed0d135 100644
--- a/storage/innodb_plugin/btr/btr0cur.c
+++ b/storage/innodb_plugin/btr/btr0cur.c
@@ -100,6 +100,18 @@ can be released by page reorganize, then it is reorganized */
/*--------------------------------------*/
#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB
part header, in bytes */
+
+/** Estimated table level stats from sampled value.
+@param value sampled stats
+@param index index being sampled
+@param sample number of sampled rows
+@param ext_size external stored data size
+@param not_empty table not empty
+@return estimated table wide stats from sampled value */
+#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty)\
+ (((value) * (ib_int64_t) index->stat_n_leaf_pages \
+ + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
+
/* @} */
#endif /* !UNIV_HOTBACKUP */
@@ -174,7 +186,7 @@ static
ulint
btr_rec_get_externally_stored_len(
/*==============================*/
- rec_t* rec, /*!< in: record */
+ const rec_t* rec, /*!< in: record */
const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
#endif /* !UNIV_HOTBACKUP */
@@ -1756,8 +1768,8 @@ btr_cur_update_in_place(
NOT call it if index is secondary */
if (!dict_index_is_clust(index)
- || row_upd_changes_ord_field_binary(NULL, NULL,
- index, update)) {
+ || row_upd_changes_ord_field_binary(index, update, thr,
+ NULL, NULL)) {
/* Remove possible hash index pointer to this record */
btr_search_update_hash_on_delete(cursor);
@@ -2560,6 +2572,7 @@ btr_cur_del_mark_set_clust_rec(
page_zip = buf_block_get_page_zip(block);
+ btr_blob_dbg_set_deleted_flag(rec, index, offsets, val);
btr_rec_set_deleted_flag(rec, page_zip, val);
trx = thr_get_trx(thr);
@@ -3201,9 +3214,54 @@ btr_estimate_n_rows_in_range(
}
/*******************************************************************//**
+Record the number of non_null key values in a given index for
+each n-column prefix of the index where n < dict_index_get_n_unique(index).
+The estimates are eventually stored in the array:
+index->stat_n_non_null_key_vals. */
+static
+void
+btr_record_not_null_field_in_rec(
+/*=============================*/
+ rec_t* rec, /*!< in: physical record */
+ ulint n_unique, /*!< in: dict_index_get_n_unique(index),
+ number of columns uniquely determine
+ an index entry */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
+ its size could be for all fields or
+ that of "n_unique" */
+ ib_int64_t* n_not_null) /*!< in/out: array to record number of
+ not null rows for n-column prefix */
+{
+ ulint i;
+
+ ut_ad(rec_offs_n_fields(offsets) >= n_unique);
+
+ if (n_not_null == NULL) {
+ return;
+ }
+
+ for (i = 0; i < n_unique; i++) {
+ ulint rec_len;
+ byte* field;
+
+ field = rec_get_nth_field(rec, offsets, i, &rec_len);
+
+ if (rec_len != UNIV_SQL_NULL) {
+ n_not_null[i]++;
+ } else {
+ /* Break if we hit the first NULL value */
+ break;
+ }
+ }
+}
+
+/*******************************************************************//**
Estimates the number of different key values in a given index, for
each n-column prefix of the index where n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals. */
+The estimates are stored in the array index->stat_n_diff_key_vals.
+If innodb_stats_method is "nulls_ignored", we also record the number of
+non-null values for each prefix and store the estimates in
+array index->stat_n_non_null_key_vals. */
UNIV_INTERN
void
btr_estimate_number_of_different_key_vals(
@@ -3217,6 +3275,8 @@ btr_estimate_number_of_different_key_vals(
ulint matched_fields;
ulint matched_bytes;
ib_int64_t* n_diff;
+ ib_int64_t* n_not_null;
+ ibool stats_null_not_equal;
ullint n_sample_pages; /* number of pages to sample */
ulint not_empty_flag = 0;
ulint total_external_size = 0;
@@ -3225,16 +3285,43 @@ btr_estimate_number_of_different_key_vals(
ullint add_on;
mtr_t mtr;
mem_heap_t* heap = NULL;
- ulint offsets_rec_[REC_OFFS_NORMAL_SIZE];
- ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE];
- ulint* offsets_rec = offsets_rec_;
- ulint* offsets_next_rec= offsets_next_rec_;
- rec_offs_init(offsets_rec_);
- rec_offs_init(offsets_next_rec_);
+ ulint* offsets_rec = NULL;
+ ulint* offsets_next_rec = NULL;
n_cols = dict_index_get_n_unique(index);
- n_diff = mem_zalloc((n_cols + 1) * sizeof(ib_int64_t));
+ heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
+ * (n_cols + 1)
+ + dict_index_get_n_fields(index)
+ * (sizeof *offsets_rec
+ + sizeof *offsets_next_rec));
+
+ n_diff = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t));
+
+ n_not_null = NULL;
+
+ /* Check srv_innodb_stats_method setting, and decide whether we
+ need to record non-null value and also decide if NULL is
+ considered equal (by setting stats_null_not_equal value) */
+ switch (srv_innodb_stats_method) {
+ case SRV_STATS_NULLS_IGNORED:
+ n_not_null = mem_heap_zalloc(heap, (n_cols + 1)
+ * sizeof *n_not_null);
+ /* fall through */
+
+ case SRV_STATS_NULLS_UNEQUAL:
+ /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
+ case, we will treat NULLs as unequal value */
+ stats_null_not_equal = TRUE;
+ break;
+
+ case SRV_STATS_NULLS_EQUAL:
+ stats_null_not_equal = FALSE;
+ break;
+
+ default:
+ ut_error;
+ }
/* It makes no sense to test more pages than are contained
in the index, thus we lower the number if it is too high */
@@ -3251,7 +3338,6 @@ btr_estimate_number_of_different_key_vals(
/* We sample some pages in the index to get an estimate */
for (i = 0; i < n_sample_pages; i++) {
- rec_t* supremum;
mtr_start(&mtr);
btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
@@ -3264,18 +3350,25 @@ btr_estimate_number_of_different_key_vals(
page = btr_cur_get_page(&cursor);
- supremum = page_get_supremum_rec(page);
rec = page_rec_get_next(page_get_infimum_rec(page));
- if (rec != supremum) {
+ if (!page_rec_is_supremum(rec)) {
not_empty_flag = 1;
offsets_rec = rec_get_offsets(rec, index, offsets_rec,
ULINT_UNDEFINED, &heap);
+
+ if (n_not_null) {
+ btr_record_not_null_field_in_rec(
+ rec, n_cols, offsets_rec, n_not_null);
+ }
}
- while (rec != supremum) {
+ while (!page_rec_is_supremum(rec)) {
rec_t* next_rec = page_rec_get_next(rec);
- if (next_rec == supremum) {
+ if (page_rec_is_supremum(next_rec)) {
+ total_external_size +=
+ btr_rec_get_externally_stored_len(
+ rec, offsets_rec);
break;
}
@@ -3283,11 +3376,13 @@ btr_estimate_number_of_different_key_vals(
matched_bytes = 0;
offsets_next_rec = rec_get_offsets(next_rec, index,
offsets_next_rec,
- n_cols, &heap);
+ ULINT_UNDEFINED,
+ &heap);
cmp_rec_rec_with_match(rec, next_rec,
offsets_rec, offsets_next_rec,
- index, &matched_fields,
+ index, stats_null_not_equal,
+ &matched_fields,
&matched_bytes);
for (j = matched_fields + 1; j <= n_cols; j++) {
@@ -3297,6 +3392,12 @@ btr_estimate_number_of_different_key_vals(
n_diff[j]++;
}
+ if (n_not_null) {
+ btr_record_not_null_field_in_rec(
+ next_rec, n_cols, offsets_next_rec,
+ n_not_null);
+ }
+
total_external_size
+= btr_rec_get_externally_stored_len(
rec, offsets_rec);
@@ -3331,10 +3432,6 @@ btr_estimate_number_of_different_key_vals(
}
}
- offsets_rec = rec_get_offsets(rec, index, offsets_rec,
- ULINT_UNDEFINED, &heap);
- total_external_size += btr_rec_get_externally_stored_len(
- rec, offsets_rec);
mtr_commit(&mtr);
}
@@ -3348,13 +3445,9 @@ btr_estimate_number_of_different_key_vals(
for (j = 0; j <= n_cols; j++) {
index->stat_n_diff_key_vals[j]
- = ((n_diff[j]
- * (ib_int64_t)index->stat_n_leaf_pages
- + n_sample_pages - 1
- + total_external_size
- + not_empty_flag)
- / (n_sample_pages
- + total_external_size));
+ = BTR_TABLE_STATS_FROM_SAMPLE(
+ n_diff[j], index, n_sample_pages,
+ total_external_size, not_empty_flag);
/* If the tree is small, smaller than
10 * n_sample_pages + total_external_size, then
@@ -3373,45 +3466,81 @@ btr_estimate_number_of_different_key_vals(
}
index->stat_n_diff_key_vals[j] += add_on;
- }
- mem_free(n_diff);
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
+ /* Update the stat_n_non_null_key_vals[] with our
+ sampled result. stat_n_non_null_key_vals[] is created
+ and initialized to zero in dict_index_add_to_cache(),
+ along with stat_n_diff_key_vals[] array */
+ if (n_not_null != NULL && (j < n_cols)) {
+ index->stat_n_non_null_key_vals[j] =
+ BTR_TABLE_STATS_FROM_SAMPLE(
+ n_not_null[j], index, n_sample_pages,
+ total_external_size, not_empty_flag);
+ }
}
+
+ mem_heap_free(heap);
}
/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
/***********************************************************//**
+Gets the offset of the pointer to the externally stored part of a field.
+@return offset of the pointer to the externally stored part */
+static
+ulint
+btr_rec_get_field_ref_offs(
+/*=======================*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: index of the external field */
+{
+ ulint field_ref_offs;
+ ulint local_len;
+
+ ut_a(rec_offs_nth_extern(offsets, n));
+ field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
+ ut_a(local_len != UNIV_SQL_NULL);
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
+}
+
+/** Gets a pointer to the externally stored part of a field.
+@param rec record
+@param offsets rec_get_offsets(rec)
+@param n index of the externally stored field
+@return pointer to the externally stored part */
+#define btr_rec_get_field_ref(rec, offsets, n) \
+ ((rec) + btr_rec_get_field_ref_offs(offsets, n))
+
+/***********************************************************//**
Gets the externally stored size of a record, in units of a database page.
@return externally stored part, in units of a database page */
static
ulint
btr_rec_get_externally_stored_len(
/*==============================*/
- rec_t* rec, /*!< in: record */
+ const rec_t* rec, /*!< in: record */
const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
{
ulint n_fields;
- byte* data;
- ulint local_len;
- ulint extern_len;
ulint total_extern_len = 0;
ulint i;
ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+
+ if (!rec_offs_any_extern(offsets)) {
+ return(0);
+ }
+
n_fields = rec_offs_n_fields(offsets);
for (i = 0; i < n_fields; i++) {
if (rec_offs_nth_extern(offsets, i)) {
- data = rec_get_nth_field(rec, offsets, i, &local_len);
-
- local_len -= BTR_EXTERN_FIELD_REF_SIZE;
-
- extern_len = mach_read_from_4(data + local_len
- + BTR_EXTERN_LEN + 4);
+ ulint extern_len = mach_read_from_4(
+ btr_rec_get_field_ref(rec, offsets, i)
+ + BTR_EXTERN_LEN + 4);
total_extern_len += ut_calc_align(extern_len,
UNIV_PAGE_SIZE);
@@ -3441,7 +3570,7 @@ btr_cur_set_ownership_of_extern_field(
ulint byte_val;
data = rec_get_nth_field(rec, offsets, i, &local_len);
-
+ ut_ad(rec_offs_nth_extern(offsets, i));
ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
local_len -= BTR_EXTERN_FIELD_REF_SIZE;
@@ -3451,6 +3580,9 @@ btr_cur_set_ownership_of_extern_field(
if (val) {
byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
} else {
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
}
@@ -3464,6 +3596,8 @@ btr_cur_set_ownership_of_extern_field(
} else {
mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
}
+
+ btr_blob_dbg_owner(rec, index, offsets, i, val);
}
/*******************************************************************//**
@@ -3667,13 +3801,12 @@ btr_blob_free(
&& buf_block_get_space(block) == space
&& buf_block_get_page_no(block) == page_no) {
- if (buf_LRU_free_block(&block->page, all, NULL)
- != BUF_LRU_FREED
+ if (buf_LRU_free_block(&block->page, all) != BUF_LRU_FREED
&& all && block->page.zip.data) {
/* Attempt to deallocate the uncompressed page
if the whole block cannot be deallocted. */
- buf_LRU_free_block(&block->page, FALSE, NULL);
+ buf_LRU_free_block(&block->page, FALSE);
}
}
@@ -3689,8 +3822,8 @@ file segment of the index tree.
@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
UNIV_INTERN
ulint
-btr_store_big_rec_extern_fields(
-/*============================*/
+btr_store_big_rec_extern_fields_func(
+/*=================================*/
dict_index_t* index, /*!< in: index of rec; the index tree
MUST be X-latched */
buf_block_t* rec_block, /*!< in/out: block containing rec */
@@ -3699,11 +3832,17 @@ btr_store_big_rec_extern_fields(
the "external storage" flags in offsets
will not correspond to rec when
this function returns */
- big_rec_t* big_rec_vec, /*!< in: vector containing fields
+#ifdef UNIV_DEBUG
+ mtr_t* local_mtr, /*!< in: mtr containing the
+ latch to rec and to the tree */
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ibool update_in_place,/*! in: TRUE if the record is updated
+ in place (not delete+insert) */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+ const big_rec_t*big_rec_vec) /*!< in: vector containing fields
to be stored externally */
- mtr_t* local_mtr __attribute__((unused))) /*!< in: mtr
- containing the latch to rec and to the
- tree */
+
{
ulint rec_page_no;
byte* field_ref;
@@ -3721,6 +3860,7 @@ btr_store_big_rec_extern_fields(
z_stream c_stream;
ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_any_extern(offsets));
ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
MTR_MEMO_X_LOCK));
ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
@@ -3752,21 +3892,37 @@ btr_store_big_rec_extern_fields(
ut_a(err == Z_OK);
}
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ /* All pointers to externally stored columns in the record
+ must either be zero or they must be pointers to inherited
+ columns, owned by this record or an earlier record version. */
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (!rec_offs_nth_extern(offsets, i)) {
+ continue;
+ }
+ field_ref = btr_rec_get_field_ref(rec, offsets, i);
+
+ ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+ /* Either this must be an update in place,
+ or the BLOB must be inherited, or the BLOB pointer
+ must be zero (will be written in this function). */
+ ut_a(update_in_place
+ || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
+ || !memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+ }
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
/* We have to create a file segment to the tablespace
for each field and put the pointer to the field in rec */
for (i = 0; i < big_rec_vec->n_fields; i++) {
- ut_ad(rec_offs_nth_extern(offsets,
- big_rec_vec->fields[i].field_no));
- {
- ulint local_len;
- field_ref = rec_get_nth_field(
- rec, offsets, big_rec_vec->fields[i].field_no,
- &local_len);
- ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
- local_len -= BTR_EXTERN_FIELD_REF_SIZE;
- field_ref += local_len;
- }
+ field_ref = btr_rec_get_field_ref(
+ rec, offsets, big_rec_vec->fields[i].field_no);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ /* A zero BLOB pointer should have been initially inserted. */
+ ut_a(!memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
extern_len = big_rec_vec->fields[i].len;
UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
extern_len);
@@ -3941,6 +4097,11 @@ btr_store_big_rec_extern_fields(
}
if (prev_page_no == FIL_NULL) {
+ btr_blob_dbg_add_blob(
+ rec, big_rec_vec->fields[i]
+ .field_no, page_no, index,
+ "store");
+
mach_write_to_4(field_ref
+ BTR_EXTERN_SPACE_ID,
space_id);
@@ -4016,6 +4177,11 @@ next_zip_page:
MLOG_4BYTES, &mtr);
if (prev_page_no == FIL_NULL) {
+ btr_blob_dbg_add_blob(
+ rec, big_rec_vec->fields[i]
+ .field_no, page_no, index,
+ "store");
+
mlog_write_ulint(field_ref
+ BTR_EXTERN_SPACE_ID,
space_id,
@@ -4048,6 +4214,23 @@ next_zip_page:
mem_heap_free(heap);
}
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ /* All pointers to externally stored columns in the record
+ must be valid. */
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (!rec_offs_nth_extern(offsets, i)) {
+ continue;
+ }
+
+ field_ref = btr_rec_get_field_ref(rec, offsets, i);
+
+ /* The pointer must not be zero. */
+ ut_a(0 != memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+ /* The column must not be disowned by this record. */
+ ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+ }
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
return(DB_SUCCESS);
}
@@ -4070,6 +4253,7 @@ btr_check_blob_fil_page_type(
if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
ulint flags = fil_space_get_flags(space_id);
+#ifndef UNIV_DEBUG /* Improve debug test coverage */
if (UNIV_LIKELY
((flags & DICT_TF_FORMAT_MASK) == DICT_TF_FORMAT_51)) {
/* Old versions of InnoDB did not initialize
@@ -4078,6 +4262,7 @@ btr_check_blob_fil_page_type(
a BLOB page that is in Antelope format.*/
return;
}
+#endif /* !UNIV_DEBUG */
ut_print_timestamp(stderr);
fprintf(stderr,
@@ -4127,23 +4312,13 @@ btr_free_externally_stored_field(
ulint page_no;
ulint next_page_no;
mtr_t mtr;
-#ifdef UNIV_DEBUG
+
ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
MTR_MEMO_X_LOCK));
ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
MTR_MEMO_PAGE_X_FIX));
ut_ad(!rec || rec_offs_validate(rec, index, offsets));
-
- if (rec) {
- ulint local_len;
- const byte* f = rec_get_nth_field(rec, offsets,
- i, &local_len);
- ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
- local_len -= BTR_EXTERN_FIELD_REF_SIZE;
- f += local_len;
- ut_ad(f == field_ref);
- }
-#endif /* UNIV_DEBUG */
+ ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
BTR_EXTERN_FIELD_REF_SIZE))) {
@@ -4175,6 +4350,37 @@ btr_free_externally_stored_field(
rec_zip_size = 0;
}
+#ifdef UNIV_BLOB_DEBUG
+ if (!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)
+ && !((field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
+ && (rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY))) {
+ /* This off-page column will be freed.
+ Check that no references remain. */
+
+ btr_blob_dbg_t b;
+
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+
+ if (rec) {
+ /* Remove the reference from the record to the
+ BLOB. If the BLOB were not freed, the
+ reference would be removed when the record is
+ removed. Freeing the BLOB will overwrite the
+ BTR_EXTERN_PAGE_NO in the field_ref of the
+ record with FIL_NULL, which would make the
+ btr_blob_dbg information inconsistent with the
+ record. */
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ b.ref_field_no = i;
+ btr_blob_dbg_rbt_delete(index, &b, "free");
+ }
+
+ btr_blob_dbg_assert_empty(index, b.blob_page_no);
+ }
+#endif /* UNIV_BLOB_DEBUG */
+
for (;;) {
#ifdef UNIV_SYNC_DEBUG
buf_block_t* rec_block;
@@ -4308,13 +4514,8 @@ btr_rec_free_externally_stored_fields(
for (i = 0; i < n_fields; i++) {
if (rec_offs_nth_extern(offsets, i)) {
- ulint len;
- byte* data
- = rec_get_nth_field(rec, offsets, i, &len);
- ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
-
btr_free_externally_stored_field(
- index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ index, btr_rec_get_field_ref(rec, offsets, i),
rec, offsets, page_zip, i, rb_ctx, mtr);
}
}
@@ -4426,27 +4627,45 @@ btr_copy_blob_prefix(
/*******************************************************************//**
Copies the prefix of a compressed BLOB. The clustered index record
-that points to this BLOB must be protected by a lock or a page latch. */
+that points to this BLOB must be protected by a lock or a page latch.
+@return number of bytes written to buf */
static
-void
+ulint
btr_copy_zblob_prefix(
/*==================*/
- z_stream* d_stream,/*!< in/out: the decompressing stream */
+ byte* buf, /*!< out: the externally stored part of
+ the field, or a prefix of it */
+ ulint len, /*!< in: length of buf, in bytes */
ulint zip_size,/*!< in: compressed BLOB page size */
ulint space_id,/*!< in: space id of the BLOB pages */
ulint page_no,/*!< in: page number of the first BLOB page */
ulint offset) /*!< in: offset on the first BLOB page */
{
- ulint page_type = FIL_PAGE_TYPE_ZBLOB;
+ ulint page_type = FIL_PAGE_TYPE_ZBLOB;
+ mem_heap_t* heap;
+ int err;
+ z_stream d_stream;
+
+ d_stream.next_out = buf;
+ d_stream.avail_out = len;
+ d_stream.next_in = Z_NULL;
+ d_stream.avail_in = 0;
+
+ /* Zlib inflate needs 32 kilobytes for the default
+ window size, plus a few kilobytes for small objects. */
+ heap = mem_heap_create(40000);
+ page_zip_set_alloc(&d_stream, heap);
ut_ad(ut_is_2pow(zip_size));
ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE);
ut_ad(zip_size <= UNIV_PAGE_SIZE);
ut_ad(space_id);
+ err = inflateInit(&d_stream);
+ ut_a(err == Z_OK);
+
for (;;) {
buf_page_t* bpage;
- int err;
ulint next_page_no;
/* There is no latch on bpage directly. Instead,
@@ -4462,7 +4681,7 @@ btr_copy_zblob_prefix(
" compressed BLOB"
" page %lu space %lu\n",
(ulong) page_no, (ulong) space_id);
- return;
+ goto func_exit;
}
if (UNIV_UNLIKELY
@@ -4488,13 +4707,13 @@ btr_copy_zblob_prefix(
offset += 4;
}
- d_stream->next_in = bpage->zip.data + offset;
- d_stream->avail_in = zip_size - offset;
+ d_stream.next_in = bpage->zip.data + offset;
+ d_stream.avail_in = zip_size - offset;
- err = inflate(d_stream, Z_NO_FLUSH);
+ err = inflate(&d_stream, Z_NO_FLUSH);
switch (err) {
case Z_OK:
- if (!d_stream->avail_out) {
+ if (!d_stream.avail_out) {
goto end_of_blob;
}
break;
@@ -4511,13 +4730,13 @@ inflate_error:
" compressed BLOB"
" page %lu space %lu returned %d (%s)\n",
(ulong) page_no, (ulong) space_id,
- err, d_stream->msg);
+ err, d_stream.msg);
case Z_BUF_ERROR:
goto end_of_blob;
}
if (next_page_no == FIL_NULL) {
- if (!d_stream->avail_in) {
+ if (!d_stream.avail_in) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: unexpected end of"
@@ -4526,7 +4745,7 @@ inflate_error:
(ulong) page_no,
(ulong) space_id);
} else {
- err = inflate(d_stream, Z_FINISH);
+ err = inflate(&d_stream, Z_FINISH);
switch (err) {
case Z_STREAM_END:
case Z_BUF_ERROR:
@@ -4538,7 +4757,7 @@ inflate_error:
end_of_blob:
buf_page_release_zip(bpage);
- return;
+ goto func_exit;
}
buf_page_release_zip(bpage);
@@ -4550,6 +4769,12 @@ end_of_blob:
offset = FIL_PAGE_NEXT;
page_type = FIL_PAGE_TYPE_ZBLOB2;
}
+
+func_exit:
+ inflateEnd(&d_stream);
+ mem_heap_free(heap);
+ UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
+ return(d_stream.total_out);
}
/*******************************************************************//**
@@ -4575,28 +4800,8 @@ btr_copy_externally_stored_field_prefix_low(
}
if (UNIV_UNLIKELY(zip_size)) {
- int err;
- z_stream d_stream;
- mem_heap_t* heap;
-
- /* Zlib inflate needs 32 kilobytes for the default
- window size, plus a few kilobytes for small objects. */
- heap = mem_heap_create(40000);
- page_zip_set_alloc(&d_stream, heap);
-
- err = inflateInit(&d_stream);
- ut_a(err == Z_OK);
-
- d_stream.next_out = buf;
- d_stream.avail_out = len;
- d_stream.avail_in = 0;
-
- btr_copy_zblob_prefix(&d_stream, zip_size,
- space_id, page_no, offset);
- inflateEnd(&d_stream);
- mem_heap_free(heap);
- UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
- return(d_stream.total_out);
+ return(btr_copy_zblob_prefix(buf, len, zip_size,
+ space_id, page_no, offset));
} else {
return(btr_copy_blob_prefix(buf, len, space_id,
page_no, offset));
diff --git a/storage/innodb_plugin/btr/btr0sea.c b/storage/innodb_plugin/btr/btr0sea.c
index 035fdbb61d2..cd0eadbb1b8 100644
--- a/storage/innodb_plugin/btr/btr0sea.c
+++ b/storage/innodb_plugin/btr/btr0sea.c
@@ -141,7 +141,7 @@ btr_search_check_free_space_in_heap(void)
be enough free space in the hash table. */
if (heap->free_block == NULL) {
- buf_block_t* block = buf_block_alloc(0);
+ buf_block_t* block = buf_block_alloc();
rw_lock_x_lock(&btr_search_latch);
@@ -1201,8 +1201,8 @@ btr_search_drop_page_hash_when_freed(
having to fear a deadlock. */
block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH, NULL,
- BUF_GET_IF_IN_POOL, __FILE__, __LINE__,
- &mtr);
+ BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__,
+ &mtr);
/* Because the buffer pool mutex was released by
buf_page_peek_if_search_hashed(), it is possible that the
block was removed from the buffer pool by another thread
diff --git a/storage/innodb_plugin/buf/buf0buddy.c b/storage/innodb_plugin/buf/buf0buddy.c
index ee5a569c3ff..63c99571510 100644
--- a/storage/innodb_plugin/buf/buf0buddy.c
+++ b/storage/innodb_plugin/buf/buf0buddy.c
@@ -327,7 +327,7 @@ buf_buddy_alloc_low(
/* Try replacing an uncompressed page in the buffer pool. */
buf_pool_mutex_exit();
- block = buf_LRU_get_free_block(0);
+ block = buf_LRU_get_free_block();
*lru = TRUE;
buf_pool_mutex_enter();
diff --git a/storage/innodb_plugin/buf/buf0buf.c b/storage/innodb_plugin/buf/buf0buf.c
index dac416f9472..14ec7b75911 100644
--- a/storage/innodb_plugin/buf/buf0buf.c
+++ b/storage/innodb_plugin/buf/buf0buf.c
@@ -657,9 +657,9 @@ buf_block_init(
block->modify_clock = 0;
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
block->page.file_page_was_freed = FALSE;
-#endif /* UNIV_DEBUG_FILE_ACCESSES */
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
block->check_index_page_at_flush = FALSE;
block->index = NULL;
@@ -1283,7 +1283,7 @@ shrink_again:
buf_LRU_make_block_old(&block->page);
dirty++;
- } else if (buf_LRU_free_block(&block->page, TRUE, NULL)
+ } else if (buf_LRU_free_block(&block->page, TRUE)
!= BUF_LRU_FREED) {
nonfree++;
}
@@ -1600,7 +1600,7 @@ buf_page_peek_if_search_hashed(
return(is_hashed);
}
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
/********************************************************************//**
Sets file_page_was_freed TRUE if the page is found in the buffer pool.
This function should be called when we free a file page and want the
@@ -1621,6 +1621,8 @@ buf_page_set_file_page_was_freed(
bpage = buf_page_hash_get(space, offset);
if (bpage) {
+ /* bpage->file_page_was_freed can already hold
+ when this code is invoked from dict_drop_index_tree() */
bpage->file_page_was_freed = TRUE;
}
@@ -1656,7 +1658,7 @@ buf_page_reset_file_page_was_freed(
return(bpage);
}
-#endif /* UNIV_DEBUG_FILE_ACCESSES */
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
/********************************************************************//**
Get read access to a compressed page (usually of type
@@ -1729,8 +1731,7 @@ err_exit:
mutex_enter(block_mutex);
/* Discard the uncompressed page frame if possible. */
- if (buf_LRU_free_block(bpage, FALSE, NULL)
- == BUF_LRU_FREED) {
+ if (buf_LRU_free_block(bpage, FALSE) == BUF_LRU_FREED) {
mutex_exit(block_mutex);
goto lookup;
@@ -1754,7 +1755,7 @@ got_block:
buf_page_set_accessed_make_young(bpage, access_time);
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
ut_a(!bpage->file_page_was_freed);
#endif
@@ -1892,16 +1893,19 @@ buf_block_align(
/* TODO: protect buf_pool->chunks with a mutex (it will
currently remain constant after buf_pool_init()) */
for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) {
- lint offs = ptr - chunk->blocks->frame;
+ ulint offs;
- if (UNIV_UNLIKELY(offs < 0)) {
+ if (UNIV_UNLIKELY(ptr < chunk->blocks->frame)) {
continue;
}
+ /* else */
+
+ offs = ptr - chunk->blocks->frame;
offs >>= UNIV_PAGE_SIZE_SHIFT;
- if (UNIV_LIKELY((ulint) offs < chunk->size)) {
+ if (UNIV_LIKELY(offs < chunk->size)) {
buf_block_t* block = &chunk->blocks[offs];
/* The function buf_chunk_init() invokes
@@ -2027,7 +2031,7 @@ buf_page_get_gen(
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
buf_block_t* guess, /*!< in: guessed block or NULL */
ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
- BUF_GET_NO_LATCH */
+ BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH */
const char* file, /*!< in: file name */
ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mini-transaction */
@@ -2043,9 +2047,19 @@ buf_page_get_gen(
ut_ad((rw_latch == RW_S_LATCH)
|| (rw_latch == RW_X_LATCH)
|| (rw_latch == RW_NO_LATCH));
- ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH));
- ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL)
- || (mode == BUF_GET_NO_LATCH));
+#ifdef UNIV_DEBUG
+ switch (mode) {
+ case BUF_GET_NO_LATCH:
+ ut_ad(rw_latch == RW_NO_LATCH);
+ break;
+ case BUF_GET:
+ case BUF_GET_IF_IN_POOL:
+ case BUF_PEEK_IF_IN_POOL:
+ break;
+ default:
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
ut_ad(zip_size == fil_space_get_zip_size(space));
ut_ad(ut_is_2pow(zip_size));
#ifndef UNIV_LOG_DEBUG
@@ -2087,7 +2101,8 @@ loop2:
buf_pool_mutex_exit();
- if (mode == BUF_GET_IF_IN_POOL) {
+ if (mode == BUF_GET_IF_IN_POOL
+ || mode == BUF_PEEK_IF_IN_POOL) {
return(NULL);
}
@@ -2126,7 +2141,8 @@ loop2:
must_read = buf_block_get_io_fix(block) == BUF_IO_READ;
- if (must_read && mode == BUF_GET_IF_IN_POOL) {
+ if (must_read && (mode == BUF_GET_IF_IN_POOL
+ || mode == BUF_PEEK_IF_IN_POOL)) {
/* The page is only being read to buffer */
buf_pool_mutex_exit();
@@ -2165,7 +2181,7 @@ wait_until_unfixed:
buf_pool_mutex_exit();
mutex_exit(&buf_pool_zip_mutex);
- block = buf_LRU_get_free_block(0);
+ block = buf_LRU_get_free_block();
ut_a(block);
buf_pool_mutex_enter();
@@ -2244,6 +2260,7 @@ wait_until_unfixed:
mutex_exit(&buf_pool_zip_mutex);
buf_pool->n_pend_unzip++;
+ bpage->state = BUF_BLOCK_ZIP_FREE;
buf_buddy_free(bpage, sizeof *bpage);
buf_pool_mutex_exit();
@@ -2291,8 +2308,7 @@ wait_until_unfixed:
/* Try to evict the block from the buffer pool, to use the
insert buffer as much as possible. */
- if (buf_LRU_free_block(&block->page, TRUE, NULL)
- == BUF_LRU_FREED) {
+ if (buf_LRU_free_block(&block->page, TRUE) == BUF_LRU_FREED) {
buf_pool_mutex_exit();
mutex_exit(&block->mutex);
fprintf(stderr,
@@ -2321,9 +2337,11 @@ wait_until_unfixed:
buf_pool_mutex_exit();
- buf_page_set_accessed_make_young(&block->page, access_time);
+ if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL)) {
+ buf_page_set_accessed_make_young(&block->page, access_time);
+ }
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
ut_a(!block->page.file_page_was_freed);
#endif
@@ -2374,7 +2392,7 @@ wait_until_unfixed:
mtr_memo_push(mtr, block, fix_type);
- if (!access_time) {
+ if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL) && !access_time) {
/* In the case of a first access, try to apply linear
read-ahead */
@@ -2481,7 +2499,7 @@ buf_page_optimistic_get(
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
ut_a(block->page.file_page_was_freed == FALSE);
#endif
if (UNIV_UNLIKELY(!access_time)) {
@@ -2589,7 +2607,7 @@ buf_page_get_known_nowait(
ut_a(block->page.buf_fix_count > 0);
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
ut_a(block->page.file_page_was_freed == FALSE);
#endif
@@ -2672,9 +2690,9 @@ buf_page_try_get_func(
ut_a(block->page.buf_fix_count > 0);
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
ut_a(block->page.file_page_was_freed == FALSE);
-#endif /* UNIV_DEBUG_FILE_ACCESSES */
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
buf_pool->stat.n_page_gets++;
@@ -2703,9 +2721,9 @@ buf_page_init_low(
bpage->newest_modification = 0;
bpage->oldest_modification = 0;
HASH_INVALIDATE(bpage, hash);
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
bpage->file_page_was_freed = FALSE;
-#endif /* UNIV_DEBUG_FILE_ACCESSES */
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
}
/********************************************************************//**
@@ -2829,7 +2847,7 @@ buf_page_init_for_read(
&& UNIV_LIKELY(!recv_recovery_is_on())) {
block = NULL;
} else {
- block = buf_LRU_get_free_block(0);
+ block = buf_LRU_get_free_block();
ut_ad(block);
}
@@ -2923,6 +2941,7 @@ err_exit:
&& UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) {
/* The block was added by some other thread. */
+ bpage->state = BUF_BLOCK_ZIP_FREE;
buf_buddy_free(bpage, sizeof *bpage);
buf_buddy_free(data, zip_size);
@@ -3001,7 +3020,7 @@ buf_page_create(
ut_ad(mtr->state == MTR_ACTIVE);
ut_ad(space || !zip_size);
- free_block = buf_LRU_get_free_block(0);
+ free_block = buf_LRU_get_free_block();
buf_pool_mutex_enter();
@@ -3011,9 +3030,9 @@ buf_page_create(
#ifdef UNIV_IBUF_COUNT_DEBUG
ut_a(ibuf_count_get(space, offset) == 0);
#endif
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
block->page.file_page_was_freed = FALSE;
-#endif /* UNIV_DEBUG_FILE_ACCESSES */
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
/* Page can be found in buf_pool */
buf_pool_mutex_exit();
diff --git a/storage/innodb_plugin/buf/buf0lru.c b/storage/innodb_plugin/buf/buf0lru.c
index e4cf218bf2e..a69b2658c51 100644
--- a/storage/innodb_plugin/buf/buf0lru.c
+++ b/storage/innodb_plugin/buf/buf0lru.c
@@ -246,71 +246,75 @@ buf_LRU_drop_page_hash_for_tablespace(
page_arr = ut_malloc(sizeof(ulint)
* BUF_LRU_DROP_SEARCH_HASH_SIZE);
buf_pool_mutex_enter();
+ num_entries = 0;
scan_again:
- num_entries = 0;
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
while (bpage != NULL) {
- mutex_t* block_mutex = buf_page_get_mutex(bpage);
buf_page_t* prev_bpage;
+ ibool is_fixed;
- mutex_enter(block_mutex);
prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
ut_a(buf_page_in_file(bpage));
if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE
|| bpage->space != id
- || bpage->buf_fix_count > 0
|| bpage->io_fix != BUF_IO_NONE) {
- /* We leave the fixed pages as is in this scan.
- To be dealt with later in the final scan. */
- mutex_exit(block_mutex);
- goto next_page;
+ /* Compressed pages are never hashed.
+ Skip blocks of other tablespaces.
+ Skip I/O-fixed blocks (to be dealt with later). */
+next_page:
+ bpage = prev_bpage;
+ continue;
}
- if (((buf_block_t*) bpage)->is_hashed) {
+ mutex_enter(&((buf_block_t*) bpage)->mutex);
+ is_fixed = bpage->buf_fix_count > 0
+ || !((buf_block_t*) bpage)->is_hashed;
+ mutex_exit(&((buf_block_t*) bpage)->mutex);
- /* Store the offset(i.e.: page_no) in the array
- so that we can drop hash index in a batch
- later. */
- page_arr[num_entries] = bpage->offset;
- mutex_exit(block_mutex);
- ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
- ++num_entries;
+ if (is_fixed) {
+ goto next_page;
+ }
- if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
- goto next_page;
- }
- /* Array full. We release the buf_pool_mutex to
- obey the latching order. */
- buf_pool_mutex_exit();
-
- buf_LRU_drop_page_hash_batch(id, zip_size, page_arr,
- num_entries);
- num_entries = 0;
- buf_pool_mutex_enter();
- } else {
- mutex_exit(block_mutex);
+ /* Store the page number so that we can drop the hash
+ index in a batch later. */
+ page_arr[num_entries] = bpage->offset;
+ ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
+ ++num_entries;
+
+ if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
+ goto next_page;
}
-next_page:
- /* Note that we may have released the buf_pool mutex
- above after reading the prev_bpage during processing
- of a page_hash_batch (i.e.: when the array was full).
- This means that prev_bpage can change in LRU list.
- This is OK because this function is a 'best effort'
- to drop as many search hash entries as possible and
- it does not guarantee that ALL such entries will be
- dropped. */
- bpage = prev_bpage;
+ /* Array full. We release the buf_pool_mutex to
+ obey the latching order. */
+ buf_pool_mutex_exit();
+ buf_LRU_drop_page_hash_batch(id, zip_size, page_arr,
+ num_entries);
+ buf_pool_mutex_enter();
+ num_entries = 0;
+
+ /* Note that we released the buf_pool mutex above
+ after reading the prev_bpage during processing of a
+ page_hash_batch (i.e.: when the array was full).
+ Because prev_bpage could belong to a compressed-only
+ block, it may have been relocated, and thus the
+ pointer cannot be trusted. Because bpage is of type
+ buf_block_t, it is safe to dereference.
+
+ bpage can change in the LRU list. This is OK because
+ this function is a 'best effort' to drop as many
+ search hash entries as possible and it does not
+ guarantee that ALL such entries will be dropped. */
/* If, however, bpage has been removed from LRU list
to the free list then we should restart the scan.
bpage->state is protected by buf_pool mutex. */
- if (bpage && !buf_page_in_file(bpage)) {
- ut_a(num_entries == 0);
+ if (bpage
+ && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
goto scan_again;
}
}
@@ -575,7 +579,7 @@ buf_LRU_free_from_unzip_LRU_list(
ut_ad(block->page.in_LRU_list);
mutex_enter(&block->mutex);
- freed = buf_LRU_free_block(&block->page, FALSE, NULL);
+ freed = buf_LRU_free_block(&block->page, FALSE);
mutex_exit(&block->mutex);
switch (freed) {
@@ -636,7 +640,7 @@ buf_LRU_free_from_common_LRU_list(
mutex_enter(block_mutex);
accessed = buf_page_is_accessed(bpage);
- freed = buf_LRU_free_block(bpage, TRUE, NULL);
+ freed = buf_LRU_free_block(bpage, TRUE);
mutex_exit(block_mutex);
switch (freed) {
@@ -798,10 +802,8 @@ LRU list to the free list.
@return the free control block, in state BUF_BLOCK_READY_FOR_USE */
UNIV_INTERN
buf_block_t*
-buf_LRU_get_free_block(
-/*===================*/
- ulint zip_size) /*!< in: compressed page size in bytes,
- or 0 if uncompressed tablespace */
+buf_LRU_get_free_block(void)
+/*========================*/
{
buf_block_t* block = NULL;
ibool freed;
@@ -877,26 +879,10 @@ loop:
/* If there is a block in the free list, take it */
block = buf_LRU_get_free_only();
- if (block) {
-
-#ifdef UNIV_DEBUG
- block->page.zip.m_start =
-#endif /* UNIV_DEBUG */
- block->page.zip.m_end =
- block->page.zip.m_nonempty =
- block->page.zip.n_blobs = 0;
-
- if (UNIV_UNLIKELY(zip_size)) {
- ibool lru;
- page_zip_set_size(&block->page.zip, zip_size);
- block->page.zip.data = buf_buddy_alloc(zip_size, &lru);
- UNIV_MEM_DESC(block->page.zip.data, zip_size, block);
- } else {
- page_zip_set_size(&block->page.zip, 0);
- block->page.zip.data = NULL;
- }
+ buf_pool_mutex_exit();
- buf_pool_mutex_exit();
+ if (block) {
+ memset(&block->page.zip, 0, sizeof block->page.zip);
if (started_monitor) {
srv_print_innodb_monitor = mon_value_was;
@@ -908,8 +894,6 @@ loop:
/* If no block was in the free list, search from the end of the LRU
list and try to free a block there */
- buf_pool_mutex_exit();
-
freed = buf_LRU_search_and_free_block(n_iterations);
if (freed > 0) {
@@ -1378,12 +1362,8 @@ enum buf_lru_free_block_status
buf_LRU_free_block(
/*===============*/
buf_page_t* bpage, /*!< in: block to be freed */
- ibool zip, /*!< in: TRUE if should remove also the
+ ibool zip) /*!< in: TRUE if should remove also the
compressed page of an uncompressed page */
- ibool* buf_pool_mutex_released)
- /*!< in: pointer to a variable that will
- be assigned TRUE if buf_pool_mutex
- was temporarily released, or NULL */
{
buf_page_t* b = NULL;
mutex_t* block_mutex = buf_page_get_mutex(bpage);
@@ -1554,10 +1534,6 @@ alloc:
b->io_fix = BUF_IO_READ;
}
- if (buf_pool_mutex_released) {
- *buf_pool_mutex_released = TRUE;
- }
-
buf_pool_mutex_exit();
mutex_exit(block_mutex);
@@ -1827,6 +1803,7 @@ buf_LRU_block_remove_hashed_page(
buf_pool_mutex_exit_forbid();
buf_buddy_free(bpage->zip.data,
page_zip_get_size(&bpage->zip));
+ bpage->state = BUF_BLOCK_ZIP_FREE;
buf_buddy_free(bpage, sizeof(*bpage));
buf_pool_mutex_exit_allow();
UNIV_MEM_UNDESC(bpage);
diff --git a/storage/innodb_plugin/dict/dict0dict.c b/storage/innodb_plugin/dict/dict0dict.c
index 9474ab3582a..a8787bbda61 100644
--- a/storage/innodb_plugin/dict/dict0dict.c
+++ b/storage/innodb_plugin/dict/dict0dict.c
@@ -1669,6 +1669,12 @@ undo_size_ok:
new_index->heap,
(1 + dict_index_get_n_unique(new_index))
* sizeof(ib_int64_t));
+
+ new_index->stat_n_non_null_key_vals = mem_heap_zalloc(
+ new_index->heap,
+ (1 + dict_index_get_n_unique(new_index))
+ * sizeof(*new_index->stat_n_non_null_key_vals));
+
/* Give some sensible values to stat_n_... in case we do
not calculate statistics quickly enough */
@@ -4291,6 +4297,10 @@ dict_update_statistics(
for (i = dict_index_get_n_unique(index); i; ) {
index->stat_n_diff_key_vals[i--] = 1;
}
+
+ memset(index->stat_n_non_null_key_vals, 0,
+ (1 + dict_index_get_n_unique(index))
+ * sizeof(*index->stat_n_non_null_key_vals));
}
index = dict_table_get_next_index(index);
diff --git a/storage/innodb_plugin/dict/dict0mem.c b/storage/innodb_plugin/dict/dict0mem.c
index 3287247029f..aef815dd2f6 100644
--- a/storage/innodb_plugin/dict/dict0mem.c
+++ b/storage/innodb_plugin/dict/dict0mem.c
@@ -36,6 +36,9 @@ Created 1/8/1996 Heikki Tuuri
#ifndef UNIV_HOTBACKUP
# include "lock0lock.h"
#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_BLOB_DEBUG
+# include "ut0rbt.h"
+#endif /* UNIV_BLOB_DEBUG */
#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when
creating a table or index object */
@@ -316,6 +319,12 @@ dict_mem_index_free(
{
ut_ad(index);
ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+#ifdef UNIV_BLOB_DEBUG
+ if (index->blobs) {
+ mutex_free(&index->blobs_mutex);
+ rbt_free(index->blobs);
+ }
+#endif /* UNIV_BLOB_DEBUG */
mem_heap_free(index->heap);
}
diff --git a/storage/innodb_plugin/fsp/fsp0fsp.c b/storage/innodb_plugin/fsp/fsp0fsp.c
index e9d24b8fdf6..d091a14c474 100644
--- a/storage/innodb_plugin/fsp/fsp0fsp.c
+++ b/storage/innodb_plugin/fsp/fsp0fsp.c
@@ -3444,9 +3444,9 @@ fseg_free_page(
fseg_free_page_low(seg_inode, space, zip_size, page, mtr);
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
buf_page_set_file_page_was_freed(space, page);
-#endif
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
}
/**********************************************************************//**
@@ -3513,13 +3513,13 @@ fseg_free_extent(
fsp_free_extent(space, zip_size, page, mtr);
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
for (i = 0; i < FSP_EXTENT_SIZE; i++) {
buf_page_set_file_page_was_freed(space,
first_page_in_extent + i);
}
-#endif
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
}
/**********************************************************************//**
diff --git a/storage/innodb_plugin/handler/ha_innodb.cc b/storage/innodb_plugin/handler/ha_innodb.cc
index 18a25fd8c4e..481b89ce0d1 100644
--- a/storage/innodb_plugin/handler/ha_innodb.cc
+++ b/storage/innodb_plugin/handler/ha_innodb.cc
@@ -174,6 +174,25 @@ static char* internal_innobase_data_file_path = NULL;
static char* innodb_version_str = (char*) INNODB_VERSION_STR;
+/** Possible values for system variable "innodb_stats_method". The values
+are defined the same as its corresponding MyISAM system variable
+"myisam_stats_method"(see "myisam_stats_method_names"), for better usability */
+static const char* innodb_stats_method_names[] = {
+ "nulls_equal",
+ "nulls_unequal",
+ "nulls_ignored",
+ NullS
+};
+
+/** Used to define an enumerate type of the system variable innodb_stats_method.
+This is the same as "myisam_stats_method_typelib" */
+static TYPELIB innodb_stats_method_typelib = {
+ array_elements(innodb_stats_method_names) - 1,
+ "innodb_stats_method_typelib",
+ innodb_stats_method_names,
+ NULL
+};
+
/* The following counter is used to convey information to InnoDB
about server activity: in selects it is not sensible to call
srv_active_wake_master_thread after each fetch or search, we only do
@@ -7518,6 +7537,65 @@ innobase_get_mysql_key_number_for_index(
return(0);
}
+
+/*********************************************************************//**
+Calculate Record Per Key value. Need to exclude the NULL value if
+innodb_stats_method is set to "nulls_ignored"
+@return estimated record per key value */
+static
+ha_rows
+innodb_rec_per_key(
+/*===============*/
+ dict_index_t* index, /*!< in: dict_index_t structure */
+ ulint i, /*!< in: the column we are
+ calculating rec per key */
+ ha_rows records) /*!< in: estimated total records */
+{
+ ha_rows rec_per_key;
+
+ ut_ad(i < dict_index_get_n_unique(index));
+
+ /* Note the stat_n_diff_key_vals[] stores the diff value with
+ n-prefix indexing, so it is always stat_n_diff_key_vals[i + 1] */
+ if (index->stat_n_diff_key_vals[i + 1] == 0) {
+
+ rec_per_key = records;
+ } else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) {
+ ib_int64_t num_null;
+
+ /* Number of rows with NULL value in this
+ field */
+ num_null = records - index->stat_n_non_null_key_vals[i];
+
+ /* In theory, index->stat_n_non_null_key_vals[i]
+ should always be less than the number of records.
+ Since this is statistics value, the value could
+ have slight discrepancy. But we will make sure
+ the number of null values is not a negative number. */
+ num_null = (num_null < 0) ? 0 : num_null;
+
+ /* If the number of NULL values is the same as or
+ large than that of the distinct values, we could
+ consider that the table consists mostly of NULL value.
+ Set rec_per_key to 1. */
+ if (index->stat_n_diff_key_vals[i + 1] <= num_null) {
+ rec_per_key = 1;
+ } else {
+ /* Need to exclude rows with NULL values from
+ rec_per_key calculation */
+ rec_per_key = (ha_rows)(
+ (records - num_null)
+ / (index->stat_n_diff_key_vals[i + 1]
+ - num_null));
+ }
+ } else {
+ rec_per_key = (ha_rows)
+ (records / index->stat_n_diff_key_vals[i + 1]);
+ }
+
+ return(rec_per_key);
+}
+
/*********************************************************************//**
Returns statistics information of the table to the MySQL interpreter,
in various fields of the handle object. */
@@ -7748,13 +7826,8 @@ ha_innobase::info_low(
break;
}
- if (index->stat_n_diff_key_vals[j + 1] == 0) {
-
- rec_per_key = stats.records;
- } else {
- rec_per_key = (ha_rows)(stats.records /
- index->stat_n_diff_key_vals[j + 1]);
- }
+ rec_per_key = innodb_rec_per_key(
+ index, j, stats.records);
/* Since MySQL seems to favor table scans
too much over index searches, we pretend
@@ -10945,6 +11018,13 @@ static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering,
innodb_change_buffering_validate,
innodb_change_buffering_update, "inserts");
+static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies how InnoDB index statistics collection code should "
+ "treat NULLs. Possible values are NULLS_EQUAL (default), "
+ "NULLS_UNEQUAL and NULLS_IGNORED",
+ NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib);
+
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug,
PLUGIN_VAR_RQCMDARG,
@@ -10999,6 +11079,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(stats_on_metadata),
MYSQL_SYSVAR(stats_sample_pages),
MYSQL_SYSVAR(adaptive_hash_index),
+ MYSQL_SYSVAR(stats_method),
MYSQL_SYSVAR(replication_delay),
MYSQL_SYSVAR(status_file),
MYSQL_SYSVAR(strict_mode),
diff --git a/storage/innodb_plugin/handler/handler0alter.cc b/storage/innodb_plugin/handler/handler0alter.cc
index 517445f7e69..dc1317d5c5a 100644
--- a/storage/innodb_plugin/handler/handler0alter.cc
+++ b/storage/innodb_plugin/handler/handler0alter.cc
@@ -782,10 +782,6 @@ err_exit:
ut_ad(error == DB_SUCCESS);
- /* We will need to rebuild index translation table. Set
- valid index entry count in the translation table to zero */
- share->idx_trans_tbl.index_count = 0;
-
/* Commit the data dictionary transaction in order to release
the table locks on the system tables. This means that if
MySQL crashes while creating a new primary key inside
@@ -911,6 +907,14 @@ error:
}
convert_error:
+ if (error == DB_SUCCESS) {
+ /* Build index is successful. We will need to
+ rebuild index translation table. Reset the
+ index entry count in the translation table
+ to zero, so that translation table will be rebuilt */
+ share->idx_trans_tbl.index_count = 0;
+ }
+
error = convert_error_code_to_mysql(error,
innodb_table->flags,
user_thd);
diff --git a/storage/innodb_plugin/ibuf/ibuf0ibuf.c b/storage/innodb_plugin/ibuf/ibuf0ibuf.c
index 701e8f0ef04..23981ac388e 100644
--- a/storage/innodb_plugin/ibuf/ibuf0ibuf.c
+++ b/storage/innodb_plugin/ibuf/ibuf0ibuf.c
@@ -1878,9 +1878,9 @@ ibuf_remove_free_page(void)
fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
IBUF_SPACE_ID, page_no, &mtr);
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
buf_page_reset_file_page_was_freed(IBUF_SPACE_ID, page_no);
-#endif
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
ibuf_enter();
@@ -1922,9 +1922,9 @@ ibuf_remove_free_page(void)
ibuf_bitmap_page_set_bits(
bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, FALSE, &mtr);
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
buf_page_set_file_page_was_freed(IBUF_SPACE_ID, page_no);
-#endif
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
mtr_commit(&mtr);
mutex_exit(&ibuf_mutex);
diff --git a/storage/innodb_plugin/include/btr0btr.h b/storage/innodb_plugin/include/btr0btr.h
index dde3a0bab69..5aa02694e0e 100644
--- a/storage/innodb_plugin/include/btr0btr.h
+++ b/storage/innodb_plugin/include/btr0btr.h
@@ -81,6 +81,91 @@ UNIQUE definition on secondary indexes when we decide if we can use
the insert buffer to speed up inserts */
#define BTR_IGNORE_SEC_UNIQUE 2048
+#ifdef UNIV_BLOB_DEBUG
+# include "ut0rbt.h"
+/** An index->blobs entry for keeping track of off-page column references */
+struct btr_blob_dbg_struct
+{
+ unsigned blob_page_no:32; /*!< first BLOB page number */
+ unsigned ref_page_no:32; /*!< referring page number */
+ unsigned ref_heap_no:16; /*!< referring heap number */
+ unsigned ref_field_no:10; /*!< referring field number */
+ unsigned owner:1; /*!< TRUE if BLOB owner */
+ unsigned always_owner:1; /*!< TRUE if always
+ has been the BLOB owner;
+ reset to TRUE on B-tree
+ page splits and merges */
+ unsigned del:1; /*!< TRUE if currently
+ delete-marked */
+};
+
+/**************************************************************//**
+Add a reference to an off-page column to the index->blobs map. */
+UNIV_INTERN
+void
+btr_blob_dbg_add_blob(
+/*==================*/
+ const rec_t* rec, /*!< in: clustered index record */
+ ulint field_no, /*!< in: number of off-page column */
+ ulint page_no, /*!< in: start page of the column */
+ dict_index_t* index, /*!< in/out: index tree */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Display the references to off-page columns.
+This function is to be called from a debugger,
+for example when a breakpoint on ut_dbg_assertion_failed is hit. */
+UNIV_INTERN
+void
+btr_blob_dbg_print(
+/*===============*/
+ const dict_index_t* index) /*!< in: index tree */
+ __attribute__((nonnull));
+/**************************************************************//**
+Check that there are no references to off-page columns from or to
+the given page. Invoked when freeing or clearing a page.
+@return TRUE when no orphan references exist */
+UNIV_INTERN
+ibool
+btr_blob_dbg_is_empty(
+/*==================*/
+ dict_index_t* index, /*!< in: index */
+ ulint page_no) /*!< in: page number */
+ __attribute__((nonnull, warn_unused_result));
+
+/**************************************************************//**
+Modify the 'deleted' flag of a record. */
+UNIV_INTERN
+void
+btr_blob_dbg_set_deleted_flag(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
+ ibool del) /*!< in: TRUE=deleted, FALSE=exists */
+ __attribute__((nonnull));
+/**************************************************************//**
+Change the ownership of an off-page column. */
+UNIV_INTERN
+void
+btr_blob_dbg_owner(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
+ ulint i, /*!< in: ith field in rec */
+ ibool own) /*!< in: TRUE=owned, FALSE=disowned */
+ __attribute__((nonnull));
+/** Assert that there are no BLOB references to or from the given page. */
+# define btr_blob_dbg_assert_empty(index, page_no) \
+ ut_a(btr_blob_dbg_is_empty(index, page_no))
+#else /* UNIV_BLOB_DEBUG */
+# define btr_blob_dbg_add_blob(rec, field_no, page, index, ctx) ((void) 0)
+# define btr_blob_dbg_set_deleted_flag(rec, index, offsets, del)((void) 0)
+# define btr_blob_dbg_owner(rec, index, offsets, i, val) ((void) 0)
+# define btr_blob_dbg_assert_empty(index, page_no) ((void) 0)
+#endif /* UNIV_BLOB_DEBUG */
+
/**************************************************************//**
Gets the root node of a tree and x-latches it.
@return root page, x-latched */
diff --git a/storage/innodb_plugin/include/btr0cur.h b/storage/innodb_plugin/include/btr0cur.h
index b477ad0320a..ece3621fa97 100644
--- a/storage/innodb_plugin/include/btr0cur.h
+++ b/storage/innodb_plugin/include/btr0cur.h
@@ -478,7 +478,10 @@ btr_estimate_n_rows_in_range(
/*******************************************************************//**
Estimates the number of different key values in a given index, for
each n-column prefix of the index where n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals. */
+The estimates are stored in the array index->stat_n_diff_key_vals.
+If innodb_stats_method is nulls_ignored, we also record the number of
+non-null values for each prefix and stored the estimates in
+array index->stat_n_non_null_key_vals. */
UNIV_INTERN
void
btr_estimate_number_of_different_key_vals(
@@ -509,8 +512,8 @@ file segment of the index tree.
@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
UNIV_INTERN
ulint
-btr_store_big_rec_extern_fields(
-/*============================*/
+btr_store_big_rec_extern_fields_func(
+/*=================================*/
dict_index_t* index, /*!< in: index of rec; the index tree
MUST be X-latched */
buf_block_t* rec_block, /*!< in/out: block containing rec */
@@ -519,10 +522,42 @@ btr_store_big_rec_extern_fields(
the "external storage" flags in offsets
will not correspond to rec when
this function returns */
- big_rec_t* big_rec_vec, /*!< in: vector containing fields
+#ifdef UNIV_DEBUG
+ mtr_t* local_mtr, /*!< in: mtr containing the
+ latch to rec and to the tree */
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ibool update_in_place,/*! in: TRUE if the record is updated
+ in place (not delete+insert) */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+ const big_rec_t*big_rec_vec) /*!< in: vector containing fields
to be stored externally */
- mtr_t* local_mtr); /*!< in: mtr containing the latch to
- rec and to the tree */
+ __attribute__((nonnull));
+
+/** Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec. The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+@param index in: clustered index; MUST be X-latched by mtr
+@param b in/out: block containing rec; MUST be X-latched by mtr
+@param rec in/out: clustered index record
+@param offsets in: rec_get_offsets(rec, index);
+ the "external storage" flags in offsets will not be adjusted
+@param mtr in: mini-transaction that holds x-latch on index and b
+@param upd in: TRUE if the record is updated in place (not delete+insert)
+@param big in: vector containing fields to be stored externally
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+#ifdef UNIV_DEBUG
+# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \
+ btr_store_big_rec_extern_fields_func(index,b,rec,offsets,mtr,upd,big)
+#elif defined UNIV_BLOB_LIGHT_DEBUG
+# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \
+ btr_store_big_rec_extern_fields_func(index,b,rec,offsets,upd,big)
+#else
+# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \
+ btr_store_big_rec_extern_fields_func(index,b,rec,offsets,big)
+#endif
+
/*******************************************************************//**
Frees the space in an externally stored field to the file space
management if the field in data is owned the externally stored field,
diff --git a/storage/innodb_plugin/include/btr0types.h b/storage/innodb_plugin/include/btr0types.h
index ef4a6b04b34..07c06fb18d7 100644
--- a/storage/innodb_plugin/include/btr0types.h
+++ b/storage/innodb_plugin/include/btr0types.h
@@ -38,6 +38,131 @@ typedef struct btr_cur_struct btr_cur_t;
/** B-tree search information for the adaptive hash index */
typedef struct btr_search_struct btr_search_t;
+#ifdef UNIV_BLOB_DEBUG
+# include "buf0types.h"
+/** An index->blobs entry for keeping track of off-page column references */
+typedef struct btr_blob_dbg_struct btr_blob_dbg_t;
+
+/** Insert to index->blobs a reference to an off-page column.
+@param index the index tree
+@param b the reference
+@param ctx context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_insert(
+/*====================*/
+ dict_index_t* index, /*!< in/out: index tree */
+ const btr_blob_dbg_t* b, /*!< in: the reference */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+
+/** Remove from index->blobs a reference to an off-page column.
+@param index the index tree
+@param b the reference
+@param ctx context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_delete(
+/*====================*/
+ dict_index_t* index, /*!< in/out: index tree */
+ const btr_blob_dbg_t* b, /*!< in: the reference */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+
+/**************************************************************//**
+Add to index->blobs any references to off-page columns from a record.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add_rec(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: offsets */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Remove from index->blobs any references to off-page columns from a record.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove_rec(
+/*====================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: offsets */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Count and add to index->blobs any references to off-page columns
+from records on a page.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add(
+/*=============*/
+ const page_t* page, /*!< in: rewritten page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Count and remove from index->blobs any references to off-page columns
+from records on a page.
+Used when reorganizing a page, before copying the records.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove(
+/*================*/
+ const page_t* page, /*!< in: b-tree page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Restore in index->blobs any references to off-page columns
+Used when page reorganize fails due to compressed page overflow. */
+UNIV_INTERN
+void
+btr_blob_dbg_restore(
+/*=================*/
+ const page_t* npage, /*!< in: page that failed to compress */
+ const page_t* page, /*!< in: copy of original page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+
+/** Operation that processes the BLOB references of an index record
+@param[in] rec record on index page
+@param[in/out] index the index tree of the record
+@param[in] offsets rec_get_offsets(rec,index)
+@param[in] ctx context (for logging)
+@return number of BLOB references processed */
+typedef ulint (*btr_blob_dbg_op_f)
+(const rec_t* rec,dict_index_t* index,const ulint* offsets,const char* ctx);
+
+/**************************************************************//**
+Count and process all references to off-page columns on a page.
+@return number of references processed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_op(
+/*============*/
+ const page_t* page, /*!< in: B-tree leaf page */
+ const rec_t* rec, /*!< in: record to start from
+ (NULL to process the whole page) */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx, /*!< in: context (for logging) */
+ const btr_blob_dbg_op_f op) /*!< in: operation on records */
+ __attribute__((nonnull(1,3,4,5)));
+#else /* UNIV_BLOB_DEBUG */
+# define btr_blob_dbg_add_rec(rec, index, offsets, ctx) ((void) 0)
+# define btr_blob_dbg_add(page, index, ctx) ((void) 0)
+# define btr_blob_dbg_remove_rec(rec, index, offsets, ctx) ((void) 0)
+# define btr_blob_dbg_remove(page, index, ctx) ((void) 0)
+# define btr_blob_dbg_restore(npage, page, index, ctx) ((void) 0)
+# define btr_blob_dbg_op(page, rec, index, ctx, op) ((void) 0)
+#endif /* UNIV_BLOB_DEBUG */
+
/** The size of a reference to data stored on a different page.
The reference is stored at the end of the prefix of the field
in the index record. */
diff --git a/storage/innodb_plugin/include/buf0buf.h b/storage/innodb_plugin/include/buf0buf.h
index cd4ee5906f0..05dead5ac9e 100644
--- a/storage/innodb_plugin/include/buf0buf.h
+++ b/storage/innodb_plugin/include/buf0buf.h
@@ -41,6 +41,8 @@ Created 11/5/1995 Heikki Tuuri
/* @{ */
#define BUF_GET 10 /*!< get always */
#define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */
+#define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make
+ the block young in the LRU list */
#define BUF_GET_NO_LATCH 14 /*!< get and bufferfix, but
set no latch; we have
separated this case, because
@@ -165,10 +167,8 @@ Allocates a buffer block.
@return own: the allocated block, in state BUF_BLOCK_MEMORY */
UNIV_INLINE
buf_block_t*
-buf_block_alloc(
-/*============*/
- ulint zip_size); /*!< in: compressed page size in bytes,
- or 0 if uncompressed tablespace */
+buf_block_alloc(void);
+/*=================*/
/********************************************************************//**
Frees a buffer block which does not contain a file page. */
UNIV_INLINE
@@ -286,7 +286,7 @@ buf_page_get_gen(
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
buf_block_t* guess, /*!< in: guessed block or NULL */
ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
- BUF_GET_NO_LATCH */
+ BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH */
const char* file, /*!< in: file name */
ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mini-transaction */
@@ -370,7 +370,7 @@ buf_reset_check_index_page_at_flush(
/*================================*/
ulint space, /*!< in: space id */
ulint offset);/*!< in: page number */
-#ifdef UNIV_DEBUG_FILE_ACCESSES
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
/********************************************************************//**
Sets file_page_was_freed TRUE if the page is found in the buffer pool.
This function should be called when we free a file page and want the
@@ -395,7 +395,7 @@ buf_page_reset_file_page_was_freed(
/*===============================*/
ulint space, /*!< in: space id */
ulint offset); /*!< in: page number */
-#endif /* UNIV_DEBUG_FILE_ACCESSES */
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
/********************************************************************//**
Reads the freed_page_clock of a buffer block.
@return freed_page_clock */
@@ -1137,11 +1137,11 @@ struct buf_page_struct{
0 if the block was never accessed
in the buffer pool */
/* @} */
-# ifdef UNIV_DEBUG_FILE_ACCESSES
+# if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
ibool file_page_was_freed;
/*!< this is set to TRUE when fsp
frees a page in buffer pool */
-# endif /* UNIV_DEBUG_FILE_ACCESSES */
+# endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
#endif /* !UNIV_HOTBACKUP */
};
diff --git a/storage/innodb_plugin/include/buf0buf.ic b/storage/innodb_plugin/include/buf0buf.ic
index 23db684806c..0025bef5aac 100644
--- a/storage/innodb_plugin/include/buf0buf.ic
+++ b/storage/innodb_plugin/include/buf0buf.ic
@@ -719,14 +719,12 @@ Allocates a buffer block.
@return own: the allocated block, in state BUF_BLOCK_MEMORY */
UNIV_INLINE
buf_block_t*
-buf_block_alloc(
-/*============*/
- ulint zip_size) /*!< in: compressed page size in bytes,
- or 0 if uncompressed tablespace */
+buf_block_alloc(void)
+/*=================*/
{
buf_block_t* block;
- block = buf_LRU_get_free_block(zip_size);
+ block = buf_LRU_get_free_block();
buf_block_set_state(block, BUF_BLOCK_MEMORY);
diff --git a/storage/innodb_plugin/include/buf0lru.h b/storage/innodb_plugin/include/buf0lru.h
index 5a9cfd059f3..d543bce53cd 100644
--- a/storage/innodb_plugin/include/buf0lru.h
+++ b/storage/innodb_plugin/include/buf0lru.h
@@ -110,12 +110,9 @@ enum buf_lru_free_block_status
buf_LRU_free_block(
/*===============*/
buf_page_t* bpage, /*!< in: block to be freed */
- ibool zip, /*!< in: TRUE if should remove also the
+ ibool zip) /*!< in: TRUE if should remove also the
compressed page of an uncompressed page */
- ibool* buf_pool_mutex_released);
- /*!< in: pointer to a variable that will
- be assigned TRUE if buf_pool_mutex
- was temporarily released, or NULL */
+ __attribute__((nonnull));
/******************************************************************//**
Try to free a replaceable block.
@return TRUE if found and freed */
@@ -146,10 +143,9 @@ LRU list to the free list.
@return the free control block, in state BUF_BLOCK_READY_FOR_USE */
UNIV_INTERN
buf_block_t*
-buf_LRU_get_free_block(
-/*===================*/
- ulint zip_size); /*!< in: compressed page size in bytes,
- or 0 if uncompressed tablespace */
+buf_LRU_get_free_block(void)
+/*========================*/
+ __attribute__((warn_unused_result));
/******************************************************************//**
Puts a block back to the free list. */
diff --git a/storage/innodb_plugin/include/dict0mem.h b/storage/innodb_plugin/include/dict0mem.h
index 19782c2e76a..bd32a239cfd 100644
--- a/storage/innodb_plugin/include/dict0mem.h
+++ b/storage/innodb_plugin/include/dict0mem.h
@@ -321,6 +321,12 @@ struct dict_index_struct{
dict_get_n_unique(index); we
periodically calculate new
estimates */
+ ib_int64_t* stat_n_non_null_key_vals;
+ /* approximate number of non-null key values
+ for this index, for each column where
+ n < dict_get_n_unique(index); This
+ is used when innodb_stats_method is
+ "nulls_ignored". */
ulint stat_index_size;
/*!< approximate index size in
database pages */
@@ -334,6 +340,13 @@ struct dict_index_struct{
index, or 0 if the index existed
when InnoDB was started up */
#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_BLOB_DEBUG
+ mutex_t blobs_mutex;
+ /*!< mutex protecting blobs */
+ void* blobs; /*!< map of (page_no,heap_no,field_no)
+ to first_blob_page_no; protected by
+ blobs_mutex; @see btr_blob_dbg_t */
+#endif /* UNIV_BLOB_DEBUG */
#ifdef UNIV_DEBUG
ulint magic_n;/*!< magic number */
/** Value of dict_index_struct::magic_n */
diff --git a/storage/innodb_plugin/include/dict0types.h b/storage/innodb_plugin/include/dict0types.h
index 7ad69193cc9..f14b59a19d4 100644
--- a/storage/innodb_plugin/include/dict0types.h
+++ b/storage/innodb_plugin/include/dict0types.h
@@ -33,11 +33,6 @@ typedef struct dict_index_struct dict_index_t;
typedef struct dict_table_struct dict_table_t;
typedef struct dict_foreign_struct dict_foreign_t;
-/* A cluster object is a table object with the type field set to
-DICT_CLUSTERED */
-
-typedef dict_table_t dict_cluster_t;
-
typedef struct ind_node_struct ind_node_t;
typedef struct tab_node_struct tab_node_t;
diff --git a/storage/innodb_plugin/include/page0zip.h b/storage/innodb_plugin/include/page0zip.h
index 574809e5227..00c1d0516e6 100644
--- a/storage/innodb_plugin/include/page0zip.h
+++ b/storage/innodb_plugin/include/page0zip.h
@@ -420,7 +420,7 @@ page_zip_copy_recs(
const page_t* src, /*!< in: page */
dict_index_t* index, /*!< in: index of the B-tree */
mtr_t* mtr) /*!< in: mini-transaction */
- __attribute__((nonnull(1,2,3,4)));
+ __attribute__((nonnull));
#endif /* !UNIV_HOTBACKUP */
/**********************************************************************//**
diff --git a/storage/innodb_plugin/include/rem0cmp.h b/storage/innodb_plugin/include/rem0cmp.h
index 2f751a38864..a908521c9f7 100644
--- a/storage/innodb_plugin/include/rem0cmp.h
+++ b/storage/innodb_plugin/include/rem0cmp.h
@@ -165,6 +165,10 @@ cmp_rec_rec_with_match(
const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */
const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */
dict_index_t* index, /*!< in: data dictionary index */
+ ibool nulls_unequal,
+ /* in: TRUE if this is for index statistics
+ cardinality estimation, and innodb_stats_method
+ is "nulls_unequal" or "nulls_ignored" */
ulint* matched_fields, /*!< in/out: number of already completely
matched fields; when the function returns,
contains the value the for current
diff --git a/storage/innodb_plugin/include/rem0cmp.ic b/storage/innodb_plugin/include/rem0cmp.ic
index 39ef5f4fba3..63415fe7837 100644
--- a/storage/innodb_plugin/include/rem0cmp.ic
+++ b/storage/innodb_plugin/include/rem0cmp.ic
@@ -87,5 +87,5 @@ cmp_rec_rec(
ulint match_b = 0;
return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index,
- &match_f, &match_b));
+ FALSE, &match_f, &match_b));
}
diff --git a/storage/innodb_plugin/include/row0upd.h b/storage/innodb_plugin/include/row0upd.h
index b61e6b6dca1..97b7ec49a17 100644
--- a/storage/innodb_plugin/include/row0upd.h
+++ b/storage/innodb_plugin/include/row0upd.h
@@ -280,19 +280,29 @@ NOTE: we compare the fields as binary strings!
@return TRUE if update vector changes an ordering field in the index record */
UNIV_INTERN
ibool
-row_upd_changes_ord_field_binary(
-/*=============================*/
+row_upd_changes_ord_field_binary_func(
+/*==================================*/
+ dict_index_t* index, /*!< in: index of the record */
+ const upd_t* update, /*!< in: update vector for the row; NOTE: the
+ field numbers in this MUST be clustered index
+ positions! */
+#ifdef UNIV_DEBUG
+ const que_thr_t*thr, /*!< in: query thread */
+#endif /* UNIV_DEBUG */
const dtuple_t* row, /*!< in: old value of row, or NULL if the
row and the data values in update are not
known when this function is called, e.g., at
compile time */
- const row_ext_t*ext, /*!< NULL, or prefixes of the externally
+ const row_ext_t*ext) /*!< NULL, or prefixes of the externally
stored columns in the old row */
- dict_index_t* index, /*!< in: index of the record */
- const upd_t* update) /*!< in: update vector for the row; NOTE: the
- field numbers in this MUST be clustered index
- positions! */
- __attribute__((nonnull(3,4), warn_unused_result));
+ __attribute__((nonnull(1,2), warn_unused_result));
+#ifdef UNIV_DEBUG
+# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \
+ row_upd_changes_ord_field_binary_func(index,update,thr,row,ext)
+#else /* UNIV_DEBUG */
+# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \
+ row_upd_changes_ord_field_binary_func(index,update,row,ext)
+#endif /* UNIV_DEBUG */
/***********************************************************//**
Checks if an update vector changes an ordering field of an index record.
This function is fast if the update vector is short or the number of ordering
diff --git a/storage/innodb_plugin/include/srv0srv.h b/storage/innodb_plugin/include/srv0srv.h
index 7aa2ce74720..91ae895040c 100644
--- a/storage/innodb_plugin/include/srv0srv.h
+++ b/storage/innodb_plugin/include/srv0srv.h
@@ -154,6 +154,11 @@ capacity. PCT_IO(5) -> returns the number of IO operations that
is 5% of the max where max is srv_io_capacity. */
#define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) p / 100.0)))
+/* The "innodb_stats_method" setting, decides how InnoDB is going
+to treat NULL value when collecting statistics. It is not defined
+as enum type because the configure option takes unsigned integer type. */
+extern ulong srv_innodb_stats_method;
+
#ifdef UNIV_LOG_ARCHIVE
extern ibool srv_log_archive_on;
extern ibool srv_archive_recovery;
@@ -363,6 +368,19 @@ enum {
in connection with recovery */
};
+/* Alternatives for srv_innodb_stats_method, which could be changed by
+setting innodb_stats_method */
+enum srv_stats_method_name_enum {
+ SRV_STATS_NULLS_EQUAL, /* All NULL values are treated as
+ equal. This is the default setting
+ for innodb_stats_method */
+ SRV_STATS_NULLS_UNEQUAL, /* All NULL values are treated as
+ NOT equal. */
+ SRV_STATS_NULLS_IGNORED /* NULL values are ignored */
+};
+
+typedef enum srv_stats_method_name_enum srv_stats_method_name_t;
+
#ifndef UNIV_HOTBACKUP
/** Types of threads existing in the system. */
enum srv_thread_type {
diff --git a/storage/innodb_plugin/include/sync0arr.h b/storage/innodb_plugin/include/sync0arr.h
index 5f1280f5e28..6e931346238 100644
--- a/storage/innodb_plugin/include/sync0arr.h
+++ b/storage/innodb_plugin/include/sync0arr.h
@@ -115,8 +115,11 @@ Prints warnings of long semaphore waits to stderr.
@return TRUE if fatal semaphore wait threshold was exceeded */
UNIV_INTERN
ibool
-sync_array_print_long_waits(void);
-/*=============================*/
+sync_array_print_long_waits(
+/*========================*/
+ os_thread_id_t* waiter, /*!< out: longest waiting thread */
+ const void** sema) /*!< out: longest-waited-for semaphore */
+ __attribute__((nonnull));
/********************************************************************//**
Validates the integrity of the wait array. Checks
that the number of reserved cells equals the count variable. */
diff --git a/storage/innodb_plugin/include/sync0rw.h b/storage/innodb_plugin/include/sync0rw.h
index 175f3deb77c..47f7dbfe0eb 100644
--- a/storage/innodb_plugin/include/sync0rw.h
+++ b/storage/innodb_plugin/include/sync0rw.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -490,6 +490,7 @@ UNIV_INTERN
void
rw_lock_debug_print(
/*================*/
+ FILE* f, /*!< in: output stream */
rw_lock_debug_t* info); /*!< in: debug struct */
#endif /* UNIV_SYNC_DEBUG */
diff --git a/storage/innodb_plugin/include/trx0rseg.h b/storage/innodb_plugin/include/trx0rseg.h
index a25d84f1e84..e3674089735 100644
--- a/storage/innodb_plugin/include/trx0rseg.h
+++ b/storage/innodb_plugin/include/trx0rseg.h
@@ -135,9 +135,7 @@ struct trx_rseg_struct{
ulint id; /*!< rollback segment id == the index of
its slot in the trx system file copy */
mutex_t mutex; /*!< mutex protecting the fields in this
- struct except id; NOTE that the latching
- order must always be kernel mutex ->
- rseg mutex */
+ struct except id, which is constant */
ulint space; /*!< space where the rollback segment is
header is placed */
ulint zip_size;/* compressed page size of space
diff --git a/storage/innodb_plugin/include/trx0trx.h b/storage/innodb_plugin/include/trx0trx.h
index abd175d365b..833bae4a4ff 100644
--- a/storage/innodb_plugin/include/trx0trx.h
+++ b/storage/innodb_plugin/include/trx0trx.h
@@ -214,12 +214,12 @@ trx_recover_for_mysql(
/*******************************************************************//**
This function is used to find one X/Open XA distributed transaction
which is in the prepared state
-@return trx or NULL */
+@return trx or NULL; on match, the trx->xid will be invalidated */
UNIV_INTERN
trx_t *
trx_get_trx_by_xid(
/*===============*/
- XID* xid); /*!< in: X/Open XA transaction identification */
+ const XID* xid); /*!< in: X/Open XA transaction identifier */
/**********************************************************************//**
If required, flushes the log to disk if we called trx_commit_for_mysql()
with trx->flush_log_later == TRUE.
diff --git a/storage/innodb_plugin/include/univ.i b/storage/innodb_plugin/include/univ.i
index b721e0406a8..319db93d137 100644
--- a/storage/innodb_plugin/include/univ.i
+++ b/storage/innodb_plugin/include/univ.i
@@ -46,7 +46,7 @@ Created 1/20/1994 Heikki Tuuri
#define INNODB_VERSION_MAJOR 1
#define INNODB_VERSION_MINOR 0
-#define INNODB_VERSION_BUGFIX 15
+#define INNODB_VERSION_BUGFIX 16
/* The following is the InnoDB version as shown in
SELECT plugin_version FROM information_schema.plugins;
@@ -177,14 +177,15 @@ command. Not tested on Windows. */
debugging without UNIV_DEBUG */
#define UNIV_BUF_DEBUG /* Enable buffer pool
debugging without UNIV_DEBUG */
+#define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column
+ debugging without UNIV_DEBUG */
#define UNIV_DEBUG /* Enable ut_ad() assertions
and disable UNIV_INLINE */
#define UNIV_DEBUG_LOCK_VALIDATE /* Enable
ut_ad(lock_rec_validate_page())
assertions. */
-#define UNIV_DEBUG_FILE_ACCESSES /* Debug .ibd file access
- (field file_page_was_freed
- in buf_page_t) */
+#define UNIV_DEBUG_FILE_ACCESSES /* Enable freed block access
+ debugging without UNIV_DEBUG */
#define UNIV_LRU_DEBUG /* debug the buffer pool LRU */
#define UNIV_HASH_DEBUG /* debug HASH_ macros */
#define UNIV_LIST_DEBUG /* debug UT_LIST_ macros */
@@ -193,6 +194,8 @@ this will break redo log file compatibility, but it may be useful when
debugging redo log application problems. */
#define UNIV_MEM_DEBUG /* detect memory leaks etc */
#define UNIV_IBUF_DEBUG /* debug the insert buffer */
+#define UNIV_BLOB_DEBUG /* track BLOB ownership;
+assumes that no BLOBs survive server restart */
#define UNIV_IBUF_COUNT_DEBUG /* debug the insert buffer;
this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES,
and the insert buffer must be empty when the database is started */
@@ -424,7 +427,7 @@ it is read or written. */
/* Use sun_prefetch when compile with Sun Studio */
# define UNIV_EXPECT(expr,value) (expr)
# define UNIV_LIKELY_NULL(expr) (expr)
-# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many(addr)
+# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr)
# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
#else
/* Dummy versions of the macros */
diff --git a/storage/innodb_plugin/mem/mem0mem.c b/storage/innodb_plugin/mem/mem0mem.c
index 1dd4db30841..86100b04fd6 100644
--- a/storage/innodb_plugin/mem/mem0mem.c
+++ b/storage/innodb_plugin/mem/mem0mem.c
@@ -347,7 +347,7 @@ mem_heap_create_block(
return(NULL);
}
} else {
- buf_block = buf_block_alloc(0);
+ buf_block = buf_block_alloc();
}
block = (mem_block_t*) buf_block->frame;
diff --git a/storage/innodb_plugin/mtr/mtr0log.c b/storage/innodb_plugin/mtr/mtr0log.c
index 3f3dab36b76..3349036b5b3 100644
--- a/storage/innodb_plugin/mtr/mtr0log.c
+++ b/storage/innodb_plugin/mtr/mtr0log.c
@@ -408,7 +408,7 @@ mlog_parse_string(
ptr += 2;
if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)
- || UNIV_UNLIKELY(len + offset) > UNIV_PAGE_SIZE) {
+ || UNIV_UNLIKELY(len + offset > UNIV_PAGE_SIZE)) {
recv_sys->found_corrupt_log = TRUE;
return(NULL);
diff --git a/storage/innodb_plugin/page/page0cur.c b/storage/innodb_plugin/page/page0cur.c
index f10f16a7dd9..936762b986a 100644
--- a/storage/innodb_plugin/page/page0cur.c
+++ b/storage/innodb_plugin/page/page0cur.c
@@ -1149,6 +1149,8 @@ use_heap:
current_rec, index, mtr);
}
+ btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert");
+
return(insert_rec);
}
@@ -1195,10 +1197,12 @@ page_cur_insert_rec_zip_reorg(
}
/* Out of space: restore the page */
+ btr_blob_dbg_remove(page, index, "insert_zip_fail");
if (!page_zip_decompress(page_zip, page, FALSE)) {
ut_error; /* Memory corrupted? */
}
ut_ad(page_validate(page, index));
+ btr_blob_dbg_add(page, index, "insert_zip_fail");
return(NULL);
}
@@ -1490,6 +1494,8 @@ use_heap:
page_zip_write_rec(page_zip, insert_rec, index, offsets, 1);
+ btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert_zip_ok");
+
/* 9. Write log record of the insert */
if (UNIV_LIKELY(mtr != NULL)) {
page_cur_insert_rec_write_log(insert_rec, rec_size,
@@ -1697,6 +1703,9 @@ page_copy_rec_list_end_to_created_page(
heap_top += rec_size;
+ rec_offs_make_valid(insert_rec, index, offsets);
+ btr_blob_dbg_add_rec(insert_rec, index, offsets, "copy_end");
+
page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec,
index, mtr);
prev_rec = insert_rec;
@@ -1944,6 +1953,7 @@ page_cur_delete_rec(
page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1);
/* 6. Free the memory occupied by the record */
+ btr_blob_dbg_remove_rec(current_rec, index, offsets, "delete");
page_mem_free(page, page_zip, current_rec, index, offsets);
/* 7. Now we have decremented the number of owned records of the slot.
diff --git a/storage/innodb_plugin/page/page0page.c b/storage/innodb_plugin/page/page0page.c
index 10008f9ac25..6cae03e8829 100644
--- a/storage/innodb_plugin/page/page0page.c
+++ b/storage/innodb_plugin/page/page0page.c
@@ -685,12 +685,16 @@ page_copy_rec_list_end(
if (UNIV_UNLIKELY
(!page_zip_reorganize(new_block, index, mtr))) {
+ btr_blob_dbg_remove(new_page, index,
+ "copy_end_reorg_fail");
if (UNIV_UNLIKELY
(!page_zip_decompress(new_page_zip,
new_page, FALSE))) {
ut_error;
}
ut_ad(page_validate(new_page, index));
+ btr_blob_dbg_add(new_page, index,
+ "copy_end_reorg_fail");
return(NULL);
} else {
/* The page was reorganized:
@@ -803,12 +807,16 @@ page_copy_rec_list_start(
if (UNIV_UNLIKELY
(!page_zip_reorganize(new_block, index, mtr))) {
+ btr_blob_dbg_remove(new_page, index,
+ "copy_start_reorg_fail");
if (UNIV_UNLIKELY
(!page_zip_decompress(new_page_zip,
new_page, FALSE))) {
ut_error;
}
ut_ad(page_validate(new_page, index));
+ btr_blob_dbg_add(new_page, index,
+ "copy_start_reorg_fail");
return(NULL);
} else {
/* The page was reorganized:
@@ -1080,6 +1088,9 @@ page_delete_rec_list_end(
/* Remove the record chain segment from the record chain */
page_rec_set_next(prev_rec, page_get_supremum_rec(page));
+ btr_blob_dbg_op(page, rec, index, "delete_end",
+ btr_blob_dbg_remove_rec);
+
/* Catenate the deleted chain segment to the page free list */
page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE));
diff --git a/storage/innodb_plugin/page/page0zip.c b/storage/innodb_plugin/page/page0zip.c
index d3b1edefc6b..6e866b3f016 100644
--- a/storage/innodb_plugin/page/page0zip.c
+++ b/storage/innodb_plugin/page/page0zip.c
@@ -653,13 +653,13 @@ page_zip_dir_encode(
Allocate memory for zlib. */
static
void*
-page_zip_malloc(
+page_zip_zalloc(
/*============*/
void* opaque, /*!< in/out: memory heap */
uInt items, /*!< in: number of items to allocate */
uInt size) /*!< in: size of an item in bytes */
{
- return(mem_heap_alloc(opaque, items * size));
+ return(mem_heap_zalloc(opaque, items * size));
}
/**********************************************************************//**
@@ -684,7 +684,7 @@ page_zip_set_alloc(
{
z_stream* strm = stream;
- strm->zalloc = page_zip_malloc;
+ strm->zalloc = page_zip_zalloc;
strm->zfree = page_zip_free;
strm->opaque = heap;
}
@@ -2912,19 +2912,18 @@ zlib_error:
page_zip_set_alloc(&d_stream, heap);
- if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT)
- != Z_OK)) {
- ut_error;
- }
-
d_stream.next_in = page_zip->data + PAGE_DATA;
/* Subtract the space reserved for
the page header and the end marker of the modification log. */
d_stream.avail_in = page_zip_get_size(page_zip) - (PAGE_DATA + 1);
-
d_stream.next_out = page + PAGE_ZIP_START;
d_stream.avail_out = UNIV_PAGE_SIZE - PAGE_ZIP_START;
+ if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT)
+ != Z_OK)) {
+ ut_error;
+ }
+
/* Decode the zlib header and the index information. */
if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
@@ -4439,7 +4438,7 @@ page_zip_reorganize(
log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
#ifndef UNIV_HOTBACKUP
- temp_block = buf_block_alloc(0);
+ temp_block = buf_block_alloc();
btr_search_drop_page_hash_index(block);
block->check_index_page_at_flush = TRUE;
#else /* !UNIV_HOTBACKUP */
@@ -4451,6 +4450,8 @@ page_zip_reorganize(
/* Copy the old page to temporary space */
buf_frame_copy(temp_page, page);
+ btr_blob_dbg_remove(page, index, "zip_reorg");
+
/* Recreate the page: note that global data on page (possible
segment headers, next page-field, etc.) is preserved intact */
@@ -4509,7 +4510,7 @@ page_zip_copy_recs(
mtr_t* mtr) /*!< in: mini-transaction */
{
ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
- ut_ad(mtr_memo_contains_page(mtr, (page_t*) src, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, src, MTR_MEMO_PAGE_X_FIX));
ut_ad(!dict_index_is_ibuf(index));
#ifdef UNIV_ZIP_DEBUG
/* The B-tree operations that call this function may set
@@ -4579,6 +4580,7 @@ page_zip_copy_recs(
#ifdef UNIV_ZIP_DEBUG
ut_a(page_zip_validate(page_zip, page));
#endif /* UNIV_ZIP_DEBUG */
+ btr_blob_dbg_add(page, index, "page_zip_copy_recs");
page_zip_compress_write_log(page_zip, page, index, mtr);
}
diff --git a/storage/innodb_plugin/rem/rem0cmp.c b/storage/innodb_plugin/rem/rem0cmp.c
index 35b67992558..04d2c15437b 100644
--- a/storage/innodb_plugin/rem/rem0cmp.c
+++ b/storage/innodb_plugin/rem/rem0cmp.c
@@ -862,6 +862,10 @@ cmp_rec_rec_with_match(
const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */
const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */
dict_index_t* index, /*!< in: data dictionary index */
+ ibool nulls_unequal,
+ /* in: TRUE if this is for index statistics
+ cardinality estimation, and innodb_stats_method
+ is "nulls_unequal" or "nulls_ignored" */
ulint* matched_fields, /*!< in/out: number of already completely
matched fields; when the function returns,
contains the value the for current
@@ -961,9 +965,13 @@ cmp_rec_rec_with_match(
|| rec2_f_len == UNIV_SQL_NULL) {
if (rec1_f_len == rec2_f_len) {
-
- goto next_field;
-
+ /* This is limited to stats collection,
+ cannot use it for regular search */
+ if (nulls_unequal) {
+ ret = -1;
+ } else {
+ goto next_field;
+ }
} else if (rec2_f_len == UNIV_SQL_NULL) {
/* We define the SQL null to be the
diff --git a/storage/innodb_plugin/row/row0ins.c b/storage/innodb_plugin/row/row0ins.c
index 298c601c7e3..8050c099751 100644
--- a/storage/innodb_plugin/row/row0ins.c
+++ b/storage/innodb_plugin/row/row0ins.c
@@ -2130,7 +2130,7 @@ function_exit:
err = btr_store_big_rec_extern_fields(
index, btr_cur_get_block(&cursor),
- rec, offsets, big_rec, &mtr);
+ rec, offsets, &mtr, FALSE, big_rec);
if (modify) {
dtuple_big_rec_free(big_rec);
diff --git a/storage/innodb_plugin/row/row0purge.c b/storage/innodb_plugin/row/row0purge.c
index 8bf2ae0f458..752a2ec9e83 100644
--- a/storage/innodb_plugin/row/row0purge.c
+++ b/storage/innodb_plugin/row/row0purge.c
@@ -387,8 +387,11 @@ Purges an update of an existing record. Also purges an update of a delete
marked record if that record contained an externally stored field. */
static
void
-row_purge_upd_exist_or_extern(
-/*==========================*/
+row_purge_upd_exist_or_extern_func(
+/*===============================*/
+#ifdef UNIV_DEBUG
+ const que_thr_t*thr, /*!< in: query thread */
+#endif /* UNIV_DEBUG */
purge_node_t* node) /*!< in: row purge node */
{
mem_heap_t* heap;
@@ -413,8 +416,8 @@ row_purge_upd_exist_or_extern(
while (node->index != NULL) {
index = node->index;
- if (row_upd_changes_ord_field_binary(NULL, NULL, node->index,
- node->update)) {
+ if (row_upd_changes_ord_field_binary(node->index, node->update,
+ thr, NULL, NULL)) {
/* Build the older version of the index entry */
entry = row_build_index_entry(node->row, NULL,
index, heap);
@@ -496,6 +499,14 @@ skip_secondaries:
}
}
+#ifdef UNIV_DEBUG
+# define row_purge_upd_exist_or_extern(thr,node) \
+ row_purge_upd_exist_or_extern_func(thr,node)
+#else /* UNIV_DEBUG */
+# define row_purge_upd_exist_or_extern(thr,node) \
+ row_purge_upd_exist_or_extern_func(node)
+#endif /* UNIV_DEBUG */
+
/***********************************************************//**
Parses the row reference and other info in a modify undo log record.
@return TRUE if purge operation required: NOTE that then the CALLER
@@ -602,47 +613,32 @@ err_exit:
/***********************************************************//**
Fetches an undo log record and does the purge for the recorded operation.
If none left, or the current purge completed, returns the control to the
-parent node, which is always a query thread node.
-@return DB_SUCCESS if operation successfully completed, else error code */
-static
-ulint
+parent node, which is always a query thread node. */
+static __attribute__((nonnull))
+void
row_purge(
/*======*/
purge_node_t* node, /*!< in: row purge node */
que_thr_t* thr) /*!< in: query thread */
{
- roll_ptr_t roll_ptr;
- ibool purge_needed;
ibool updated_extern;
- trx_t* trx;
- ut_ad(node && thr);
-
- trx = thr_get_trx(thr);
+ ut_ad(node);
+ ut_ad(thr);
- node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr,
- &(node->reservation),
+ node->undo_rec = trx_purge_fetch_next_rec(&node->roll_ptr,
+ &node->reservation,
node->heap);
if (!node->undo_rec) {
/* Purge completed for this query thread */
thr->run_node = que_node_get_parent(node);
- return(DB_SUCCESS);
- }
-
- node->roll_ptr = roll_ptr;
-
- if (node->undo_rec == &trx_purge_dummy_rec) {
- purge_needed = FALSE;
- } else {
- purge_needed = row_purge_parse_undo_rec(node, &updated_extern,
- thr);
- /* If purge_needed == TRUE, we must also remember to unfreeze
- data dictionary! */
+ return;
}
- if (purge_needed) {
+ if (node->undo_rec != &trx_purge_dummy_rec
+ && row_purge_parse_undo_rec(node, &updated_extern, thr)) {
node->found_clust = FALSE;
node->index = dict_table_get_next_index(
@@ -654,14 +650,14 @@ row_purge(
} else if (updated_extern
|| node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
- row_purge_upd_exist_or_extern(node);
+ row_purge_upd_exist_or_extern(thr, node);
}
if (node->found_clust) {
btr_pcur_close(&(node->pcur));
}
- row_mysql_unfreeze_data_dictionary(trx);
+ row_mysql_unfreeze_data_dictionary(thr_get_trx(thr));
}
/* Do some cleanup */
@@ -669,8 +665,6 @@ row_purge(
mem_heap_empty(node->heap);
thr->run_node = node;
-
- return(DB_SUCCESS);
}
/***********************************************************//**
@@ -684,9 +678,6 @@ row_purge_step(
que_thr_t* thr) /*!< in: query thread */
{
purge_node_t* node;
-#ifdef UNIV_DEBUG
- ulint err;
-#endif /* UNIV_DEBUG */
ut_ad(thr);
@@ -694,12 +685,7 @@ row_purge_step(
ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
-#ifdef UNIV_DEBUG
- err =
-#endif /* UNIV_DEBUG */
row_purge(node, thr);
- ut_ad(err == DB_SUCCESS);
-
return(thr);
}
diff --git a/storage/innodb_plugin/row/row0umod.c b/storage/innodb_plugin/row/row0umod.c
index 562f8093c38..5202a498eed 100644
--- a/storage/innodb_plugin/row/row0umod.c
+++ b/storage/innodb_plugin/row/row0umod.c
@@ -173,40 +173,26 @@ row_undo_mod_remove_clust_low(
mtr_t* mtr, /*!< in: mtr */
ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
{
- btr_pcur_t* pcur;
btr_cur_t* btr_cur;
ulint err;
- ibool success;
ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
- pcur = &(node->pcur);
- btr_cur = btr_pcur_get_btr_cur(pcur);
- success = btr_pcur_restore_position(mode, pcur, mtr);
+ /* Find out if the record has been purged already
+ or if we can remove it. */
- if (!success) {
+ if (!btr_pcur_restore_position(mode, &node->pcur, mtr)
+ || row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) {
return(DB_SUCCESS);
}
- /* Find out if we can remove the whole clustered index record */
-
- if (node->rec_type == TRX_UNDO_UPD_DEL_REC
- && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) {
-
- /* Ok, we can remove */
- } else {
- return(DB_SUCCESS);
- }
+ btr_cur = btr_pcur_get_btr_cur(&node->pcur);
if (mode == BTR_MODIFY_LEAF) {
- success = btr_cur_optimistic_delete(btr_cur, mtr);
-
- if (success) {
- err = DB_SUCCESS;
- } else {
- err = DB_FAIL;
- }
+ err = btr_cur_optimistic_delete(btr_cur, mtr)
+ ? DB_SUCCESS
+ : DB_FAIL;
} else {
ut_ad(mode == BTR_MODIFY_TREE);
@@ -668,8 +654,9 @@ row_undo_mod_upd_exist_sec(
while (node->index != NULL) {
index = node->index;
- if (row_upd_changes_ord_field_binary(
- node->row, node->ext, node->index, node->update)) {
+ if (row_upd_changes_ord_field_binary(node->index, node->update,
+ thr,
+ node->row, node->ext)) {
/* Build the newest version of the index entry */
entry = row_build_index_entry(node->row, node->ext,
diff --git a/storage/innodb_plugin/row/row0upd.c b/storage/innodb_plugin/row/row0upd.c
index e1c78949603..13134afd1aa 100644
--- a/storage/innodb_plugin/row/row0upd.c
+++ b/storage/innodb_plugin/row/row0upd.c
@@ -498,14 +498,49 @@ row_upd_rec_in_place(
n_fields = upd_get_n_fields(update);
for (i = 0; i < n_fields; i++) {
+#ifdef UNIV_BLOB_DEBUG
+ btr_blob_dbg_t b;
+ const byte* field_ref = NULL;
+#endif /* UNIV_BLOB_DEBUG */
+
upd_field = upd_get_nth_field(update, i);
new_val = &(upd_field->new_val);
ut_ad(!dfield_is_ext(new_val) ==
!rec_offs_nth_extern(offsets, upd_field->field_no));
+#ifdef UNIV_BLOB_DEBUG
+ if (dfield_is_ext(new_val)) {
+ ulint len;
+ field_ref = rec_get_nth_field(rec, offsets, i, &len);
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ b.ref_field_no = i;
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+ ut_a(b.ref_field_no >= index->n_uniq);
+ btr_blob_dbg_rbt_delete(index, &b, "upd_in_place");
+ }
+#endif /* UNIV_BLOB_DEBUG */
rec_set_nth_field(rec, offsets, upd_field->field_no,
dfield_get_data(new_val),
dfield_get_len(new_val));
+
+#ifdef UNIV_BLOB_DEBUG
+ if (dfield_is_ext(new_val)) {
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+ b.always_owner = b.owner = !(field_ref[BTR_EXTERN_LEN]
+ & BTR_EXTERN_OWNER_FLAG);
+ b.del = rec_get_deleted_flag(
+ rec, rec_offs_comp(offsets));
+
+ btr_blob_dbg_rbt_insert(index, &b, "upd_in_place");
+ }
+#endif /* UNIV_BLOB_DEBUG */
}
if (UNIV_LIKELY_NULL(page_zip)) {
@@ -1192,25 +1227,31 @@ NOTE: we compare the fields as binary strings!
@return TRUE if update vector changes an ordering field in the index record */
UNIV_INTERN
ibool
-row_upd_changes_ord_field_binary(
-/*=============================*/
+row_upd_changes_ord_field_binary_func(
+/*==================================*/
+ dict_index_t* index, /*!< in: index of the record */
+ const upd_t* update, /*!< in: update vector for the row; NOTE: the
+ field numbers in this MUST be clustered index
+ positions! */
+#ifdef UNIV_DEBUG
+ const que_thr_t*thr, /*!< in: query thread */
+#endif /* UNIV_DEBUG */
const dtuple_t* row, /*!< in: old value of row, or NULL if the
row and the data values in update are not
known when this function is called, e.g., at
compile time */
- const row_ext_t*ext, /*!< NULL, or prefixes of the externally
+ const row_ext_t*ext) /*!< NULL, or prefixes of the externally
stored columns in the old row */
- dict_index_t* index, /*!< in: index of the record */
- const upd_t* update) /*!< in: update vector for the row; NOTE: the
- field numbers in this MUST be clustered index
- positions! */
{
ulint n_unique;
ulint i;
const dict_index_t* clust_index;
- ut_ad(update);
ut_ad(index);
+ ut_ad(update);
+ ut_ad(thr);
+ ut_ad(thr->graph);
+ ut_ad(thr->graph->trx);
n_unique = dict_index_get_n_unique(index);
@@ -1252,6 +1293,10 @@ row_upd_changes_ord_field_binary(
|| dfield_is_null(dfield)) {
/* do nothing special */
} else if (UNIV_LIKELY_NULL(ext)) {
+ /* Silence a compiler warning without
+ silencing a Valgrind error. */
+ dfield_len = 0;
+ UNIV_MEM_INVALID(&dfield_len, sizeof dfield_len);
/* See if the column is stored externally. */
buf = row_ext_lookup(ext, col_no, &dfield_len);
@@ -1259,9 +1304,14 @@ row_upd_changes_ord_field_binary(
if (UNIV_LIKELY_NULL(buf)) {
if (UNIV_UNLIKELY(buf == field_ref_zero)) {
- /* This should never happen, but
- we try to fail safe here. */
- ut_ad(0);
+ /* The externally stored field
+ was not written yet. This
+ record should only be seen by
+ recv_recovery_rollback_active(),
+ when the server had crashed before
+ storing the field. */
+ ut_ad(thr->graph->trx->is_recovered);
+ ut_ad(trx_is_recv(thr->graph->trx));
return(TRUE);
}
@@ -1608,8 +1658,8 @@ row_upd_sec_step(
ut_ad(!dict_index_is_clust(node->index));
if (node->state == UPD_NODE_UPDATE_ALL_SEC
- || row_upd_changes_ord_field_binary(node->row, node->ext,
- node->index, node->update)) {
+ || row_upd_changes_ord_field_binary(node->index, node->update,
+ thr, node->row, node->ext)) {
return(row_upd_sec_index_entry(node, thr));
}
@@ -1937,7 +1987,7 @@ row_upd_clust_rec(
index, btr_cur_get_block(btr_cur), rec,
rec_get_offsets(rec, index, offsets_,
ULINT_UNDEFINED, &heap),
- big_rec, mtr);
+ mtr, TRUE, big_rec);
mtr_commit(mtr);
}
@@ -2136,8 +2186,8 @@ exit_func:
row_upd_store_row(node);
- if (row_upd_changes_ord_field_binary(node->row, node->ext, index,
- node->update)) {
+ if (row_upd_changes_ord_field_binary(index, node->update, thr,
+ node->row, node->ext)) {
/* Update causes an ordering field (ordering fields within
the B-tree) of the clustered index record to change: perform
diff --git a/storage/innodb_plugin/row/row0vers.c b/storage/innodb_plugin/row/row0vers.c
index b6d35363f08..d4fde0b939b 100644
--- a/storage/innodb_plugin/row/row0vers.c
+++ b/storage/innodb_plugin/row/row0vers.c
@@ -669,11 +669,15 @@ row_vers_build_for_semi_consistent_read(
mutex_enter(&kernel_mutex);
version_trx = trx_get_on_id(version_trx_id);
+ if (version_trx
+ && (version_trx->conc_state == TRX_COMMITTED_IN_MEMORY
+ || version_trx->conc_state == TRX_NOT_STARTED)) {
+
+ version_trx = NULL;
+ }
mutex_exit(&kernel_mutex);
- if (!version_trx
- || version_trx->conc_state == TRX_NOT_STARTED
- || version_trx->conc_state == TRX_COMMITTED_IN_MEMORY) {
+ if (!version_trx) {
/* We found a version that belongs to a
committed transaction: return it. */
diff --git a/storage/innodb_plugin/srv/srv0srv.c b/storage/innodb_plugin/srv/srv0srv.c
index f7e7e351bdc..b1fc1ac67fd 100644
--- a/storage/innodb_plugin/srv/srv0srv.c
+++ b/storage/innodb_plugin/srv/srv0srv.c
@@ -243,6 +243,11 @@ UNIV_INTERN ulong srv_max_buf_pool_modified_pct = 75;
/* variable counts amount of data read in total (in bytes) */
UNIV_INTERN ulint srv_data_read = 0;
+/* Internal setting for "innodb_stats_method". Decides how InnoDB treats
+NULL value when collecting statistics. By default, it is set to
+SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */
+ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL;
+
/* here we count the amount of data written in total (in bytes) */
UNIV_INTERN ulint srv_data_written = 0;
@@ -2231,6 +2236,12 @@ srv_error_monitor_thread(
ulint fatal_cnt = 0;
ib_uint64_t old_lsn;
ib_uint64_t new_lsn;
+ /* longest waiting thread for a semaphore */
+ os_thread_id_t waiter = os_thread_get_curr_id();
+ os_thread_id_t old_waiter = waiter;
+ /* the semaphore that is being waited for */
+ const void* sema = NULL;
+ const void* old_sema = NULL;
old_lsn = srv_start_lsn;
@@ -2279,7 +2290,8 @@ loop:
sync_arr_wake_threads_if_sema_free();
- if (sync_array_print_long_waits()) {
+ if (sync_array_print_long_waits(&waiter, &sema)
+ && sema == old_sema && os_thread_eq(waiter, old_waiter)) {
fatal_cnt++;
if (fatal_cnt > 10) {
@@ -2294,6 +2306,8 @@ loop:
}
} else {
fatal_cnt = 0;
+ old_waiter = waiter;
+ old_sema = sema;
}
/* Flush stderr so that a database user gets the output
diff --git a/storage/innodb_plugin/srv/srv0start.c b/storage/innodb_plugin/srv/srv0start.c
index 73f8f319704..f8b5049ca65 100644
--- a/storage/innodb_plugin/srv/srv0start.c
+++ b/storage/innodb_plugin/srv/srv0start.c
@@ -1061,6 +1061,12 @@ innobase_start_or_create_for_mysql(void)
);
#endif
+#ifdef UNIV_BLOB_DEBUG
+ fprintf(stderr,
+ "InnoDB: !!!!!!!! UNIV_BLOB_DEBUG switched on !!!!!!!!!\n"
+ "InnoDB: Server restart may fail with UNIV_BLOB_DEBUG\n");
+#endif /* UNIV_BLOB_DEBUG */
+
#ifdef UNIV_SYNC_DEBUG
fprintf(stderr,
"InnoDB: !!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!\n");
diff --git a/storage/innodb_plugin/sync/sync0arr.c b/storage/innodb_plugin/sync/sync0arr.c
index 3c825e2202b..13970023573 100644
--- a/storage/innodb_plugin/sync/sync0arr.c
+++ b/storage/innodb_plugin/sync/sync0arr.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -715,7 +715,7 @@ print:
fprintf(stderr, "rw-lock %p ",
(void*) lock);
sync_array_cell_print(stderr, cell);
- rw_lock_debug_print(debug);
+ rw_lock_debug_print(stderr, debug);
return(TRUE);
}
}
@@ -914,8 +914,10 @@ Prints warnings of long semaphore waits to stderr.
@return TRUE if fatal semaphore wait threshold was exceeded */
UNIV_INTERN
ibool
-sync_array_print_long_waits(void)
-/*=============================*/
+sync_array_print_long_waits(
+/*========================*/
+ os_thread_id_t* waiter, /*!< out: longest waiting thread */
+ const void** sema) /*!< out: longest-waited-for semaphore */
{
sync_cell_t* cell;
ibool old_val;
@@ -923,24 +925,40 @@ sync_array_print_long_waits(void)
ulint i;
ulint fatal_timeout = srv_fatal_semaphore_wait_threshold;
ibool fatal = FALSE;
+ double longest_diff = 0;
for (i = 0; i < sync_primary_wait_array->n_cells; i++) {
+ double diff;
+ void* wait_object;
+
cell = sync_array_get_nth_cell(sync_primary_wait_array, i);
- if (cell->wait_object != NULL && cell->waiting
- && difftime(time(NULL), cell->reservation_time) > 240) {
+ wait_object = cell->wait_object;
+
+ if (wait_object == NULL || !cell->waiting) {
+
+ continue;
+ }
+
+ diff = difftime(time(NULL), cell->reservation_time);
+
+ if (diff > 240) {
fputs("InnoDB: Warning: a long semaphore wait:\n",
stderr);
sync_array_cell_print(stderr, cell);
noticed = TRUE;
}
- if (cell->wait_object != NULL && cell->waiting
- && difftime(time(NULL), cell->reservation_time)
- > fatal_timeout) {
+ if (diff > fatal_timeout) {
fatal = TRUE;
}
+
+ if (diff > longest_diff) {
+ longest_diff = diff;
+ *sema = wait_object;
+ *waiter = cell->thread;
+ }
}
if (noticed) {
diff --git a/storage/innodb_plugin/sync/sync0rw.c b/storage/innodb_plugin/sync/sync0rw.c
index 572c3690a7f..a5da606ad80 100644
--- a/storage/innodb_plugin/sync/sync0rw.c
+++ b/storage/innodb_plugin/sync/sync0rw.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -260,6 +260,9 @@ rw_lock_create_func(
contains garbage at initialization and cannot be used for
recursive x-locking. */
lock->recursive = FALSE;
+ /* Silence Valgrind when UNIV_DEBUG_VALGRIND is not enabled. */
+ memset((void*) &lock->writer_thread, 0, sizeof lock->writer_thread);
+ UNIV_MEM_INVALID(&lock->writer_thread, sizeof lock->writer_thread);
#ifdef UNIV_SYNC_DEBUG
UT_LIST_INIT(lock->debug_list);
@@ -925,7 +928,7 @@ rw_lock_list_print_info(
info = UT_LIST_GET_FIRST(lock->debug_list);
while (info != NULL) {
- rw_lock_debug_print(info);
+ rw_lock_debug_print(file, info);
info = UT_LIST_GET_NEXT(list, info);
}
}
@@ -973,7 +976,7 @@ rw_lock_print(
info = UT_LIST_GET_FIRST(lock->debug_list);
while (info != NULL) {
- rw_lock_debug_print(info);
+ rw_lock_debug_print(stderr, info);
info = UT_LIST_GET_NEXT(list, info);
}
}
@@ -985,28 +988,29 @@ UNIV_INTERN
void
rw_lock_debug_print(
/*================*/
+ FILE* f, /*!< in: output stream */
rw_lock_debug_t* info) /*!< in: debug struct */
{
ulint rwt;
rwt = info->lock_type;
- fprintf(stderr, "Locked: thread %lu file %s line %lu ",
+ fprintf(f, "Locked: thread %lu file %s line %lu ",
(ulong) os_thread_pf(info->thread_id), info->file_name,
(ulong) info->line);
if (rwt == RW_LOCK_SHARED) {
- fputs("S-LOCK", stderr);
+ fputs("S-LOCK", f);
} else if (rwt == RW_LOCK_EX) {
- fputs("X-LOCK", stderr);
+ fputs("X-LOCK", f);
} else if (rwt == RW_LOCK_WAIT_EX) {
- fputs("WAIT X-LOCK", stderr);
+ fputs("WAIT X-LOCK", f);
} else {
ut_error;
}
if (info->pass != 0) {
- fprintf(stderr, " pass value %lu", (ulong) info->pass);
+ fprintf(f, " pass value %lu", (ulong) info->pass);
}
- putc('\n', stderr);
+ putc('\n', f);
}
/***************************************************************//**
diff --git a/storage/innodb_plugin/trx/trx0i_s.c b/storage/innodb_plugin/trx/trx0i_s.c
index e148234888b..5cc9df2d5c4 100644
--- a/storage/innodb_plugin/trx/trx0i_s.c
+++ b/storage/innodb_plugin/trx/trx0i_s.c
@@ -504,7 +504,7 @@ fill_trx_row(
query[stmt_len] = '\0';
row->trx_query = ha_storage_put_memlim(
- cache->storage, stmt, stmt_len + 1,
+ cache->storage, query, stmt_len + 1,
MAX_ALLOWED_FOR_STORAGE(cache));
row->trx_query_cs = innobase_get_charset(trx->mysql_thd);
diff --git a/storage/innodb_plugin/trx/trx0roll.c b/storage/innodb_plugin/trx/trx0roll.c
index 1a43e419214..a4bbf7fd652 100644
--- a/storage/innodb_plugin/trx/trx0roll.c
+++ b/storage/innodb_plugin/trx/trx0roll.c
@@ -48,8 +48,8 @@ Created 3/26/1996 Heikki Tuuri
rollback */
#define TRX_ROLL_TRUNC_THRESHOLD 1
-/** In crash recovery, the current trx to be rolled back */
-static trx_t* trx_roll_crash_recv_trx = NULL;
+/** In crash recovery, the current trx to be rolled back; NULL otherwise */
+static const trx_t* trx_roll_crash_recv_trx = NULL;
/** In crash recovery we set this to the undo n:o of the current trx to be
rolled back. Then we can print how many % the rollback has progressed. */
diff --git a/storage/innodb_plugin/trx/trx0trx.c b/storage/innodb_plugin/trx/trx0trx.c
index ee744fd58b1..f0bbf220815 100644
--- a/storage/innodb_plugin/trx/trx0trx.c
+++ b/storage/innodb_plugin/trx/trx0trx.c
@@ -2010,18 +2010,18 @@ trx_recover_for_mysql(
/*******************************************************************//**
This function is used to find one X/Open XA distributed transaction
which is in the prepared state
-@return trx or NULL */
+@return trx or NULL; on match, the trx->xid will be invalidated */
UNIV_INTERN
trx_t*
trx_get_trx_by_xid(
/*===============*/
- XID* xid) /*!< in: X/Open XA transaction identification */
+ const XID* xid) /*!< in: X/Open XA transaction identifier */
{
trx_t* trx;
if (xid == NULL) {
- return (NULL);
+ return(NULL);
}
mutex_enter(&kernel_mutex);
@@ -2034,10 +2034,16 @@ trx_get_trx_by_xid(
of gtrid_lenght+bqual_length bytes should be
the same */
- if (xid->gtrid_length == trx->xid.gtrid_length
+ if (trx->conc_state == TRX_PREPARED
+ && xid->gtrid_length == trx->xid.gtrid_length
&& xid->bqual_length == trx->xid.bqual_length
&& memcmp(xid->data, trx->xid.data,
xid->gtrid_length + xid->bqual_length) == 0) {
+
+ /* Invalidate the XID, so that subsequent calls
+ will not find it. */
+ memset(&trx->xid, 0, sizeof(trx->xid));
+ trx->xid.formatID = -1;
break;
}
@@ -2046,14 +2052,5 @@ trx_get_trx_by_xid(
mutex_exit(&kernel_mutex);
- if (trx) {
- if (trx->conc_state != TRX_PREPARED) {
-
- return(NULL);
- }
-
- return(trx);
- } else {
- return(NULL);
- }
+ return(trx);
}
diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc
index 8da234111b2..5661e1f06d7 100644
--- a/storage/maria/ha_maria.cc
+++ b/storage/maria/ha_maria.cc
@@ -789,9 +789,10 @@ can_enable_indexes(1), bulk_insert_single_undo(BULK_INSERT_NONE)
{}
-handler *ha_maria::clone(MEM_ROOT *mem_root)
+handler *ha_maria::clone(const char *name, MEM_ROOT *mem_root)
{
- ha_maria *new_handler= static_cast <ha_maria *>(handler::clone(mem_root));
+ ha_maria *new_handler= static_cast <ha_maria *>(handler::clone(name,
+ mem_root));
if (new_handler)
{
new_handler->file->state= file->state;
diff --git a/storage/maria/ha_maria.h b/storage/maria/ha_maria.h
index 6ca1f6ca797..6171f89cb18 100644
--- a/storage/maria/ha_maria.h
+++ b/storage/maria/ha_maria.h
@@ -51,7 +51,7 @@ class ha_maria :public handler
public:
ha_maria(handlerton *hton, TABLE_SHARE * table_arg);
~ha_maria() {}
- handler *clone(MEM_ROOT *mem_root);
+ handler *clone(const char *name, MEM_ROOT *mem_root);
const char *table_type() const
{ return "MARIA"; }
const char *index_type(uint key_number);
diff --git a/storage/maria/ma_init.c b/storage/maria/ma_init.c
index 9e5b600c972..8f695acfe5a 100644
--- a/storage/maria/ma_init.c
+++ b/storage/maria/ma_init.c
@@ -21,6 +21,7 @@
#include "trnman_public.h"
#include "ma_checkpoint.h"
#include <hash.h>
+#include <my_handler.h>
void history_state_free(MARIA_STATE_HISTORY_CLOSED *closed_history)
{
diff --git a/storage/myisam/ft_stopwords.c b/storage/myisam/ft_stopwords.c
index 05948a936a6..5bfbf7e6c61 100644
--- a/storage/myisam/ft_stopwords.c
+++ b/storage/myisam/ft_stopwords.c
@@ -16,7 +16,7 @@
/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
#include "ftdefs.h"
-#include "my_handler.h"
+#include "my_compare.h"
typedef struct st_ft_stopwords
{
diff --git a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc
index f8f9340ba3a..f736a99df42 100644
--- a/storage/myisam/ha_myisam.cc
+++ b/storage/myisam/ha_myisam.cc
@@ -20,12 +20,8 @@
#define MYSQL_SERVER 1
#include "mysql_priv.h"
-#include <mysql/plugin.h>
-#include <m_ctype.h>
#include <my_bit.h>
-#include <myisampack.h>
#include "ha_myisam.h"
-#include <stdarg.h>
#include "myisamdef.h"
#include "rt_index.h"
@@ -552,9 +548,10 @@ ha_myisam::ha_myisam(handlerton *hton, TABLE_SHARE *table_arg)
can_enable_indexes(1)
{}
-handler *ha_myisam::clone(MEM_ROOT *mem_root)
+handler *ha_myisam::clone(const char *name, MEM_ROOT *mem_root)
{
- ha_myisam *new_handler= static_cast <ha_myisam *>(handler::clone(mem_root));
+ ha_myisam *new_handler= static_cast <ha_myisam *>(handler::clone(name,
+ mem_root));
if (new_handler)
new_handler->file->state= file->state;
return new_handler;
diff --git a/storage/myisam/ha_myisam.h b/storage/myisam/ha_myisam.h
index 0e53bc58c81..7ca9c191bdf 100644
--- a/storage/myisam/ha_myisam.h
+++ b/storage/myisam/ha_myisam.h
@@ -21,7 +21,6 @@
/* class for the the myisam handler */
#include <myisam.h>
-#include <myisamchk.h>
#include <ft_global.h>
#define HA_RECOVER_NONE 0 /* No automatic recover */
@@ -46,7 +45,7 @@ class ha_myisam: public handler
public:
ha_myisam(handlerton *hton, TABLE_SHARE *table_arg);
~ha_myisam() {}
- handler *clone(MEM_ROOT *mem_root);
+ handler *clone(const char *name, MEM_ROOT *mem_root);
const char *table_type() const { return "MyISAM"; }
const char *index_type(uint key_number);
const char **bas_ext() const;
diff --git a/storage/myisam/mi_create.c b/storage/myisam/mi_create.c
index d5ce941bf75..f40f1bad710 100644
--- a/storage/myisam/mi_create.c
+++ b/storage/myisam/mi_create.c
@@ -848,8 +848,8 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
err:
pthread_mutex_unlock(&THR_LOCK_myisam);
-err_no_lock:
+err_no_lock:
save_errno=my_errno;
switch (errpos) {
case 3:
diff --git a/storage/myisam/mi_test1.c b/storage/myisam/mi_test1.c
index a6e870f3cde..199334e8b1b 100644
--- a/storage/myisam/mi_test1.c
+++ b/storage/myisam/mi_test1.c
@@ -16,6 +16,7 @@
/* Testing of the basic functions of a MyISAM table */
#include "myisam.h"
+#include "myisamdef.h"
#include <my_getopt.h>
#include <m_string.h>
diff --git a/storage/myisam/mi_write.c b/storage/myisam/mi_write.c
index 86c4df77817..8229dfef9a5 100644
--- a/storage/myisam/mi_write.c
+++ b/storage/myisam/mi_write.c
@@ -17,6 +17,7 @@
#include "fulltext.h"
#include "rt_index.h"
+#include "my_compare.h"
#define MAX_POINTER_LENGTH 8
diff --git a/storage/myisam/myisamdef.h b/storage/myisam/myisamdef.h
index e03f88b1a1f..ee782cf2ab0 100644
--- a/storage/myisam/myisamdef.h
+++ b/storage/myisam/myisamdef.h
@@ -15,8 +15,8 @@
/* This file is included by all internal myisam files */
-#include "myisam.h" /* Structs & some defines */
-#include "myisampack.h" /* packing of keys */
+#include <myisam.h> /* Structs & some defines */
+#include <myisampack.h> /* packing of keys */
#include <my_tree.h>
#ifdef THREAD
#include <my_pthread.h>
diff --git a/storage/myisam/sp_test.c b/storage/myisam/sp_test.c
index f572c7ab19b..7a30a742fd6 100644
--- a/storage/myisam/sp_test.c
+++ b/storage/myisam/sp_test.c
@@ -17,6 +17,7 @@
/* Written by Alex Barkov, who has a shared copyright to this code */
#include "myisam.h"
+#include "myisamdef.h"
#ifdef HAVE_SPATIAL
#include "sp_defs.h"
diff --git a/storage/myisammrg/ha_myisammrg.cc b/storage/myisammrg/ha_myisammrg.cc
index 99513e2267f..e6fd3ed4939 100644
--- a/storage/myisammrg/ha_myisammrg.cc
+++ b/storage/myisammrg/ha_myisammrg.cc
@@ -459,8 +459,7 @@ int ha_myisammrg::open(const char *name, int mode __attribute__((unused)),
problem because all locking is handled by the original MERGE table
from which this is cloned of.
*/
- if (!(file= myrg_open(table->s->normalized_path.str, table->db_stat,
- HA_OPEN_IGNORE_IF_LOCKED)))
+ if (!(file= myrg_open(name, table->db_stat, HA_OPEN_IGNORE_IF_LOCKED)))
{
DBUG_PRINT("error", ("my_errno %d", my_errno));
DBUG_RETURN(my_errno ? my_errno : -1);
@@ -484,7 +483,7 @@ int ha_myisammrg::open(const char *name, int mode __attribute__((unused)),
@return A cloned handler instance.
*/
-handler *ha_myisammrg::clone(MEM_ROOT *mem_root)
+handler *ha_myisammrg::clone(const char *name, MEM_ROOT *mem_root)
{
MYRG_TABLE *u_table,*newu_table;
ha_myisammrg *new_handler=
@@ -505,8 +504,8 @@ handler *ha_myisammrg::clone(MEM_ROOT *mem_root)
return NULL;
}
- if (new_handler->ha_open(table, table->s->normalized_path.str, table->db_stat,
- HA_OPEN_IGNORE_IF_LOCKED))
+ if (new_handler->ha_open(table, name, table->db_stat,
+ HA_OPEN_IGNORE_IF_LOCKED))
{
delete new_handler;
return NULL;
diff --git a/storage/myisammrg/ha_myisammrg.h b/storage/myisammrg/ha_myisammrg.h
index 4a65a60d97b..aa8f8d3e2be 100644
--- a/storage/myisammrg/ha_myisammrg.h
+++ b/storage/myisammrg/ha_myisammrg.h
@@ -62,7 +62,7 @@ class ha_myisammrg: public handler
int open(const char *name, int mode, uint test_if_locked);
int attach_children(void);
int detach_children(void);
- virtual handler *clone(MEM_ROOT *mem_root);
+ virtual handler *clone(const char *name, MEM_ROOT *mem_root);
int close(void);
int write_row(uchar * buf);
int update_row(const uchar * old_data, uchar * new_data);