diff options
author | Michael Widenius <monty@askmonty.org> | 2011-05-02 20:58:45 +0300 |
---|---|---|
committer | Michael Widenius <monty@askmonty.org> | 2011-05-02 20:58:45 +0300 |
commit | e415ba0fb2c57eaf370e84a3a9c8d831f820a560 (patch) | |
tree | f113a8024de4ee4f1bc19aae98c19a2835f5b4e7 /storage | |
parent | 046418ad956c98c3788d79650fcb50479844df3b (diff) | |
parent | a1f7ceb281f9d87c9baea125ebab26f99a0370f8 (diff) | |
download | mariadb-git-e415ba0fb2c57eaf370e84a3a9c8d831f820a560.tar.gz |
Merge with MySQL 5.1.57/58
Moved some BSD string functions from Unireg
Diffstat (limited to 'storage')
83 files changed, 2168 insertions, 572 deletions
diff --git a/storage/heap/ha_heap.cc b/storage/heap/ha_heap.cc index fb7c13e4e41..9f29dee2030 100644 --- a/storage/heap/ha_heap.cc +++ b/storage/heap/ha_heap.cc @@ -142,11 +142,11 @@ int ha_heap::close(void) DESCRIPTION Do same as default implementation but use file->s->name instead of table->s->path. This is needed by Windows where the clone() call sees - '/'-delimited path in table->s->path, while ha_peap::open() was called + '/'-delimited path in table->s->path, while ha_heap::open() was called with '\'-delimited path. */ -handler *ha_heap::clone(MEM_ROOT *mem_root) +handler *ha_heap::clone(const char *name, MEM_ROOT *mem_root) { handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type()); if (new_handler && !new_handler->ha_open(table, file->s->name, table->db_stat, diff --git a/storage/heap/ha_heap.h b/storage/heap/ha_heap.h index 22722129f4c..69751101645 100644 --- a/storage/heap/ha_heap.h +++ b/storage/heap/ha_heap.h @@ -34,7 +34,7 @@ class ha_heap: public handler public: ha_heap(handlerton *hton, TABLE_SHARE *table); ~ha_heap() {} - handler *clone(MEM_ROOT *mem_root); + handler *clone(const char *name, MEM_ROOT *mem_root); const char *table_type() const { return (table->in_use->variables.sql_mode & MODE_MYSQL323) ? diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c index a7160d74a32..6c0497cbd41 100644 --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -66,6 +66,13 @@ this many index pages */ /*--------------------------------------*/ #define BTR_BLOB_HDR_SIZE 8 +/* Estimated table level stats from sampled value. */ +#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, ext_size, not_empty) \ + ((value * (ib_longlong) index->stat_n_leaf_pages \ + + BTR_KEY_VAL_ESTIMATE_N_PAGES - 1 + ext_size \ + + not_empty) \ + / (BTR_KEY_VAL_ESTIMATE_N_PAGES + ext_size)) + /*********************************************************************** Marks all extern fields in a record as owned by the record. This function should be called if the delete mark of a record is removed: a not delete @@ -2835,9 +2842,54 @@ btr_estimate_n_rows_in_range( } /*********************************************************************** +Record the number of non_null key values in a given index for +each n-column prefix of the index where n < dict_index_get_n_unique(index). +The estimates are eventually stored in the array: +index->stat_n_non_null_key_vals. */ +static +void +btr_record_not_null_field_in_rec( +/*=============================*/ + rec_t* rec, /* in: physical record */ + ulint n_unique, /* in: dict_index_get_n_unique(index), + number of columns uniquely determine + an index entry */ + const ulint* offsets, /* in: rec_get_offsets(rec, index), + its size could be for all fields or + that of "n_unique" */ + ib_longlong* n_not_null) /* in/out: array to record number of + not null rows for n-column prefix */ +{ + ulint i; + + ut_ad(rec_offs_n_fields(offsets) >= n_unique); + + if (n_not_null == NULL) { + return; + } + + for (i = 0; i < n_unique; i++) { + ulint rec_len; + byte* field; + + field = rec_get_nth_field(rec, offsets, i, &rec_len); + + if (rec_len != UNIV_SQL_NULL) { + n_not_null[i]++; + } else { + /* Break if we hit the first NULL value */ + break; + } + } +} + +/*********************************************************************** Estimates the number of different key values in a given index, for each n-column prefix of the index where n <= dict_index_get_n_unique(index). -The estimates are stored in the array index->stat_n_diff_key_vals. */ +The estimates are stored in the array index->stat_n_diff_key_vals. +If innodb_stats_method is "nulls_ignored", we also record the number of +non-null values for each prefix and store the estimates in +array index->stat_n_non_null_key_vals. */ void btr_estimate_number_of_different_key_vals( @@ -2851,6 +2903,8 @@ btr_estimate_number_of_different_key_vals( ulint matched_fields; ulint matched_bytes; ib_longlong* n_diff; + ib_longlong* n_not_null; + ibool stats_null_not_equal; ulint not_empty_flag = 0; ulint total_external_size = 0; ulint i; @@ -2858,24 +2912,47 @@ btr_estimate_number_of_different_key_vals( ulint add_on; mtr_t mtr; mem_heap_t* heap = NULL; - ulint offsets_rec_[REC_OFFS_NORMAL_SIZE]; - ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE]; - ulint* offsets_rec = offsets_rec_; - ulint* offsets_next_rec= offsets_next_rec_; - *offsets_rec_ = (sizeof offsets_rec_) / sizeof *offsets_rec_; - *offsets_next_rec_ - = (sizeof offsets_next_rec_) / sizeof *offsets_next_rec_; + ulint* offsets_rec = NULL; + ulint* offsets_next_rec = NULL; n_cols = dict_index_get_n_unique(index); - n_diff = mem_alloc((n_cols + 1) * sizeof(ib_longlong)); + heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null) + * (n_cols + 1) + + dict_index_get_n_fields(index) + * (sizeof *offsets_rec + + sizeof *offsets_next_rec)); + + n_diff = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_longlong)); + + n_not_null = NULL; + + /* Check srv_innodb_stats_method setting, and decide whether we + need to record non-null value and also decide if NULL is + considered equal (by setting stats_null_not_equal value) */ + switch (srv_innodb_stats_method) { + case SRV_STATS_NULLS_IGNORED: + n_not_null = mem_heap_zalloc(heap, (n_cols + 1) + * sizeof *n_not_null); + /* fall through */ + + case SRV_STATS_NULLS_UNEQUAL: + /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL + case, we will treat NULLs as unequal value */ + stats_null_not_equal = TRUE; + break; - memset(n_diff, 0, (n_cols + 1) * sizeof(ib_longlong)); + case SRV_STATS_NULLS_EQUAL: + stats_null_not_equal = FALSE; + break; + + default: + ut_error; + } /* We sample some pages in the index to get an estimate */ for (i = 0; i < BTR_KEY_VAL_ESTIMATE_N_PAGES; i++) { - rec_t* supremum; mtr_start(&mtr); btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr); @@ -2888,18 +2965,25 @@ btr_estimate_number_of_different_key_vals( page = btr_cur_get_page(&cursor); - supremum = page_get_supremum_rec(page); rec = page_rec_get_next(page_get_infimum_rec(page)); - if (rec != supremum) { + if (!page_rec_is_supremum(rec)) { not_empty_flag = 1; offsets_rec = rec_get_offsets(rec, index, offsets_rec, ULINT_UNDEFINED, &heap); + + if (n_not_null) { + btr_record_not_null_field_in_rec( + rec, n_cols, offsets_rec, n_not_null); + } } - while (rec != supremum) { + while (!page_rec_is_supremum(rec)) { rec_t* next_rec = page_rec_get_next(rec); - if (next_rec == supremum) { + if (page_rec_is_supremum(next_rec)) { + total_external_size += + btr_rec_get_externally_stored_len( + rec, offsets_rec); break; } @@ -2907,11 +2991,13 @@ btr_estimate_number_of_different_key_vals( matched_bytes = 0; offsets_next_rec = rec_get_offsets(next_rec, index, offsets_next_rec, - n_cols, &heap); + ULINT_UNDEFINED, + &heap); cmp_rec_rec_with_match(rec, next_rec, offsets_rec, offsets_next_rec, - index, &matched_fields, + index, stats_null_not_equal, + &matched_fields, &matched_bytes); for (j = matched_fields + 1; j <= n_cols; j++) { @@ -2921,6 +3007,12 @@ btr_estimate_number_of_different_key_vals( n_diff[j]++; } + if (n_not_null) { + btr_record_not_null_field_in_rec( + next_rec, n_cols, offsets_next_rec, + n_not_null); + } + total_external_size += btr_rec_get_externally_stored_len( rec, offsets_rec); @@ -2955,10 +3047,6 @@ btr_estimate_number_of_different_key_vals( } } - offsets_rec = rec_get_offsets(rec, index, offsets_rec, - ULINT_UNDEFINED, &heap); - total_external_size += btr_rec_get_externally_stored_len( - rec, offsets_rec); mtr_commit(&mtr); } @@ -2971,14 +3059,8 @@ btr_estimate_number_of_different_key_vals( included in index->stat_n_leaf_pages) */ for (j = 0; j <= n_cols; j++) { - index->stat_n_diff_key_vals[j] - = ((n_diff[j] - * (ib_longlong)index->stat_n_leaf_pages - + BTR_KEY_VAL_ESTIMATE_N_PAGES - 1 - + total_external_size - + not_empty_flag) - / (BTR_KEY_VAL_ESTIMATE_N_PAGES - + total_external_size)); + index->stat_n_diff_key_vals[j] = BTR_TABLE_STATS_FROM_SAMPLE( + n_diff[j], index, total_external_size, not_empty_flag); /* If the tree is small, smaller than 10 * BTR_KEY_VAL_ESTIMATE_N_PAGES + total_external_size, then @@ -2997,12 +3079,20 @@ btr_estimate_number_of_different_key_vals( } index->stat_n_diff_key_vals[j] += add_on; - } - mem_free(n_diff); - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); + /* Update the stat_n_non_null_key_vals[] with our + sampled result. stat_n_non_null_key_vals[] is created + and initialized to zero in dict_index_add_to_cache(), + along with stat_n_diff_key_vals[] array */ + if (n_not_null != NULL && (j < n_cols)) { + index->stat_n_non_null_key_vals[j] = + BTR_TABLE_STATS_FROM_SAMPLE( + n_not_null[j], index, + total_external_size, not_empty_flag); + } } + + mem_heap_free(heap); } /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ diff --git a/storage/innobase/dict/dict0dict.c b/storage/innobase/dict/dict0dict.c index fda6555e082..beea0a2f411 100644 --- a/storage/innobase/dict/dict0dict.c +++ b/storage/innobase/dict/dict0dict.c @@ -1358,6 +1358,12 @@ dict_index_add_to_cache( new_index->heap, (1 + dict_index_get_n_unique(new_index)) * sizeof(ib_longlong)); + + new_index->stat_n_non_null_key_vals = mem_heap_zalloc( + new_index->heap, + (1 + dict_index_get_n_unique(new_index)) + * sizeof(*new_index->stat_n_non_null_key_vals)); + /* Give some sensible values to stat_n_... in case we do not calculate statistics quickly enough */ @@ -3817,6 +3823,10 @@ dict_update_statistics_low( for (i = dict_index_get_n_unique(index); i; ) { index->stat_n_diff_key_vals[i--] = 1; } + + memset(index->stat_n_non_null_key_vals, 0, + (1 + dict_index_get_n_unique(index)) + * sizeof(*index->stat_n_non_null_key_vals)); } index = dict_table_get_next_index(index); diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 4c52326a58a..6f58fd70fbd 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -130,6 +130,25 @@ static my_bool innobase_adaptive_hash_index = TRUE; static char* internal_innobase_data_file_path = NULL; +/* Possible values for system variable "innodb_stats_method". The values +are defined the same as its corresponding MyISAM system variable +"myisam_stats_method"(see "myisam_stats_method_names"), for better usability */ +static const char* innodb_stats_method_names[] = { + "nulls_equal", + "nulls_unequal", + "nulls_ignored", + NullS +}; + +/* Used to define an enumerate type of the system variable innodb_stats_method. +This is the same as "myisam_stats_method_typelib" */ +static TYPELIB innodb_stats_method_typelib = { + array_elements(innodb_stats_method_names) - 1, + "innodb_stats_method_typelib", + innodb_stats_method_names, + NULL +}; + /* The following counter is used to convey information to InnoDB about server activity: in selects it is not sensible to call srv_active_wake_master_thread after each fetch or search, we only do @@ -6363,6 +6382,65 @@ ha_innobase::read_time( } /************************************************************************* +Calculate Record Per Key value. Need to exclude the NULL value if +innodb_stats_method is set to "nulls_ignored" */ +static +ha_rows +innodb_rec_per_key( +/*===============*/ + /* out: estimated record per key + value */ + dict_index_t* index, /* in: dict_index_t structure */ + ulint i, /* in: the column we are + calculating rec per key */ + ha_rows records) /* in: estimated total records */ +{ + ha_rows rec_per_key; + + ut_ad(i < dict_index_get_n_unique(index)); + + /* Note the stat_n_diff_key_vals[] stores the diff value with + n-prefix indexing, so it is always stat_n_diff_key_vals[i + 1] */ + if (index->stat_n_diff_key_vals[i + 1] == 0) { + + rec_per_key = records; + } else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) { + ib_longlong num_null; + + /* Number of rows with NULL value in this + field */ + num_null = records - index->stat_n_non_null_key_vals[i]; + + /* In theory, index->stat_n_non_null_key_vals[i] + should always be less than the number of records. + Since this is statistics value, the value could + have slight discrepancy. But we will make sure + the number of null values is not a negative number. */ + num_null = (num_null < 0) ? 0 : num_null; + + /* If the number of NULL values is the same as or + large than that of the distinct values, we could + consider that the table consists mostly of NULL value. + Set rec_per_key to 1. */ + if (index->stat_n_diff_key_vals[i + 1] <= num_null) { + rec_per_key = 1; + } else { + /* Need to exclude rows with NULL values from + rec_per_key calculation */ + rec_per_key = (ha_rows)( + (records - num_null) + / (index->stat_n_diff_key_vals[i + 1] + - num_null)); + } + } else { + rec_per_key = (ha_rows) + (records / index->stat_n_diff_key_vals[i + 1]); + } + + return(rec_per_key); +} + +/************************************************************************* Returns statistics information of the table to the MySQL interpreter, in various fields of the handle object. */ @@ -6568,13 +6646,8 @@ ha_innobase::info_low( break; } - if (index->stat_n_diff_key_vals[j + 1] == 0) { - - rec_per_key = stats.records; - } else { - rec_per_key = (ha_rows)(stats.records / - index->stat_n_diff_key_vals[j + 1]); - } + rec_per_key = innodb_rec_per_key( + index, j, stats.records); /* Since MySQL seems to favor table scans too much over index searches, we pretend @@ -8990,6 +9063,13 @@ static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode, AUTOINC_OLD_STYLE_LOCKING, /* Minimum value */ AUTOINC_NO_LOCKING, 0); /* Maximum value */ +static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method, + PLUGIN_VAR_RQCMDARG, + "Specifies how InnoDB index statistics collection code should " + "treat NULLs. Possible values are NULLS_EQUAL (default), " + "NULLS_UNEQUAL and NULLS_IGNORED", + NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib); + #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug, PLUGIN_VAR_RQCMDARG, @@ -9031,6 +9111,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(stats_on_metadata), MYSQL_SYSVAR(use_legacy_cardinality_algorithm), MYSQL_SYSVAR(adaptive_hash_index), + MYSQL_SYSVAR(stats_method), MYSQL_SYSVAR(status_file), MYSQL_SYSVAR(support_xa), MYSQL_SYSVAR(sync_spin_loops), diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index 213dcb7f568..20235c55f22 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -404,7 +404,10 @@ btr_estimate_n_rows_in_range( /*********************************************************************** Estimates the number of different key values in a given index, for each n-column prefix of the index where n <= dict_index_get_n_unique(index). -The estimates are stored in the array index->stat_n_diff_key_vals. */ +The estimates are stored in the array index->stat_n_diff_key_vals. +If innodb_stats_method is nulls_ignored, we also record the number of +non-null values for each prefix and stored the estimates in +array index->stat_n_non_null_key_vals. */ void btr_estimate_number_of_different_key_vals( diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index 2f2a7441478..83dbf65ea41 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -222,6 +222,12 @@ struct dict_index_struct{ for this index, for each n-column prefix where n <= dict_get_n_unique(index); we periodically calculate new estimates */ + ib_longlong* stat_n_non_null_key_vals; + /* approximate number of non-null key values + for this index, for each column where + n < dict_get_n_unique(index); This + is used when innodb_stats_method is + "nulls_ignored". */ ulint stat_index_size; /* approximate index size in database pages */ ulint stat_n_leaf_pages; diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h index b90545f2105..6674b5ff397 100644 --- a/storage/innobase/include/dict0types.h +++ b/storage/innobase/include/dict0types.h @@ -16,11 +16,6 @@ typedef struct dict_index_struct dict_index_t; typedef struct dict_table_struct dict_table_t; typedef struct dict_foreign_struct dict_foreign_t; -/* A cluster object is a table object with the type field set to -DICT_CLUSTERED */ - -typedef dict_table_t dict_cluster_t; - typedef struct ind_node_struct ind_node_t; typedef struct tab_node_struct tab_node_t; diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h index c6a6e5de4db..22a22d13e17 100644 --- a/storage/innobase/include/rem0cmp.h +++ b/storage/innobase/include/rem0cmp.h @@ -141,6 +141,10 @@ cmp_rec_rec_with_match( const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index, /* in: data dictionary index */ + ibool nulls_unequal, + /* in: TRUE if this is for index statistics + cardinality estimation, and innodb_stats_method + is "nulls_unequal" or "nulls_ignored" */ ulint* matched_fields, /* in/out: number of already completely matched fields; when the function returns, contains the value the for current diff --git a/storage/innobase/include/rem0cmp.ic b/storage/innobase/include/rem0cmp.ic index 52dc7ff5dc9..45e12301a3c 100644 --- a/storage/innobase/include/rem0cmp.ic +++ b/storage/innobase/include/rem0cmp.ic @@ -72,5 +72,5 @@ cmp_rec_rec( ulint match_b = 0; return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index, - &match_f, &match_b)); + FALSE, &match_f, &match_b)); } diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 3dd4bb961f9..811074b2be8 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -91,6 +91,11 @@ extern ulint srv_lock_table_size; extern ulint srv_n_file_io_threads; +/* The "innodb_stats_method" setting, decides how InnoDB is going +to treat NULL value when collecting statistics. It is not defined +as enum type because the configure option takes unsigned integer type. */ +extern ulong srv_innodb_stats_method; + #ifdef UNIV_LOG_ARCHIVE extern ibool srv_log_archive_on; extern ibool srv_archive_recovery; @@ -286,6 +291,19 @@ of lower numbers are included. */ #define SRV_FORCE_NO_LOG_REDO 6 /* do not do the log roll-forward in connection with recovery */ +/* Alternatives for srv_innodb_stats_method, which could be changed by +setting innodb_stats_method */ +enum srv_stats_method_name_enum { + SRV_STATS_NULLS_EQUAL, /* All NULL values are treated as + equal. This is the default setting + for innodb_stats_method */ + SRV_STATS_NULLS_UNEQUAL, /* All NULL values are treated as + NOT equal. */ + SRV_STATS_NULLS_IGNORED /* NULL values are ignored */ +}; + +typedef enum srv_stats_method_name_enum srv_stats_method_name_t; + /************************************************************************* Boots Innobase server. */ diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h index fae26b7a63e..ec48059dbcb 100644 --- a/storage/innobase/include/sync0arr.h +++ b/storage/innobase/include/sync0arr.h @@ -93,10 +93,13 @@ sync_arr_wake_threads_if_sema_free(void); Prints warnings of long semaphore waits to stderr. */ ibool -sync_array_print_long_waits(void); -/*=============================*/ - /* out: TRUE if fatal semaphore wait threshold - was exceeded */ +sync_array_print_long_waits( +/*========================*/ + /* out: TRUE if fatal semaphore wait threshold + was exceeded */ + os_thread_id_t* waiter, /* out: longest waiting thread */ + const void** sema) /* out: longest-waited-for semaphore */ + __attribute__((nonnull)); /************************************************************************ Validates the integrity of the wait array. Checks that the number of reserved cells equals the count variable. */ diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h index 008df80a2c7..dd898557d6e 100644 --- a/storage/innobase/include/sync0rw.h +++ b/storage/innobase/include/sync0rw.h @@ -1,7 +1,7 @@ /****************************************************** The read-write lock (for threads, not for database transactions) -(c) 1995 Innobase Oy +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Created 9/11/1995 Heikki Tuuri *******************************************************/ @@ -409,6 +409,7 @@ Prints info of a debug struct. */ void rw_lock_debug_print( /*================*/ + FILE* f, /* in: output stream */ rw_lock_debug_t* info); /* in: debug struct */ #endif /* UNIV_SYNC_DEBUG */ diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h index 46ba010bd1d..22f8aa89181 100644 --- a/storage/innobase/include/trx0rseg.h +++ b/storage/innobase/include/trx0rseg.h @@ -121,9 +121,7 @@ struct trx_rseg_struct{ ulint id; /* rollback segment id == the index of its slot in the trx system file copy */ mutex_t mutex; /* mutex protecting the fields in this - struct except id; NOTE that the latching - order must always be kernel mutex -> - rseg mutex */ + struct except id, which is constant */ ulint space; /* space where the rollback segment is header is placed */ ulint page_no;/* page number of the rollback segment diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 97a47d9f46e..4652f45892e 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -198,8 +198,9 @@ which is in the prepared state */ trx_t * trx_get_trx_by_xid( /*===============*/ - /* out: trx or NULL */ - XID* xid); /* in: X/Open XA transaction identification */ + /* out: trx or NULL; + on match, the trx->xid will be invalidated */ + const XID* xid); /* in: X/Open XA transaction identifier */ /************************************************************************** If required, flushes the log to disk if we called trx_commit_for_mysql() with trx->flush_log_later == TRUE. */ diff --git a/storage/innobase/rem/rem0cmp.c b/storage/innobase/rem/rem0cmp.c index ca0ec663548..2939c119e2e 100644 --- a/storage/innobase/rem/rem0cmp.c +++ b/storage/innobase/rem/rem0cmp.c @@ -720,6 +720,10 @@ cmp_rec_rec_with_match( const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index, /* in: data dictionary index */ + ibool nulls_unequal, + /* in: TRUE if this is for index statistics + cardinality estimation, and innodb_stats_method + is "nulls_unequal" or "nulls_ignored" */ ulint* matched_fields, /* in/out: number of already completely matched fields; when the function returns, contains the value the for current @@ -821,9 +825,13 @@ cmp_rec_rec_with_match( || rec2_f_len == UNIV_SQL_NULL) { if (rec1_f_len == rec2_f_len) { - - goto next_field; - + /* This is limited to stats collection, + cannot use it for regular search */ + if (nulls_unequal) { + ret = -1; + } else { + goto next_field; + } } else if (rec2_f_len == UNIV_SQL_NULL) { /* We define the SQL null to be the diff --git a/storage/innobase/row/row0vers.c b/storage/innobase/row/row0vers.c index f4adfa855df..23aca8c3f2e 100644 --- a/storage/innobase/row/row0vers.c +++ b/storage/innobase/row/row0vers.c @@ -593,11 +593,15 @@ row_vers_build_for_semi_consistent_read( mutex_enter(&kernel_mutex); version_trx = trx_get_on_id(version_trx_id); + if (version_trx + && (version_trx->conc_state == TRX_COMMITTED_IN_MEMORY + || version_trx->conc_state == TRX_NOT_STARTED)) { + + version_trx = NULL; + } mutex_exit(&kernel_mutex); - if (!version_trx - || version_trx->conc_state == TRX_NOT_STARTED - || version_trx->conc_state == TRX_COMMITTED_IN_MEMORY) { + if (!version_trx) { /* We found a version that belongs to a committed transaction: return it. */ diff --git a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c index 5b1184fb416..3f6f1982992 100644 --- a/storage/innobase/srv/srv0srv.c +++ b/storage/innobase/srv/srv0srv.c @@ -218,6 +218,11 @@ ulong srv_max_buf_pool_modified_pct = 90; /* variable counts amount of data read in total (in bytes) */ ulint srv_data_read = 0; +/* Internal setting for "innodb_stats_method". Decides how InnoDB treats +NULL value when collecting statistics. By default, it is set to +SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */ +ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL; + /* here we count the amount of data written in total (in bytes) */ ulint srv_data_written = 0; @@ -2175,9 +2180,15 @@ srv_error_monitor_thread( os_thread_create */ { /* number of successive fatal timeouts observed */ - ulint fatal_cnt = 0; - dulint old_lsn; - dulint new_lsn; + ulint fatal_cnt = 0; + dulint old_lsn; + dulint new_lsn; + /* longest waiting thread for a semaphore */ + os_thread_id_t waiter = os_thread_get_curr_id(); + os_thread_id_t old_waiter = waiter; + /* the semaphore that is being waited for */ + const void* sema = NULL; + const void* old_sema = NULL; old_lsn = srv_start_lsn; @@ -2219,10 +2230,11 @@ loop: /* In case mutex_exit is not a memory barrier, it is theoretically possible some threads are left waiting though the semaphore is already released. Wake up those threads: */ - + sync_arr_wake_threads_if_sema_free(); - if (sync_array_print_long_waits()) { + if (sync_array_print_long_waits(&waiter, &sema) + && sema == old_sema && os_thread_eq(waiter, old_waiter)) { fatal_cnt++; if (fatal_cnt > 10) { @@ -2237,6 +2249,8 @@ loop: } } else { fatal_cnt = 0; + old_waiter = waiter; + old_sema = sema; } /* Flush stderr so that a database user gets the output diff --git a/storage/innobase/sync/sync0arr.c b/storage/innobase/sync/sync0arr.c index 154593a9035..93a7398f252 100644 --- a/storage/innobase/sync/sync0arr.c +++ b/storage/innobase/sync/sync0arr.c @@ -1,7 +1,7 @@ /****************************************************** The wait array used in synchronization primitives -(c) 1995 Innobase Oy +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Created 9/5/1995 Heikki Tuuri *******************************************************/ @@ -709,7 +709,7 @@ print: fprintf(stderr, "rw-lock %p ", (void*) lock); sync_array_cell_print(stderr, cell); - rw_lock_debug_print(debug); + rw_lock_debug_print(stderr, debug); return(TRUE); } } @@ -916,10 +916,12 @@ sync_arr_wake_threads_if_sema_free(void) Prints warnings of long semaphore waits to stderr. */ ibool -sync_array_print_long_waits(void) -/*=============================*/ - /* out: TRUE if fatal semaphore wait threshold - was exceeded */ +sync_array_print_long_waits( +/*========================*/ + /* out: TRUE if fatal semaphore wait threshold + was exceeded */ + os_thread_id_t* waiter, /* out: longest waiting thread */ + const void** sema) /* out: longest-waited-for semaphore */ { sync_cell_t* cell; ibool old_val; @@ -927,24 +929,40 @@ sync_array_print_long_waits(void) ulint i; ulint fatal_timeout = srv_fatal_semaphore_wait_threshold; ibool fatal = FALSE; + double longest_diff = 0; for (i = 0; i < sync_primary_wait_array->n_cells; i++) { + double diff; + void* wait_object; + cell = sync_array_get_nth_cell(sync_primary_wait_array, i); - if (cell->wait_object != NULL && cell->waiting - && difftime(time(NULL), cell->reservation_time) > 240) { + wait_object = cell->wait_object; + + if (wait_object == NULL || !cell->waiting) { + + continue; + } + + diff = difftime(time(NULL), cell->reservation_time); + + if (diff > 240) { fputs("InnoDB: Warning: a long semaphore wait:\n", stderr); sync_array_cell_print(stderr, cell); noticed = TRUE; } - if (cell->wait_object != NULL && cell->waiting - && difftime(time(NULL), cell->reservation_time) - > fatal_timeout) { + if (diff > fatal_timeout) { fatal = TRUE; } + + if (diff > longest_diff) { + longest_diff = diff; + *sema = wait_object; + *waiter = cell->thread; + } } if (noticed) { diff --git a/storage/innobase/sync/sync0rw.c b/storage/innobase/sync/sync0rw.c index 0b05fb826ac..ef4c07e8c26 100644 --- a/storage/innobase/sync/sync0rw.c +++ b/storage/innobase/sync/sync0rw.c @@ -1,7 +1,7 @@ /****************************************************** The read-write lock (for thread synchronization) -(c) 1995 Innobase Oy +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Created 9/11/1995 Heikki Tuuri *******************************************************/ @@ -830,7 +830,7 @@ rw_lock_list_print_info( info = UT_LIST_GET_FIRST(lock->debug_list); while (info != NULL) { - rw_lock_debug_print(info); + rw_lock_debug_print(file, info); info = UT_LIST_GET_NEXT(list, info); } } @@ -870,7 +870,7 @@ rw_lock_print( info = UT_LIST_GET_FIRST(lock->debug_list); while (info != NULL) { - rw_lock_debug_print(info); + rw_lock_debug_print(stderr, info); info = UT_LIST_GET_NEXT(list, info); } } @@ -882,28 +882,29 @@ Prints info of a debug struct. */ void rw_lock_debug_print( /*================*/ + FILE* f, /* in: output stream */ rw_lock_debug_t* info) /* in: debug struct */ { ulint rwt; rwt = info->lock_type; - fprintf(stderr, "Locked: thread %lu file %s line %lu ", + fprintf(f, "Locked: thread %lu file %s line %lu ", (ulong) os_thread_pf(info->thread_id), info->file_name, (ulong) info->line); if (rwt == RW_LOCK_SHARED) { - fputs("S-LOCK", stderr); + fputs("S-LOCK", f); } else if (rwt == RW_LOCK_EX) { - fputs("X-LOCK", stderr); + fputs("X-LOCK", f); } else if (rwt == RW_LOCK_WAIT_EX) { - fputs("WAIT X-LOCK", stderr); + fputs("WAIT X-LOCK", f); } else { ut_error; } if (info->pass != 0) { - fprintf(stderr, " pass value %lu", (ulong) info->pass); + fprintf(f, " pass value %lu", (ulong) info->pass); } - putc('\n', stderr); + putc('\n', f); } /******************************************************************* diff --git a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c index 21f75e0818f..a82d7f452fc 100644 --- a/storage/innobase/trx/trx0trx.c +++ b/storage/innobase/trx/trx0trx.c @@ -2041,14 +2041,15 @@ which is in the prepared state */ trx_t* trx_get_trx_by_xid( /*===============*/ - /* out: trx or NULL */ - XID* xid) /* in: X/Open XA transaction identification */ + /* out: trx or NULL; + on match, the trx->xid will be invalidated */ + const XID* xid) /* in: X/Open XA transaction identifier */ { trx_t* trx; if (xid == NULL) { - return (NULL); + return(NULL); } mutex_enter(&kernel_mutex); @@ -2061,10 +2062,16 @@ trx_get_trx_by_xid( of gtrid_lenght+bqual_length bytes should be the same */ - if (xid->gtrid_length == trx->xid.gtrid_length + if (trx->conc_state == TRX_PREPARED + && xid->gtrid_length == trx->xid.gtrid_length && xid->bqual_length == trx->xid.bqual_length && memcmp(xid->data, trx->xid.data, xid->gtrid_length + xid->bqual_length) == 0) { + + /* Invalidate the XID, so that subsequent calls + will not find it. */ + memset(&trx->xid, 0, sizeof(trx->xid)); + trx->xid.formatID = -1; break; } @@ -2073,14 +2080,5 @@ trx_get_trx_by_xid( mutex_exit(&kernel_mutex); - if (trx) { - if (trx->conc_state != TRX_PREPARED) { - - return(NULL); - } - - return(trx); - } else { - return(NULL); - } + return(trx); } diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index bf003b810d2..100cf3690ce 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,89 @@ +2011-03-30 The InnoDB Team + + * srv/srv0srv.c, sync/sync0arr.h, sync/sync0arr.c: + Fix Bug#11877216 InnoDB too eager to commit suicide on a busy server + +2011-03-15 The InnoDB Team + + * btr/btr0cur.c, page/page0zip.c: + Fix Bug#11849231 inflateInit() invoked without initializing all memory + +2011-02-28 The InnoDB Team + + * btr/btr0sea.c, buf/buf0buf.c, buf/buf0lru.c: + Fix Bug#58549 Race condition in buf_LRU_drop_page_hash_for_tablespace() + and compressed tables + +2011-02-15 The InnoDB Team + + * sync/sync0rw.c, innodb_bug59307.test: + Bug#59307 Valgrind: uninitialized value in + rw_lock_set_writer_id_and_recursion_flag() + +2011-02-14 The InnoDB Team + + * handler/handler0alter.cc: + Bug#59749 Enabling concurrent reads while creating non-primary + unique index gives failures + +2011-01-31 The InnoDB Team + + * btr/btr0cur.c, include/row0upd.h, + row/row0purge.c, row/row0umod.c, row/row0upd.c: + Bug#59230 assert 0 row_upd_changes_ord_field_binary() + in post-crash rollback or purge + +2011-01-27 The InnoDB Team + + * btr/btr0cur.c: + Bug#59465 btr_estimate_number_of_different_key_vals use + incorrect offset for external_size + +2011-01-27 The InnoDB Team + + * include/trx0trx.h, trx/trx0trx.c: + Bug#59440 Race condition in XA ROLLBACK and XA COMMIT + after server restart + +2011-01-25 The InnoDB Team + + * row/row0upd.c: + Bug#59585 Fix 58912 introduces compiler warning + due to potentially uninitialized variable + +2011-01-25 The InnoDB Team + + * mtr/mtr0log.c: + Bug#59486 Incorrect usage of UNIV_UNLIKELY() in mlog_parse_string() + +2011-01-25 The InnoDB Team + + * row/row0vers.c: + Fix Bug#59464 Race condition in row_vers_build_for_semi_consistent_read + +2011-01-25 The InnoDB Team + + * btr/btr0btr.c, btr/btr0cur.c, btr/btr0sea.c, + buf/buf0buddy.c, buf/buf0buf.c, buf/buf0lru.c, + include/buf0buf.h, include/buf0buf.ic, include/buf0lru.h, + mem/mem0mem.c, page/page0zip.c: + Fix Bug#59707 Unused compression-related parameters + in buffer pool functions + +2011-01-18 The InnoDB Team + + * include/sync0rw.h, sync/sync0arr.c, sync/sync0rw.c: + Fix Bug#59579 rw_lock_debug_print outputs to stderr, not to + SHOW ENGINE INNODB STATUS + +2011-01-14 The InnoDB Team + * btr/btr0cur.c, dict/dict0dict.c, handler/ha_innodb.cc, + include/btr0cur.h, include/dict0mem.h, include/rem0cmp.h, + include/rem0cmp.ic, include/srv0srv.h, rem/rem0cmp.c, + srv/srv0srv.c, innodb_bug30423.test: + Fix Bug#30423 InnoDBs treatment of NULL in index stats causes + bad "rows examined" estimates + 2011-01-06 The InnoDB Team * row/row0merge.c: Fix Bug#59312 Examine MAX_FULL_NAME_LEN in InnoDB to address diff --git a/storage/innodb_plugin/btr/btr0btr.c b/storage/innodb_plugin/btr/btr0btr.c index 32e2caecdb8..46810c011c4 100644 --- a/storage/innodb_plugin/btr/btr0btr.c +++ b/storage/innodb_plugin/btr/btr0btr.c @@ -42,6 +42,560 @@ Created 6/2/1994 Heikki Tuuri #include "ibuf0ibuf.h" #include "trx0trx.h" +#ifdef UNIV_BLOB_DEBUG +# include "srv0srv.h" +# include "ut0rbt.h" + +/** TRUE when messages about index->blobs modification are enabled. */ +static ibool btr_blob_dbg_msg; + +/** Issue a message about an operation on index->blobs. +@param op operation +@param b the entry being subjected to the operation +@param ctx the context of the operation */ +#define btr_blob_dbg_msg_issue(op, b, ctx) \ + fprintf(stderr, op " %u:%u:%u->%u %s(%u,%u,%u)\n", \ + (b)->ref_page_no, (b)->ref_heap_no, \ + (b)->ref_field_no, (b)->blob_page_no, ctx, \ + (b)->owner, (b)->always_owner, (b)->del) + +/** Insert to index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_insert( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ +{ + if (btr_blob_dbg_msg) { + btr_blob_dbg_msg_issue("insert", b, ctx); + } + mutex_enter(&index->blobs_mutex); + rbt_insert(index->blobs, b, b); + mutex_exit(&index->blobs_mutex); +} + +/** Remove from index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_delete( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ +{ + if (btr_blob_dbg_msg) { + btr_blob_dbg_msg_issue("delete", b, ctx); + } + mutex_enter(&index->blobs_mutex); + ut_a(rbt_delete(index->blobs, b)); + mutex_exit(&index->blobs_mutex); +} + +/**************************************************************//** +Comparator for items (btr_blob_dbg_t) in index->blobs. +The key in index->blobs is (ref_page_no, ref_heap_no, ref_field_no). +@return negative, 0 or positive if *a<*b, *a=*b, *a>*b */ +static +int +btr_blob_dbg_cmp( +/*=============*/ + const void* a, /*!< in: first btr_blob_dbg_t to compare */ + const void* b) /*!< in: second btr_blob_dbg_t to compare */ +{ + const btr_blob_dbg_t* aa = a; + const btr_blob_dbg_t* bb = b; + + ut_ad(aa != NULL); + ut_ad(bb != NULL); + + if (aa->ref_page_no != bb->ref_page_no) { + return(aa->ref_page_no < bb->ref_page_no ? -1 : 1); + } + if (aa->ref_heap_no != bb->ref_heap_no) { + return(aa->ref_heap_no < bb->ref_heap_no ? -1 : 1); + } + if (aa->ref_field_no != bb->ref_field_no) { + return(aa->ref_field_no < bb->ref_field_no ? -1 : 1); + } + return(0); +} + +/**************************************************************//** +Add a reference to an off-page column to the index->blobs map. */ +UNIV_INTERN +void +btr_blob_dbg_add_blob( +/*==================*/ + const rec_t* rec, /*!< in: clustered index record */ + ulint field_no, /*!< in: off-page column number */ + ulint page_no, /*!< in: start page of the column */ + dict_index_t* index, /*!< in/out: index tree */ + const char* ctx) /*!< in: context (for logging) */ +{ + btr_blob_dbg_t b; + const page_t* page = page_align(rec); + + ut_a(index->blobs); + + b.blob_page_no = page_no; + b.ref_page_no = page_get_page_no(page); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = field_no; + ut_a(b.ref_field_no >= index->n_uniq); + b.always_owner = b.owner = TRUE; + b.del = FALSE; + ut_a(!rec_get_deleted_flag(rec, page_is_comp(page))); + btr_blob_dbg_rbt_insert(index, &b, ctx); +} + +/**************************************************************//** +Add to index->blobs any references to off-page columns from a record. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add_rec( +/*=================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint count = 0; + ulint i; + btr_blob_dbg_t b; + ibool del; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (!rec_offs_any_extern(offsets)) { + return(0); + } + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + del = (rec_get_deleted_flag(rec, rec_offs_comp(offsets)) != 0); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field_ref = rec_get_nth_field( + rec, offsets, i, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + if (!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + /* the column has not been stored yet */ + continue; + } + + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + ut_a(b.ref_field_no >= index->n_uniq); + b.always_owner = b.owner + = !(field_ref[BTR_EXTERN_LEN] + & BTR_EXTERN_OWNER_FLAG); + b.del = del; + + btr_blob_dbg_rbt_insert(index, &b, ctx); + count++; + } + } + + return(count); +} + +/**************************************************************//** +Display the references to off-page columns. +This function is to be called from a debugger, +for example when a breakpoint on ut_dbg_assertion_failed is hit. */ +UNIV_INTERN +void +btr_blob_dbg_print( +/*===============*/ + const dict_index_t* index) /*!< in: index tree */ +{ + const ib_rbt_node_t* node; + + if (!index->blobs) { + return; + } + + /* We intentionally do not acquire index->blobs_mutex here. + This function is to be called from a debugger, and the caller + should make sure that the index->blobs_mutex is held. */ + + for (node = rbt_first(index->blobs); + node != NULL; node = rbt_next(index->blobs, node)) { + const btr_blob_dbg_t* b + = rbt_value(btr_blob_dbg_t, node); + fprintf(stderr, "%u:%u:%u->%u%s%s%s\n", + b->ref_page_no, b->ref_heap_no, b->ref_field_no, + b->blob_page_no, + b->owner ? "" : "(disowned)", + b->always_owner ? "" : "(has disowned)", + b->del ? "(deleted)" : ""); + } +} + +/**************************************************************//** +Remove from index->blobs any references to off-page columns from a record. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove_rec( +/*====================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint i; + ulint count = 0; + btr_blob_dbg_t b; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (!rec_offs_any_extern(offsets)) { + return(0); + } + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field_ref = rec_get_nth_field( + rec, offsets, i, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + + switch (b.blob_page_no) { + case 0: + /* The column has not been stored yet. + The BLOB pointer must be all zero. + There cannot be a BLOB starting at + page 0, because page 0 is reserved for + the tablespace header. */ + ut_a(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + /* fall through */ + case FIL_NULL: + /* the column has been freed already */ + continue; + } + + btr_blob_dbg_rbt_delete(index, &b, ctx); + count++; + } + } + + return(count); +} + +/**************************************************************//** +Check that there are no references to off-page columns from or to +the given page. Invoked when freeing or clearing a page. +@return TRUE when no orphan references exist */ +UNIV_INTERN +ibool +btr_blob_dbg_is_empty( +/*==================*/ + dict_index_t* index, /*!< in: index */ + ulint page_no) /*!< in: page number */ +{ + const ib_rbt_node_t* node; + ibool success = TRUE; + + if (!index->blobs) { + return(success); + } + + mutex_enter(&index->blobs_mutex); + + for (node = rbt_first(index->blobs); + node != NULL; node = rbt_next(index->blobs, node)) { + const btr_blob_dbg_t* b + = rbt_value(btr_blob_dbg_t, node); + + if (b->ref_page_no != page_no && b->blob_page_no != page_no) { + continue; + } + + fprintf(stderr, + "InnoDB: orphan BLOB ref%s%s%s %u:%u:%u->%u\n", + b->owner ? "" : "(disowned)", + b->always_owner ? "" : "(has disowned)", + b->del ? "(deleted)" : "", + b->ref_page_no, b->ref_heap_no, b->ref_field_no, + b->blob_page_no); + + if (b->blob_page_no != page_no || b->owner || !b->del) { + success = FALSE; + } + } + + mutex_exit(&index->blobs_mutex); + return(success); +} + +/**************************************************************//** +Count and process all references to off-page columns on a page. +@return number of references processed */ +UNIV_INTERN +ulint +btr_blob_dbg_op( +/*============*/ + const page_t* page, /*!< in: B-tree leaf page */ + const rec_t* rec, /*!< in: record to start from + (NULL to process the whole page) */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx, /*!< in: context (for logging) */ + const btr_blob_dbg_op_f op) /*!< in: operation on records */ +{ + ulint count = 0; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_a(!rec || page_align(rec) == page); + + if (!index->blobs || !page_is_leaf(page) + || !dict_index_is_clust(index)) { + return(0); + } + + if (rec == NULL) { + rec = page_get_infimum_rec(page); + } + + do { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + count += op(rec, index, offsets, ctx); + rec = page_rec_get_next_const(rec); + } while (!page_rec_is_supremum(rec)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(count); +} + +/**************************************************************//** +Count and add to index->blobs any references to off-page columns +from records on a page. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add( +/*=============*/ + const page_t* page, /*!< in: rewritten page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ +{ + btr_blob_dbg_assert_empty(index, page_get_page_no(page)); + + return(btr_blob_dbg_op(page, NULL, index, ctx, btr_blob_dbg_add_rec)); +} + +/**************************************************************//** +Count and remove from index->blobs any references to off-page columns +from records on a page. +Used when reorganizing a page, before copying the records. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove( +/*================*/ + const page_t* page, /*!< in: b-tree page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint count; + + count = btr_blob_dbg_op(page, NULL, index, ctx, + btr_blob_dbg_remove_rec); + + /* Check that no references exist. */ + btr_blob_dbg_assert_empty(index, page_get_page_no(page)); + + return(count); +} + +/**************************************************************//** +Restore in index->blobs any references to off-page columns +Used when page reorganize fails due to compressed page overflow. */ +UNIV_INTERN +void +btr_blob_dbg_restore( +/*=================*/ + const page_t* npage, /*!< in: page that failed to compress */ + const page_t* page, /*!< in: copy of original page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint removed; + ulint added; + + ut_a(page_get_page_no(npage) == page_get_page_no(page)); + ut_a(page_get_space_id(npage) == page_get_space_id(page)); + + removed = btr_blob_dbg_remove(npage, index, ctx); + added = btr_blob_dbg_add(page, index, ctx); + ut_a(added == removed); +} + +/**************************************************************//** +Modify the 'deleted' flag of a record. */ +UNIV_INTERN +void +btr_blob_dbg_set_deleted_flag( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ibool del) /*!< in: TRUE=deleted, FALSE=exists */ +{ + const ib_rbt_node_t* node; + btr_blob_dbg_t b; + btr_blob_dbg_t* c; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_a(dict_index_is_clust(index)); + ut_a(del == !!del);/* must be FALSE==0 or TRUE==1 */ + + if (!rec_offs_any_extern(offsets) || !index->blobs) { + + return; + } + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field_ref = rec_get_nth_field( + rec, offsets, i, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + + switch (b.blob_page_no) { + case 0: + ut_a(memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + /* page number 0 is for the + page allocation bitmap */ + case FIL_NULL: + /* the column has been freed already */ + ut_error; + } + + mutex_enter(&index->blobs_mutex); + node = rbt_lookup(index->blobs, &b); + ut_a(node); + + c = rbt_value(btr_blob_dbg_t, node); + /* The flag should be modified. */ + c->del = del; + if (btr_blob_dbg_msg) { + b = *c; + mutex_exit(&index->blobs_mutex); + btr_blob_dbg_msg_issue("del_mk", &b, ""); + } else { + mutex_exit(&index->blobs_mutex); + } + } + } +} + +/**************************************************************//** +Change the ownership of an off-page column. */ +UNIV_INTERN +void +btr_blob_dbg_owner( +/*===============*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ulint i, /*!< in: ith field in rec */ + ibool own) /*!< in: TRUE=owned, FALSE=disowned */ +{ + const ib_rbt_node_t* node; + btr_blob_dbg_t b; + const byte* field_ref; + ulint len; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_a(rec_offs_nth_extern(offsets, i)); + + field_ref = rec_get_nth_field(rec, offsets, i, &len); + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = i; + b.owner = !(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG); + b.blob_page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO); + + ut_a(b.owner == own); + + mutex_enter(&index->blobs_mutex); + node = rbt_lookup(index->blobs, &b); + /* row_ins_clust_index_entry_by_modify() invokes + btr_cur_unmark_extern_fields() also for the newly inserted + references, which are all zero bytes until the columns are stored. + The node lookup must fail if and only if that is the case. */ + ut_a(!memcmp(field_ref, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE) + == !node); + + if (node) { + btr_blob_dbg_t* c = rbt_value(btr_blob_dbg_t, node); + /* Some code sets ownership from TRUE to TRUE. + We do not allow changing ownership from FALSE to FALSE. */ + ut_a(own || c->owner); + + c->owner = own; + if (!own) { + c->always_owner = FALSE; + } + } + + mutex_exit(&index->blobs_mutex); +} +#endif /* UNIV_BLOB_DEBUG */ + /* Latching strategy of the InnoDB B-tree -------------------------------------- @@ -296,6 +850,7 @@ btr_page_create( page_t* page = buf_block_get_frame(block); ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block)); if (UNIV_LIKELY_NULL(page_zip)) { page_create_zip(block, index, level, mtr); @@ -489,6 +1044,7 @@ btr_page_free_low( modify clock */ buf_block_modify_clock_inc(block); + btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block)); if (dict_index_is_ibuf(index)) { @@ -773,6 +1329,13 @@ btr_create( block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); } else { +#ifdef UNIV_BLOB_DEBUG + if ((type & DICT_CLUSTERED) && !index->blobs) { + mutex_create(&index->blobs_mutex, SYNC_ANY_LATCH); + index->blobs = rbt_create(sizeof(btr_blob_dbg_t), + btr_blob_dbg_cmp); + } +#endif /* UNIV_BLOB_DEBUG */ block = fseg_create(space, 0, PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr); } @@ -979,7 +1542,7 @@ btr_page_reorganize_low( log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); #ifndef UNIV_HOTBACKUP - temp_block = buf_block_alloc(0); + temp_block = buf_block_alloc(); #else /* !UNIV_HOTBACKUP */ ut_ad(block == back_block1); temp_block = back_block2; @@ -996,6 +1559,7 @@ btr_page_reorganize_low( block->check_index_page_at_flush = TRUE; #endif /* !UNIV_HOTBACKUP */ + btr_blob_dbg_remove(page, index, "btr_page_reorganize"); /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ @@ -1024,6 +1588,8 @@ btr_page_reorganize_low( (!page_zip_compress(page_zip, page, index, NULL))) { /* Restore the old page and exit. */ + btr_blob_dbg_restore(page, temp_page, index, + "btr_page_reorganize_compress_fail"); #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG /* Check that the bytes that we skip are identical. */ @@ -1157,6 +1723,7 @@ btr_page_empty( #endif /* UNIV_ZIP_DEBUG */ btr_search_drop_page_hash_index(block); + btr_blob_dbg_remove(page, index, "btr_page_empty"); /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ @@ -2497,6 +3064,7 @@ btr_lift_page_up( index); } + btr_blob_dbg_remove(page, index, "btr_lift_page_up"); lock_update_copy_and_discard(father_block, block); /* Go upward to root page, decrementing levels by one. */ @@ -2758,6 +3326,7 @@ err_exit: lock_update_merge_right(merge_block, orig_succ, block); } + btr_blob_dbg_remove(page, index, "btr_compress"); mem_heap_free(heap); if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) { @@ -2988,6 +3557,8 @@ btr_discard_page( block); } + btr_blob_dbg_remove(page, index, "btr_discard_page"); + /* Free the file page */ btr_page_free(index, block, mtr); diff --git a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c index c57255a25ae..d7b5ed0d135 100644 --- a/storage/innodb_plugin/btr/btr0cur.c +++ b/storage/innodb_plugin/btr/btr0cur.c @@ -100,6 +100,18 @@ can be released by page reorganize, then it is reorganized */ /*--------------------------------------*/ #define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB part header, in bytes */ + +/** Estimated table level stats from sampled value. +@param value sampled stats +@param index index being sampled +@param sample number of sampled rows +@param ext_size external stored data size +@param not_empty table not empty +@return estimated table wide stats from sampled value */ +#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty)\ + (((value) * (ib_int64_t) index->stat_n_leaf_pages \ + + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size))) + /* @} */ #endif /* !UNIV_HOTBACKUP */ @@ -174,7 +186,7 @@ static ulint btr_rec_get_externally_stored_len( /*==============================*/ - rec_t* rec, /*!< in: record */ + const rec_t* rec, /*!< in: record */ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ #endif /* !UNIV_HOTBACKUP */ @@ -1756,8 +1768,8 @@ btr_cur_update_in_place( NOT call it if index is secondary */ if (!dict_index_is_clust(index) - || row_upd_changes_ord_field_binary(NULL, NULL, - index, update)) { + || row_upd_changes_ord_field_binary(index, update, thr, + NULL, NULL)) { /* Remove possible hash index pointer to this record */ btr_search_update_hash_on_delete(cursor); @@ -2560,6 +2572,7 @@ btr_cur_del_mark_set_clust_rec( page_zip = buf_block_get_page_zip(block); + btr_blob_dbg_set_deleted_flag(rec, index, offsets, val); btr_rec_set_deleted_flag(rec, page_zip, val); trx = thr_get_trx(thr); @@ -3201,9 +3214,54 @@ btr_estimate_n_rows_in_range( } /*******************************************************************//** +Record the number of non_null key values in a given index for +each n-column prefix of the index where n < dict_index_get_n_unique(index). +The estimates are eventually stored in the array: +index->stat_n_non_null_key_vals. */ +static +void +btr_record_not_null_field_in_rec( +/*=============================*/ + rec_t* rec, /*!< in: physical record */ + ulint n_unique, /*!< in: dict_index_get_n_unique(index), + number of columns uniquely determine + an index entry */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index), + its size could be for all fields or + that of "n_unique" */ + ib_int64_t* n_not_null) /*!< in/out: array to record number of + not null rows for n-column prefix */ +{ + ulint i; + + ut_ad(rec_offs_n_fields(offsets) >= n_unique); + + if (n_not_null == NULL) { + return; + } + + for (i = 0; i < n_unique; i++) { + ulint rec_len; + byte* field; + + field = rec_get_nth_field(rec, offsets, i, &rec_len); + + if (rec_len != UNIV_SQL_NULL) { + n_not_null[i]++; + } else { + /* Break if we hit the first NULL value */ + break; + } + } +} + +/*******************************************************************//** Estimates the number of different key values in a given index, for each n-column prefix of the index where n <= dict_index_get_n_unique(index). -The estimates are stored in the array index->stat_n_diff_key_vals. */ +The estimates are stored in the array index->stat_n_diff_key_vals. +If innodb_stats_method is "nulls_ignored", we also record the number of +non-null values for each prefix and store the estimates in +array index->stat_n_non_null_key_vals. */ UNIV_INTERN void btr_estimate_number_of_different_key_vals( @@ -3217,6 +3275,8 @@ btr_estimate_number_of_different_key_vals( ulint matched_fields; ulint matched_bytes; ib_int64_t* n_diff; + ib_int64_t* n_not_null; + ibool stats_null_not_equal; ullint n_sample_pages; /* number of pages to sample */ ulint not_empty_flag = 0; ulint total_external_size = 0; @@ -3225,16 +3285,43 @@ btr_estimate_number_of_different_key_vals( ullint add_on; mtr_t mtr; mem_heap_t* heap = NULL; - ulint offsets_rec_[REC_OFFS_NORMAL_SIZE]; - ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE]; - ulint* offsets_rec = offsets_rec_; - ulint* offsets_next_rec= offsets_next_rec_; - rec_offs_init(offsets_rec_); - rec_offs_init(offsets_next_rec_); + ulint* offsets_rec = NULL; + ulint* offsets_next_rec = NULL; n_cols = dict_index_get_n_unique(index); - n_diff = mem_zalloc((n_cols + 1) * sizeof(ib_int64_t)); + heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null) + * (n_cols + 1) + + dict_index_get_n_fields(index) + * (sizeof *offsets_rec + + sizeof *offsets_next_rec)); + + n_diff = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t)); + + n_not_null = NULL; + + /* Check srv_innodb_stats_method setting, and decide whether we + need to record non-null value and also decide if NULL is + considered equal (by setting stats_null_not_equal value) */ + switch (srv_innodb_stats_method) { + case SRV_STATS_NULLS_IGNORED: + n_not_null = mem_heap_zalloc(heap, (n_cols + 1) + * sizeof *n_not_null); + /* fall through */ + + case SRV_STATS_NULLS_UNEQUAL: + /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL + case, we will treat NULLs as unequal value */ + stats_null_not_equal = TRUE; + break; + + case SRV_STATS_NULLS_EQUAL: + stats_null_not_equal = FALSE; + break; + + default: + ut_error; + } /* It makes no sense to test more pages than are contained in the index, thus we lower the number if it is too high */ @@ -3251,7 +3338,6 @@ btr_estimate_number_of_different_key_vals( /* We sample some pages in the index to get an estimate */ for (i = 0; i < n_sample_pages; i++) { - rec_t* supremum; mtr_start(&mtr); btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr); @@ -3264,18 +3350,25 @@ btr_estimate_number_of_different_key_vals( page = btr_cur_get_page(&cursor); - supremum = page_get_supremum_rec(page); rec = page_rec_get_next(page_get_infimum_rec(page)); - if (rec != supremum) { + if (!page_rec_is_supremum(rec)) { not_empty_flag = 1; offsets_rec = rec_get_offsets(rec, index, offsets_rec, ULINT_UNDEFINED, &heap); + + if (n_not_null) { + btr_record_not_null_field_in_rec( + rec, n_cols, offsets_rec, n_not_null); + } } - while (rec != supremum) { + while (!page_rec_is_supremum(rec)) { rec_t* next_rec = page_rec_get_next(rec); - if (next_rec == supremum) { + if (page_rec_is_supremum(next_rec)) { + total_external_size += + btr_rec_get_externally_stored_len( + rec, offsets_rec); break; } @@ -3283,11 +3376,13 @@ btr_estimate_number_of_different_key_vals( matched_bytes = 0; offsets_next_rec = rec_get_offsets(next_rec, index, offsets_next_rec, - n_cols, &heap); + ULINT_UNDEFINED, + &heap); cmp_rec_rec_with_match(rec, next_rec, offsets_rec, offsets_next_rec, - index, &matched_fields, + index, stats_null_not_equal, + &matched_fields, &matched_bytes); for (j = matched_fields + 1; j <= n_cols; j++) { @@ -3297,6 +3392,12 @@ btr_estimate_number_of_different_key_vals( n_diff[j]++; } + if (n_not_null) { + btr_record_not_null_field_in_rec( + next_rec, n_cols, offsets_next_rec, + n_not_null); + } + total_external_size += btr_rec_get_externally_stored_len( rec, offsets_rec); @@ -3331,10 +3432,6 @@ btr_estimate_number_of_different_key_vals( } } - offsets_rec = rec_get_offsets(rec, index, offsets_rec, - ULINT_UNDEFINED, &heap); - total_external_size += btr_rec_get_externally_stored_len( - rec, offsets_rec); mtr_commit(&mtr); } @@ -3348,13 +3445,9 @@ btr_estimate_number_of_different_key_vals( for (j = 0; j <= n_cols; j++) { index->stat_n_diff_key_vals[j] - = ((n_diff[j] - * (ib_int64_t)index->stat_n_leaf_pages - + n_sample_pages - 1 - + total_external_size - + not_empty_flag) - / (n_sample_pages - + total_external_size)); + = BTR_TABLE_STATS_FROM_SAMPLE( + n_diff[j], index, n_sample_pages, + total_external_size, not_empty_flag); /* If the tree is small, smaller than 10 * n_sample_pages + total_external_size, then @@ -3373,45 +3466,81 @@ btr_estimate_number_of_different_key_vals( } index->stat_n_diff_key_vals[j] += add_on; - } - mem_free(n_diff); - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); + /* Update the stat_n_non_null_key_vals[] with our + sampled result. stat_n_non_null_key_vals[] is created + and initialized to zero in dict_index_add_to_cache(), + along with stat_n_diff_key_vals[] array */ + if (n_not_null != NULL && (j < n_cols)) { + index->stat_n_non_null_key_vals[j] = + BTR_TABLE_STATS_FROM_SAMPLE( + n_not_null[j], index, n_sample_pages, + total_external_size, not_empty_flag); + } } + + mem_heap_free(heap); } /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ /***********************************************************//** +Gets the offset of the pointer to the externally stored part of a field. +@return offset of the pointer to the externally stored part */ +static +ulint +btr_rec_get_field_ref_offs( +/*=======================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: index of the external field */ +{ + ulint field_ref_offs; + ulint local_len; + + ut_a(rec_offs_nth_extern(offsets, n)); + field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len); + ut_a(local_len != UNIV_SQL_NULL); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE); +} + +/** Gets a pointer to the externally stored part of a field. +@param rec record +@param offsets rec_get_offsets(rec) +@param n index of the externally stored field +@return pointer to the externally stored part */ +#define btr_rec_get_field_ref(rec, offsets, n) \ + ((rec) + btr_rec_get_field_ref_offs(offsets, n)) + +/***********************************************************//** Gets the externally stored size of a record, in units of a database page. @return externally stored part, in units of a database page */ static ulint btr_rec_get_externally_stored_len( /*==============================*/ - rec_t* rec, /*!< in: record */ + const rec_t* rec, /*!< in: record */ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ { ulint n_fields; - byte* data; - ulint local_len; - ulint extern_len; ulint total_extern_len = 0; ulint i; ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + + if (!rec_offs_any_extern(offsets)) { + return(0); + } + n_fields = rec_offs_n_fields(offsets); for (i = 0; i < n_fields; i++) { if (rec_offs_nth_extern(offsets, i)) { - data = rec_get_nth_field(rec, offsets, i, &local_len); - - local_len -= BTR_EXTERN_FIELD_REF_SIZE; - - extern_len = mach_read_from_4(data + local_len - + BTR_EXTERN_LEN + 4); + ulint extern_len = mach_read_from_4( + btr_rec_get_field_ref(rec, offsets, i) + + BTR_EXTERN_LEN + 4); total_extern_len += ut_calc_align(extern_len, UNIV_PAGE_SIZE); @@ -3441,7 +3570,7 @@ btr_cur_set_ownership_of_extern_field( ulint byte_val; data = rec_get_nth_field(rec, offsets, i, &local_len); - + ut_ad(rec_offs_nth_extern(offsets, i)); ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); local_len -= BTR_EXTERN_FIELD_REF_SIZE; @@ -3451,6 +3580,9 @@ btr_cur_set_ownership_of_extern_field( if (val) { byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG); } else { +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ byte_val = byte_val | BTR_EXTERN_OWNER_FLAG; } @@ -3464,6 +3596,8 @@ btr_cur_set_ownership_of_extern_field( } else { mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val); } + + btr_blob_dbg_owner(rec, index, offsets, i, val); } /*******************************************************************//** @@ -3667,13 +3801,12 @@ btr_blob_free( && buf_block_get_space(block) == space && buf_block_get_page_no(block) == page_no) { - if (buf_LRU_free_block(&block->page, all, NULL) - != BUF_LRU_FREED + if (buf_LRU_free_block(&block->page, all) != BUF_LRU_FREED && all && block->page.zip.data) { /* Attempt to deallocate the uncompressed page if the whole block cannot be deallocted. */ - buf_LRU_free_block(&block->page, FALSE, NULL); + buf_LRU_free_block(&block->page, FALSE); } } @@ -3689,8 +3822,8 @@ file segment of the index tree. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ UNIV_INTERN ulint -btr_store_big_rec_extern_fields( -/*============================*/ +btr_store_big_rec_extern_fields_func( +/*=================================*/ dict_index_t* index, /*!< in: index of rec; the index tree MUST be X-latched */ buf_block_t* rec_block, /*!< in/out: block containing rec */ @@ -3699,11 +3832,17 @@ btr_store_big_rec_extern_fields( the "external storage" flags in offsets will not correspond to rec when this function returns */ - big_rec_t* big_rec_vec, /*!< in: vector containing fields +#ifdef UNIV_DEBUG + mtr_t* local_mtr, /*!< in: mtr containing the + latch to rec and to the tree */ +#endif /* UNIV_DEBUG */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ibool update_in_place,/*! in: TRUE if the record is updated + in place (not delete+insert) */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + const big_rec_t*big_rec_vec) /*!< in: vector containing fields to be stored externally */ - mtr_t* local_mtr __attribute__((unused))) /*!< in: mtr - containing the latch to rec and to the - tree */ + { ulint rec_page_no; byte* field_ref; @@ -3721,6 +3860,7 @@ btr_store_big_rec_extern_fields( z_stream c_stream; ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_any_extern(offsets)); ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX)); @@ -3752,21 +3892,37 @@ btr_store_big_rec_extern_fields( ut_a(err == Z_OK); } +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* All pointers to externally stored columns in the record + must either be zero or they must be pointers to inherited + columns, owned by this record or an earlier record version. */ + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (!rec_offs_nth_extern(offsets, i)) { + continue; + } + field_ref = btr_rec_get_field_ref(rec, offsets, i); + + ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)); + /* Either this must be an update in place, + or the BLOB must be inherited, or the BLOB pointer + must be zero (will be written in this function). */ + ut_a(update_in_place + || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG) + || !memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + } +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ /* We have to create a file segment to the tablespace for each field and put the pointer to the field in rec */ for (i = 0; i < big_rec_vec->n_fields; i++) { - ut_ad(rec_offs_nth_extern(offsets, - big_rec_vec->fields[i].field_no)); - { - ulint local_len; - field_ref = rec_get_nth_field( - rec, offsets, big_rec_vec->fields[i].field_no, - &local_len); - ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); - local_len -= BTR_EXTERN_FIELD_REF_SIZE; - field_ref += local_len; - } + field_ref = btr_rec_get_field_ref( + rec, offsets, big_rec_vec->fields[i].field_no); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* A zero BLOB pointer should have been initially inserted. */ + ut_a(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ extern_len = big_rec_vec->fields[i].len; UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data, extern_len); @@ -3941,6 +4097,11 @@ btr_store_big_rec_extern_fields( } if (prev_page_no == FIL_NULL) { + btr_blob_dbg_add_blob( + rec, big_rec_vec->fields[i] + .field_no, page_no, index, + "store"); + mach_write_to_4(field_ref + BTR_EXTERN_SPACE_ID, space_id); @@ -4016,6 +4177,11 @@ next_zip_page: MLOG_4BYTES, &mtr); if (prev_page_no == FIL_NULL) { + btr_blob_dbg_add_blob( + rec, big_rec_vec->fields[i] + .field_no, page_no, index, + "store"); + mlog_write_ulint(field_ref + BTR_EXTERN_SPACE_ID, space_id, @@ -4048,6 +4214,23 @@ next_zip_page: mem_heap_free(heap); } +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* All pointers to externally stored columns in the record + must be valid. */ + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (!rec_offs_nth_extern(offsets, i)) { + continue; + } + + field_ref = btr_rec_get_field_ref(rec, offsets, i); + + /* The pointer must not be zero. */ + ut_a(0 != memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + /* The column must not be disowned by this record. */ + ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)); + } +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ return(DB_SUCCESS); } @@ -4070,6 +4253,7 @@ btr_check_blob_fil_page_type( if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) { ulint flags = fil_space_get_flags(space_id); +#ifndef UNIV_DEBUG /* Improve debug test coverage */ if (UNIV_LIKELY ((flags & DICT_TF_FORMAT_MASK) == DICT_TF_FORMAT_51)) { /* Old versions of InnoDB did not initialize @@ -4078,6 +4262,7 @@ btr_check_blob_fil_page_type( a BLOB page that is in Antelope format.*/ return; } +#endif /* !UNIV_DEBUG */ ut_print_timestamp(stderr); fprintf(stderr, @@ -4127,23 +4312,13 @@ btr_free_externally_stored_field( ulint page_no; ulint next_page_no; mtr_t mtr; -#ifdef UNIV_DEBUG + ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains_page(local_mtr, field_ref, MTR_MEMO_PAGE_X_FIX)); ut_ad(!rec || rec_offs_validate(rec, index, offsets)); - - if (rec) { - ulint local_len; - const byte* f = rec_get_nth_field(rec, offsets, - i, &local_len); - ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); - local_len -= BTR_EXTERN_FIELD_REF_SIZE; - f += local_len; - ut_ad(f == field_ref); - } -#endif /* UNIV_DEBUG */ + ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i)); if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) { @@ -4175,6 +4350,37 @@ btr_free_externally_stored_field( rec_zip_size = 0; } +#ifdef UNIV_BLOB_DEBUG + if (!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG) + && !((field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG) + && (rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY))) { + /* This off-page column will be freed. + Check that no references remain. */ + + btr_blob_dbg_t b; + + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + + if (rec) { + /* Remove the reference from the record to the + BLOB. If the BLOB were not freed, the + reference would be removed when the record is + removed. Freeing the BLOB will overwrite the + BTR_EXTERN_PAGE_NO in the field_ref of the + record with FIL_NULL, which would make the + btr_blob_dbg information inconsistent with the + record. */ + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = i; + btr_blob_dbg_rbt_delete(index, &b, "free"); + } + + btr_blob_dbg_assert_empty(index, b.blob_page_no); + } +#endif /* UNIV_BLOB_DEBUG */ + for (;;) { #ifdef UNIV_SYNC_DEBUG buf_block_t* rec_block; @@ -4308,13 +4514,8 @@ btr_rec_free_externally_stored_fields( for (i = 0; i < n_fields; i++) { if (rec_offs_nth_extern(offsets, i)) { - ulint len; - byte* data - = rec_get_nth_field(rec, offsets, i, &len); - ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); - btr_free_externally_stored_field( - index, data + len - BTR_EXTERN_FIELD_REF_SIZE, + index, btr_rec_get_field_ref(rec, offsets, i), rec, offsets, page_zip, i, rb_ctx, mtr); } } @@ -4426,27 +4627,45 @@ btr_copy_blob_prefix( /*******************************************************************//** Copies the prefix of a compressed BLOB. The clustered index record -that points to this BLOB must be protected by a lock or a page latch. */ +that points to this BLOB must be protected by a lock or a page latch. +@return number of bytes written to buf */ static -void +ulint btr_copy_zblob_prefix( /*==================*/ - z_stream* d_stream,/*!< in/out: the decompressing stream */ + byte* buf, /*!< out: the externally stored part of + the field, or a prefix of it */ + ulint len, /*!< in: length of buf, in bytes */ ulint zip_size,/*!< in: compressed BLOB page size */ ulint space_id,/*!< in: space id of the BLOB pages */ ulint page_no,/*!< in: page number of the first BLOB page */ ulint offset) /*!< in: offset on the first BLOB page */ { - ulint page_type = FIL_PAGE_TYPE_ZBLOB; + ulint page_type = FIL_PAGE_TYPE_ZBLOB; + mem_heap_t* heap; + int err; + z_stream d_stream; + + d_stream.next_out = buf; + d_stream.avail_out = len; + d_stream.next_in = Z_NULL; + d_stream.avail_in = 0; + + /* Zlib inflate needs 32 kilobytes for the default + window size, plus a few kilobytes for small objects. */ + heap = mem_heap_create(40000); + page_zip_set_alloc(&d_stream, heap); ut_ad(ut_is_2pow(zip_size)); ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE); ut_ad(zip_size <= UNIV_PAGE_SIZE); ut_ad(space_id); + err = inflateInit(&d_stream); + ut_a(err == Z_OK); + for (;;) { buf_page_t* bpage; - int err; ulint next_page_no; /* There is no latch on bpage directly. Instead, @@ -4462,7 +4681,7 @@ btr_copy_zblob_prefix( " compressed BLOB" " page %lu space %lu\n", (ulong) page_no, (ulong) space_id); - return; + goto func_exit; } if (UNIV_UNLIKELY @@ -4488,13 +4707,13 @@ btr_copy_zblob_prefix( offset += 4; } - d_stream->next_in = bpage->zip.data + offset; - d_stream->avail_in = zip_size - offset; + d_stream.next_in = bpage->zip.data + offset; + d_stream.avail_in = zip_size - offset; - err = inflate(d_stream, Z_NO_FLUSH); + err = inflate(&d_stream, Z_NO_FLUSH); switch (err) { case Z_OK: - if (!d_stream->avail_out) { + if (!d_stream.avail_out) { goto end_of_blob; } break; @@ -4511,13 +4730,13 @@ inflate_error: " compressed BLOB" " page %lu space %lu returned %d (%s)\n", (ulong) page_no, (ulong) space_id, - err, d_stream->msg); + err, d_stream.msg); case Z_BUF_ERROR: goto end_of_blob; } if (next_page_no == FIL_NULL) { - if (!d_stream->avail_in) { + if (!d_stream.avail_in) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: unexpected end of" @@ -4526,7 +4745,7 @@ inflate_error: (ulong) page_no, (ulong) space_id); } else { - err = inflate(d_stream, Z_FINISH); + err = inflate(&d_stream, Z_FINISH); switch (err) { case Z_STREAM_END: case Z_BUF_ERROR: @@ -4538,7 +4757,7 @@ inflate_error: end_of_blob: buf_page_release_zip(bpage); - return; + goto func_exit; } buf_page_release_zip(bpage); @@ -4550,6 +4769,12 @@ end_of_blob: offset = FIL_PAGE_NEXT; page_type = FIL_PAGE_TYPE_ZBLOB2; } + +func_exit: + inflateEnd(&d_stream); + mem_heap_free(heap); + UNIV_MEM_ASSERT_RW(buf, d_stream.total_out); + return(d_stream.total_out); } /*******************************************************************//** @@ -4575,28 +4800,8 @@ btr_copy_externally_stored_field_prefix_low( } if (UNIV_UNLIKELY(zip_size)) { - int err; - z_stream d_stream; - mem_heap_t* heap; - - /* Zlib inflate needs 32 kilobytes for the default - window size, plus a few kilobytes for small objects. */ - heap = mem_heap_create(40000); - page_zip_set_alloc(&d_stream, heap); - - err = inflateInit(&d_stream); - ut_a(err == Z_OK); - - d_stream.next_out = buf; - d_stream.avail_out = len; - d_stream.avail_in = 0; - - btr_copy_zblob_prefix(&d_stream, zip_size, - space_id, page_no, offset); - inflateEnd(&d_stream); - mem_heap_free(heap); - UNIV_MEM_ASSERT_RW(buf, d_stream.total_out); - return(d_stream.total_out); + return(btr_copy_zblob_prefix(buf, len, zip_size, + space_id, page_no, offset)); } else { return(btr_copy_blob_prefix(buf, len, space_id, page_no, offset)); diff --git a/storage/innodb_plugin/btr/btr0sea.c b/storage/innodb_plugin/btr/btr0sea.c index 035fdbb61d2..cd0eadbb1b8 100644 --- a/storage/innodb_plugin/btr/btr0sea.c +++ b/storage/innodb_plugin/btr/btr0sea.c @@ -141,7 +141,7 @@ btr_search_check_free_space_in_heap(void) be enough free space in the hash table. */ if (heap->free_block == NULL) { - buf_block_t* block = buf_block_alloc(0); + buf_block_t* block = buf_block_alloc(); rw_lock_x_lock(&btr_search_latch); @@ -1201,8 +1201,8 @@ btr_search_drop_page_hash_when_freed( having to fear a deadlock. */ block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH, NULL, - BUF_GET_IF_IN_POOL, __FILE__, __LINE__, - &mtr); + BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__, + &mtr); /* Because the buffer pool mutex was released by buf_page_peek_if_search_hashed(), it is possible that the block was removed from the buffer pool by another thread diff --git a/storage/innodb_plugin/buf/buf0buddy.c b/storage/innodb_plugin/buf/buf0buddy.c index ee5a569c3ff..63c99571510 100644 --- a/storage/innodb_plugin/buf/buf0buddy.c +++ b/storage/innodb_plugin/buf/buf0buddy.c @@ -327,7 +327,7 @@ buf_buddy_alloc_low( /* Try replacing an uncompressed page in the buffer pool. */ buf_pool_mutex_exit(); - block = buf_LRU_get_free_block(0); + block = buf_LRU_get_free_block(); *lru = TRUE; buf_pool_mutex_enter(); diff --git a/storage/innodb_plugin/buf/buf0buf.c b/storage/innodb_plugin/buf/buf0buf.c index dac416f9472..14ec7b75911 100644 --- a/storage/innodb_plugin/buf/buf0buf.c +++ b/storage/innodb_plugin/buf/buf0buf.c @@ -657,9 +657,9 @@ buf_block_init( block->modify_clock = 0; -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG block->page.file_page_was_freed = FALSE; -#endif /* UNIV_DEBUG_FILE_ACCESSES */ +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ block->check_index_page_at_flush = FALSE; block->index = NULL; @@ -1283,7 +1283,7 @@ shrink_again: buf_LRU_make_block_old(&block->page); dirty++; - } else if (buf_LRU_free_block(&block->page, TRUE, NULL) + } else if (buf_LRU_free_block(&block->page, TRUE) != BUF_LRU_FREED) { nonfree++; } @@ -1600,7 +1600,7 @@ buf_page_peek_if_search_hashed( return(is_hashed); } -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG /********************************************************************//** Sets file_page_was_freed TRUE if the page is found in the buffer pool. This function should be called when we free a file page and want the @@ -1621,6 +1621,8 @@ buf_page_set_file_page_was_freed( bpage = buf_page_hash_get(space, offset); if (bpage) { + /* bpage->file_page_was_freed can already hold + when this code is invoked from dict_drop_index_tree() */ bpage->file_page_was_freed = TRUE; } @@ -1656,7 +1658,7 @@ buf_page_reset_file_page_was_freed( return(bpage); } -#endif /* UNIV_DEBUG_FILE_ACCESSES */ +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ /********************************************************************//** Get read access to a compressed page (usually of type @@ -1729,8 +1731,7 @@ err_exit: mutex_enter(block_mutex); /* Discard the uncompressed page frame if possible. */ - if (buf_LRU_free_block(bpage, FALSE, NULL) - == BUF_LRU_FREED) { + if (buf_LRU_free_block(bpage, FALSE) == BUF_LRU_FREED) { mutex_exit(block_mutex); goto lookup; @@ -1754,7 +1755,7 @@ got_block: buf_page_set_accessed_make_young(bpage, access_time); -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG ut_a(!bpage->file_page_was_freed); #endif @@ -1892,16 +1893,19 @@ buf_block_align( /* TODO: protect buf_pool->chunks with a mutex (it will currently remain constant after buf_pool_init()) */ for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) { - lint offs = ptr - chunk->blocks->frame; + ulint offs; - if (UNIV_UNLIKELY(offs < 0)) { + if (UNIV_UNLIKELY(ptr < chunk->blocks->frame)) { continue; } + /* else */ + + offs = ptr - chunk->blocks->frame; offs >>= UNIV_PAGE_SIZE_SHIFT; - if (UNIV_LIKELY((ulint) offs < chunk->size)) { + if (UNIV_LIKELY(offs < chunk->size)) { buf_block_t* block = &chunk->blocks[offs]; /* The function buf_chunk_init() invokes @@ -2027,7 +2031,7 @@ buf_page_get_gen( ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ buf_block_t* guess, /*!< in: guessed block or NULL */ ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL, - BUF_GET_NO_LATCH */ + BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH */ const char* file, /*!< in: file name */ ulint line, /*!< in: line where called */ mtr_t* mtr) /*!< in: mini-transaction */ @@ -2043,9 +2047,19 @@ buf_page_get_gen( ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH) || (rw_latch == RW_NO_LATCH)); - ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH)); - ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL) - || (mode == BUF_GET_NO_LATCH)); +#ifdef UNIV_DEBUG + switch (mode) { + case BUF_GET_NO_LATCH: + ut_ad(rw_latch == RW_NO_LATCH); + break; + case BUF_GET: + case BUF_GET_IF_IN_POOL: + case BUF_PEEK_IF_IN_POOL: + break; + default: + ut_error; + } +#endif /* UNIV_DEBUG */ ut_ad(zip_size == fil_space_get_zip_size(space)); ut_ad(ut_is_2pow(zip_size)); #ifndef UNIV_LOG_DEBUG @@ -2087,7 +2101,8 @@ loop2: buf_pool_mutex_exit(); - if (mode == BUF_GET_IF_IN_POOL) { + if (mode == BUF_GET_IF_IN_POOL + || mode == BUF_PEEK_IF_IN_POOL) { return(NULL); } @@ -2126,7 +2141,8 @@ loop2: must_read = buf_block_get_io_fix(block) == BUF_IO_READ; - if (must_read && mode == BUF_GET_IF_IN_POOL) { + if (must_read && (mode == BUF_GET_IF_IN_POOL + || mode == BUF_PEEK_IF_IN_POOL)) { /* The page is only being read to buffer */ buf_pool_mutex_exit(); @@ -2165,7 +2181,7 @@ wait_until_unfixed: buf_pool_mutex_exit(); mutex_exit(&buf_pool_zip_mutex); - block = buf_LRU_get_free_block(0); + block = buf_LRU_get_free_block(); ut_a(block); buf_pool_mutex_enter(); @@ -2244,6 +2260,7 @@ wait_until_unfixed: mutex_exit(&buf_pool_zip_mutex); buf_pool->n_pend_unzip++; + bpage->state = BUF_BLOCK_ZIP_FREE; buf_buddy_free(bpage, sizeof *bpage); buf_pool_mutex_exit(); @@ -2291,8 +2308,7 @@ wait_until_unfixed: /* Try to evict the block from the buffer pool, to use the insert buffer as much as possible. */ - if (buf_LRU_free_block(&block->page, TRUE, NULL) - == BUF_LRU_FREED) { + if (buf_LRU_free_block(&block->page, TRUE) == BUF_LRU_FREED) { buf_pool_mutex_exit(); mutex_exit(&block->mutex); fprintf(stderr, @@ -2321,9 +2337,11 @@ wait_until_unfixed: buf_pool_mutex_exit(); - buf_page_set_accessed_make_young(&block->page, access_time); + if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL)) { + buf_page_set_accessed_make_young(&block->page, access_time); + } -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG ut_a(!block->page.file_page_was_freed); #endif @@ -2374,7 +2392,7 @@ wait_until_unfixed: mtr_memo_push(mtr, block, fix_type); - if (!access_time) { + if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL) && !access_time) { /* In the case of a first access, try to apply linear read-ahead */ @@ -2481,7 +2499,7 @@ buf_page_optimistic_get( ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG ut_a(block->page.file_page_was_freed == FALSE); #endif if (UNIV_UNLIKELY(!access_time)) { @@ -2589,7 +2607,7 @@ buf_page_get_known_nowait( ut_a(block->page.buf_fix_count > 0); ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG ut_a(block->page.file_page_was_freed == FALSE); #endif @@ -2672,9 +2690,9 @@ buf_page_try_get_func( ut_a(block->page.buf_fix_count > 0); ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG ut_a(block->page.file_page_was_freed == FALSE); -#endif /* UNIV_DEBUG_FILE_ACCESSES */ +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); buf_pool->stat.n_page_gets++; @@ -2703,9 +2721,9 @@ buf_page_init_low( bpage->newest_modification = 0; bpage->oldest_modification = 0; HASH_INVALIDATE(bpage, hash); -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG bpage->file_page_was_freed = FALSE; -#endif /* UNIV_DEBUG_FILE_ACCESSES */ +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ } /********************************************************************//** @@ -2829,7 +2847,7 @@ buf_page_init_for_read( && UNIV_LIKELY(!recv_recovery_is_on())) { block = NULL; } else { - block = buf_LRU_get_free_block(0); + block = buf_LRU_get_free_block(); ut_ad(block); } @@ -2923,6 +2941,7 @@ err_exit: && UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) { /* The block was added by some other thread. */ + bpage->state = BUF_BLOCK_ZIP_FREE; buf_buddy_free(bpage, sizeof *bpage); buf_buddy_free(data, zip_size); @@ -3001,7 +3020,7 @@ buf_page_create( ut_ad(mtr->state == MTR_ACTIVE); ut_ad(space || !zip_size); - free_block = buf_LRU_get_free_block(0); + free_block = buf_LRU_get_free_block(); buf_pool_mutex_enter(); @@ -3011,9 +3030,9 @@ buf_page_create( #ifdef UNIV_IBUF_COUNT_DEBUG ut_a(ibuf_count_get(space, offset) == 0); #endif -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG block->page.file_page_was_freed = FALSE; -#endif /* UNIV_DEBUG_FILE_ACCESSES */ +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ /* Page can be found in buf_pool */ buf_pool_mutex_exit(); diff --git a/storage/innodb_plugin/buf/buf0lru.c b/storage/innodb_plugin/buf/buf0lru.c index e4cf218bf2e..a69b2658c51 100644 --- a/storage/innodb_plugin/buf/buf0lru.c +++ b/storage/innodb_plugin/buf/buf0lru.c @@ -246,71 +246,75 @@ buf_LRU_drop_page_hash_for_tablespace( page_arr = ut_malloc(sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE); buf_pool_mutex_enter(); + num_entries = 0; scan_again: - num_entries = 0; bpage = UT_LIST_GET_LAST(buf_pool->LRU); while (bpage != NULL) { - mutex_t* block_mutex = buf_page_get_mutex(bpage); buf_page_t* prev_bpage; + ibool is_fixed; - mutex_enter(block_mutex); prev_bpage = UT_LIST_GET_PREV(LRU, bpage); ut_a(buf_page_in_file(bpage)); if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE || bpage->space != id - || bpage->buf_fix_count > 0 || bpage->io_fix != BUF_IO_NONE) { - /* We leave the fixed pages as is in this scan. - To be dealt with later in the final scan. */ - mutex_exit(block_mutex); - goto next_page; + /* Compressed pages are never hashed. + Skip blocks of other tablespaces. + Skip I/O-fixed blocks (to be dealt with later). */ +next_page: + bpage = prev_bpage; + continue; } - if (((buf_block_t*) bpage)->is_hashed) { + mutex_enter(&((buf_block_t*) bpage)->mutex); + is_fixed = bpage->buf_fix_count > 0 + || !((buf_block_t*) bpage)->is_hashed; + mutex_exit(&((buf_block_t*) bpage)->mutex); - /* Store the offset(i.e.: page_no) in the array - so that we can drop hash index in a batch - later. */ - page_arr[num_entries] = bpage->offset; - mutex_exit(block_mutex); - ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE); - ++num_entries; + if (is_fixed) { + goto next_page; + } - if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) { - goto next_page; - } - /* Array full. We release the buf_pool_mutex to - obey the latching order. */ - buf_pool_mutex_exit(); - - buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, - num_entries); - num_entries = 0; - buf_pool_mutex_enter(); - } else { - mutex_exit(block_mutex); + /* Store the page number so that we can drop the hash + index in a batch later. */ + page_arr[num_entries] = bpage->offset; + ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE); + ++num_entries; + + if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) { + goto next_page; } -next_page: - /* Note that we may have released the buf_pool mutex - above after reading the prev_bpage during processing - of a page_hash_batch (i.e.: when the array was full). - This means that prev_bpage can change in LRU list. - This is OK because this function is a 'best effort' - to drop as many search hash entries as possible and - it does not guarantee that ALL such entries will be - dropped. */ - bpage = prev_bpage; + /* Array full. We release the buf_pool_mutex to + obey the latching order. */ + buf_pool_mutex_exit(); + buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, + num_entries); + buf_pool_mutex_enter(); + num_entries = 0; + + /* Note that we released the buf_pool mutex above + after reading the prev_bpage during processing of a + page_hash_batch (i.e.: when the array was full). + Because prev_bpage could belong to a compressed-only + block, it may have been relocated, and thus the + pointer cannot be trusted. Because bpage is of type + buf_block_t, it is safe to dereference. + + bpage can change in the LRU list. This is OK because + this function is a 'best effort' to drop as many + search hash entries as possible and it does not + guarantee that ALL such entries will be dropped. */ /* If, however, bpage has been removed from LRU list to the free list then we should restart the scan. bpage->state is protected by buf_pool mutex. */ - if (bpage && !buf_page_in_file(bpage)) { - ut_a(num_entries == 0); + if (bpage + && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { goto scan_again; } } @@ -575,7 +579,7 @@ buf_LRU_free_from_unzip_LRU_list( ut_ad(block->page.in_LRU_list); mutex_enter(&block->mutex); - freed = buf_LRU_free_block(&block->page, FALSE, NULL); + freed = buf_LRU_free_block(&block->page, FALSE); mutex_exit(&block->mutex); switch (freed) { @@ -636,7 +640,7 @@ buf_LRU_free_from_common_LRU_list( mutex_enter(block_mutex); accessed = buf_page_is_accessed(bpage); - freed = buf_LRU_free_block(bpage, TRUE, NULL); + freed = buf_LRU_free_block(bpage, TRUE); mutex_exit(block_mutex); switch (freed) { @@ -798,10 +802,8 @@ LRU list to the free list. @return the free control block, in state BUF_BLOCK_READY_FOR_USE */ UNIV_INTERN buf_block_t* -buf_LRU_get_free_block( -/*===================*/ - ulint zip_size) /*!< in: compressed page size in bytes, - or 0 if uncompressed tablespace */ +buf_LRU_get_free_block(void) +/*========================*/ { buf_block_t* block = NULL; ibool freed; @@ -877,26 +879,10 @@ loop: /* If there is a block in the free list, take it */ block = buf_LRU_get_free_only(); - if (block) { - -#ifdef UNIV_DEBUG - block->page.zip.m_start = -#endif /* UNIV_DEBUG */ - block->page.zip.m_end = - block->page.zip.m_nonempty = - block->page.zip.n_blobs = 0; - - if (UNIV_UNLIKELY(zip_size)) { - ibool lru; - page_zip_set_size(&block->page.zip, zip_size); - block->page.zip.data = buf_buddy_alloc(zip_size, &lru); - UNIV_MEM_DESC(block->page.zip.data, zip_size, block); - } else { - page_zip_set_size(&block->page.zip, 0); - block->page.zip.data = NULL; - } + buf_pool_mutex_exit(); - buf_pool_mutex_exit(); + if (block) { + memset(&block->page.zip, 0, sizeof block->page.zip); if (started_monitor) { srv_print_innodb_monitor = mon_value_was; @@ -908,8 +894,6 @@ loop: /* If no block was in the free list, search from the end of the LRU list and try to free a block there */ - buf_pool_mutex_exit(); - freed = buf_LRU_search_and_free_block(n_iterations); if (freed > 0) { @@ -1378,12 +1362,8 @@ enum buf_lru_free_block_status buf_LRU_free_block( /*===============*/ buf_page_t* bpage, /*!< in: block to be freed */ - ibool zip, /*!< in: TRUE if should remove also the + ibool zip) /*!< in: TRUE if should remove also the compressed page of an uncompressed page */ - ibool* buf_pool_mutex_released) - /*!< in: pointer to a variable that will - be assigned TRUE if buf_pool_mutex - was temporarily released, or NULL */ { buf_page_t* b = NULL; mutex_t* block_mutex = buf_page_get_mutex(bpage); @@ -1554,10 +1534,6 @@ alloc: b->io_fix = BUF_IO_READ; } - if (buf_pool_mutex_released) { - *buf_pool_mutex_released = TRUE; - } - buf_pool_mutex_exit(); mutex_exit(block_mutex); @@ -1827,6 +1803,7 @@ buf_LRU_block_remove_hashed_page( buf_pool_mutex_exit_forbid(); buf_buddy_free(bpage->zip.data, page_zip_get_size(&bpage->zip)); + bpage->state = BUF_BLOCK_ZIP_FREE; buf_buddy_free(bpage, sizeof(*bpage)); buf_pool_mutex_exit_allow(); UNIV_MEM_UNDESC(bpage); diff --git a/storage/innodb_plugin/dict/dict0dict.c b/storage/innodb_plugin/dict/dict0dict.c index 9474ab3582a..a8787bbda61 100644 --- a/storage/innodb_plugin/dict/dict0dict.c +++ b/storage/innodb_plugin/dict/dict0dict.c @@ -1669,6 +1669,12 @@ undo_size_ok: new_index->heap, (1 + dict_index_get_n_unique(new_index)) * sizeof(ib_int64_t)); + + new_index->stat_n_non_null_key_vals = mem_heap_zalloc( + new_index->heap, + (1 + dict_index_get_n_unique(new_index)) + * sizeof(*new_index->stat_n_non_null_key_vals)); + /* Give some sensible values to stat_n_... in case we do not calculate statistics quickly enough */ @@ -4291,6 +4297,10 @@ dict_update_statistics( for (i = dict_index_get_n_unique(index); i; ) { index->stat_n_diff_key_vals[i--] = 1; } + + memset(index->stat_n_non_null_key_vals, 0, + (1 + dict_index_get_n_unique(index)) + * sizeof(*index->stat_n_non_null_key_vals)); } index = dict_table_get_next_index(index); diff --git a/storage/innodb_plugin/dict/dict0mem.c b/storage/innodb_plugin/dict/dict0mem.c index 3287247029f..aef815dd2f6 100644 --- a/storage/innodb_plugin/dict/dict0mem.c +++ b/storage/innodb_plugin/dict/dict0mem.c @@ -36,6 +36,9 @@ Created 1/8/1996 Heikki Tuuri #ifndef UNIV_HOTBACKUP # include "lock0lock.h" #endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_BLOB_DEBUG +# include "ut0rbt.h" +#endif /* UNIV_BLOB_DEBUG */ #define DICT_HEAP_SIZE 100 /*!< initial memory heap size when creating a table or index object */ @@ -316,6 +319,12 @@ dict_mem_index_free( { ut_ad(index); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); +#ifdef UNIV_BLOB_DEBUG + if (index->blobs) { + mutex_free(&index->blobs_mutex); + rbt_free(index->blobs); + } +#endif /* UNIV_BLOB_DEBUG */ mem_heap_free(index->heap); } diff --git a/storage/innodb_plugin/fsp/fsp0fsp.c b/storage/innodb_plugin/fsp/fsp0fsp.c index e9d24b8fdf6..d091a14c474 100644 --- a/storage/innodb_plugin/fsp/fsp0fsp.c +++ b/storage/innodb_plugin/fsp/fsp0fsp.c @@ -3444,9 +3444,9 @@ fseg_free_page( fseg_free_page_low(seg_inode, space, zip_size, page, mtr); -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG buf_page_set_file_page_was_freed(space, page); -#endif +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ } /**********************************************************************//** @@ -3513,13 +3513,13 @@ fseg_free_extent( fsp_free_extent(space, zip_size, page, mtr); -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG for (i = 0; i < FSP_EXTENT_SIZE; i++) { buf_page_set_file_page_was_freed(space, first_page_in_extent + i); } -#endif +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ } /**********************************************************************//** diff --git a/storage/innodb_plugin/handler/ha_innodb.cc b/storage/innodb_plugin/handler/ha_innodb.cc index 18a25fd8c4e..481b89ce0d1 100644 --- a/storage/innodb_plugin/handler/ha_innodb.cc +++ b/storage/innodb_plugin/handler/ha_innodb.cc @@ -174,6 +174,25 @@ static char* internal_innobase_data_file_path = NULL; static char* innodb_version_str = (char*) INNODB_VERSION_STR; +/** Possible values for system variable "innodb_stats_method". The values +are defined the same as its corresponding MyISAM system variable +"myisam_stats_method"(see "myisam_stats_method_names"), for better usability */ +static const char* innodb_stats_method_names[] = { + "nulls_equal", + "nulls_unequal", + "nulls_ignored", + NullS +}; + +/** Used to define an enumerate type of the system variable innodb_stats_method. +This is the same as "myisam_stats_method_typelib" */ +static TYPELIB innodb_stats_method_typelib = { + array_elements(innodb_stats_method_names) - 1, + "innodb_stats_method_typelib", + innodb_stats_method_names, + NULL +}; + /* The following counter is used to convey information to InnoDB about server activity: in selects it is not sensible to call srv_active_wake_master_thread after each fetch or search, we only do @@ -7518,6 +7537,65 @@ innobase_get_mysql_key_number_for_index( return(0); } + +/*********************************************************************//** +Calculate Record Per Key value. Need to exclude the NULL value if +innodb_stats_method is set to "nulls_ignored" +@return estimated record per key value */ +static +ha_rows +innodb_rec_per_key( +/*===============*/ + dict_index_t* index, /*!< in: dict_index_t structure */ + ulint i, /*!< in: the column we are + calculating rec per key */ + ha_rows records) /*!< in: estimated total records */ +{ + ha_rows rec_per_key; + + ut_ad(i < dict_index_get_n_unique(index)); + + /* Note the stat_n_diff_key_vals[] stores the diff value with + n-prefix indexing, so it is always stat_n_diff_key_vals[i + 1] */ + if (index->stat_n_diff_key_vals[i + 1] == 0) { + + rec_per_key = records; + } else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) { + ib_int64_t num_null; + + /* Number of rows with NULL value in this + field */ + num_null = records - index->stat_n_non_null_key_vals[i]; + + /* In theory, index->stat_n_non_null_key_vals[i] + should always be less than the number of records. + Since this is statistics value, the value could + have slight discrepancy. But we will make sure + the number of null values is not a negative number. */ + num_null = (num_null < 0) ? 0 : num_null; + + /* If the number of NULL values is the same as or + large than that of the distinct values, we could + consider that the table consists mostly of NULL value. + Set rec_per_key to 1. */ + if (index->stat_n_diff_key_vals[i + 1] <= num_null) { + rec_per_key = 1; + } else { + /* Need to exclude rows with NULL values from + rec_per_key calculation */ + rec_per_key = (ha_rows)( + (records - num_null) + / (index->stat_n_diff_key_vals[i + 1] + - num_null)); + } + } else { + rec_per_key = (ha_rows) + (records / index->stat_n_diff_key_vals[i + 1]); + } + + return(rec_per_key); +} + /*********************************************************************//** Returns statistics information of the table to the MySQL interpreter, in various fields of the handle object. */ @@ -7748,13 +7826,8 @@ ha_innobase::info_low( break; } - if (index->stat_n_diff_key_vals[j + 1] == 0) { - - rec_per_key = stats.records; - } else { - rec_per_key = (ha_rows)(stats.records / - index->stat_n_diff_key_vals[j + 1]); - } + rec_per_key = innodb_rec_per_key( + index, j, stats.records); /* Since MySQL seems to favor table scans too much over index searches, we pretend @@ -10945,6 +11018,13 @@ static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering, innodb_change_buffering_validate, innodb_change_buffering_update, "inserts"); +static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method, + PLUGIN_VAR_RQCMDARG, + "Specifies how InnoDB index statistics collection code should " + "treat NULLs. Possible values are NULLS_EQUAL (default), " + "NULLS_UNEQUAL and NULLS_IGNORED", + NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib); + #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug, PLUGIN_VAR_RQCMDARG, @@ -10999,6 +11079,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(stats_on_metadata), MYSQL_SYSVAR(stats_sample_pages), MYSQL_SYSVAR(adaptive_hash_index), + MYSQL_SYSVAR(stats_method), MYSQL_SYSVAR(replication_delay), MYSQL_SYSVAR(status_file), MYSQL_SYSVAR(strict_mode), diff --git a/storage/innodb_plugin/handler/handler0alter.cc b/storage/innodb_plugin/handler/handler0alter.cc index 517445f7e69..dc1317d5c5a 100644 --- a/storage/innodb_plugin/handler/handler0alter.cc +++ b/storage/innodb_plugin/handler/handler0alter.cc @@ -782,10 +782,6 @@ err_exit: ut_ad(error == DB_SUCCESS); - /* We will need to rebuild index translation table. Set - valid index entry count in the translation table to zero */ - share->idx_trans_tbl.index_count = 0; - /* Commit the data dictionary transaction in order to release the table locks on the system tables. This means that if MySQL crashes while creating a new primary key inside @@ -911,6 +907,14 @@ error: } convert_error: + if (error == DB_SUCCESS) { + /* Build index is successful. We will need to + rebuild index translation table. Reset the + index entry count in the translation table + to zero, so that translation table will be rebuilt */ + share->idx_trans_tbl.index_count = 0; + } + error = convert_error_code_to_mysql(error, innodb_table->flags, user_thd); diff --git a/storage/innodb_plugin/ibuf/ibuf0ibuf.c b/storage/innodb_plugin/ibuf/ibuf0ibuf.c index 701e8f0ef04..23981ac388e 100644 --- a/storage/innodb_plugin/ibuf/ibuf0ibuf.c +++ b/storage/innodb_plugin/ibuf/ibuf0ibuf.c @@ -1878,9 +1878,9 @@ ibuf_remove_free_page(void) fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, IBUF_SPACE_ID, page_no, &mtr); -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG buf_page_reset_file_page_was_freed(IBUF_SPACE_ID, page_no); -#endif +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ ibuf_enter(); @@ -1922,9 +1922,9 @@ ibuf_remove_free_page(void) ibuf_bitmap_page_set_bits( bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, FALSE, &mtr); -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG buf_page_set_file_page_was_freed(IBUF_SPACE_ID, page_no); -#endif +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ mtr_commit(&mtr); mutex_exit(&ibuf_mutex); diff --git a/storage/innodb_plugin/include/btr0btr.h b/storage/innodb_plugin/include/btr0btr.h index dde3a0bab69..5aa02694e0e 100644 --- a/storage/innodb_plugin/include/btr0btr.h +++ b/storage/innodb_plugin/include/btr0btr.h @@ -81,6 +81,91 @@ UNIQUE definition on secondary indexes when we decide if we can use the insert buffer to speed up inserts */ #define BTR_IGNORE_SEC_UNIQUE 2048 +#ifdef UNIV_BLOB_DEBUG +# include "ut0rbt.h" +/** An index->blobs entry for keeping track of off-page column references */ +struct btr_blob_dbg_struct +{ + unsigned blob_page_no:32; /*!< first BLOB page number */ + unsigned ref_page_no:32; /*!< referring page number */ + unsigned ref_heap_no:16; /*!< referring heap number */ + unsigned ref_field_no:10; /*!< referring field number */ + unsigned owner:1; /*!< TRUE if BLOB owner */ + unsigned always_owner:1; /*!< TRUE if always + has been the BLOB owner; + reset to TRUE on B-tree + page splits and merges */ + unsigned del:1; /*!< TRUE if currently + delete-marked */ +}; + +/**************************************************************//** +Add a reference to an off-page column to the index->blobs map. */ +UNIV_INTERN +void +btr_blob_dbg_add_blob( +/*==================*/ + const rec_t* rec, /*!< in: clustered index record */ + ulint field_no, /*!< in: number of off-page column */ + ulint page_no, /*!< in: start page of the column */ + dict_index_t* index, /*!< in/out: index tree */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Display the references to off-page columns. +This function is to be called from a debugger, +for example when a breakpoint on ut_dbg_assertion_failed is hit. */ +UNIV_INTERN +void +btr_blob_dbg_print( +/*===============*/ + const dict_index_t* index) /*!< in: index tree */ + __attribute__((nonnull)); +/**************************************************************//** +Check that there are no references to off-page columns from or to +the given page. Invoked when freeing or clearing a page. +@return TRUE when no orphan references exist */ +UNIV_INTERN +ibool +btr_blob_dbg_is_empty( +/*==================*/ + dict_index_t* index, /*!< in: index */ + ulint page_no) /*!< in: page number */ + __attribute__((nonnull, warn_unused_result)); + +/**************************************************************//** +Modify the 'deleted' flag of a record. */ +UNIV_INTERN +void +btr_blob_dbg_set_deleted_flag( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ibool del) /*!< in: TRUE=deleted, FALSE=exists */ + __attribute__((nonnull)); +/**************************************************************//** +Change the ownership of an off-page column. */ +UNIV_INTERN +void +btr_blob_dbg_owner( +/*===============*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ulint i, /*!< in: ith field in rec */ + ibool own) /*!< in: TRUE=owned, FALSE=disowned */ + __attribute__((nonnull)); +/** Assert that there are no BLOB references to or from the given page. */ +# define btr_blob_dbg_assert_empty(index, page_no) \ + ut_a(btr_blob_dbg_is_empty(index, page_no)) +#else /* UNIV_BLOB_DEBUG */ +# define btr_blob_dbg_add_blob(rec, field_no, page, index, ctx) ((void) 0) +# define btr_blob_dbg_set_deleted_flag(rec, index, offsets, del)((void) 0) +# define btr_blob_dbg_owner(rec, index, offsets, i, val) ((void) 0) +# define btr_blob_dbg_assert_empty(index, page_no) ((void) 0) +#endif /* UNIV_BLOB_DEBUG */ + /**************************************************************//** Gets the root node of a tree and x-latches it. @return root page, x-latched */ diff --git a/storage/innodb_plugin/include/btr0cur.h b/storage/innodb_plugin/include/btr0cur.h index b477ad0320a..ece3621fa97 100644 --- a/storage/innodb_plugin/include/btr0cur.h +++ b/storage/innodb_plugin/include/btr0cur.h @@ -478,7 +478,10 @@ btr_estimate_n_rows_in_range( /*******************************************************************//** Estimates the number of different key values in a given index, for each n-column prefix of the index where n <= dict_index_get_n_unique(index). -The estimates are stored in the array index->stat_n_diff_key_vals. */ +The estimates are stored in the array index->stat_n_diff_key_vals. +If innodb_stats_method is nulls_ignored, we also record the number of +non-null values for each prefix and stored the estimates in +array index->stat_n_non_null_key_vals. */ UNIV_INTERN void btr_estimate_number_of_different_key_vals( @@ -509,8 +512,8 @@ file segment of the index tree. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ UNIV_INTERN ulint -btr_store_big_rec_extern_fields( -/*============================*/ +btr_store_big_rec_extern_fields_func( +/*=================================*/ dict_index_t* index, /*!< in: index of rec; the index tree MUST be X-latched */ buf_block_t* rec_block, /*!< in/out: block containing rec */ @@ -519,10 +522,42 @@ btr_store_big_rec_extern_fields( the "external storage" flags in offsets will not correspond to rec when this function returns */ - big_rec_t* big_rec_vec, /*!< in: vector containing fields +#ifdef UNIV_DEBUG + mtr_t* local_mtr, /*!< in: mtr containing the + latch to rec and to the tree */ +#endif /* UNIV_DEBUG */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ibool update_in_place,/*! in: TRUE if the record is updated + in place (not delete+insert) */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + const big_rec_t*big_rec_vec) /*!< in: vector containing fields to be stored externally */ - mtr_t* local_mtr); /*!< in: mtr containing the latch to - rec and to the tree */ + __attribute__((nonnull)); + +/** Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node +file segment of the index tree. +@param index in: clustered index; MUST be X-latched by mtr +@param b in/out: block containing rec; MUST be X-latched by mtr +@param rec in/out: clustered index record +@param offsets in: rec_get_offsets(rec, index); + the "external storage" flags in offsets will not be adjusted +@param mtr in: mini-transaction that holds x-latch on index and b +@param upd in: TRUE if the record is updated in place (not delete+insert) +@param big in: vector containing fields to be stored externally +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +#ifdef UNIV_DEBUG +# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \ + btr_store_big_rec_extern_fields_func(index,b,rec,offsets,mtr,upd,big) +#elif defined UNIV_BLOB_LIGHT_DEBUG +# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \ + btr_store_big_rec_extern_fields_func(index,b,rec,offsets,upd,big) +#else +# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \ + btr_store_big_rec_extern_fields_func(index,b,rec,offsets,big) +#endif + /*******************************************************************//** Frees the space in an externally stored field to the file space management if the field in data is owned the externally stored field, diff --git a/storage/innodb_plugin/include/btr0types.h b/storage/innodb_plugin/include/btr0types.h index ef4a6b04b34..07c06fb18d7 100644 --- a/storage/innodb_plugin/include/btr0types.h +++ b/storage/innodb_plugin/include/btr0types.h @@ -38,6 +38,131 @@ typedef struct btr_cur_struct btr_cur_t; /** B-tree search information for the adaptive hash index */ typedef struct btr_search_struct btr_search_t; +#ifdef UNIV_BLOB_DEBUG +# include "buf0types.h" +/** An index->blobs entry for keeping track of off-page column references */ +typedef struct btr_blob_dbg_struct btr_blob_dbg_t; + +/** Insert to index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_insert( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/** Remove from index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_delete( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/**************************************************************//** +Add to index->blobs any references to off-page columns from a record. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add_rec( +/*=================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Remove from index->blobs any references to off-page columns from a record. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove_rec( +/*====================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Count and add to index->blobs any references to off-page columns +from records on a page. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add( +/*=============*/ + const page_t* page, /*!< in: rewritten page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Count and remove from index->blobs any references to off-page columns +from records on a page. +Used when reorganizing a page, before copying the records. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove( +/*================*/ + const page_t* page, /*!< in: b-tree page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Restore in index->blobs any references to off-page columns +Used when page reorganize fails due to compressed page overflow. */ +UNIV_INTERN +void +btr_blob_dbg_restore( +/*=================*/ + const page_t* npage, /*!< in: page that failed to compress */ + const page_t* page, /*!< in: copy of original page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/** Operation that processes the BLOB references of an index record +@param[in] rec record on index page +@param[in/out] index the index tree of the record +@param[in] offsets rec_get_offsets(rec,index) +@param[in] ctx context (for logging) +@return number of BLOB references processed */ +typedef ulint (*btr_blob_dbg_op_f) +(const rec_t* rec,dict_index_t* index,const ulint* offsets,const char* ctx); + +/**************************************************************//** +Count and process all references to off-page columns on a page. +@return number of references processed */ +UNIV_INTERN +ulint +btr_blob_dbg_op( +/*============*/ + const page_t* page, /*!< in: B-tree leaf page */ + const rec_t* rec, /*!< in: record to start from + (NULL to process the whole page) */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx, /*!< in: context (for logging) */ + const btr_blob_dbg_op_f op) /*!< in: operation on records */ + __attribute__((nonnull(1,3,4,5))); +#else /* UNIV_BLOB_DEBUG */ +# define btr_blob_dbg_add_rec(rec, index, offsets, ctx) ((void) 0) +# define btr_blob_dbg_add(page, index, ctx) ((void) 0) +# define btr_blob_dbg_remove_rec(rec, index, offsets, ctx) ((void) 0) +# define btr_blob_dbg_remove(page, index, ctx) ((void) 0) +# define btr_blob_dbg_restore(npage, page, index, ctx) ((void) 0) +# define btr_blob_dbg_op(page, rec, index, ctx, op) ((void) 0) +#endif /* UNIV_BLOB_DEBUG */ + /** The size of a reference to data stored on a different page. The reference is stored at the end of the prefix of the field in the index record. */ diff --git a/storage/innodb_plugin/include/buf0buf.h b/storage/innodb_plugin/include/buf0buf.h index cd4ee5906f0..05dead5ac9e 100644 --- a/storage/innodb_plugin/include/buf0buf.h +++ b/storage/innodb_plugin/include/buf0buf.h @@ -41,6 +41,8 @@ Created 11/5/1995 Heikki Tuuri /* @{ */ #define BUF_GET 10 /*!< get always */ #define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */ +#define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make + the block young in the LRU list */ #define BUF_GET_NO_LATCH 14 /*!< get and bufferfix, but set no latch; we have separated this case, because @@ -165,10 +167,8 @@ Allocates a buffer block. @return own: the allocated block, in state BUF_BLOCK_MEMORY */ UNIV_INLINE buf_block_t* -buf_block_alloc( -/*============*/ - ulint zip_size); /*!< in: compressed page size in bytes, - or 0 if uncompressed tablespace */ +buf_block_alloc(void); +/*=================*/ /********************************************************************//** Frees a buffer block which does not contain a file page. */ UNIV_INLINE @@ -286,7 +286,7 @@ buf_page_get_gen( ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ buf_block_t* guess, /*!< in: guessed block or NULL */ ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL, - BUF_GET_NO_LATCH */ + BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH */ const char* file, /*!< in: file name */ ulint line, /*!< in: line where called */ mtr_t* mtr); /*!< in: mini-transaction */ @@ -370,7 +370,7 @@ buf_reset_check_index_page_at_flush( /*================================*/ ulint space, /*!< in: space id */ ulint offset);/*!< in: page number */ -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG /********************************************************************//** Sets file_page_was_freed TRUE if the page is found in the buffer pool. This function should be called when we free a file page and want the @@ -395,7 +395,7 @@ buf_page_reset_file_page_was_freed( /*===============================*/ ulint space, /*!< in: space id */ ulint offset); /*!< in: page number */ -#endif /* UNIV_DEBUG_FILE_ACCESSES */ +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ /********************************************************************//** Reads the freed_page_clock of a buffer block. @return freed_page_clock */ @@ -1137,11 +1137,11 @@ struct buf_page_struct{ 0 if the block was never accessed in the buffer pool */ /* @} */ -# ifdef UNIV_DEBUG_FILE_ACCESSES +# if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG ibool file_page_was_freed; /*!< this is set to TRUE when fsp frees a page in buffer pool */ -# endif /* UNIV_DEBUG_FILE_ACCESSES */ +# endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ #endif /* !UNIV_HOTBACKUP */ }; diff --git a/storage/innodb_plugin/include/buf0buf.ic b/storage/innodb_plugin/include/buf0buf.ic index 23db684806c..0025bef5aac 100644 --- a/storage/innodb_plugin/include/buf0buf.ic +++ b/storage/innodb_plugin/include/buf0buf.ic @@ -719,14 +719,12 @@ Allocates a buffer block. @return own: the allocated block, in state BUF_BLOCK_MEMORY */ UNIV_INLINE buf_block_t* -buf_block_alloc( -/*============*/ - ulint zip_size) /*!< in: compressed page size in bytes, - or 0 if uncompressed tablespace */ +buf_block_alloc(void) +/*=================*/ { buf_block_t* block; - block = buf_LRU_get_free_block(zip_size); + block = buf_LRU_get_free_block(); buf_block_set_state(block, BUF_BLOCK_MEMORY); diff --git a/storage/innodb_plugin/include/buf0lru.h b/storage/innodb_plugin/include/buf0lru.h index 5a9cfd059f3..d543bce53cd 100644 --- a/storage/innodb_plugin/include/buf0lru.h +++ b/storage/innodb_plugin/include/buf0lru.h @@ -110,12 +110,9 @@ enum buf_lru_free_block_status buf_LRU_free_block( /*===============*/ buf_page_t* bpage, /*!< in: block to be freed */ - ibool zip, /*!< in: TRUE if should remove also the + ibool zip) /*!< in: TRUE if should remove also the compressed page of an uncompressed page */ - ibool* buf_pool_mutex_released); - /*!< in: pointer to a variable that will - be assigned TRUE if buf_pool_mutex - was temporarily released, or NULL */ + __attribute__((nonnull)); /******************************************************************//** Try to free a replaceable block. @return TRUE if found and freed */ @@ -146,10 +143,9 @@ LRU list to the free list. @return the free control block, in state BUF_BLOCK_READY_FOR_USE */ UNIV_INTERN buf_block_t* -buf_LRU_get_free_block( -/*===================*/ - ulint zip_size); /*!< in: compressed page size in bytes, - or 0 if uncompressed tablespace */ +buf_LRU_get_free_block(void) +/*========================*/ + __attribute__((warn_unused_result)); /******************************************************************//** Puts a block back to the free list. */ diff --git a/storage/innodb_plugin/include/dict0mem.h b/storage/innodb_plugin/include/dict0mem.h index 19782c2e76a..bd32a239cfd 100644 --- a/storage/innodb_plugin/include/dict0mem.h +++ b/storage/innodb_plugin/include/dict0mem.h @@ -321,6 +321,12 @@ struct dict_index_struct{ dict_get_n_unique(index); we periodically calculate new estimates */ + ib_int64_t* stat_n_non_null_key_vals; + /* approximate number of non-null key values + for this index, for each column where + n < dict_get_n_unique(index); This + is used when innodb_stats_method is + "nulls_ignored". */ ulint stat_index_size; /*!< approximate index size in database pages */ @@ -334,6 +340,13 @@ struct dict_index_struct{ index, or 0 if the index existed when InnoDB was started up */ #endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_BLOB_DEBUG + mutex_t blobs_mutex; + /*!< mutex protecting blobs */ + void* blobs; /*!< map of (page_no,heap_no,field_no) + to first_blob_page_no; protected by + blobs_mutex; @see btr_blob_dbg_t */ +#endif /* UNIV_BLOB_DEBUG */ #ifdef UNIV_DEBUG ulint magic_n;/*!< magic number */ /** Value of dict_index_struct::magic_n */ diff --git a/storage/innodb_plugin/include/dict0types.h b/storage/innodb_plugin/include/dict0types.h index 7ad69193cc9..f14b59a19d4 100644 --- a/storage/innodb_plugin/include/dict0types.h +++ b/storage/innodb_plugin/include/dict0types.h @@ -33,11 +33,6 @@ typedef struct dict_index_struct dict_index_t; typedef struct dict_table_struct dict_table_t; typedef struct dict_foreign_struct dict_foreign_t; -/* A cluster object is a table object with the type field set to -DICT_CLUSTERED */ - -typedef dict_table_t dict_cluster_t; - typedef struct ind_node_struct ind_node_t; typedef struct tab_node_struct tab_node_t; diff --git a/storage/innodb_plugin/include/page0zip.h b/storage/innodb_plugin/include/page0zip.h index 574809e5227..00c1d0516e6 100644 --- a/storage/innodb_plugin/include/page0zip.h +++ b/storage/innodb_plugin/include/page0zip.h @@ -420,7 +420,7 @@ page_zip_copy_recs( const page_t* src, /*!< in: page */ dict_index_t* index, /*!< in: index of the B-tree */ mtr_t* mtr) /*!< in: mini-transaction */ - __attribute__((nonnull(1,2,3,4))); + __attribute__((nonnull)); #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** diff --git a/storage/innodb_plugin/include/rem0cmp.h b/storage/innodb_plugin/include/rem0cmp.h index 2f751a38864..a908521c9f7 100644 --- a/storage/innodb_plugin/include/rem0cmp.h +++ b/storage/innodb_plugin/include/rem0cmp.h @@ -165,6 +165,10 @@ cmp_rec_rec_with_match( const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */ dict_index_t* index, /*!< in: data dictionary index */ + ibool nulls_unequal, + /* in: TRUE if this is for index statistics + cardinality estimation, and innodb_stats_method + is "nulls_unequal" or "nulls_ignored" */ ulint* matched_fields, /*!< in/out: number of already completely matched fields; when the function returns, contains the value the for current diff --git a/storage/innodb_plugin/include/rem0cmp.ic b/storage/innodb_plugin/include/rem0cmp.ic index 39ef5f4fba3..63415fe7837 100644 --- a/storage/innodb_plugin/include/rem0cmp.ic +++ b/storage/innodb_plugin/include/rem0cmp.ic @@ -87,5 +87,5 @@ cmp_rec_rec( ulint match_b = 0; return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index, - &match_f, &match_b)); + FALSE, &match_f, &match_b)); } diff --git a/storage/innodb_plugin/include/row0upd.h b/storage/innodb_plugin/include/row0upd.h index b61e6b6dca1..97b7ec49a17 100644 --- a/storage/innodb_plugin/include/row0upd.h +++ b/storage/innodb_plugin/include/row0upd.h @@ -280,19 +280,29 @@ NOTE: we compare the fields as binary strings! @return TRUE if update vector changes an ordering field in the index record */ UNIV_INTERN ibool -row_upd_changes_ord_field_binary( -/*=============================*/ +row_upd_changes_ord_field_binary_func( +/*==================================*/ + dict_index_t* index, /*!< in: index of the record */ + const upd_t* update, /*!< in: update vector for the row; NOTE: the + field numbers in this MUST be clustered index + positions! */ +#ifdef UNIV_DEBUG + const que_thr_t*thr, /*!< in: query thread */ +#endif /* UNIV_DEBUG */ const dtuple_t* row, /*!< in: old value of row, or NULL if the row and the data values in update are not known when this function is called, e.g., at compile time */ - const row_ext_t*ext, /*!< NULL, or prefixes of the externally + const row_ext_t*ext) /*!< NULL, or prefixes of the externally stored columns in the old row */ - dict_index_t* index, /*!< in: index of the record */ - const upd_t* update) /*!< in: update vector for the row; NOTE: the - field numbers in this MUST be clustered index - positions! */ - __attribute__((nonnull(3,4), warn_unused_result)); + __attribute__((nonnull(1,2), warn_unused_result)); +#ifdef UNIV_DEBUG +# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \ + row_upd_changes_ord_field_binary_func(index,update,thr,row,ext) +#else /* UNIV_DEBUG */ +# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \ + row_upd_changes_ord_field_binary_func(index,update,row,ext) +#endif /* UNIV_DEBUG */ /***********************************************************//** Checks if an update vector changes an ordering field of an index record. This function is fast if the update vector is short or the number of ordering diff --git a/storage/innodb_plugin/include/srv0srv.h b/storage/innodb_plugin/include/srv0srv.h index 7aa2ce74720..91ae895040c 100644 --- a/storage/innodb_plugin/include/srv0srv.h +++ b/storage/innodb_plugin/include/srv0srv.h @@ -154,6 +154,11 @@ capacity. PCT_IO(5) -> returns the number of IO operations that is 5% of the max where max is srv_io_capacity. */ #define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) p / 100.0))) +/* The "innodb_stats_method" setting, decides how InnoDB is going +to treat NULL value when collecting statistics. It is not defined +as enum type because the configure option takes unsigned integer type. */ +extern ulong srv_innodb_stats_method; + #ifdef UNIV_LOG_ARCHIVE extern ibool srv_log_archive_on; extern ibool srv_archive_recovery; @@ -363,6 +368,19 @@ enum { in connection with recovery */ }; +/* Alternatives for srv_innodb_stats_method, which could be changed by +setting innodb_stats_method */ +enum srv_stats_method_name_enum { + SRV_STATS_NULLS_EQUAL, /* All NULL values are treated as + equal. This is the default setting + for innodb_stats_method */ + SRV_STATS_NULLS_UNEQUAL, /* All NULL values are treated as + NOT equal. */ + SRV_STATS_NULLS_IGNORED /* NULL values are ignored */ +}; + +typedef enum srv_stats_method_name_enum srv_stats_method_name_t; + #ifndef UNIV_HOTBACKUP /** Types of threads existing in the system. */ enum srv_thread_type { diff --git a/storage/innodb_plugin/include/sync0arr.h b/storage/innodb_plugin/include/sync0arr.h index 5f1280f5e28..6e931346238 100644 --- a/storage/innodb_plugin/include/sync0arr.h +++ b/storage/innodb_plugin/include/sync0arr.h @@ -115,8 +115,11 @@ Prints warnings of long semaphore waits to stderr. @return TRUE if fatal semaphore wait threshold was exceeded */ UNIV_INTERN ibool -sync_array_print_long_waits(void); -/*=============================*/ +sync_array_print_long_waits( +/*========================*/ + os_thread_id_t* waiter, /*!< out: longest waiting thread */ + const void** sema) /*!< out: longest-waited-for semaphore */ + __attribute__((nonnull)); /********************************************************************//** Validates the integrity of the wait array. Checks that the number of reserved cells equals the count variable. */ diff --git a/storage/innodb_plugin/include/sync0rw.h b/storage/innodb_plugin/include/sync0rw.h index 175f3deb77c..47f7dbfe0eb 100644 --- a/storage/innodb_plugin/include/sync0rw.h +++ b/storage/innodb_plugin/include/sync0rw.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -490,6 +490,7 @@ UNIV_INTERN void rw_lock_debug_print( /*================*/ + FILE* f, /*!< in: output stream */ rw_lock_debug_t* info); /*!< in: debug struct */ #endif /* UNIV_SYNC_DEBUG */ diff --git a/storage/innodb_plugin/include/trx0rseg.h b/storage/innodb_plugin/include/trx0rseg.h index a25d84f1e84..e3674089735 100644 --- a/storage/innodb_plugin/include/trx0rseg.h +++ b/storage/innodb_plugin/include/trx0rseg.h @@ -135,9 +135,7 @@ struct trx_rseg_struct{ ulint id; /*!< rollback segment id == the index of its slot in the trx system file copy */ mutex_t mutex; /*!< mutex protecting the fields in this - struct except id; NOTE that the latching - order must always be kernel mutex -> - rseg mutex */ + struct except id, which is constant */ ulint space; /*!< space where the rollback segment is header is placed */ ulint zip_size;/* compressed page size of space diff --git a/storage/innodb_plugin/include/trx0trx.h b/storage/innodb_plugin/include/trx0trx.h index abd175d365b..833bae4a4ff 100644 --- a/storage/innodb_plugin/include/trx0trx.h +++ b/storage/innodb_plugin/include/trx0trx.h @@ -214,12 +214,12 @@ trx_recover_for_mysql( /*******************************************************************//** This function is used to find one X/Open XA distributed transaction which is in the prepared state -@return trx or NULL */ +@return trx or NULL; on match, the trx->xid will be invalidated */ UNIV_INTERN trx_t * trx_get_trx_by_xid( /*===============*/ - XID* xid); /*!< in: X/Open XA transaction identification */ + const XID* xid); /*!< in: X/Open XA transaction identifier */ /**********************************************************************//** If required, flushes the log to disk if we called trx_commit_for_mysql() with trx->flush_log_later == TRUE. diff --git a/storage/innodb_plugin/include/univ.i b/storage/innodb_plugin/include/univ.i index b721e0406a8..319db93d137 100644 --- a/storage/innodb_plugin/include/univ.i +++ b/storage/innodb_plugin/include/univ.i @@ -46,7 +46,7 @@ Created 1/20/1994 Heikki Tuuri #define INNODB_VERSION_MAJOR 1 #define INNODB_VERSION_MINOR 0 -#define INNODB_VERSION_BUGFIX 15 +#define INNODB_VERSION_BUGFIX 16 /* The following is the InnoDB version as shown in SELECT plugin_version FROM information_schema.plugins; @@ -177,14 +177,15 @@ command. Not tested on Windows. */ debugging without UNIV_DEBUG */ #define UNIV_BUF_DEBUG /* Enable buffer pool debugging without UNIV_DEBUG */ +#define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column + debugging without UNIV_DEBUG */ #define UNIV_DEBUG /* Enable ut_ad() assertions and disable UNIV_INLINE */ #define UNIV_DEBUG_LOCK_VALIDATE /* Enable ut_ad(lock_rec_validate_page()) assertions. */ -#define UNIV_DEBUG_FILE_ACCESSES /* Debug .ibd file access - (field file_page_was_freed - in buf_page_t) */ +#define UNIV_DEBUG_FILE_ACCESSES /* Enable freed block access + debugging without UNIV_DEBUG */ #define UNIV_LRU_DEBUG /* debug the buffer pool LRU */ #define UNIV_HASH_DEBUG /* debug HASH_ macros */ #define UNIV_LIST_DEBUG /* debug UT_LIST_ macros */ @@ -193,6 +194,8 @@ this will break redo log file compatibility, but it may be useful when debugging redo log application problems. */ #define UNIV_MEM_DEBUG /* detect memory leaks etc */ #define UNIV_IBUF_DEBUG /* debug the insert buffer */ +#define UNIV_BLOB_DEBUG /* track BLOB ownership; +assumes that no BLOBs survive server restart */ #define UNIV_IBUF_COUNT_DEBUG /* debug the insert buffer; this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES, and the insert buffer must be empty when the database is started */ @@ -424,7 +427,7 @@ it is read or written. */ /* Use sun_prefetch when compile with Sun Studio */ # define UNIV_EXPECT(expr,value) (expr) # define UNIV_LIKELY_NULL(expr) (expr) -# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many(addr) +# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr) # define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr) #else /* Dummy versions of the macros */ diff --git a/storage/innodb_plugin/mem/mem0mem.c b/storage/innodb_plugin/mem/mem0mem.c index 1dd4db30841..86100b04fd6 100644 --- a/storage/innodb_plugin/mem/mem0mem.c +++ b/storage/innodb_plugin/mem/mem0mem.c @@ -347,7 +347,7 @@ mem_heap_create_block( return(NULL); } } else { - buf_block = buf_block_alloc(0); + buf_block = buf_block_alloc(); } block = (mem_block_t*) buf_block->frame; diff --git a/storage/innodb_plugin/mtr/mtr0log.c b/storage/innodb_plugin/mtr/mtr0log.c index 3f3dab36b76..3349036b5b3 100644 --- a/storage/innodb_plugin/mtr/mtr0log.c +++ b/storage/innodb_plugin/mtr/mtr0log.c @@ -408,7 +408,7 @@ mlog_parse_string( ptr += 2; if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE) - || UNIV_UNLIKELY(len + offset) > UNIV_PAGE_SIZE) { + || UNIV_UNLIKELY(len + offset > UNIV_PAGE_SIZE)) { recv_sys->found_corrupt_log = TRUE; return(NULL); diff --git a/storage/innodb_plugin/page/page0cur.c b/storage/innodb_plugin/page/page0cur.c index f10f16a7dd9..936762b986a 100644 --- a/storage/innodb_plugin/page/page0cur.c +++ b/storage/innodb_plugin/page/page0cur.c @@ -1149,6 +1149,8 @@ use_heap: current_rec, index, mtr); } + btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert"); + return(insert_rec); } @@ -1195,10 +1197,12 @@ page_cur_insert_rec_zip_reorg( } /* Out of space: restore the page */ + btr_blob_dbg_remove(page, index, "insert_zip_fail"); if (!page_zip_decompress(page_zip, page, FALSE)) { ut_error; /* Memory corrupted? */ } ut_ad(page_validate(page, index)); + btr_blob_dbg_add(page, index, "insert_zip_fail"); return(NULL); } @@ -1490,6 +1494,8 @@ use_heap: page_zip_write_rec(page_zip, insert_rec, index, offsets, 1); + btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert_zip_ok"); + /* 9. Write log record of the insert */ if (UNIV_LIKELY(mtr != NULL)) { page_cur_insert_rec_write_log(insert_rec, rec_size, @@ -1697,6 +1703,9 @@ page_copy_rec_list_end_to_created_page( heap_top += rec_size; + rec_offs_make_valid(insert_rec, index, offsets); + btr_blob_dbg_add_rec(insert_rec, index, offsets, "copy_end"); + page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, index, mtr); prev_rec = insert_rec; @@ -1944,6 +1953,7 @@ page_cur_delete_rec( page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1); /* 6. Free the memory occupied by the record */ + btr_blob_dbg_remove_rec(current_rec, index, offsets, "delete"); page_mem_free(page, page_zip, current_rec, index, offsets); /* 7. Now we have decremented the number of owned records of the slot. diff --git a/storage/innodb_plugin/page/page0page.c b/storage/innodb_plugin/page/page0page.c index 10008f9ac25..6cae03e8829 100644 --- a/storage/innodb_plugin/page/page0page.c +++ b/storage/innodb_plugin/page/page0page.c @@ -685,12 +685,16 @@ page_copy_rec_list_end( if (UNIV_UNLIKELY (!page_zip_reorganize(new_block, index, mtr))) { + btr_blob_dbg_remove(new_page, index, + "copy_end_reorg_fail"); if (UNIV_UNLIKELY (!page_zip_decompress(new_page_zip, new_page, FALSE))) { ut_error; } ut_ad(page_validate(new_page, index)); + btr_blob_dbg_add(new_page, index, + "copy_end_reorg_fail"); return(NULL); } else { /* The page was reorganized: @@ -803,12 +807,16 @@ page_copy_rec_list_start( if (UNIV_UNLIKELY (!page_zip_reorganize(new_block, index, mtr))) { + btr_blob_dbg_remove(new_page, index, + "copy_start_reorg_fail"); if (UNIV_UNLIKELY (!page_zip_decompress(new_page_zip, new_page, FALSE))) { ut_error; } ut_ad(page_validate(new_page, index)); + btr_blob_dbg_add(new_page, index, + "copy_start_reorg_fail"); return(NULL); } else { /* The page was reorganized: @@ -1080,6 +1088,9 @@ page_delete_rec_list_end( /* Remove the record chain segment from the record chain */ page_rec_set_next(prev_rec, page_get_supremum_rec(page)); + btr_blob_dbg_op(page, rec, index, "delete_end", + btr_blob_dbg_remove_rec); + /* Catenate the deleted chain segment to the page free list */ page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE)); diff --git a/storage/innodb_plugin/page/page0zip.c b/storage/innodb_plugin/page/page0zip.c index d3b1edefc6b..6e866b3f016 100644 --- a/storage/innodb_plugin/page/page0zip.c +++ b/storage/innodb_plugin/page/page0zip.c @@ -653,13 +653,13 @@ page_zip_dir_encode( Allocate memory for zlib. */ static void* -page_zip_malloc( +page_zip_zalloc( /*============*/ void* opaque, /*!< in/out: memory heap */ uInt items, /*!< in: number of items to allocate */ uInt size) /*!< in: size of an item in bytes */ { - return(mem_heap_alloc(opaque, items * size)); + return(mem_heap_zalloc(opaque, items * size)); } /**********************************************************************//** @@ -684,7 +684,7 @@ page_zip_set_alloc( { z_stream* strm = stream; - strm->zalloc = page_zip_malloc; + strm->zalloc = page_zip_zalloc; strm->zfree = page_zip_free; strm->opaque = heap; } @@ -2912,19 +2912,18 @@ zlib_error: page_zip_set_alloc(&d_stream, heap); - if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT) - != Z_OK)) { - ut_error; - } - d_stream.next_in = page_zip->data + PAGE_DATA; /* Subtract the space reserved for the page header and the end marker of the modification log. */ d_stream.avail_in = page_zip_get_size(page_zip) - (PAGE_DATA + 1); - d_stream.next_out = page + PAGE_ZIP_START; d_stream.avail_out = UNIV_PAGE_SIZE - PAGE_ZIP_START; + if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT) + != Z_OK)) { + ut_error; + } + /* Decode the zlib header and the index information. */ if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { @@ -4439,7 +4438,7 @@ page_zip_reorganize( log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); #ifndef UNIV_HOTBACKUP - temp_block = buf_block_alloc(0); + temp_block = buf_block_alloc(); btr_search_drop_page_hash_index(block); block->check_index_page_at_flush = TRUE; #else /* !UNIV_HOTBACKUP */ @@ -4451,6 +4450,8 @@ page_zip_reorganize( /* Copy the old page to temporary space */ buf_frame_copy(temp_page, page); + btr_blob_dbg_remove(page, index, "zip_reorg"); + /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ @@ -4509,7 +4510,7 @@ page_zip_copy_recs( mtr_t* mtr) /*!< in: mini-transaction */ { ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); - ut_ad(mtr_memo_contains_page(mtr, (page_t*) src, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, src, MTR_MEMO_PAGE_X_FIX)); ut_ad(!dict_index_is_ibuf(index)); #ifdef UNIV_ZIP_DEBUG /* The B-tree operations that call this function may set @@ -4579,6 +4580,7 @@ page_zip_copy_recs( #ifdef UNIV_ZIP_DEBUG ut_a(page_zip_validate(page_zip, page)); #endif /* UNIV_ZIP_DEBUG */ + btr_blob_dbg_add(page, index, "page_zip_copy_recs"); page_zip_compress_write_log(page_zip, page, index, mtr); } diff --git a/storage/innodb_plugin/rem/rem0cmp.c b/storage/innodb_plugin/rem/rem0cmp.c index 35b67992558..04d2c15437b 100644 --- a/storage/innodb_plugin/rem/rem0cmp.c +++ b/storage/innodb_plugin/rem/rem0cmp.c @@ -862,6 +862,10 @@ cmp_rec_rec_with_match( const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */ dict_index_t* index, /*!< in: data dictionary index */ + ibool nulls_unequal, + /* in: TRUE if this is for index statistics + cardinality estimation, and innodb_stats_method + is "nulls_unequal" or "nulls_ignored" */ ulint* matched_fields, /*!< in/out: number of already completely matched fields; when the function returns, contains the value the for current @@ -961,9 +965,13 @@ cmp_rec_rec_with_match( || rec2_f_len == UNIV_SQL_NULL) { if (rec1_f_len == rec2_f_len) { - - goto next_field; - + /* This is limited to stats collection, + cannot use it for regular search */ + if (nulls_unequal) { + ret = -1; + } else { + goto next_field; + } } else if (rec2_f_len == UNIV_SQL_NULL) { /* We define the SQL null to be the diff --git a/storage/innodb_plugin/row/row0ins.c b/storage/innodb_plugin/row/row0ins.c index 298c601c7e3..8050c099751 100644 --- a/storage/innodb_plugin/row/row0ins.c +++ b/storage/innodb_plugin/row/row0ins.c @@ -2130,7 +2130,7 @@ function_exit: err = btr_store_big_rec_extern_fields( index, btr_cur_get_block(&cursor), - rec, offsets, big_rec, &mtr); + rec, offsets, &mtr, FALSE, big_rec); if (modify) { dtuple_big_rec_free(big_rec); diff --git a/storage/innodb_plugin/row/row0purge.c b/storage/innodb_plugin/row/row0purge.c index 8bf2ae0f458..752a2ec9e83 100644 --- a/storage/innodb_plugin/row/row0purge.c +++ b/storage/innodb_plugin/row/row0purge.c @@ -387,8 +387,11 @@ Purges an update of an existing record. Also purges an update of a delete marked record if that record contained an externally stored field. */ static void -row_purge_upd_exist_or_extern( -/*==========================*/ +row_purge_upd_exist_or_extern_func( +/*===============================*/ +#ifdef UNIV_DEBUG + const que_thr_t*thr, /*!< in: query thread */ +#endif /* UNIV_DEBUG */ purge_node_t* node) /*!< in: row purge node */ { mem_heap_t* heap; @@ -413,8 +416,8 @@ row_purge_upd_exist_or_extern( while (node->index != NULL) { index = node->index; - if (row_upd_changes_ord_field_binary(NULL, NULL, node->index, - node->update)) { + if (row_upd_changes_ord_field_binary(node->index, node->update, + thr, NULL, NULL)) { /* Build the older version of the index entry */ entry = row_build_index_entry(node->row, NULL, index, heap); @@ -496,6 +499,14 @@ skip_secondaries: } } +#ifdef UNIV_DEBUG +# define row_purge_upd_exist_or_extern(thr,node) \ + row_purge_upd_exist_or_extern_func(thr,node) +#else /* UNIV_DEBUG */ +# define row_purge_upd_exist_or_extern(thr,node) \ + row_purge_upd_exist_or_extern_func(node) +#endif /* UNIV_DEBUG */ + /***********************************************************//** Parses the row reference and other info in a modify undo log record. @return TRUE if purge operation required: NOTE that then the CALLER @@ -602,47 +613,32 @@ err_exit: /***********************************************************//** Fetches an undo log record and does the purge for the recorded operation. If none left, or the current purge completed, returns the control to the -parent node, which is always a query thread node. -@return DB_SUCCESS if operation successfully completed, else error code */ -static -ulint +parent node, which is always a query thread node. */ +static __attribute__((nonnull)) +void row_purge( /*======*/ purge_node_t* node, /*!< in: row purge node */ que_thr_t* thr) /*!< in: query thread */ { - roll_ptr_t roll_ptr; - ibool purge_needed; ibool updated_extern; - trx_t* trx; - ut_ad(node && thr); - - trx = thr_get_trx(thr); + ut_ad(node); + ut_ad(thr); - node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr, - &(node->reservation), + node->undo_rec = trx_purge_fetch_next_rec(&node->roll_ptr, + &node->reservation, node->heap); if (!node->undo_rec) { /* Purge completed for this query thread */ thr->run_node = que_node_get_parent(node); - return(DB_SUCCESS); - } - - node->roll_ptr = roll_ptr; - - if (node->undo_rec == &trx_purge_dummy_rec) { - purge_needed = FALSE; - } else { - purge_needed = row_purge_parse_undo_rec(node, &updated_extern, - thr); - /* If purge_needed == TRUE, we must also remember to unfreeze - data dictionary! */ + return; } - if (purge_needed) { + if (node->undo_rec != &trx_purge_dummy_rec + && row_purge_parse_undo_rec(node, &updated_extern, thr)) { node->found_clust = FALSE; node->index = dict_table_get_next_index( @@ -654,14 +650,14 @@ row_purge( } else if (updated_extern || node->rec_type == TRX_UNDO_UPD_EXIST_REC) { - row_purge_upd_exist_or_extern(node); + row_purge_upd_exist_or_extern(thr, node); } if (node->found_clust) { btr_pcur_close(&(node->pcur)); } - row_mysql_unfreeze_data_dictionary(trx); + row_mysql_unfreeze_data_dictionary(thr_get_trx(thr)); } /* Do some cleanup */ @@ -669,8 +665,6 @@ row_purge( mem_heap_empty(node->heap); thr->run_node = node; - - return(DB_SUCCESS); } /***********************************************************//** @@ -684,9 +678,6 @@ row_purge_step( que_thr_t* thr) /*!< in: query thread */ { purge_node_t* node; -#ifdef UNIV_DEBUG - ulint err; -#endif /* UNIV_DEBUG */ ut_ad(thr); @@ -694,12 +685,7 @@ row_purge_step( ut_ad(que_node_get_type(node) == QUE_NODE_PURGE); -#ifdef UNIV_DEBUG - err = -#endif /* UNIV_DEBUG */ row_purge(node, thr); - ut_ad(err == DB_SUCCESS); - return(thr); } diff --git a/storage/innodb_plugin/row/row0umod.c b/storage/innodb_plugin/row/row0umod.c index 562f8093c38..5202a498eed 100644 --- a/storage/innodb_plugin/row/row0umod.c +++ b/storage/innodb_plugin/row/row0umod.c @@ -173,40 +173,26 @@ row_undo_mod_remove_clust_low( mtr_t* mtr, /*!< in: mtr */ ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ { - btr_pcur_t* pcur; btr_cur_t* btr_cur; ulint err; - ibool success; ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); - pcur = &(node->pcur); - btr_cur = btr_pcur_get_btr_cur(pcur); - success = btr_pcur_restore_position(mode, pcur, mtr); + /* Find out if the record has been purged already + or if we can remove it. */ - if (!success) { + if (!btr_pcur_restore_position(mode, &node->pcur, mtr) + || row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) { return(DB_SUCCESS); } - /* Find out if we can remove the whole clustered index record */ - - if (node->rec_type == TRX_UNDO_UPD_DEL_REC - && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) { - - /* Ok, we can remove */ - } else { - return(DB_SUCCESS); - } + btr_cur = btr_pcur_get_btr_cur(&node->pcur); if (mode == BTR_MODIFY_LEAF) { - success = btr_cur_optimistic_delete(btr_cur, mtr); - - if (success) { - err = DB_SUCCESS; - } else { - err = DB_FAIL; - } + err = btr_cur_optimistic_delete(btr_cur, mtr) + ? DB_SUCCESS + : DB_FAIL; } else { ut_ad(mode == BTR_MODIFY_TREE); @@ -668,8 +654,9 @@ row_undo_mod_upd_exist_sec( while (node->index != NULL) { index = node->index; - if (row_upd_changes_ord_field_binary( - node->row, node->ext, node->index, node->update)) { + if (row_upd_changes_ord_field_binary(node->index, node->update, + thr, + node->row, node->ext)) { /* Build the newest version of the index entry */ entry = row_build_index_entry(node->row, node->ext, diff --git a/storage/innodb_plugin/row/row0upd.c b/storage/innodb_plugin/row/row0upd.c index e1c78949603..13134afd1aa 100644 --- a/storage/innodb_plugin/row/row0upd.c +++ b/storage/innodb_plugin/row/row0upd.c @@ -498,14 +498,49 @@ row_upd_rec_in_place( n_fields = upd_get_n_fields(update); for (i = 0; i < n_fields; i++) { +#ifdef UNIV_BLOB_DEBUG + btr_blob_dbg_t b; + const byte* field_ref = NULL; +#endif /* UNIV_BLOB_DEBUG */ + upd_field = upd_get_nth_field(update, i); new_val = &(upd_field->new_val); ut_ad(!dfield_is_ext(new_val) == !rec_offs_nth_extern(offsets, upd_field->field_no)); +#ifdef UNIV_BLOB_DEBUG + if (dfield_is_ext(new_val)) { + ulint len; + field_ref = rec_get_nth_field(rec, offsets, i, &len); + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + ut_a(b.ref_field_no >= index->n_uniq); + btr_blob_dbg_rbt_delete(index, &b, "upd_in_place"); + } +#endif /* UNIV_BLOB_DEBUG */ rec_set_nth_field(rec, offsets, upd_field->field_no, dfield_get_data(new_val), dfield_get_len(new_val)); + +#ifdef UNIV_BLOB_DEBUG + if (dfield_is_ext(new_val)) { + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + b.always_owner = b.owner = !(field_ref[BTR_EXTERN_LEN] + & BTR_EXTERN_OWNER_FLAG); + b.del = rec_get_deleted_flag( + rec, rec_offs_comp(offsets)); + + btr_blob_dbg_rbt_insert(index, &b, "upd_in_place"); + } +#endif /* UNIV_BLOB_DEBUG */ } if (UNIV_LIKELY_NULL(page_zip)) { @@ -1192,25 +1227,31 @@ NOTE: we compare the fields as binary strings! @return TRUE if update vector changes an ordering field in the index record */ UNIV_INTERN ibool -row_upd_changes_ord_field_binary( -/*=============================*/ +row_upd_changes_ord_field_binary_func( +/*==================================*/ + dict_index_t* index, /*!< in: index of the record */ + const upd_t* update, /*!< in: update vector for the row; NOTE: the + field numbers in this MUST be clustered index + positions! */ +#ifdef UNIV_DEBUG + const que_thr_t*thr, /*!< in: query thread */ +#endif /* UNIV_DEBUG */ const dtuple_t* row, /*!< in: old value of row, or NULL if the row and the data values in update are not known when this function is called, e.g., at compile time */ - const row_ext_t*ext, /*!< NULL, or prefixes of the externally + const row_ext_t*ext) /*!< NULL, or prefixes of the externally stored columns in the old row */ - dict_index_t* index, /*!< in: index of the record */ - const upd_t* update) /*!< in: update vector for the row; NOTE: the - field numbers in this MUST be clustered index - positions! */ { ulint n_unique; ulint i; const dict_index_t* clust_index; - ut_ad(update); ut_ad(index); + ut_ad(update); + ut_ad(thr); + ut_ad(thr->graph); + ut_ad(thr->graph->trx); n_unique = dict_index_get_n_unique(index); @@ -1252,6 +1293,10 @@ row_upd_changes_ord_field_binary( || dfield_is_null(dfield)) { /* do nothing special */ } else if (UNIV_LIKELY_NULL(ext)) { + /* Silence a compiler warning without + silencing a Valgrind error. */ + dfield_len = 0; + UNIV_MEM_INVALID(&dfield_len, sizeof dfield_len); /* See if the column is stored externally. */ buf = row_ext_lookup(ext, col_no, &dfield_len); @@ -1259,9 +1304,14 @@ row_upd_changes_ord_field_binary( if (UNIV_LIKELY_NULL(buf)) { if (UNIV_UNLIKELY(buf == field_ref_zero)) { - /* This should never happen, but - we try to fail safe here. */ - ut_ad(0); + /* The externally stored field + was not written yet. This + record should only be seen by + recv_recovery_rollback_active(), + when the server had crashed before + storing the field. */ + ut_ad(thr->graph->trx->is_recovered); + ut_ad(trx_is_recv(thr->graph->trx)); return(TRUE); } @@ -1608,8 +1658,8 @@ row_upd_sec_step( ut_ad(!dict_index_is_clust(node->index)); if (node->state == UPD_NODE_UPDATE_ALL_SEC - || row_upd_changes_ord_field_binary(node->row, node->ext, - node->index, node->update)) { + || row_upd_changes_ord_field_binary(node->index, node->update, + thr, node->row, node->ext)) { return(row_upd_sec_index_entry(node, thr)); } @@ -1937,7 +1987,7 @@ row_upd_clust_rec( index, btr_cur_get_block(btr_cur), rec, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), - big_rec, mtr); + mtr, TRUE, big_rec); mtr_commit(mtr); } @@ -2136,8 +2186,8 @@ exit_func: row_upd_store_row(node); - if (row_upd_changes_ord_field_binary(node->row, node->ext, index, - node->update)) { + if (row_upd_changes_ord_field_binary(index, node->update, thr, + node->row, node->ext)) { /* Update causes an ordering field (ordering fields within the B-tree) of the clustered index record to change: perform diff --git a/storage/innodb_plugin/row/row0vers.c b/storage/innodb_plugin/row/row0vers.c index b6d35363f08..d4fde0b939b 100644 --- a/storage/innodb_plugin/row/row0vers.c +++ b/storage/innodb_plugin/row/row0vers.c @@ -669,11 +669,15 @@ row_vers_build_for_semi_consistent_read( mutex_enter(&kernel_mutex); version_trx = trx_get_on_id(version_trx_id); + if (version_trx + && (version_trx->conc_state == TRX_COMMITTED_IN_MEMORY + || version_trx->conc_state == TRX_NOT_STARTED)) { + + version_trx = NULL; + } mutex_exit(&kernel_mutex); - if (!version_trx - || version_trx->conc_state == TRX_NOT_STARTED - || version_trx->conc_state == TRX_COMMITTED_IN_MEMORY) { + if (!version_trx) { /* We found a version that belongs to a committed transaction: return it. */ diff --git a/storage/innodb_plugin/srv/srv0srv.c b/storage/innodb_plugin/srv/srv0srv.c index f7e7e351bdc..b1fc1ac67fd 100644 --- a/storage/innodb_plugin/srv/srv0srv.c +++ b/storage/innodb_plugin/srv/srv0srv.c @@ -243,6 +243,11 @@ UNIV_INTERN ulong srv_max_buf_pool_modified_pct = 75; /* variable counts amount of data read in total (in bytes) */ UNIV_INTERN ulint srv_data_read = 0; +/* Internal setting for "innodb_stats_method". Decides how InnoDB treats +NULL value when collecting statistics. By default, it is set to +SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */ +ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL; + /* here we count the amount of data written in total (in bytes) */ UNIV_INTERN ulint srv_data_written = 0; @@ -2231,6 +2236,12 @@ srv_error_monitor_thread( ulint fatal_cnt = 0; ib_uint64_t old_lsn; ib_uint64_t new_lsn; + /* longest waiting thread for a semaphore */ + os_thread_id_t waiter = os_thread_get_curr_id(); + os_thread_id_t old_waiter = waiter; + /* the semaphore that is being waited for */ + const void* sema = NULL; + const void* old_sema = NULL; old_lsn = srv_start_lsn; @@ -2279,7 +2290,8 @@ loop: sync_arr_wake_threads_if_sema_free(); - if (sync_array_print_long_waits()) { + if (sync_array_print_long_waits(&waiter, &sema) + && sema == old_sema && os_thread_eq(waiter, old_waiter)) { fatal_cnt++; if (fatal_cnt > 10) { @@ -2294,6 +2306,8 @@ loop: } } else { fatal_cnt = 0; + old_waiter = waiter; + old_sema = sema; } /* Flush stderr so that a database user gets the output diff --git a/storage/innodb_plugin/srv/srv0start.c b/storage/innodb_plugin/srv/srv0start.c index 73f8f319704..f8b5049ca65 100644 --- a/storage/innodb_plugin/srv/srv0start.c +++ b/storage/innodb_plugin/srv/srv0start.c @@ -1061,6 +1061,12 @@ innobase_start_or_create_for_mysql(void) ); #endif +#ifdef UNIV_BLOB_DEBUG + fprintf(stderr, + "InnoDB: !!!!!!!! UNIV_BLOB_DEBUG switched on !!!!!!!!!\n" + "InnoDB: Server restart may fail with UNIV_BLOB_DEBUG\n"); +#endif /* UNIV_BLOB_DEBUG */ + #ifdef UNIV_SYNC_DEBUG fprintf(stderr, "InnoDB: !!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!\n"); diff --git a/storage/innodb_plugin/sync/sync0arr.c b/storage/innodb_plugin/sync/sync0arr.c index 3c825e2202b..13970023573 100644 --- a/storage/innodb_plugin/sync/sync0arr.c +++ b/storage/innodb_plugin/sync/sync0arr.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -715,7 +715,7 @@ print: fprintf(stderr, "rw-lock %p ", (void*) lock); sync_array_cell_print(stderr, cell); - rw_lock_debug_print(debug); + rw_lock_debug_print(stderr, debug); return(TRUE); } } @@ -914,8 +914,10 @@ Prints warnings of long semaphore waits to stderr. @return TRUE if fatal semaphore wait threshold was exceeded */ UNIV_INTERN ibool -sync_array_print_long_waits(void) -/*=============================*/ +sync_array_print_long_waits( +/*========================*/ + os_thread_id_t* waiter, /*!< out: longest waiting thread */ + const void** sema) /*!< out: longest-waited-for semaphore */ { sync_cell_t* cell; ibool old_val; @@ -923,24 +925,40 @@ sync_array_print_long_waits(void) ulint i; ulint fatal_timeout = srv_fatal_semaphore_wait_threshold; ibool fatal = FALSE; + double longest_diff = 0; for (i = 0; i < sync_primary_wait_array->n_cells; i++) { + double diff; + void* wait_object; + cell = sync_array_get_nth_cell(sync_primary_wait_array, i); - if (cell->wait_object != NULL && cell->waiting - && difftime(time(NULL), cell->reservation_time) > 240) { + wait_object = cell->wait_object; + + if (wait_object == NULL || !cell->waiting) { + + continue; + } + + diff = difftime(time(NULL), cell->reservation_time); + + if (diff > 240) { fputs("InnoDB: Warning: a long semaphore wait:\n", stderr); sync_array_cell_print(stderr, cell); noticed = TRUE; } - if (cell->wait_object != NULL && cell->waiting - && difftime(time(NULL), cell->reservation_time) - > fatal_timeout) { + if (diff > fatal_timeout) { fatal = TRUE; } + + if (diff > longest_diff) { + longest_diff = diff; + *sema = wait_object; + *waiter = cell->thread; + } } if (noticed) { diff --git a/storage/innodb_plugin/sync/sync0rw.c b/storage/innodb_plugin/sync/sync0rw.c index 572c3690a7f..a5da606ad80 100644 --- a/storage/innodb_plugin/sync/sync0rw.c +++ b/storage/innodb_plugin/sync/sync0rw.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -260,6 +260,9 @@ rw_lock_create_func( contains garbage at initialization and cannot be used for recursive x-locking. */ lock->recursive = FALSE; + /* Silence Valgrind when UNIV_DEBUG_VALGRIND is not enabled. */ + memset((void*) &lock->writer_thread, 0, sizeof lock->writer_thread); + UNIV_MEM_INVALID(&lock->writer_thread, sizeof lock->writer_thread); #ifdef UNIV_SYNC_DEBUG UT_LIST_INIT(lock->debug_list); @@ -925,7 +928,7 @@ rw_lock_list_print_info( info = UT_LIST_GET_FIRST(lock->debug_list); while (info != NULL) { - rw_lock_debug_print(info); + rw_lock_debug_print(file, info); info = UT_LIST_GET_NEXT(list, info); } } @@ -973,7 +976,7 @@ rw_lock_print( info = UT_LIST_GET_FIRST(lock->debug_list); while (info != NULL) { - rw_lock_debug_print(info); + rw_lock_debug_print(stderr, info); info = UT_LIST_GET_NEXT(list, info); } } @@ -985,28 +988,29 @@ UNIV_INTERN void rw_lock_debug_print( /*================*/ + FILE* f, /*!< in: output stream */ rw_lock_debug_t* info) /*!< in: debug struct */ { ulint rwt; rwt = info->lock_type; - fprintf(stderr, "Locked: thread %lu file %s line %lu ", + fprintf(f, "Locked: thread %lu file %s line %lu ", (ulong) os_thread_pf(info->thread_id), info->file_name, (ulong) info->line); if (rwt == RW_LOCK_SHARED) { - fputs("S-LOCK", stderr); + fputs("S-LOCK", f); } else if (rwt == RW_LOCK_EX) { - fputs("X-LOCK", stderr); + fputs("X-LOCK", f); } else if (rwt == RW_LOCK_WAIT_EX) { - fputs("WAIT X-LOCK", stderr); + fputs("WAIT X-LOCK", f); } else { ut_error; } if (info->pass != 0) { - fprintf(stderr, " pass value %lu", (ulong) info->pass); + fprintf(f, " pass value %lu", (ulong) info->pass); } - putc('\n', stderr); + putc('\n', f); } /***************************************************************//** diff --git a/storage/innodb_plugin/trx/trx0i_s.c b/storage/innodb_plugin/trx/trx0i_s.c index e148234888b..5cc9df2d5c4 100644 --- a/storage/innodb_plugin/trx/trx0i_s.c +++ b/storage/innodb_plugin/trx/trx0i_s.c @@ -504,7 +504,7 @@ fill_trx_row( query[stmt_len] = '\0'; row->trx_query = ha_storage_put_memlim( - cache->storage, stmt, stmt_len + 1, + cache->storage, query, stmt_len + 1, MAX_ALLOWED_FOR_STORAGE(cache)); row->trx_query_cs = innobase_get_charset(trx->mysql_thd); diff --git a/storage/innodb_plugin/trx/trx0roll.c b/storage/innodb_plugin/trx/trx0roll.c index 1a43e419214..a4bbf7fd652 100644 --- a/storage/innodb_plugin/trx/trx0roll.c +++ b/storage/innodb_plugin/trx/trx0roll.c @@ -48,8 +48,8 @@ Created 3/26/1996 Heikki Tuuri rollback */ #define TRX_ROLL_TRUNC_THRESHOLD 1 -/** In crash recovery, the current trx to be rolled back */ -static trx_t* trx_roll_crash_recv_trx = NULL; +/** In crash recovery, the current trx to be rolled back; NULL otherwise */ +static const trx_t* trx_roll_crash_recv_trx = NULL; /** In crash recovery we set this to the undo n:o of the current trx to be rolled back. Then we can print how many % the rollback has progressed. */ diff --git a/storage/innodb_plugin/trx/trx0trx.c b/storage/innodb_plugin/trx/trx0trx.c index ee744fd58b1..f0bbf220815 100644 --- a/storage/innodb_plugin/trx/trx0trx.c +++ b/storage/innodb_plugin/trx/trx0trx.c @@ -2010,18 +2010,18 @@ trx_recover_for_mysql( /*******************************************************************//** This function is used to find one X/Open XA distributed transaction which is in the prepared state -@return trx or NULL */ +@return trx or NULL; on match, the trx->xid will be invalidated */ UNIV_INTERN trx_t* trx_get_trx_by_xid( /*===============*/ - XID* xid) /*!< in: X/Open XA transaction identification */ + const XID* xid) /*!< in: X/Open XA transaction identifier */ { trx_t* trx; if (xid == NULL) { - return (NULL); + return(NULL); } mutex_enter(&kernel_mutex); @@ -2034,10 +2034,16 @@ trx_get_trx_by_xid( of gtrid_lenght+bqual_length bytes should be the same */ - if (xid->gtrid_length == trx->xid.gtrid_length + if (trx->conc_state == TRX_PREPARED + && xid->gtrid_length == trx->xid.gtrid_length && xid->bqual_length == trx->xid.bqual_length && memcmp(xid->data, trx->xid.data, xid->gtrid_length + xid->bqual_length) == 0) { + + /* Invalidate the XID, so that subsequent calls + will not find it. */ + memset(&trx->xid, 0, sizeof(trx->xid)); + trx->xid.formatID = -1; break; } @@ -2046,14 +2052,5 @@ trx_get_trx_by_xid( mutex_exit(&kernel_mutex); - if (trx) { - if (trx->conc_state != TRX_PREPARED) { - - return(NULL); - } - - return(trx); - } else { - return(NULL); - } + return(trx); } diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc index 8da234111b2..5661e1f06d7 100644 --- a/storage/maria/ha_maria.cc +++ b/storage/maria/ha_maria.cc @@ -789,9 +789,10 @@ can_enable_indexes(1), bulk_insert_single_undo(BULK_INSERT_NONE) {} -handler *ha_maria::clone(MEM_ROOT *mem_root) +handler *ha_maria::clone(const char *name, MEM_ROOT *mem_root) { - ha_maria *new_handler= static_cast <ha_maria *>(handler::clone(mem_root)); + ha_maria *new_handler= static_cast <ha_maria *>(handler::clone(name, + mem_root)); if (new_handler) { new_handler->file->state= file->state; diff --git a/storage/maria/ha_maria.h b/storage/maria/ha_maria.h index 6ca1f6ca797..6171f89cb18 100644 --- a/storage/maria/ha_maria.h +++ b/storage/maria/ha_maria.h @@ -51,7 +51,7 @@ class ha_maria :public handler public: ha_maria(handlerton *hton, TABLE_SHARE * table_arg); ~ha_maria() {} - handler *clone(MEM_ROOT *mem_root); + handler *clone(const char *name, MEM_ROOT *mem_root); const char *table_type() const { return "MARIA"; } const char *index_type(uint key_number); diff --git a/storage/maria/ma_init.c b/storage/maria/ma_init.c index 9e5b600c972..8f695acfe5a 100644 --- a/storage/maria/ma_init.c +++ b/storage/maria/ma_init.c @@ -21,6 +21,7 @@ #include "trnman_public.h" #include "ma_checkpoint.h" #include <hash.h> +#include <my_handler.h> void history_state_free(MARIA_STATE_HISTORY_CLOSED *closed_history) { diff --git a/storage/myisam/ft_stopwords.c b/storage/myisam/ft_stopwords.c index 05948a936a6..5bfbf7e6c61 100644 --- a/storage/myisam/ft_stopwords.c +++ b/storage/myisam/ft_stopwords.c @@ -16,7 +16,7 @@ /* Written by Sergei A. Golubchik, who has a shared copyright to this code */ #include "ftdefs.h" -#include "my_handler.h" +#include "my_compare.h" typedef struct st_ft_stopwords { diff --git a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc index f8f9340ba3a..f736a99df42 100644 --- a/storage/myisam/ha_myisam.cc +++ b/storage/myisam/ha_myisam.cc @@ -20,12 +20,8 @@ #define MYSQL_SERVER 1 #include "mysql_priv.h" -#include <mysql/plugin.h> -#include <m_ctype.h> #include <my_bit.h> -#include <myisampack.h> #include "ha_myisam.h" -#include <stdarg.h> #include "myisamdef.h" #include "rt_index.h" @@ -552,9 +548,10 @@ ha_myisam::ha_myisam(handlerton *hton, TABLE_SHARE *table_arg) can_enable_indexes(1) {} -handler *ha_myisam::clone(MEM_ROOT *mem_root) +handler *ha_myisam::clone(const char *name, MEM_ROOT *mem_root) { - ha_myisam *new_handler= static_cast <ha_myisam *>(handler::clone(mem_root)); + ha_myisam *new_handler= static_cast <ha_myisam *>(handler::clone(name, + mem_root)); if (new_handler) new_handler->file->state= file->state; return new_handler; diff --git a/storage/myisam/ha_myisam.h b/storage/myisam/ha_myisam.h index 0e53bc58c81..7ca9c191bdf 100644 --- a/storage/myisam/ha_myisam.h +++ b/storage/myisam/ha_myisam.h @@ -21,7 +21,6 @@ /* class for the the myisam handler */ #include <myisam.h> -#include <myisamchk.h> #include <ft_global.h> #define HA_RECOVER_NONE 0 /* No automatic recover */ @@ -46,7 +45,7 @@ class ha_myisam: public handler public: ha_myisam(handlerton *hton, TABLE_SHARE *table_arg); ~ha_myisam() {} - handler *clone(MEM_ROOT *mem_root); + handler *clone(const char *name, MEM_ROOT *mem_root); const char *table_type() const { return "MyISAM"; } const char *index_type(uint key_number); const char **bas_ext() const; diff --git a/storage/myisam/mi_create.c b/storage/myisam/mi_create.c index d5ce941bf75..f40f1bad710 100644 --- a/storage/myisam/mi_create.c +++ b/storage/myisam/mi_create.c @@ -848,8 +848,8 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, err: pthread_mutex_unlock(&THR_LOCK_myisam); -err_no_lock: +err_no_lock: save_errno=my_errno; switch (errpos) { case 3: diff --git a/storage/myisam/mi_test1.c b/storage/myisam/mi_test1.c index a6e870f3cde..199334e8b1b 100644 --- a/storage/myisam/mi_test1.c +++ b/storage/myisam/mi_test1.c @@ -16,6 +16,7 @@ /* Testing of the basic functions of a MyISAM table */ #include "myisam.h" +#include "myisamdef.h" #include <my_getopt.h> #include <m_string.h> diff --git a/storage/myisam/mi_write.c b/storage/myisam/mi_write.c index 86c4df77817..8229dfef9a5 100644 --- a/storage/myisam/mi_write.c +++ b/storage/myisam/mi_write.c @@ -17,6 +17,7 @@ #include "fulltext.h" #include "rt_index.h" +#include "my_compare.h" #define MAX_POINTER_LENGTH 8 diff --git a/storage/myisam/myisamdef.h b/storage/myisam/myisamdef.h index e03f88b1a1f..ee782cf2ab0 100644 --- a/storage/myisam/myisamdef.h +++ b/storage/myisam/myisamdef.h @@ -15,8 +15,8 @@ /* This file is included by all internal myisam files */ -#include "myisam.h" /* Structs & some defines */ -#include "myisampack.h" /* packing of keys */ +#include <myisam.h> /* Structs & some defines */ +#include <myisampack.h> /* packing of keys */ #include <my_tree.h> #ifdef THREAD #include <my_pthread.h> diff --git a/storage/myisam/sp_test.c b/storage/myisam/sp_test.c index f572c7ab19b..7a30a742fd6 100644 --- a/storage/myisam/sp_test.c +++ b/storage/myisam/sp_test.c @@ -17,6 +17,7 @@ /* Written by Alex Barkov, who has a shared copyright to this code */ #include "myisam.h" +#include "myisamdef.h" #ifdef HAVE_SPATIAL #include "sp_defs.h" diff --git a/storage/myisammrg/ha_myisammrg.cc b/storage/myisammrg/ha_myisammrg.cc index 99513e2267f..e6fd3ed4939 100644 --- a/storage/myisammrg/ha_myisammrg.cc +++ b/storage/myisammrg/ha_myisammrg.cc @@ -459,8 +459,7 @@ int ha_myisammrg::open(const char *name, int mode __attribute__((unused)), problem because all locking is handled by the original MERGE table from which this is cloned of. */ - if (!(file= myrg_open(table->s->normalized_path.str, table->db_stat, - HA_OPEN_IGNORE_IF_LOCKED))) + if (!(file= myrg_open(name, table->db_stat, HA_OPEN_IGNORE_IF_LOCKED))) { DBUG_PRINT("error", ("my_errno %d", my_errno)); DBUG_RETURN(my_errno ? my_errno : -1); @@ -484,7 +483,7 @@ int ha_myisammrg::open(const char *name, int mode __attribute__((unused)), @return A cloned handler instance. */ -handler *ha_myisammrg::clone(MEM_ROOT *mem_root) +handler *ha_myisammrg::clone(const char *name, MEM_ROOT *mem_root) { MYRG_TABLE *u_table,*newu_table; ha_myisammrg *new_handler= @@ -505,8 +504,8 @@ handler *ha_myisammrg::clone(MEM_ROOT *mem_root) return NULL; } - if (new_handler->ha_open(table, table->s->normalized_path.str, table->db_stat, - HA_OPEN_IGNORE_IF_LOCKED)) + if (new_handler->ha_open(table, name, table->db_stat, + HA_OPEN_IGNORE_IF_LOCKED)) { delete new_handler; return NULL; diff --git a/storage/myisammrg/ha_myisammrg.h b/storage/myisammrg/ha_myisammrg.h index 4a65a60d97b..aa8f8d3e2be 100644 --- a/storage/myisammrg/ha_myisammrg.h +++ b/storage/myisammrg/ha_myisammrg.h @@ -62,7 +62,7 @@ class ha_myisammrg: public handler int open(const char *name, int mode, uint test_if_locked); int attach_children(void); int detach_children(void); - virtual handler *clone(MEM_ROOT *mem_root); + virtual handler *clone(const char *name, MEM_ROOT *mem_root); int close(void); int write_row(uchar * buf); int update_row(const uchar * old_data, uchar * new_data); |